예제 #1
0
def ImportDataSet(calculationObject, baseFolder, datasetId, importSettings):
    Utils.CheckSafeIdentifier(datasetId)
    with calculationObject.LogHeader(
            'Importing dataset {0}'.format(datasetId)):
        calculationObject.Log('Import settings: ' + str(importSettings))
        DQXUtils.CheckValidIdentifier(datasetId)
        datasetFolder = os.path.join(baseFolder, datasetId)
        indexDb = config.DB

        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(indexDb, 'datasetindex'))
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId))

        # Remove current reference in the index first: if import fails, nothing will show up
        ImpUtils.ExecuteSQL(
            calculationObject, indexDb,
            'DELETE FROM datasetindex WHERE id="{0}"'.format(datasetId))

        globalSettings = SettingsLoader.SettingsLoader(
            os.path.join(datasetFolder, 'settings'))
        globalSettings.RequireTokens(['Name'])
        globalSettings.AddTokenIfMissing('Description', '')

        print('Global settings: ' + str(globalSettings.Get()))

        if not importSettings['ConfigOnly']:
            # Dropping existing database
            calculationObject.SetInfo('Dropping database')
            print('Dropping database')
            try:
                ImpUtils.ExecuteSQL(
                    calculationObject, indexDb,
                    'DROP DATABASE IF EXISTS {0}'.format(datasetId))
            except:
                pass
            ImpUtils.ExecuteSQL(calculationObject, indexDb,
                                'CREATE DATABASE {0}'.format(datasetId))

            # Creating new database
            scriptPath = os.path.dirname(os.path.realpath(__file__))
            calculationObject.SetInfo('Creating database')
            print('Creating new database')
            ImpUtils.ExecuteSQLScript(calculationObject,
                                      scriptPath + '/createdataset.sql',
                                      datasetId)

        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM propertycatalog')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM summaryvalues')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM tablecatalog')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM settings')
        ImpUtils.ExecuteSQL(calculationObject, datasetId,
                            'DELETE FROM customdatacatalog')

        datatables = []

        if globalSettings.HasToken('DataTables'):
            if not type(globalSettings['DataTables']) is list:
                raise Exception('DataTables token should be a list')
            datatables = globalSettings['DataTables']

        for dir in os.listdir(os.path.join(datasetFolder, 'datatables')):
            if os.path.isdir(os.path.join(datasetFolder, 'datatables', dir)):
                if dir not in datatables:
                    datatables.append(dir)
        print('Data tables: ' + str(datatables))
        for datatable in datatables:
            ImportDataTable.ImportDataTable(
                calculationObject, datasetId, datatable,
                os.path.join(datasetFolder, 'datatables', datatable),
                importSettings)

        try:
            datatables_2D = globalSettings['2D_DataTables']
        except KeyError:
            datatables_2D = []
        if type(datatables_2D) is not list:
            raise TypeError('2D_DataTables token should be a list')
        for datatable in datatables_2D:
            Import2DDataTable.ImportDataTable(
                calculationObject, datasetId, datatable,
                os.path.join(datasetFolder, '2D_datatables', datatable),
                importSettings)

        if os.path.exists(os.path.join(datasetFolder, 'refgenome')):
            ImportRefGenome.ImportRefGenome(
                calculationObject, datasetId,
                os.path.join(datasetFolder, 'refgenome'), importSettings)
            globalSettings.AddTokenIfMissing('hasGenomeBrowser', True)

        ImportWorkspaces.ImportWorkspaces(calculationObject, datasetFolder,
                                          datasetId, importSettings)

        # Global settings
        print('Defining global settings')
        ImpUtils.ImportGlobalSettings(calculationObject, datasetId,
                                      globalSettings)

        # Finalise: register dataset
        print('Registering dataset')
        importtime = 0
        if not importSettings['ConfigOnly']:
            importtime = time.time()
        ImpUtils.ExecuteSQL(
            calculationObject, indexDb,
            'INSERT INTO datasetindex VALUES ("{0}", "{1}", "{2}")'.format(
                datasetId, globalSettings['Name'], str(math.ceil(importtime))))
예제 #2
0
def LoadTable(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings, importSettings, allowSubSampling):

    def DecoId(id):
        return '`' + id + '`'

    def EncodeCell(icontent, col):
        content = icontent
        if col['IsString']:
            if len(icontent) == 0:
                content = "''"
            else:
                try:
                    content = content.encode('ascii', 'ignore')
                except UnicodeDecodeError:
                    print('Unable to encode '+content)
                    content='*failed encoding*'
                content = content.replace("\x92", "'")
                content = content.replace("\xC2", "'")
                content = content.replace("\x91", "'")
                #filter(lambda x: x in string.printable, val)
                content = content.replace("'", "\\'")
                content = content.replace('\r\n', '\\n')
                content = content.replace('\n\r', '\\n')
                content = '\'' + content + '\''

        if col['IsValue']:
            if (content == 'NA') or (content == '') or (content == 'None') or (content == 'NULL') or (content == 'null') or (content == 'inf') or (content == '-'):
                content = 'NULL'

        if col['IsDate']:
            if len(content)>=5:
                try:
                    dt = dateutil.parser.parse(content)
                    tmdiff  =(dt - datetime.datetime(1970, 1, 1)).days
                    tmdiff += 2440587.5 +0.5 # note: extra 0.5 because we set datestamp at noon
                    content = str(tmdiff)
                except:
                    print('ERROR: date parsing string '+content)
                    content = 'NULL'
            else:
                content = 'NULL'

        if col['IsBoolean']:
            vl = content
            content = 'NULL'
            if (vl.lower() == 'true') or (vl.lower() == 'yes') or (vl.lower() == 'y') or (vl == '1'):
                content = '1'
            if (vl.lower() == 'false') or (vl.lower() == 'no') or (vl.lower() == 'n') or (vl == '0'):
                content = '0'

        return content

    calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName))
    colDict = {col['name']: col for col in columns}
    colNameList = [col['name'] for col in columns]
    primkey = loadSettings['PrimKey']
    autoPrimKey = (primkey == 'AutoKey')
    print('Column info: ' + str(columns))
    print('Primary key: ' + primkey)

    for col in columns:
        col['IsString'] = (col['DataType'] == 'Text')
        col['IsValue'] = ImpUtils.IsValueDataTypeIdenfifier(col['DataType'])
        col['IsDate'] = ImpUtils.IsDateDataTypeIdenfifier(col['DataType'])
        col['IsBoolean'] = (col['DataType'] == 'Boolean')
        col['MaxLen'] = 0

    destFileName = ImpUtils.GetTempFileName()

    maxLineCount = -1
    if importSettings['ScopeStr'] == '1k':
        maxLineCount = 1000
    if importSettings['ScopeStr'] == '10k':
        maxLineCount = 10000
    if importSettings['ScopeStr'] == '100k':
        maxLineCount = 100000
    if importSettings['ScopeStr'] == '1M':
        maxLineCount = 1000000
    if importSettings['ScopeStr'] == '10M':
        maxLineCount = 10000000

    with open(sourceFileName, 'r') as ifp:
        if ifp is None:
            raise Exception('Unable to read file '+sourceFileName)
        with open(destFileName, 'w') as ofp:
            if ofp is None:
                raise Exception('Unable to write to temporary file ' + destFileName)
            fileColNames = ifp.readline().rstrip('\n\r').split('\t')
            calculationObject.Log('File columns: ' + str(fileColNames))
            fileColIndex = {fileColNames[i]: i for i in range(len(fileColNames))}
            if not(autoPrimKey) and (primkey not in fileColIndex):
                raise Exception('File is missing primary key '+primkey)
            for col in columns:
                # if 'ReadData' not in col:
                #     print('==========' + str(col))
                colname = col['name']
                if (col['ReadData']) and (colname not in fileColIndex):
                    raise Exception('File is missing column '+colname)

            blockSize = 499
            blockStarted = False

            lineCount = 0
            for line in ifp:
                line = line.rstrip('\r\n')
                if len(line) > 0:
                    sourceCells = line.split('\t')
                    writeCells = []
                    for col in columns:
                        content = 'NULL'
                        if col['name'] in fileColIndex:
                            content = sourceCells[fileColIndex[col['name']]]
                            content = EncodeCell(content, col)
                        writeCells.append(content)
                        if col['IsString']:
                            col['MaxLen'] = max(col['MaxLen'], len(content))

                    if not(blockStarted):
                        ofp.write('INSERT INTO {0} ({1}) VALUES '.format(DecoId(tableid), ', '.join([DecoId(col) for col in colNameList])))
                        blockStarted = True
                        blockNr = 0

                    if blockNr > 0:
                        ofp.write(',')
                    ofp.write('(')
                    ofp.write(','.join(writeCells))
                    ofp.write(')')
                    blockNr += 1
                    if blockNr >= blockSize:
                        ofp.write(';\n')
                        blockStarted = False
                    lineCount += 1
                    if lineCount % 250000 == 0:
                        calculationObject.Log('Line '+str(lineCount))
                    if (maxLineCount>0) and (lineCount >= maxLineCount):
                        calculationObject.Log('WARNING:Terminating import at {0} lines'.format(lineCount))
                        break

    calculationObject.Log('Column sizes: '+str({col['name']: col['MaxLen'] for col in columns}))

    calculationObject.Log('Creating schema')
    scr = ImpUtils.SQLScript(calculationObject)
    scr.AddCommand('drop table if exists {0};'.format(DecoId(tableid)))
    sql = 'CREATE TABLE {0} (\n'.format(DecoId(tableid))
    colTokens = []
    if autoPrimKey:
        colTokens.append("{0} int AUTO_INCREMENT PRIMARY KEY".format(primkey))
    if allowSubSampling:
        colTokens.append("_randomval_ double")
    for col in columns:
        st = DecoId(col['name'])
        typestr = ''
        if col['DataType'] == 'Text':
            typestr = 'varchar({0})'.format(max(1, col['MaxLen']))
        if len(typestr) == 0:
            typestr = ImpUtils.GetSQLDataType(col['DataType'])
        if len(typestr) == 0:
            raise Exception('Invalid property data type ' + col['DataType'])
        st += ' '+typestr
        colTokens.append(st)
    sql += ', '.join(colTokens)
    sql +=')'
    scr.AddCommand(sql)
    calculationObject.Log('Creation statement: '+sql)
    if not(autoPrimKey):
        scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, primkey))
    for col in columns:
        if ('Index' in col) and (col['Index']) and (col['name'] != primkey):
            scr.AddCommand('create index {0}_{1} ON {0}({1})'.format(tableid, col['name']))
    scr.Execute(databaseid)

    calculationObject.Log('Importing data')
    ImpUtils.ExecuteSQLScript(calculationObject, destFileName, databaseid)

    if allowSubSampling:
        with calculationObject.LogHeader('Create subsampling table'):
            calculationObject.Log('Creating random data column')
            sql = "UPDATE {0} SET _randomval_=RAND()".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "CREATE INDEX {0}_randomindex ON {0}(_randomval_)".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "DROP TABLE IF EXISTS {0}_SORTRAND".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "CREATE TABLE {0}_SORTRAND LIKE {0}".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            if autoPrimKey:
                calculationObject.Log('Restructuring AutoKey')
                sql = "alter table {0}_SORTRAND drop column AutoKey".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
                sql = "alter table {0}_SORTRAND add column AutoKey int FIRST".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
                sql = "create index idx_autokey on {0}_SORTRAND(AutoKey)".format(tableid)
                ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "alter table {0}_SORTRAND add column RandPrimKey int AUTO_INCREMENT PRIMARY KEY".format(tableid)
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)
            sql = "insert into {0}_SORTRAND select *,0 from {0} order by _randomval_".format(tableid)
            sql += ' LIMIT 5000000' # NOTE: there is little point in importing more than that!
            ImpUtils.ExecuteSQL(calculationObject, databaseid, sql)


    os.remove(destFileName)
예제 #3
0
def LoadTable0(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings):

    calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName))

    colDict = {col['name']: col for col in columns}
    colNameList = [col['name'] for col in columns]
    print('Column info: '+str(columns))


    tb = VTTable.VTTable()
    tb.allColumnsText = True
    try:
        tb.LoadFile(sourceFileName, loadSettings['MaxTableSize'])
    except Exception as e:
        raise Exception('Error while reading file: '+str(e))
    calculationObject.Log('---- ORIG TABLE ----')
    with calculationObject.LogDataDump():
        tb.PrintRows(0, 9)

    for col in columns:
        if not tb.IsColumnPresent(col['name']):
            raise Exception('Missing column "{0}" in datatable "{1}"'.format(col['name'], tableid))

    if loadSettings['PrimKey'] not in colDict:
        raise Exception('Missing primary key column "{0}" in datatable "{1}"'.format(loadSettings['PrimKey'], tableid))

    for col in tb.GetColList():
        if col not in colDict:
            tb.DropCol(col)
    tb.ArrangeColumns(colNameList)
    for col in columns:
        colname = col['name']
        if ImpUtils.IsValueDataTypeIdenfifier(col['DataType']):
            tb.ConvertColToValue(colname)
        if col['DataType'] == 'Boolean':
            tb.MapCol(colname, ImpUtils.convertToBooleanInt)
            tb.ConvertColToValue(colname)
    calculationObject.Log('---- PROCESSED TABLE ----')
    with calculationObject.LogDataDump():
        tb.PrintRows(0, 9)

    createcmd = 'CREATE TABLE {0} ('.format(tableid)
    frst = True
    for col in columns:
        if not frst:
            createcmd += ', '
        colname = col['name']
        colnr = tb.GetColNr(colname)
        datatypestr = ''
        if col['DataType'] == 'Text':
            maxlength = 1
            for rownr in tb.GetRowNrRange():
                maxlength = max(maxlength, len(tb.GetValue(rownr, colnr)))
            datatypestr = 'varchar({0})'.format(maxlength)
        if len(datatypestr) == 0:
            datatypestr = ImpUtils.GetSQLDataType(col['DataType'])
        createcmd += colname + ' ' + datatypestr
        frst = False
    createcmd += ')'

    calculationObject.Log('Creating datatable')
    scr = ImpUtils.SQLScript(calculationObject)
    scr.AddCommand('drop table if exists {0}'.format(tableid))
    scr.AddCommand(createcmd)
    scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, loadSettings['PrimKey']))
    scr.Execute(databaseid)

    calculationObject.Log('Loading datatable values')
    sqlfile = ImpUtils.GetTempFileName()
    tb.SaveSQLDump(sqlfile, tableid)
    ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, databaseid)
    os.remove(sqlfile)
예제 #4
0
def ImportRefGenome(calculationObject, datasetId, folder, importSettings):
    with calculationObject.LogHeader('Importing reference genome data'):

        ImportRefGenomeSummaryData(calculationObject, datasetId, folder,
                                   importSettings)

        settings = SettingsLoader.SettingsLoader(
            os.path.join(folder, 'settings'))
        settings.DefineKnownTokens(['AnnotMaxViewportSize', 'RefSequenceSumm'])
        print('Settings: ' + str(settings.Get()))
        ImpUtils.ImportGlobalSettings(calculationObject, datasetId, settings)

        # Import reference genome
        if importSettings['ScopeStr'] == 'all':
            refsequencefile = os.path.join(folder, 'refsequence.fa')
            if os.path.exists(refsequencefile):
                with calculationObject.LogHeader(
                        'Converting reference genome'):
                    destfolder = config.BASEDIR + '/SummaryTracks/' + datasetId + '/Sequence'
                    if not os.path.exists(destfolder):
                        os.makedirs(destfolder)
                    tempfastafile = destfolder + '/refsequence.fa'
                    shutil.copyfile(refsequencefile, tempfastafile)
                    ImpUtils.RunConvertor(calculationObject,
                                          'Fasta2FilterBankData', destfolder,
                                          ['refsequence.fa'])
            else:
                calculationObject.Log(
                    'WARNING: missing reference sequence file')
        else:
            calculationObject.Log(
                'WARNING: Skipping converting reference genome')

        # Import chromosomes
        with calculationObject.LogHeader('Loading chromosomes'):
            tb = VTTable.VTTable()
            tb.allColumnsText = True
            try:
                tb.LoadFile(os.path.join(folder, 'chromosomes'))
            except Exception as e:
                raise Exception('Error while reading chromosomes file: ' +
                                str(e))
            tb.RequireColumnSet(['chrom', 'length'])
            tb.RenameCol('chrom', 'id')
            tb.RenameCol('length', 'len')
            tb.ConvertColToValue('len')
            with calculationObject.LogDataDump():
                tb.PrintRows(0, 99)
            sqlfile = ImpUtils.GetTempFileName()
            tb.SaveSQLDump(sqlfile, 'chromosomes')
            ImpUtils.ExecuteSQL(calculationObject, datasetId,
                                'DELETE FROM chromosomes')
            ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, datasetId)
            os.remove(sqlfile)

        if importSettings['ScopeStr'] == 'all':
            # Import annotation
            with calculationObject.LogHeader('Converting annotation'):
                tempgfffile = ImpUtils.GetTempFileName()
                temppath = os.path.dirname(tempgfffile)
                shutil.copyfile(os.path.join(folder, 'annotation.gff'),
                                tempgfffile)
                ImpUtils.RunConvertor(calculationObject, 'ParseGFF', temppath,
                                      [os.path.basename(tempgfffile)])
                print('Importing annotation')
                ImpUtils.ExecuteSQLScript(
                    calculationObject,
                    os.path.join(temppath, 'annotation_dump.sql'), datasetId)
                os.remove(tempgfffile)
                os.remove(os.path.join(temppath, 'annotation.txt'))
                os.remove(os.path.join(temppath, 'annotation_dump.sql'))
                os.remove(os.path.join(temppath, 'annotation_create.sql'))
        else:
            calculationObject.Log('WARNING: Skipping converting annotation')