예제 #1
0
def ImportDataTable(calculationObject, datasetId, tableid, folder, importSettings):
    Utils.CheckSafeIdentifier(tableid)
    global tableOrder
    with calculationObject.LogHeader('Importing datatable {0}'.format(tableid)):
        print('Source: ' + folder)
        DQXUtils.CheckValidIdentifier(tableid)

        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablecatalog'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'relations'))
        calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablebasedsummaryvalues'))

        tableSettings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings')))
        tableSettings.RequireTokens(['NameSingle', 'NamePlural', 'PrimKey'])
        tableSettings.AddTokenIfMissing('IsPositionOnGenome', False)
        tableSettings.AddTokenIfMissing('IsRegionOnGenome', False)
        tableSettings.AddTokenIfMissing('MaxTableSize', None)
        tableSettings.AddTokenIfMissing('MaxCountQueryRecords', 200000)
        tableSettings.AddTokenIfMissing('MaxCountQueryAggregated', 1000000)
        tableSettings.AddTokenIfMissing('AllowSubSampling', False)
        extraSettings = tableSettings.Clone()
        extraSettings.DropTokens(['PrimKey', 'Properties'])

        if tableSettings['MaxTableSize'] is not None:
            print('WARNING: table size limited to '+str(tableSettings['MaxTableSize']))

        # Drop existing tablecatalog record
        sql = "DELETE FROM tablecatalog WHERE id='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        # Add to tablecatalog
        extraSettings.ConvertStringsToSafeSQL()
        sql = "INSERT INTO tablecatalog VALUES ('{0}', '{1}', '{2}', {3}, '{4}', {5})".format(
            tableid,
            tableSettings['NamePlural'],
            tableSettings['PrimKey'],
            tableSettings['IsPositionOnGenome'],
            extraSettings.ToJSON(),
            tableOrder
        )
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        tableOrder += 1

        properties = ImpUtils.LoadPropertyInfo(calculationObject, tableSettings, os.path.join(folder, 'data'))

        sql = "DELETE FROM propertycatalog WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        sql = "DELETE FROM relations WHERE childtableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        ranknr = 0
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            extraSettings = settings.Clone()
            extraSettings.DropTokens(['Name', 'DataType', 'Order'])
            sql = "INSERT INTO propertycatalog VALUES ('', 'fixed', '{0}', '{1}', '{2}', '{3}', {4}, '{5}')".format(
                settings['DataType'],
                propid,
                tableid,
                settings['Name'],
                0,
                extraSettings.ToJSON()
            )
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            if settings.HasToken('Relation'):
                relationSettings = settings.GetSubSettings('Relation')
                calculationObject.Log('Creating relation: '+relationSettings.ToJSON())
                relationSettings.RequireTokens(['TableId'])
                relationSettings.AddTokenIfMissing('ForwardName', 'belongs to')
                relationSettings.AddTokenIfMissing('ReverseName', 'has')
                sql = "INSERT INTO relations VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}')".format(
                    tableid,
                    propid,
                    relationSettings['TableId'],
                    '',
                    relationSettings['ForwardName'],
                    relationSettings['ReverseName']
                )
                ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            ranknr += 1



        propidList = []
        propDict = {}
        for property in properties:
            propDict[property['propid']] = property
            propidList.append(property['propid'])

        if tableSettings['IsPositionOnGenome']:
            if not tableSettings.HasToken('Chromosome'):
                raise Exception('Missing settings tag Chromosome in genome position datatable.')
            if not tableSettings.HasToken('Position'):
                raise Exception('Missing settings tag Position in genome position datatable.')
            if tableSettings['Chromosome'] not in propDict:
                 raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome']))
            if tableSettings['Position'] not in propDict:
                 raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Position']))

        if tableSettings['IsRegionOnGenome']:
            if not tableSettings.HasToken('Chromosome'):
                raise Exception('Missing setting "Chromosome" in genome region datatable {0}'.format(tableid))
            if not tableSettings.HasToken('RegionStart'):
                raise Exception('Missing setting "RegionStart" in genome region datatable {0}'.format(tableid))
            if not tableSettings.HasToken('RegionStop'):
                raise Exception('Missing setting "RegionStop" in genome region datatable {0}'.format(tableid))
            if tableSettings['Chromosome'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome']))
            if tableSettings['RegionStart'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStart']))
            if tableSettings['RegionStop'] not in propDict:
                 raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStop']))
            propDict[tableSettings['Chromosome']]['Settings'].SetToken('Index', True)
            propDict[tableSettings['RegionStart']]['Settings'].SetToken('Index', True)
            propDict[tableSettings['RegionStop']]['Settings'].SetToken('Index', True)


        if not importSettings['ConfigOnly']:
            columns = [ {
                            'name': prop['propid'],
                            'DataType': prop['DataType'],
                            'Index': prop['Settings']['Index'],
                            'ReadData': prop['Settings']['ReadData']
                        }
                        for prop in properties if (prop['propid'] != 'AutoKey')]
            LoadTable.LoadTable(
                calculationObject,
                os.path.join(folder, 'data'),
                datasetId,
                tableid,
                columns,
                tableSettings,
                importSettings,
                tableSettings['AllowSubSampling']
            )
            if tableSettings['IsPositionOnGenome']:
                calculationObject.Log('Indexing chromosome')
                scr = ImpUtils.SQLScript(calculationObject)
                scr.AddCommand('create index {0}_chrompos ON {0}({1},{2})'.format(
                    tableid,
                    tableSettings['Chromosome'],
                    tableSettings['Position']
                ))
                scr.Execute(datasetId)



        print('Creating summary values')
        sql = "DELETE FROM summaryvalues WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
        for property in properties:
            propid = property['propid']
            settings = property['Settings']

            if (settings.HasToken('SummaryValues')) and ImpUtils.IsValueDataTypeIdenfifier(property['DataType']):
                with calculationObject.LogHeader('Creating numerical summary values for {0}.{1}'.format(tableid,propid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    if settings.HasToken('minval'):
                        summSettings.AddTokenIfMissing('MinVal', settings['minval'])
                    if settings.HasToken('maxval'):
                        summSettings.AddTokenIfMissing('MaxVal', settings['maxval'])
                    sourceFileName = os.path.join(folder, 'data')
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)
                    if not importSettings['ConfigOnly']:
                        ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings)
                    ImpUtils.CreateSummaryValues_Value(
                        calculationObject,
                        summSettings,
                        datasetId,
                        tableid,
                        'fixed',
                        '',
                        propid,
                        settings['Name'],
                        dataFileName,
                        importSettings
                    )

            if (settings.HasToken('SummaryValues')) and (property['DataType'] == 'Text'):
                with calculationObject.LogHeader('Creating categorical summary values for {0}.{1}'.format(tableid,propid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    sourceFileName = os.path.join(folder, 'data')
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)
                    if not importSettings['ConfigOnly']:
                        ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings)
                    categories = []
                    if settings.HasToken('categoryColors'):
                        stt = settings.GetSubSettings('categoryColors')
                        categories = [x for x in stt.Get()]
                        summSettings.AddTokenIfMissing('Categories', categories)
                    ImpUtils.CreateSummaryValues_Categorical(
                        calculationObject,
                        summSettings,
                        datasetId,
                        tableid,
                        'fixed',
                        '',
                        propid,
                        settings['Name'],
                        dataFileName,
                        importSettings
                    )

        sql = "DELETE FROM tablebasedsummaryvalues WHERE tableid='{0}'".format(tableid)
        ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)

        if tableSettings.HasToken('TableBasedSummaryValues'):
            calculationObject.Log('Processing table-based summary values')
            if not type(tableSettings['TableBasedSummaryValues']) is list:
                raise Exception('TableBasedSummaryValues token should be a list')
            for stt in tableSettings['TableBasedSummaryValues']:
                summSettings = SettingsLoader.SettingsLoader()
                summSettings.LoadDict(stt)
                summSettings.RequireTokens(['Id', 'Name', 'MaxVal', 'BlockSizeMax'])
                summSettings.AddTokenIfMissing('MinVal', 0)
                summSettings.AddTokenIfMissing('BlockSizeMin', 1)
                summSettings.DefineKnownTokens(['channelColor'])
                summaryid = summSettings['Id']
                with calculationObject.LogHeader('Table based summary value {0}, {1}'.format(tableid, summaryid)):
                    extraSummSettings = summSettings.Clone()
                    extraSummSettings.DropTokens(['Id', 'Name', 'MinVal', 'MaxVal', 'BlockSizeMin', 'BlockSizeMax'])
                    sql = "INSERT INTO tablebasedsummaryvalues VALUES ('{0}', '{1}', '{2}', '{3}', {4}, {5}, {6}, 0)".format(
                        tableid,
                        summaryid,
                        summSettings['Name'],
                        extraSummSettings.ToJSON(),
                        summSettings['MinVal'],
                        summSettings['MaxVal'],
                        summSettings['BlockSizeMin']
                    )
                    ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
                    if importSettings['ScopeStr'] == 'all':
                        itemtracknr = 0
                        for fileid in os.listdir(os.path.join(folder, summaryid)):
                            if not(os.path.isdir(os.path.join(folder, summaryid, fileid))):
                                itemtracknr += 1
                                calculationObject.Log('Processing {0}: {1}'.format(itemtracknr, fileid))
                                destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, 'TableTracks', tableid, summaryid, fileid)
                                calculationObject.Log('Destination: '+destFolder)
                                if not os.path.exists(destFolder):
                                    os.makedirs(destFolder)
                                shutil.copyfile(os.path.join(folder, summaryid, fileid), os.path.join(destFolder, summaryid+'_'+fileid))
                                ImpUtils.ExecuteFilterbankSummary_Value(calculationObject, destFolder, summaryid+'_'+fileid, summSettings)
예제 #2
0
def ImportCustomData(calculationObject, datasetId, workspaceid, tableid,
                     sourceid, folder, importSettings):

    with calculationObject.LogHeader('Importing custom data'):
        print('Importing custom data into {0}, {1}, {2} FROM {3}'.format(
            datasetId, workspaceid, tableid, folder))

        credInfo = calculationObject.credentialInfo

        if not ImpUtils.IsDatasetPresentInServer(
                calculationObject.credentialInfo, datasetId):
            raise Exception(
                'Dataset {0} is not found. Please import the dataset first'.
                format(datasetId))

        db = DQXDbTools.OpenDatabase(credInfo, datasetId)
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog'))
        calculationObject.credentialInfo.VerifyCanDo(
            DQXDbTools.DbOperationWrite(datasetId, 'workspaces'))

        cur = db.cursor()
        cur.execute(
            'SELECT primkey, settings FROM tablecatalog WHERE id="{0}"'.format(
                tableid))
        row = cur.fetchone()
        if row is None:
            raise Exception(
                'Unable to find table record for table {0} in dataset {1}'.
                format(tableid, datasetId))
        primkey = row[0]
        tableSettingsStr = row[1]
        db.close()

        ImpUtils.ExecuteSQL(
            calculationObject, datasetId,
            'DELETE FROM customdatacatalog WHERE tableid="{tableid}" and sourceid="{sourceid}"'
            .format(tableid=tableid, sourceid=sourceid))

        tableSettings = SettingsLoader.SettingsLoader()
        tableSettings.LoadDict(simplejson.loads(tableSettingsStr,
                                                strict=False))

        allowSubSampling = tableSettings['AllowSubSampling']

        isPositionOnGenome = False
        if tableSettings.HasToken(
                'IsPositionOnGenome') and tableSettings['IsPositionOnGenome']:
            isPositionOnGenome = True
            chromField = tableSettings['Chromosome']
            posField = tableSettings['Position']

        settings = SettingsLoader.SettingsLoader(
            os.path.join(os.path.join(folder, 'settings')))

        ImpUtils.ExecuteSQL(
            calculationObject, datasetId,
            "INSERT INTO customdatacatalog VALUES ('{tableid}', '{sourceid}', '{settings}')"
            .format(tableid=tableid,
                    sourceid=sourceid,
                    settings=settings.ToJSON()))

        properties = ImpUtils.LoadPropertyInfo(calculationObject, settings,
                                               os.path.join(folder, 'data'))

        # remove primary key, just in case
        properties = [prop for prop in properties if prop['propid'] != primkey]

        sourcetable = Utils.GetTableWorkspaceProperties(workspaceid, tableid)
        print('Source table: ' + sourcetable)

        #Get list of existing properties
        db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo,
                                     datasetId)
        cur = db.cursor()
        cur.execute(
            'SELECT propid FROM propertycatalog WHERE (workspaceid="{0}") and (source="custom") and (tableid="{1}")'
            .format(workspaceid, tableid))
        existingProperties = [row[0] for row in cur.fetchall()]
        print('Existing properties: ' + str(existingProperties))
        db.close()

        propidList = []
        for property in properties:
            DQXUtils.CheckValidIdentifier(property['propid'])
            propidList.append(property['propid'])

        db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo,
                                     datasetId)
        cur = db.cursor()

        if not importSettings['ConfigOnly']:
            # Dropping columns that will be replaced
            toRemoveExistingProperties = []
            for existProperty in existingProperties:
                if existProperty in propidList:
                    toRemoveExistingProperties.append(existProperty)
            print('Removing outdated information:')
            if len(toRemoveExistingProperties) > 0:
                for prop in toRemoveExistingProperties:
                    print('Removing outdated information: {0} {1} {2}'.format(
                        workspaceid, prop, tableid))
                    sql = 'DELETE FROM propertycatalog WHERE (workspaceid="{0}") and (propid="{1}") and (tableid="{2}")'.format(
                        workspaceid, prop, tableid)
                    print(sql)
                    cur.execute(sql)
                sql = "ALTER TABLE {0} ".format(sourcetable)
                for prop in toRemoveExistingProperties:
                    if prop != toRemoveExistingProperties[0]:
                        sql += ", "
                    sql += "DROP COLUMN {0}".format(prop)
                calculationObject.LogSQLCommand(sql)
                cur.execute(sql)

        ranknr = 0
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            extraSettings = settings.Clone()
            extraSettings.DropTokens(
                ['Name', 'DataType', 'Order', 'SummaryValues'])
            print('Create property catalog entry for {0} {1} {2}'.format(
                workspaceid, tableid, propid))
            sql = "DELETE FROM propertycatalog WHERE (workspaceid='{0}') and (propid='{1}') and (tableid='{2}')".format(
                workspaceid, propid, tableid)
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            sql = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', '{1}', '{2}', '{3}', '{4}', {5}, '{6}')".format(
                workspaceid, settings['DataType'], propid, tableid,
                settings['Name'], 0, extraSettings.ToJSON())
            ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
            ranknr += 1

        propDict = {}
        for property in properties:
            propDict[property['propid']] = property

        if not importSettings['ConfigOnly']:
            tmptable = Utils.GetTempID()
            columns = [{
                'name': prop['propid'],
                'DataType': prop['DataType'],
                'Index': prop['Settings']['Index'],
                'ReadData': prop['Settings']['ReadData']
            } for prop in properties]
            columns.append({
                'name': primkey,
                'DataType': 'Text',
                'Index': False,
                'ReadData': True
            })
            # print('----------------------------------------------------------------')
            # print(str(columns))
            # print('----------------------------------------------------------------')
            LoadTable.LoadTable(calculationObject,
                                os.path.join(folder, 'data'), datasetId,
                                tmptable, columns, {'PrimKey': primkey},
                                importSettings, False)

            print('Creating new columns')
            calculationObject.Log(
                'WARNING: better mechanism to determine column types needed here'
            )  #TODO: implement
            frst = True
            sql = "ALTER TABLE {0} ".format(sourcetable)
            for property in properties:
                propid = property['propid']
                if not frst:
                    sql += " ,"
                sqldatatype = ImpUtils.GetSQLDataType(property['DataType'])
                sql += "ADD COLUMN {0} {1}".format(propid, sqldatatype)
                frst = False
                calculationObject.LogSQLCommand(sql)
            cur.execute(sql)

            print('Joining information')
            frst = True
            credInfo.VerifyCanDo(
                DQXDbTools.DbOperationWrite(datasetId, sourcetable))
            sql = "update {0} left join {1} on {0}.{2}={1}.{2} set ".format(
                sourcetable, tmptable, primkey)
            for property in properties:
                propid = property['propid']
                if not frst:
                    sql += " ,"
                sql += "{0}.{2}={1}.{2}".format(sourcetable, tmptable, propid)
                frst = False
                calculationObject.LogSQLCommand(sql)
            cur.execute(sql)

            print('Cleaning up')
            cur.execute("DROP TABLE {0}".format(tmptable))

        if not importSettings['ConfigOnly']:
            print('Updating view')
            Utils.UpdateTableInfoView(workspaceid, tableid, allowSubSampling,
                                      cur)

        db.commit()
        db.close()

        print('Creating summary values')
        for property in properties:
            propid = property['propid']
            settings = property['Settings']
            if settings.HasToken('SummaryValues'):
                with calculationObject.LogHeader(
                        'Creating summary values for custom data {0}'.format(
                            tableid)):
                    summSettings = settings.GetSubSettings('SummaryValues')
                    if settings.HasToken('minval'):
                        summSettings.AddTokenIfMissing('MinVal',
                                                       settings['minval'])
                    summSettings.AddTokenIfMissing('MaxVal',
                                                   settings['maxval'])
                    destFolder = os.path.join(config.BASEDIR, 'SummaryTracks',
                                              datasetId, propid)
                    if not os.path.exists(destFolder):
                        os.makedirs(destFolder)
                    dataFileName = os.path.join(destFolder, propid)

                    if not isPositionOnGenome:
                        raise Exception(
                            'Summary values defined for non-position table')

                    if not importSettings['ConfigOnly']:
                        calculationObject.Log('Extracting data to ' +
                                              dataFileName)
                        script = ImpUtils.SQLScript(calculationObject)
                        script.AddCommand(
                            "SELECT {2} as chrom, {3} as pos, {0} FROM {1} ORDER BY {2},{3}"
                            .format(
                                propid,
                                Utils.GetTableWorkspaceView(
                                    workspaceid, tableid), chromField,
                                posField))
                        script.Execute(datasetId, dataFileName)
                        calculationObject.LogFileTop(dataFileName, 10)

                    ImpUtils.CreateSummaryValues_Value(
                        calculationObject, summSettings, datasetId, tableid,
                        'custom', workspaceid, propid, settings['Name'],
                        dataFileName, importSettings)