def ImportRefGenomeSummaryData(calculationObject, datasetId, folder, importSettings): if not os.path.exists(os.path.join(folder, 'summaryvalues')): return calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, 'summaryvalues')) summaryids = [] for dir in os.listdir(os.path.join(folder, 'summaryvalues')): if os.path.isdir(os.path.join(folder, 'summaryvalues', dir)): summaryids.append(dir) for summaryid in summaryids: with calculationObject.LogHeader( 'Importing reference genome summary data ' + summaryid): DQXUtils.CheckValidIdentifier(summaryid) destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, summaryid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, summaryid) shutil.copyfile( os.path.join(folder, 'summaryvalues', summaryid, 'values'), dataFileName) settings = SettingsLoader.SettingsLoader( os.path.join(folder, 'summaryvalues', summaryid, 'settings')) settings.RequireTokens( ['Name', 'MaxVal', 'MaxVal', 'BlockSizeMax']) settings.AddTokenIfMissing('MinVal', 0) settings.AddTokenIfMissing('BlockSizeMin', 1) settings.AddTokenIfMissing('ChannelColor', 'rgb(0,0,0)') settings.AddTokenIfMissing('Order', 99999) settings.DefineKnownTokens(['channelColor']) settings.AddTokenIfMissing('ScopeStr', importSettings['ScopeStr']) print('SETTINGS: ' + settings.ToJSON()) if importSettings['ScopeStr'] == 'all': print('Executing filter bank') ImpUtils.ExecuteFilterbankSummary_Value( calculationObject, destFolder, summaryid, settings) else: calculationObject.Log( 'WARNING: Skipping filterbanking genome summary data') extraSettings = settings.Clone() extraSettings.DropTokens([ 'Name', 'Order', 'MinVal', 'MaxVal', 'BlockSizeMin', 'BlockSizeMax' ]) sql = "INSERT INTO summaryvalues VALUES ('', 'fixed', '{0}', '-', '{1}', {2}, '{3}', {4}, {5}, {6})".format( summaryid, settings['Name'], settings['Order'], extraSettings.ToJSON(), settings['MinVal'], settings['MaxVal'], settings['BlockSizeMin']) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql)
def ImportDataTable(calculation_object, dataset_id, tableid, folder, import_settings): global tableOrder, property_order with calculation_object.LogHeader('Importing 2D datatable {0}'.format(tableid)): print('Source: ' + folder) DQXUtils.CheckValidIdentifier(tableid) table_settings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings'))) table_settings.RequireTokens(['NameSingle', 'NamePlural', 'FirstArrayDimension']) table_settings.AddTokenIfMissing('ShowInGenomeBrowser', False) table_settings.AddTokenIfMissing('ColumnDataTable', '') table_settings.AddTokenIfMissing('RowDataTable', '') extra_settings = table_settings.Clone() extra_settings.DropTokens(['ColumnDataTable', 'ColumnIndexField', 'RowDataTable', 'RowIndexField', 'Properties']) remote_hdf5 = h5py.File(os.path.join(folder, 'data.hdf5'), 'r') #Check that the referenced tables exist and have the primary key specified. if table_settings['ColumnDataTable']: sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['ColumnDataTable']) id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) try: id = id[0][0] except IndexError: raise Exception("Index Table " + table_settings['ColumnDataTable'] + " doesn't exist") sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['ColumnIndexField'], table_settings['ColumnDataTable']) try: field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) except: raise Exception(table_settings['ColumnIndexField'] + " column index field doesn't exist in table " + table_settings['ColumnDataTable']) if table_settings['RowDataTable']: sql = "SELECT id FROM tablecatalog WHERE id = '{0}'".format(table_settings['RowDataTable']) id = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) try: id = id[0][0] except IndexError: raise Exception("Index Table " + table_settings['RowDataTable'] + " doesn't exist") sql = "SELECT {0} FROM {1} LIMIT 1".format(table_settings['RowIndexField'], table_settings['RowDataTable']) try: field = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) except: raise Exception(table_settings['RowIndexField'] + " row index field doesn't exist in table " + table_settings['RowDataTable']) if table_settings['ShowInGenomeBrowser']: sql = "SELECT IsPositionOnGenome FROM tablecatalog WHERE id='{0}' ".format(table_settings['ColumnDataTable']) is_position = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql)[0][0] if not is_position: raise Exception(table_settings['ColumnDataTable'] + ' is not a genomic position based table (IsPositionOnGenome in config), but you have asked to use this table as a column index on a genome browseable 2D array.') if table_settings['FirstArrayDimension'] not in ['column', 'row']: raise Exception("FirstArrayDimension must be column or row") # Add to tablecatalog extra_settings.ConvertStringsToSafeSQL() sql = "INSERT INTO 2D_tablecatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', {6})".format( tableid, table_settings['NamePlural'], table_settings['ColumnDataTable'], table_settings['RowDataTable'], table_settings['FirstArrayDimension'], extra_settings.ToJSON(), tableOrder ) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) tableOrder += 1 for property in table_settings['Properties']: extra_settings = copy.deepcopy(property) dtype = arraybuffer._strict_dtype_string(remote_hdf5[property['Id']].dtype) del extra_settings['Id'] del extra_settings['Name'] sql = "INSERT INTO 2D_propertycatalog VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', {5}, '{6}', '{7}')".format( property['Id'], tableid, table_settings['ColumnDataTable'], table_settings['RowDataTable'], property['Name'], property_order, dtype, simplejson.dumps(extra_settings) ) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) property_order += 1 if not import_settings['ConfigOnly']: #Insert an index column into the index tables if table_settings['ColumnDataTable']: if table_settings.HasToken('ColumnIndexArray'): #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporay table with the index array try: column_index = remote_hdf5[table_settings['ColumnIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['ColumnIndexArray'])) for property in table_settings['Properties']: if len(column_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'column' else 1]: raise Exception("Property {0} has a different column length to the column index".format(property)) sql = ImpUtils.Numpy_to_SQL().create_table('TempColIndex', table_settings['ColumnIndexField'], column_index) ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql) #We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempColIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempColIndex` SET `index` = @i:=@i+1; ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL; UPDATE `{0}` INNER JOIN `TempColIndex` ON `{0}`.`{1}` = `TempColIndex`.`{1}` SET `{0}`.`{2}_column_index` = `TempColIndex`.`index`; DROP TABLE `TempColIndex`""".format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #Now check we have no NULLS sql = "SELECT `{1}_column_index` from `{0}` where `{1}_column_index` IS NULL".format( table_settings['ColumnDataTable'], tableid) nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) if len(nulls) > 0: raise Exception("Not all rows in {0} have a corresponding column in 2D datatable {1}".format(table_settings['ColumnDataTable'], tableid)) else: #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "ColumnIndexField" so we index by that column in order sql = """ALTER TABLE `{0}` ADD `{2}_column_index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `{0}` SET `{2}_column_index` = @i:=@i+1 ORDER BY `{1}`; """.format( table_settings['ColumnDataTable'], table_settings['ColumnIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) if table_settings['RowDataTable']: if table_settings.HasToken('RowIndexArray'): #We have an array that matches to a column in the 1D SQL, we add an index to the 1D SQL #Firstly create a temporay table with the index array try: row_index = remote_hdf5[table_settings['RowIndexArray']] except KeyError: raise Exception("HDF5 doesn't contain {0} at the root".format(table_settings['RowIndexArray'])) for property in table_settings['Properties']: if len(row_index) != remote_hdf5[property['Id']].shape[0 if table_settings['FirstArrayDimension'] == 'row' else 1]: raise Exception("Property {0} has a different row length to the row index".format(property)) sql = ImpUtils.Numpy_to_SQL().create_table('TempRowIndex', table_settings['RowIndexField'], row_index) ImpUtils.ExecuteSQLGenerator(calculation_object, dataset_id, sql) #We have a datatable - add an index to it then copy that index across to the data table sql = """ALTER TABLE `TempRowIndex` ADD `index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `TempRowIndex` SET `index` = @i:=@i+1; ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL; UPDATE `{0}` INNER JOIN `TempRowIndex` ON `{0}`.`{1}` = `TempRowIndex`.`{1}` SET `{0}`.`{2}_row_index` = `TempRowIndex`.`index`; DROP TABLE `TempRowIndex`""".format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #Now check we have no NULLS sql = "SELECT `{1}_row_index` from `{0}` where `{1}_row_index` IS NULL".format( table_settings['RowDataTable'], tableid) nulls = ImpUtils.ExecuteSQLQuery(calculation_object, dataset_id, sql) if len(nulls) > 0: raise Exception("Not all rows in {0} have a corresponding row in 2D datatable {1}".format(table_settings['RowDataTable'], tableid)) else: #We don't have an array of keys into a column so we are being told the data in HDF5 is in the same order as sorted "RowIndexField" so we index by that column in order sql = """ALTER TABLE `{0}` ADD `{2}_row_index` INT DEFAULT NULL; SELECT @i:=-1;UPDATE `{0}` SET `{2}_row_index` = @i:=@i+1 ORDER BY `{1}`; """.format( table_settings['RowDataTable'], table_settings['RowIndexField'], tableid) ImpUtils.ExecuteSQL(calculation_object, dataset_id, sql) #We have the indexes - now we need a local copy of the HDF5 data for each property ImpUtils.mkdir(os.path.join(config.BASEDIR, '2D_data')) local_hdf5 = h5py.File(os.path.join(config.BASEDIR, '2D_data', dataset_id+'_'+tableid+'.hdf5'), 'w', libver='latest') print "Copying HDF5 datasets" for property in table_settings['Properties']: print "..", property local_hdf5.copy(remote_hdf5[property['Id']], property['Id']) print "done" print "all copies complete" local_hdf5.close() remote_hdf5.close()
def ImportDataTable(calculationObject, datasetId, tableid, folder, importSettings): Utils.CheckSafeIdentifier(tableid) global tableOrder with calculationObject.LogHeader('Importing datatable {0}'.format(tableid)): print('Source: ' + folder) DQXUtils.CheckValidIdentifier(tableid) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablecatalog')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'relations')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablebasedsummaryvalues')) tableSettings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings'))) tableSettings.RequireTokens(['NameSingle', 'NamePlural', 'PrimKey']) tableSettings.AddTokenIfMissing('IsPositionOnGenome', False) tableSettings.AddTokenIfMissing('IsRegionOnGenome', False) tableSettings.AddTokenIfMissing('MaxTableSize', None) tableSettings.AddTokenIfMissing('MaxCountQueryRecords', 200000) tableSettings.AddTokenIfMissing('MaxCountQueryAggregated', 1000000) tableSettings.AddTokenIfMissing('AllowSubSampling', False) extraSettings = tableSettings.Clone() extraSettings.DropTokens(['PrimKey', 'Properties']) if tableSettings['MaxTableSize'] is not None: print('WARNING: table size limited to '+str(tableSettings['MaxTableSize'])) # Drop existing tablecatalog record sql = "DELETE FROM tablecatalog WHERE id='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) # Add to tablecatalog extraSettings.ConvertStringsToSafeSQL() sql = "INSERT INTO tablecatalog VALUES ('{0}', '{1}', '{2}', {3}, '{4}', {5})".format( tableid, tableSettings['NamePlural'], tableSettings['PrimKey'], tableSettings['IsPositionOnGenome'], extraSettings.ToJSON(), tableOrder ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) tableOrder += 1 properties = ImpUtils.LoadPropertyInfo(calculationObject, tableSettings, os.path.join(folder, 'data')) sql = "DELETE FROM propertycatalog WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) sql = "DELETE FROM relations WHERE childtableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr = 0 for property in properties: propid = property['propid'] settings = property['Settings'] extraSettings = settings.Clone() extraSettings.DropTokens(['Name', 'DataType', 'Order']) sql = "INSERT INTO propertycatalog VALUES ('', 'fixed', '{0}', '{1}', '{2}', '{3}', {4}, '{5}')".format( settings['DataType'], propid, tableid, settings['Name'], 0, extraSettings.ToJSON() ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if settings.HasToken('Relation'): relationSettings = settings.GetSubSettings('Relation') calculationObject.Log('Creating relation: '+relationSettings.ToJSON()) relationSettings.RequireTokens(['TableId']) relationSettings.AddTokenIfMissing('ForwardName', 'belongs to') relationSettings.AddTokenIfMissing('ReverseName', 'has') sql = "INSERT INTO relations VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}')".format( tableid, propid, relationSettings['TableId'], '', relationSettings['ForwardName'], relationSettings['ReverseName'] ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr += 1 propidList = [] propDict = {} for property in properties: propDict[property['propid']] = property propidList.append(property['propid']) if tableSettings['IsPositionOnGenome']: if not tableSettings.HasToken('Chromosome'): raise Exception('Missing settings tag Chromosome in genome position datatable.') if not tableSettings.HasToken('Position'): raise Exception('Missing settings tag Position in genome position datatable.') if tableSettings['Chromosome'] not in propDict: raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome'])) if tableSettings['Position'] not in propDict: raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Position'])) if tableSettings['IsRegionOnGenome']: if not tableSettings.HasToken('Chromosome'): raise Exception('Missing setting "Chromosome" in genome region datatable {0}'.format(tableid)) if not tableSettings.HasToken('RegionStart'): raise Exception('Missing setting "RegionStart" in genome region datatable {0}'.format(tableid)) if not tableSettings.HasToken('RegionStop'): raise Exception('Missing setting "RegionStop" in genome region datatable {0}'.format(tableid)) if tableSettings['Chromosome'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome'])) if tableSettings['RegionStart'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStart'])) if tableSettings['RegionStop'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStop'])) propDict[tableSettings['Chromosome']]['Settings'].SetToken('Index', True) propDict[tableSettings['RegionStart']]['Settings'].SetToken('Index', True) propDict[tableSettings['RegionStop']]['Settings'].SetToken('Index', True) if not importSettings['ConfigOnly']: columns = [ { 'name': prop['propid'], 'DataType': prop['DataType'], 'Index': prop['Settings']['Index'], 'ReadData': prop['Settings']['ReadData'] } for prop in properties if (prop['propid'] != 'AutoKey')] LoadTable.LoadTable( calculationObject, os.path.join(folder, 'data'), datasetId, tableid, columns, tableSettings, importSettings, tableSettings['AllowSubSampling'] ) if tableSettings['IsPositionOnGenome']: calculationObject.Log('Indexing chromosome') scr = ImpUtils.SQLScript(calculationObject) scr.AddCommand('create index {0}_chrompos ON {0}({1},{2})'.format( tableid, tableSettings['Chromosome'], tableSettings['Position'] )) scr.Execute(datasetId) print('Creating summary values') sql = "DELETE FROM summaryvalues WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) for property in properties: propid = property['propid'] settings = property['Settings'] if (settings.HasToken('SummaryValues')) and ImpUtils.IsValueDataTypeIdenfifier(property['DataType']): with calculationObject.LogHeader('Creating numerical summary values for {0}.{1}'.format(tableid,propid)): summSettings = settings.GetSubSettings('SummaryValues') if settings.HasToken('minval'): summSettings.AddTokenIfMissing('MinVal', settings['minval']) if settings.HasToken('maxval'): summSettings.AddTokenIfMissing('MaxVal', settings['maxval']) sourceFileName = os.path.join(folder, 'data') destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not importSettings['ConfigOnly']: ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings) ImpUtils.CreateSummaryValues_Value( calculationObject, summSettings, datasetId, tableid, 'fixed', '', propid, settings['Name'], dataFileName, importSettings ) if (settings.HasToken('SummaryValues')) and (property['DataType'] == 'Text'): with calculationObject.LogHeader('Creating categorical summary values for {0}.{1}'.format(tableid,propid)): summSettings = settings.GetSubSettings('SummaryValues') sourceFileName = os.path.join(folder, 'data') destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not importSettings['ConfigOnly']: ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings) categories = [] if settings.HasToken('categoryColors'): stt = settings.GetSubSettings('categoryColors') categories = [x for x in stt.Get()] summSettings.AddTokenIfMissing('Categories', categories) ImpUtils.CreateSummaryValues_Categorical( calculationObject, summSettings, datasetId, tableid, 'fixed', '', propid, settings['Name'], dataFileName, importSettings ) sql = "DELETE FROM tablebasedsummaryvalues WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if tableSettings.HasToken('TableBasedSummaryValues'): calculationObject.Log('Processing table-based summary values') if not type(tableSettings['TableBasedSummaryValues']) is list: raise Exception('TableBasedSummaryValues token should be a list') for stt in tableSettings['TableBasedSummaryValues']: summSettings = SettingsLoader.SettingsLoader() summSettings.LoadDict(stt) summSettings.RequireTokens(['Id', 'Name', 'MaxVal', 'BlockSizeMax']) summSettings.AddTokenIfMissing('MinVal', 0) summSettings.AddTokenIfMissing('BlockSizeMin', 1) summSettings.DefineKnownTokens(['channelColor']) summaryid = summSettings['Id'] with calculationObject.LogHeader('Table based summary value {0}, {1}'.format(tableid, summaryid)): extraSummSettings = summSettings.Clone() extraSummSettings.DropTokens(['Id', 'Name', 'MinVal', 'MaxVal', 'BlockSizeMin', 'BlockSizeMax']) sql = "INSERT INTO tablebasedsummaryvalues VALUES ('{0}', '{1}', '{2}', '{3}', {4}, {5}, {6}, 0)".format( tableid, summaryid, summSettings['Name'], extraSummSettings.ToJSON(), summSettings['MinVal'], summSettings['MaxVal'], summSettings['BlockSizeMin'] ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if importSettings['ScopeStr'] == 'all': itemtracknr = 0 for fileid in os.listdir(os.path.join(folder, summaryid)): if not(os.path.isdir(os.path.join(folder, summaryid, fileid))): itemtracknr += 1 calculationObject.Log('Processing {0}: {1}'.format(itemtracknr, fileid)) destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, 'TableTracks', tableid, summaryid, fileid) calculationObject.Log('Destination: '+destFolder) if not os.path.exists(destFolder): os.makedirs(destFolder) shutil.copyfile(os.path.join(folder, summaryid, fileid), os.path.join(destFolder, summaryid+'_'+fileid)) ImpUtils.ExecuteFilterbankSummary_Value(calculationObject, destFolder, summaryid+'_'+fileid, summSettings)
def ImportWorkspace(calculationObject, datasetId, workspaceid, folder, importSettings): Utils.CheckSafeIdentifier(workspaceid) with calculationObject.LogHeader('Importing workspace {0}.{1}'.format( datasetId, workspaceid)): print('Source directory: ' + folder) settings = SettingsLoader.SettingsLoader( os.path.join(folder, 'settings')) settings.RequireTokens(['Name']) print(settings.ToJSON()) workspaceName = settings['Name'] if not ImpUtils.IsDatasetPresentInServer( calculationObject.credentialInfo, datasetId): raise Exception( 'Dataset {0} is not found. Please import the dataset first'. format(datasetId)) db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo, datasetId) cur = db.cursor() def execSQL(cmd): calculationObject.LogSQLCommand(cmd) cur.execute(cmd) cur.execute('SELECT id, primkey, settings FROM tablecatalog') tables = [{ 'id': row[0], 'primkey': row[1], 'settingsStr': row[2] } for row in cur.fetchall()] tableMap = {table['id']: table for table in tables} for table in tables: tableSettings = SettingsLoader.SettingsLoader() tableSettings.LoadDict( simplejson.loads(table['settingsStr'], strict=False)) table['settings'] = tableSettings if not importSettings['ConfigOnly']: for table in tables: tableid = table['id'] print('Re-creating custom data table for ' + tableid) execSQL("DROP TABLE IF EXISTS {0}".format( Utils.GetTableWorkspaceProperties(workspaceid, tableid))) execSQL( "CREATE TABLE {0} (StoredSelection TINYINT DEFAULT 0) AS SELECT {1} FROM {2}" .format( Utils.GetTableWorkspaceProperties( workspaceid, tableid), table['primkey'], tableid)) execSQL("create unique index {1} on {0}({1})".format( Utils.GetTableWorkspaceProperties(workspaceid, tableid), table['primkey'])) execSQL( "create index idx_StoredSelection on {0}(StoredSelection)". format( Utils.GetTableWorkspaceProperties( workspaceid, tableid))) print('Removing existing workspace properties') execSQL("DELETE FROM propertycatalog WHERE workspaceid='{0}'".format( workspaceid)) calculationObject.Log('Creating StoredSelection columns') for table in tables: tableid = table['id'] sett = '{"CanUpdate": true, "Index": false, "ReadData": false, "showInTable": false, "Search":"None" }' cmd = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', 'Boolean', 'StoredSelection', '{1}', 'Stored selection', 0, '{2}')".format( workspaceid, tableid, sett) execSQL(cmd) print('Re-creating workspaces record') execSQL("DELETE FROM workspaces WHERE id='{0}'".format(workspaceid)) execSQL('INSERT INTO workspaces VALUES ("{0}","{1}")'.format( workspaceid, workspaceName)) if not importSettings['ConfigOnly']: print('Updating views') for table in tables: Utils.UpdateTableInfoView( workspaceid, table['id'], table['settings']['AllowSubSampling'], cur) db.commit() db.close() print('Scanning for custom data') if os.path.exists(os.path.join(folder, 'customdata')): for tableid in os.listdir(os.path.join(folder, 'customdata')): if os.path.isdir(os.path.join(folder, 'customdata', tableid)): if not tableid in tableMap: raise Exception('Invalid table id ' + tableid) #Read optional settings file customdatalist = None settingsfilename = os.path.join(folder, 'customdata', tableid, 'settings') if os.path.isfile(settingsfilename): with open(settingsfilename) as settingsfile: cSettings = SettingsLoader.SettingsLoader( settingsfilename) if cSettings.HasToken('CustomData'): if not type(cSettings['CustomData']) is list: raise Exception( 'CustomData token should be a list') customdatalist = cSettings['CustomData'] print('Custom data list taken from settings') if customdatalist is None: # Alternatively, just use list of folders customdatalist = [] for customid in os.listdir( os.path.join(folder, 'customdata', tableid)): if os.path.isdir( os.path.join(folder, 'customdata', tableid, customid)): customdatalist.append(customid) print('Custom data list taken from folders') print('Custom data list: ' + str(customdatalist)) for customid in customdatalist: ImportCustomData( calculationObject, datasetId, workspaceid, tableid, customid, os.path.join(folder, 'customdata', tableid, customid), importSettings) else: print('Directory not present')
def ImportCustomData(calculationObject, datasetId, workspaceid, tableid, sourceid, folder, importSettings): with calculationObject.LogHeader('Importing custom data'): print('Importing custom data into {0}, {1}, {2} FROM {3}'.format( datasetId, workspaceid, tableid, folder)) credInfo = calculationObject.credentialInfo if not ImpUtils.IsDatasetPresentInServer( calculationObject.credentialInfo, datasetId): raise Exception( 'Dataset {0} is not found. Please import the dataset first'. format(datasetId)) db = DQXDbTools.OpenDatabase(credInfo, datasetId) calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog')) calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, 'workspaces')) cur = db.cursor() cur.execute( 'SELECT primkey, settings FROM tablecatalog WHERE id="{0}"'.format( tableid)) row = cur.fetchone() if row is None: raise Exception( 'Unable to find table record for table {0} in dataset {1}'. format(tableid, datasetId)) primkey = row[0] tableSettingsStr = row[1] db.close() ImpUtils.ExecuteSQL( calculationObject, datasetId, 'DELETE FROM customdatacatalog WHERE tableid="{tableid}" and sourceid="{sourceid}"' .format(tableid=tableid, sourceid=sourceid)) tableSettings = SettingsLoader.SettingsLoader() tableSettings.LoadDict(simplejson.loads(tableSettingsStr, strict=False)) allowSubSampling = tableSettings['AllowSubSampling'] isPositionOnGenome = False if tableSettings.HasToken( 'IsPositionOnGenome') and tableSettings['IsPositionOnGenome']: isPositionOnGenome = True chromField = tableSettings['Chromosome'] posField = tableSettings['Position'] settings = SettingsLoader.SettingsLoader( os.path.join(os.path.join(folder, 'settings'))) ImpUtils.ExecuteSQL( calculationObject, datasetId, "INSERT INTO customdatacatalog VALUES ('{tableid}', '{sourceid}', '{settings}')" .format(tableid=tableid, sourceid=sourceid, settings=settings.ToJSON())) properties = ImpUtils.LoadPropertyInfo(calculationObject, settings, os.path.join(folder, 'data')) # remove primary key, just in case properties = [prop for prop in properties if prop['propid'] != primkey] sourcetable = Utils.GetTableWorkspaceProperties(workspaceid, tableid) print('Source table: ' + sourcetable) #Get list of existing properties db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo, datasetId) cur = db.cursor() cur.execute( 'SELECT propid FROM propertycatalog WHERE (workspaceid="{0}") and (source="custom") and (tableid="{1}")' .format(workspaceid, tableid)) existingProperties = [row[0] for row in cur.fetchall()] print('Existing properties: ' + str(existingProperties)) db.close() propidList = [] for property in properties: DQXUtils.CheckValidIdentifier(property['propid']) propidList.append(property['propid']) db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo, datasetId) cur = db.cursor() if not importSettings['ConfigOnly']: # Dropping columns that will be replaced toRemoveExistingProperties = [] for existProperty in existingProperties: if existProperty in propidList: toRemoveExistingProperties.append(existProperty) print('Removing outdated information:') if len(toRemoveExistingProperties) > 0: for prop in toRemoveExistingProperties: print('Removing outdated information: {0} {1} {2}'.format( workspaceid, prop, tableid)) sql = 'DELETE FROM propertycatalog WHERE (workspaceid="{0}") and (propid="{1}") and (tableid="{2}")'.format( workspaceid, prop, tableid) print(sql) cur.execute(sql) sql = "ALTER TABLE {0} ".format(sourcetable) for prop in toRemoveExistingProperties: if prop != toRemoveExistingProperties[0]: sql += ", " sql += "DROP COLUMN {0}".format(prop) calculationObject.LogSQLCommand(sql) cur.execute(sql) ranknr = 0 for property in properties: propid = property['propid'] settings = property['Settings'] extraSettings = settings.Clone() extraSettings.DropTokens( ['Name', 'DataType', 'Order', 'SummaryValues']) print('Create property catalog entry for {0} {1} {2}'.format( workspaceid, tableid, propid)) sql = "DELETE FROM propertycatalog WHERE (workspaceid='{0}') and (propid='{1}') and (tableid='{2}')".format( workspaceid, propid, tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) sql = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', '{1}', '{2}', '{3}', '{4}', {5}, '{6}')".format( workspaceid, settings['DataType'], propid, tableid, settings['Name'], 0, extraSettings.ToJSON()) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr += 1 propDict = {} for property in properties: propDict[property['propid']] = property if not importSettings['ConfigOnly']: tmptable = Utils.GetTempID() columns = [{ 'name': prop['propid'], 'DataType': prop['DataType'], 'Index': prop['Settings']['Index'], 'ReadData': prop['Settings']['ReadData'] } for prop in properties] columns.append({ 'name': primkey, 'DataType': 'Text', 'Index': False, 'ReadData': True }) # print('----------------------------------------------------------------') # print(str(columns)) # print('----------------------------------------------------------------') LoadTable.LoadTable(calculationObject, os.path.join(folder, 'data'), datasetId, tmptable, columns, {'PrimKey': primkey}, importSettings, False) print('Creating new columns') calculationObject.Log( 'WARNING: better mechanism to determine column types needed here' ) #TODO: implement frst = True sql = "ALTER TABLE {0} ".format(sourcetable) for property in properties: propid = property['propid'] if not frst: sql += " ," sqldatatype = ImpUtils.GetSQLDataType(property['DataType']) sql += "ADD COLUMN {0} {1}".format(propid, sqldatatype) frst = False calculationObject.LogSQLCommand(sql) cur.execute(sql) print('Joining information') frst = True credInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, sourcetable)) sql = "update {0} left join {1} on {0}.{2}={1}.{2} set ".format( sourcetable, tmptable, primkey) for property in properties: propid = property['propid'] if not frst: sql += " ," sql += "{0}.{2}={1}.{2}".format(sourcetable, tmptable, propid) frst = False calculationObject.LogSQLCommand(sql) cur.execute(sql) print('Cleaning up') cur.execute("DROP TABLE {0}".format(tmptable)) if not importSettings['ConfigOnly']: print('Updating view') Utils.UpdateTableInfoView(workspaceid, tableid, allowSubSampling, cur) db.commit() db.close() print('Creating summary values') for property in properties: propid = property['propid'] settings = property['Settings'] if settings.HasToken('SummaryValues'): with calculationObject.LogHeader( 'Creating summary values for custom data {0}'.format( tableid)): summSettings = settings.GetSubSettings('SummaryValues') if settings.HasToken('minval'): summSettings.AddTokenIfMissing('MinVal', settings['minval']) summSettings.AddTokenIfMissing('MaxVal', settings['maxval']) destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not isPositionOnGenome: raise Exception( 'Summary values defined for non-position table') if not importSettings['ConfigOnly']: calculationObject.Log('Extracting data to ' + dataFileName) script = ImpUtils.SQLScript(calculationObject) script.AddCommand( "SELECT {2} as chrom, {3} as pos, {0} FROM {1} ORDER BY {2},{3}" .format( propid, Utils.GetTableWorkspaceView( workspaceid, tableid), chromField, posField)) script.Execute(datasetId, dataFileName) calculationObject.LogFileTop(dataFileName, 10) ImpUtils.CreateSummaryValues_Value( calculationObject, summSettings, datasetId, tableid, 'custom', workspaceid, propid, settings['Name'], dataFileName, importSettings)
def __init__(self): self.name = self.__class__.__name__ self.logger = logger self._settings_loader = SettingsLoader() self.logger.debug('Module {0} initialized '.format(self.name))
def ImportDataSet(calculationObject, baseFolder, datasetId, importSettings): Utils.CheckSafeIdentifier(datasetId) with calculationObject.LogHeader( 'Importing dataset {0}'.format(datasetId)): calculationObject.Log('Import settings: ' + str(importSettings)) DQXUtils.CheckValidIdentifier(datasetId) datasetFolder = os.path.join(baseFolder, datasetId) indexDb = config.DB calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(indexDb, 'datasetindex')) calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId)) # Remove current reference in the index first: if import fails, nothing will show up ImpUtils.ExecuteSQL( calculationObject, indexDb, 'DELETE FROM datasetindex WHERE id="{0}"'.format(datasetId)) globalSettings = SettingsLoader.SettingsLoader( os.path.join(datasetFolder, 'settings')) globalSettings.RequireTokens(['Name']) globalSettings.AddTokenIfMissing('Description', '') print('Global settings: ' + str(globalSettings.Get())) if not importSettings['ConfigOnly']: # Dropping existing database calculationObject.SetInfo('Dropping database') print('Dropping database') try: ImpUtils.ExecuteSQL( calculationObject, indexDb, 'DROP DATABASE IF EXISTS {0}'.format(datasetId)) except: pass ImpUtils.ExecuteSQL(calculationObject, indexDb, 'CREATE DATABASE {0}'.format(datasetId)) # Creating new database scriptPath = os.path.dirname(os.path.realpath(__file__)) calculationObject.SetInfo('Creating database') print('Creating new database') ImpUtils.ExecuteSQLScript(calculationObject, scriptPath + '/createdataset.sql', datasetId) ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM propertycatalog') ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM summaryvalues') ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM tablecatalog') ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM settings') ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM customdatacatalog') datatables = [] if globalSettings.HasToken('DataTables'): if not type(globalSettings['DataTables']) is list: raise Exception('DataTables token should be a list') datatables = globalSettings['DataTables'] for dir in os.listdir(os.path.join(datasetFolder, 'datatables')): if os.path.isdir(os.path.join(datasetFolder, 'datatables', dir)): if dir not in datatables: datatables.append(dir) print('Data tables: ' + str(datatables)) for datatable in datatables: ImportDataTable.ImportDataTable( calculationObject, datasetId, datatable, os.path.join(datasetFolder, 'datatables', datatable), importSettings) try: datatables_2D = globalSettings['2D_DataTables'] except KeyError: datatables_2D = [] if type(datatables_2D) is not list: raise TypeError('2D_DataTables token should be a list') for datatable in datatables_2D: Import2DDataTable.ImportDataTable( calculationObject, datasetId, datatable, os.path.join(datasetFolder, '2D_datatables', datatable), importSettings) if os.path.exists(os.path.join(datasetFolder, 'refgenome')): ImportRefGenome.ImportRefGenome( calculationObject, datasetId, os.path.join(datasetFolder, 'refgenome'), importSettings) globalSettings.AddTokenIfMissing('hasGenomeBrowser', True) ImportWorkspaces.ImportWorkspaces(calculationObject, datasetFolder, datasetId, importSettings) # Global settings print('Defining global settings') ImpUtils.ImportGlobalSettings(calculationObject, datasetId, globalSettings) # Finalise: register dataset print('Registering dataset') importtime = 0 if not importSettings['ConfigOnly']: importtime = time.time() ImpUtils.ExecuteSQL( calculationObject, indexDb, 'INSERT INTO datasetindex VALUES ("{0}", "{1}", "{2}")'.format( datasetId, globalSettings['Name'], str(math.ceil(importtime))))
def LoadPropertyInfo(calculationObject, impSettings, datafile): calculationObject.Log('Determining properties') properties = [] propidMap = {} autoPrimKey = (impSettings.HasToken('PrimKey') and (impSettings['PrimKey'] == 'AutoKey')) if autoPrimKey: propid = impSettings['PrimKey'] property = {'propid': propid} propidMap[propid] = property properties.append(property) settings = SettingsLoader.SettingsLoader() settings.LoadDict({ 'Name': impSettings['PrimKey'], 'ShowInTable': True, 'DataType': 'Value', 'DecimDigits': 0 }) property['Settings'] = settings if impSettings.HasToken('Properties'): if not type(impSettings['Properties']) is list: raise Exception('Properties token should be a list') for propSource in impSettings['Properties']: if 'Id' not in propSource: raise Exception('Property is missing Id field') propids = propSource['Id'] for propid in propids.split(','): propid = propid.strip() if propid in propidMap: property = propidMap[propid] settings = property['Settings'] else: property = {'propid': propid} settings = SettingsLoader.SettingsLoader() settings.LoadDict({}) property['Settings'] = settings propidMap[propid] = property properties.append(property) DQXUtils.CheckValidIdentifier(propid) settings.AddDict(propSource) if (impSettings.HasToken('AutoScanProperties')) and ( impSettings['AutoScanProperties']): calculationObject.Log('Auto determining columns') tb = VTTable.VTTable() tb.allColumnsText = True try: tb.LoadFile(datafile, 9999) except Exception as e: raise Exception('Error while reading data file: ' + str(e)) with calculationObject.LogDataDump(): tb.PrintRows(0, 9) for propid in tb.GetColList(): if propid not in propidMap: property = {'propid': propid} colnr = tb.GetColNr(propid) cnt_tot = 0 cnt_isnumber = 0 cnt_isbool = 0 for rownr in tb.GetRowNrRange(): val = tb.GetValue(rownr, colnr) if val is not None: cnt_tot += 1 try: float(val) cnt_isnumber += 1 except ValueError: pass if val in [ 'True', 'true', 'TRUE', 'False', 'false', 'FALSE', '1', '0' ]: cnt_isbool += 1 property['DataType'] = 'Text' if (cnt_isnumber > 0.75 * cnt_tot) and (cnt_isnumber > cnt_isbool): property['DataType'] = 'Value' if (cnt_isbool == cnt_tot) and (cnt_isbool >= cnt_isnumber): property['DataType'] = 'Boolean' DQXUtils.CheckValidIdentifier(propid) settings = SettingsLoader.SettingsLoader() settings.LoadDict({}) settings.AddTokenIfMissing('Name', propid) settings.AddTokenIfMissing('DataType', property['DataType']) property['Settings'] = settings properties.append(property) propidMap[propid] = property for property in properties: settings = property['Settings'] settings.AddTokenIfMissing('Index', False) settings.AddTokenIfMissing('Search', 'None') settings.DefineKnownTokens([ 'isCategorical', 'minval', 'maxval', 'decimDigits', 'showInBrowser', 'showInTable', 'categoryColors', 'channelName', 'channelColor', 'connectLines', 'SummaryValues' ]) settings.RequireTokens(['DataType']) settings.ConvertToken_Boolean('isCategorical') if settings.HasToken('isCategorical') and settings['isCategorical']: settings.SetToken( 'Index', True) # Categorical data types are always indexed if settings.HasToken('Relation'): settings.SetToken( 'Index', True) # Relation child properties are always indexed if settings['Search'] not in [ 'None', 'StartPattern', 'Pattern', 'Match' ]: raise Exception( 'Property "Search" token should be None,StartPattern,Pattern,Match' ) if settings['Search'] in ['StartPattern', 'Pattern', 'Match']: settings.SetToken('Index', True) # Use index to speed up search settings.AddTokenIfMissing('Name', property['propid']) settings.AddTokenIfMissing('ReadData', True) settings.ConvertToken_Boolean('ReadData') settings.AddTokenIfMissing('CanUpdate', False) settings.ConvertToken_Boolean('CanUpdate') settings.ConvertStringsToSafeSQL() property['DataType'] = settings['DataType'] if len(properties) == 0: raise Exception( 'No properties defined. Use "AutoScanProperties: true" or "Properties" list to define' ) calculationObject.Log('Properties found:') with calculationObject.LogDataDump(): for property in properties: calculationObject.Log( str(property) + ' | ' + property['Settings'].ToJSON()) for property in properties: Utils.CheckSafeIdentifier(property['propid']) return properties
def ImportRefGenome(calculationObject, datasetId, folder, importSettings): with calculationObject.LogHeader('Importing reference genome data'): ImportRefGenomeSummaryData(calculationObject, datasetId, folder, importSettings) settings = SettingsLoader.SettingsLoader( os.path.join(folder, 'settings')) settings.DefineKnownTokens(['AnnotMaxViewportSize', 'RefSequenceSumm']) print('Settings: ' + str(settings.Get())) ImpUtils.ImportGlobalSettings(calculationObject, datasetId, settings) # Import reference genome if importSettings['ScopeStr'] == 'all': refsequencefile = os.path.join(folder, 'refsequence.fa') if os.path.exists(refsequencefile): with calculationObject.LogHeader( 'Converting reference genome'): destfolder = config.BASEDIR + '/SummaryTracks/' + datasetId + '/Sequence' if not os.path.exists(destfolder): os.makedirs(destfolder) tempfastafile = destfolder + '/refsequence.fa' shutil.copyfile(refsequencefile, tempfastafile) ImpUtils.RunConvertor(calculationObject, 'Fasta2FilterBankData', destfolder, ['refsequence.fa']) else: calculationObject.Log( 'WARNING: missing reference sequence file') else: calculationObject.Log( 'WARNING: Skipping converting reference genome') # Import chromosomes with calculationObject.LogHeader('Loading chromosomes'): tb = VTTable.VTTable() tb.allColumnsText = True try: tb.LoadFile(os.path.join(folder, 'chromosomes')) except Exception as e: raise Exception('Error while reading chromosomes file: ' + str(e)) tb.RequireColumnSet(['chrom', 'length']) tb.RenameCol('chrom', 'id') tb.RenameCol('length', 'len') tb.ConvertColToValue('len') with calculationObject.LogDataDump(): tb.PrintRows(0, 99) sqlfile = ImpUtils.GetTempFileName() tb.SaveSQLDump(sqlfile, 'chromosomes') ImpUtils.ExecuteSQL(calculationObject, datasetId, 'DELETE FROM chromosomes') ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, datasetId) os.remove(sqlfile) if importSettings['ScopeStr'] == 'all': # Import annotation with calculationObject.LogHeader('Converting annotation'): tempgfffile = ImpUtils.GetTempFileName() temppath = os.path.dirname(tempgfffile) shutil.copyfile(os.path.join(folder, 'annotation.gff'), tempgfffile) ImpUtils.RunConvertor(calculationObject, 'ParseGFF', temppath, [os.path.basename(tempgfffile)]) print('Importing annotation') ImpUtils.ExecuteSQLScript( calculationObject, os.path.join(temppath, 'annotation_dump.sql'), datasetId) os.remove(tempgfffile) os.remove(os.path.join(temppath, 'annotation.txt')) os.remove(os.path.join(temppath, 'annotation_dump.sql')) os.remove(os.path.join(temppath, 'annotation_create.sql')) else: calculationObject.Log('WARNING: Skipping converting annotation')