def ImportDataTable(calculationObject, datasetId, tableid, folder, importSettings): Utils.CheckSafeIdentifier(tableid) global tableOrder with calculationObject.LogHeader('Importing datatable {0}'.format(tableid)): print('Source: ' + folder) DQXUtils.CheckValidIdentifier(tableid) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablecatalog')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'relations')) calculationObject.credentialInfo.VerifyCanDo(DQXDbTools.DbOperationWrite(datasetId, 'tablebasedsummaryvalues')) tableSettings = SettingsLoader.SettingsLoader(os.path.join(os.path.join(folder, 'settings'))) tableSettings.RequireTokens(['NameSingle', 'NamePlural', 'PrimKey']) tableSettings.AddTokenIfMissing('IsPositionOnGenome', False) tableSettings.AddTokenIfMissing('IsRegionOnGenome', False) tableSettings.AddTokenIfMissing('MaxTableSize', None) tableSettings.AddTokenIfMissing('MaxCountQueryRecords', 200000) tableSettings.AddTokenIfMissing('MaxCountQueryAggregated', 1000000) tableSettings.AddTokenIfMissing('AllowSubSampling', False) extraSettings = tableSettings.Clone() extraSettings.DropTokens(['PrimKey', 'Properties']) if tableSettings['MaxTableSize'] is not None: print('WARNING: table size limited to '+str(tableSettings['MaxTableSize'])) # Drop existing tablecatalog record sql = "DELETE FROM tablecatalog WHERE id='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) # Add to tablecatalog extraSettings.ConvertStringsToSafeSQL() sql = "INSERT INTO tablecatalog VALUES ('{0}', '{1}', '{2}', {3}, '{4}', {5})".format( tableid, tableSettings['NamePlural'], tableSettings['PrimKey'], tableSettings['IsPositionOnGenome'], extraSettings.ToJSON(), tableOrder ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) tableOrder += 1 properties = ImpUtils.LoadPropertyInfo(calculationObject, tableSettings, os.path.join(folder, 'data')) sql = "DELETE FROM propertycatalog WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) sql = "DELETE FROM relations WHERE childtableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr = 0 for property in properties: propid = property['propid'] settings = property['Settings'] extraSettings = settings.Clone() extraSettings.DropTokens(['Name', 'DataType', 'Order']) sql = "INSERT INTO propertycatalog VALUES ('', 'fixed', '{0}', '{1}', '{2}', '{3}', {4}, '{5}')".format( settings['DataType'], propid, tableid, settings['Name'], 0, extraSettings.ToJSON() ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if settings.HasToken('Relation'): relationSettings = settings.GetSubSettings('Relation') calculationObject.Log('Creating relation: '+relationSettings.ToJSON()) relationSettings.RequireTokens(['TableId']) relationSettings.AddTokenIfMissing('ForwardName', 'belongs to') relationSettings.AddTokenIfMissing('ReverseName', 'has') sql = "INSERT INTO relations VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}')".format( tableid, propid, relationSettings['TableId'], '', relationSettings['ForwardName'], relationSettings['ReverseName'] ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr += 1 propidList = [] propDict = {} for property in properties: propDict[property['propid']] = property propidList.append(property['propid']) if tableSettings['IsPositionOnGenome']: if not tableSettings.HasToken('Chromosome'): raise Exception('Missing settings tag Chromosome in genome position datatable.') if not tableSettings.HasToken('Position'): raise Exception('Missing settings tag Position in genome position datatable.') if tableSettings['Chromosome'] not in propDict: raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome'])) if tableSettings['Position'] not in propDict: raise Exception('Genome position datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Position'])) if tableSettings['IsRegionOnGenome']: if not tableSettings.HasToken('Chromosome'): raise Exception('Missing setting "Chromosome" in genome region datatable {0}'.format(tableid)) if not tableSettings.HasToken('RegionStart'): raise Exception('Missing setting "RegionStart" in genome region datatable {0}'.format(tableid)) if not tableSettings.HasToken('RegionStop'): raise Exception('Missing setting "RegionStop" in genome region datatable {0}'.format(tableid)) if tableSettings['Chromosome'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['Chromosome'])) if tableSettings['RegionStart'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStart'])) if tableSettings['RegionStop'] not in propDict: raise Exception('Genome region datatable {0} is missing property "{1}"'.format(tableid, tableSettings['RegionStop'])) propDict[tableSettings['Chromosome']]['Settings'].SetToken('Index', True) propDict[tableSettings['RegionStart']]['Settings'].SetToken('Index', True) propDict[tableSettings['RegionStop']]['Settings'].SetToken('Index', True) if not importSettings['ConfigOnly']: columns = [ { 'name': prop['propid'], 'DataType': prop['DataType'], 'Index': prop['Settings']['Index'], 'ReadData': prop['Settings']['ReadData'] } for prop in properties if (prop['propid'] != 'AutoKey')] LoadTable.LoadTable( calculationObject, os.path.join(folder, 'data'), datasetId, tableid, columns, tableSettings, importSettings, tableSettings['AllowSubSampling'] ) if tableSettings['IsPositionOnGenome']: calculationObject.Log('Indexing chromosome') scr = ImpUtils.SQLScript(calculationObject) scr.AddCommand('create index {0}_chrompos ON {0}({1},{2})'.format( tableid, tableSettings['Chromosome'], tableSettings['Position'] )) scr.Execute(datasetId) print('Creating summary values') sql = "DELETE FROM summaryvalues WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) for property in properties: propid = property['propid'] settings = property['Settings'] if (settings.HasToken('SummaryValues')) and ImpUtils.IsValueDataTypeIdenfifier(property['DataType']): with calculationObject.LogHeader('Creating numerical summary values for {0}.{1}'.format(tableid,propid)): summSettings = settings.GetSubSettings('SummaryValues') if settings.HasToken('minval'): summSettings.AddTokenIfMissing('MinVal', settings['minval']) if settings.HasToken('maxval'): summSettings.AddTokenIfMissing('MaxVal', settings['maxval']) sourceFileName = os.path.join(folder, 'data') destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not importSettings['ConfigOnly']: ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings) ImpUtils.CreateSummaryValues_Value( calculationObject, summSettings, datasetId, tableid, 'fixed', '', propid, settings['Name'], dataFileName, importSettings ) if (settings.HasToken('SummaryValues')) and (property['DataType'] == 'Text'): with calculationObject.LogHeader('Creating categorical summary values for {0}.{1}'.format(tableid,propid)): summSettings = settings.GetSubSettings('SummaryValues') sourceFileName = os.path.join(folder, 'data') destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not importSettings['ConfigOnly']: ImpUtils.ExtractColumns(calculationObject, sourceFileName, dataFileName, [tableSettings['Chromosome'], tableSettings['Position'], propid], False, importSettings) categories = [] if settings.HasToken('categoryColors'): stt = settings.GetSubSettings('categoryColors') categories = [x for x in stt.Get()] summSettings.AddTokenIfMissing('Categories', categories) ImpUtils.CreateSummaryValues_Categorical( calculationObject, summSettings, datasetId, tableid, 'fixed', '', propid, settings['Name'], dataFileName, importSettings ) sql = "DELETE FROM tablebasedsummaryvalues WHERE tableid='{0}'".format(tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if tableSettings.HasToken('TableBasedSummaryValues'): calculationObject.Log('Processing table-based summary values') if not type(tableSettings['TableBasedSummaryValues']) is list: raise Exception('TableBasedSummaryValues token should be a list') for stt in tableSettings['TableBasedSummaryValues']: summSettings = SettingsLoader.SettingsLoader() summSettings.LoadDict(stt) summSettings.RequireTokens(['Id', 'Name', 'MaxVal', 'BlockSizeMax']) summSettings.AddTokenIfMissing('MinVal', 0) summSettings.AddTokenIfMissing('BlockSizeMin', 1) summSettings.DefineKnownTokens(['channelColor']) summaryid = summSettings['Id'] with calculationObject.LogHeader('Table based summary value {0}, {1}'.format(tableid, summaryid)): extraSummSettings = summSettings.Clone() extraSummSettings.DropTokens(['Id', 'Name', 'MinVal', 'MaxVal', 'BlockSizeMin', 'BlockSizeMax']) sql = "INSERT INTO tablebasedsummaryvalues VALUES ('{0}', '{1}', '{2}', '{3}', {4}, {5}, {6}, 0)".format( tableid, summaryid, summSettings['Name'], extraSummSettings.ToJSON(), summSettings['MinVal'], summSettings['MaxVal'], summSettings['BlockSizeMin'] ) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) if importSettings['ScopeStr'] == 'all': itemtracknr = 0 for fileid in os.listdir(os.path.join(folder, summaryid)): if not(os.path.isdir(os.path.join(folder, summaryid, fileid))): itemtracknr += 1 calculationObject.Log('Processing {0}: {1}'.format(itemtracknr, fileid)) destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, 'TableTracks', tableid, summaryid, fileid) calculationObject.Log('Destination: '+destFolder) if not os.path.exists(destFolder): os.makedirs(destFolder) shutil.copyfile(os.path.join(folder, summaryid, fileid), os.path.join(destFolder, summaryid+'_'+fileid)) ImpUtils.ExecuteFilterbankSummary_Value(calculationObject, destFolder, summaryid+'_'+fileid, summSettings)
def LoadTable0(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings): calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName)) colDict = {col['name']: col for col in columns} colNameList = [col['name'] for col in columns] print('Column info: '+str(columns)) tb = VTTable.VTTable() tb.allColumnsText = True try: tb.LoadFile(sourceFileName, loadSettings['MaxTableSize']) except Exception as e: raise Exception('Error while reading file: '+str(e)) calculationObject.Log('---- ORIG TABLE ----') with calculationObject.LogDataDump(): tb.PrintRows(0, 9) for col in columns: if not tb.IsColumnPresent(col['name']): raise Exception('Missing column "{0}" in datatable "{1}"'.format(col['name'], tableid)) if loadSettings['PrimKey'] not in colDict: raise Exception('Missing primary key column "{0}" in datatable "{1}"'.format(loadSettings['PrimKey'], tableid)) for col in tb.GetColList(): if col not in colDict: tb.DropCol(col) tb.ArrangeColumns(colNameList) for col in columns: colname = col['name'] if ImpUtils.IsValueDataTypeIdenfifier(col['DataType']): tb.ConvertColToValue(colname) if col['DataType'] == 'Boolean': tb.MapCol(colname, ImpUtils.convertToBooleanInt) tb.ConvertColToValue(colname) calculationObject.Log('---- PROCESSED TABLE ----') with calculationObject.LogDataDump(): tb.PrintRows(0, 9) createcmd = 'CREATE TABLE {0} ('.format(tableid) frst = True for col in columns: if not frst: createcmd += ', ' colname = col['name'] colnr = tb.GetColNr(colname) datatypestr = '' if col['DataType'] == 'Text': maxlength = 1 for rownr in tb.GetRowNrRange(): maxlength = max(maxlength, len(tb.GetValue(rownr, colnr))) datatypestr = 'varchar({0})'.format(maxlength) if len(datatypestr) == 0: datatypestr = ImpUtils.GetSQLDataType(col['DataType']) createcmd += colname + ' ' + datatypestr frst = False createcmd += ')' calculationObject.Log('Creating datatable') scr = ImpUtils.SQLScript(calculationObject) scr.AddCommand('drop table if exists {0}'.format(tableid)) scr.AddCommand(createcmd) scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, loadSettings['PrimKey'])) scr.Execute(databaseid) calculationObject.Log('Loading datatable values') sqlfile = ImpUtils.GetTempFileName() tb.SaveSQLDump(sqlfile, tableid) ImpUtils.ExecuteSQLScript(calculationObject, sqlfile, databaseid) os.remove(sqlfile)
def ImportCustomData(calculationObject, datasetId, workspaceid, tableid, sourceid, folder, importSettings): with calculationObject.LogHeader('Importing custom data'): print('Importing custom data into {0}, {1}, {2} FROM {3}'.format( datasetId, workspaceid, tableid, folder)) credInfo = calculationObject.credentialInfo if not ImpUtils.IsDatasetPresentInServer( calculationObject.credentialInfo, datasetId): raise Exception( 'Dataset {0} is not found. Please import the dataset first'. format(datasetId)) db = DQXDbTools.OpenDatabase(credInfo, datasetId) calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, 'propertycatalog')) calculationObject.credentialInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, 'workspaces')) cur = db.cursor() cur.execute( 'SELECT primkey, settings FROM tablecatalog WHERE id="{0}"'.format( tableid)) row = cur.fetchone() if row is None: raise Exception( 'Unable to find table record for table {0} in dataset {1}'. format(tableid, datasetId)) primkey = row[0] tableSettingsStr = row[1] db.close() ImpUtils.ExecuteSQL( calculationObject, datasetId, 'DELETE FROM customdatacatalog WHERE tableid="{tableid}" and sourceid="{sourceid}"' .format(tableid=tableid, sourceid=sourceid)) tableSettings = SettingsLoader.SettingsLoader() tableSettings.LoadDict(simplejson.loads(tableSettingsStr, strict=False)) allowSubSampling = tableSettings['AllowSubSampling'] isPositionOnGenome = False if tableSettings.HasToken( 'IsPositionOnGenome') and tableSettings['IsPositionOnGenome']: isPositionOnGenome = True chromField = tableSettings['Chromosome'] posField = tableSettings['Position'] settings = SettingsLoader.SettingsLoader( os.path.join(os.path.join(folder, 'settings'))) ImpUtils.ExecuteSQL( calculationObject, datasetId, "INSERT INTO customdatacatalog VALUES ('{tableid}', '{sourceid}', '{settings}')" .format(tableid=tableid, sourceid=sourceid, settings=settings.ToJSON())) properties = ImpUtils.LoadPropertyInfo(calculationObject, settings, os.path.join(folder, 'data')) # remove primary key, just in case properties = [prop for prop in properties if prop['propid'] != primkey] sourcetable = Utils.GetTableWorkspaceProperties(workspaceid, tableid) print('Source table: ' + sourcetable) #Get list of existing properties db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo, datasetId) cur = db.cursor() cur.execute( 'SELECT propid FROM propertycatalog WHERE (workspaceid="{0}") and (source="custom") and (tableid="{1}")' .format(workspaceid, tableid)) existingProperties = [row[0] for row in cur.fetchall()] print('Existing properties: ' + str(existingProperties)) db.close() propidList = [] for property in properties: DQXUtils.CheckValidIdentifier(property['propid']) propidList.append(property['propid']) db = DQXDbTools.OpenDatabase(calculationObject.credentialInfo, datasetId) cur = db.cursor() if not importSettings['ConfigOnly']: # Dropping columns that will be replaced toRemoveExistingProperties = [] for existProperty in existingProperties: if existProperty in propidList: toRemoveExistingProperties.append(existProperty) print('Removing outdated information:') if len(toRemoveExistingProperties) > 0: for prop in toRemoveExistingProperties: print('Removing outdated information: {0} {1} {2}'.format( workspaceid, prop, tableid)) sql = 'DELETE FROM propertycatalog WHERE (workspaceid="{0}") and (propid="{1}") and (tableid="{2}")'.format( workspaceid, prop, tableid) print(sql) cur.execute(sql) sql = "ALTER TABLE {0} ".format(sourcetable) for prop in toRemoveExistingProperties: if prop != toRemoveExistingProperties[0]: sql += ", " sql += "DROP COLUMN {0}".format(prop) calculationObject.LogSQLCommand(sql) cur.execute(sql) ranknr = 0 for property in properties: propid = property['propid'] settings = property['Settings'] extraSettings = settings.Clone() extraSettings.DropTokens( ['Name', 'DataType', 'Order', 'SummaryValues']) print('Create property catalog entry for {0} {1} {2}'.format( workspaceid, tableid, propid)) sql = "DELETE FROM propertycatalog WHERE (workspaceid='{0}') and (propid='{1}') and (tableid='{2}')".format( workspaceid, propid, tableid) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) sql = "INSERT INTO propertycatalog VALUES ('{0}', 'custom', '{1}', '{2}', '{3}', '{4}', {5}, '{6}')".format( workspaceid, settings['DataType'], propid, tableid, settings['Name'], 0, extraSettings.ToJSON()) ImpUtils.ExecuteSQL(calculationObject, datasetId, sql) ranknr += 1 propDict = {} for property in properties: propDict[property['propid']] = property if not importSettings['ConfigOnly']: tmptable = Utils.GetTempID() columns = [{ 'name': prop['propid'], 'DataType': prop['DataType'], 'Index': prop['Settings']['Index'], 'ReadData': prop['Settings']['ReadData'] } for prop in properties] columns.append({ 'name': primkey, 'DataType': 'Text', 'Index': False, 'ReadData': True }) # print('----------------------------------------------------------------') # print(str(columns)) # print('----------------------------------------------------------------') LoadTable.LoadTable(calculationObject, os.path.join(folder, 'data'), datasetId, tmptable, columns, {'PrimKey': primkey}, importSettings, False) print('Creating new columns') calculationObject.Log( 'WARNING: better mechanism to determine column types needed here' ) #TODO: implement frst = True sql = "ALTER TABLE {0} ".format(sourcetable) for property in properties: propid = property['propid'] if not frst: sql += " ," sqldatatype = ImpUtils.GetSQLDataType(property['DataType']) sql += "ADD COLUMN {0} {1}".format(propid, sqldatatype) frst = False calculationObject.LogSQLCommand(sql) cur.execute(sql) print('Joining information') frst = True credInfo.VerifyCanDo( DQXDbTools.DbOperationWrite(datasetId, sourcetable)) sql = "update {0} left join {1} on {0}.{2}={1}.{2} set ".format( sourcetable, tmptable, primkey) for property in properties: propid = property['propid'] if not frst: sql += " ," sql += "{0}.{2}={1}.{2}".format(sourcetable, tmptable, propid) frst = False calculationObject.LogSQLCommand(sql) cur.execute(sql) print('Cleaning up') cur.execute("DROP TABLE {0}".format(tmptable)) if not importSettings['ConfigOnly']: print('Updating view') Utils.UpdateTableInfoView(workspaceid, tableid, allowSubSampling, cur) db.commit() db.close() print('Creating summary values') for property in properties: propid = property['propid'] settings = property['Settings'] if settings.HasToken('SummaryValues'): with calculationObject.LogHeader( 'Creating summary values for custom data {0}'.format( tableid)): summSettings = settings.GetSubSettings('SummaryValues') if settings.HasToken('minval'): summSettings.AddTokenIfMissing('MinVal', settings['minval']) summSettings.AddTokenIfMissing('MaxVal', settings['maxval']) destFolder = os.path.join(config.BASEDIR, 'SummaryTracks', datasetId, propid) if not os.path.exists(destFolder): os.makedirs(destFolder) dataFileName = os.path.join(destFolder, propid) if not isPositionOnGenome: raise Exception( 'Summary values defined for non-position table') if not importSettings['ConfigOnly']: calculationObject.Log('Extracting data to ' + dataFileName) script = ImpUtils.SQLScript(calculationObject) script.AddCommand( "SELECT {2} as chrom, {3} as pos, {0} FROM {1} ORDER BY {2},{3}" .format( propid, Utils.GetTableWorkspaceView( workspaceid, tableid), chromField, posField)) script.Execute(datasetId, dataFileName) calculationObject.LogFileTop(dataFileName, 10) ImpUtils.CreateSummaryValues_Value( calculationObject, summSettings, datasetId, tableid, 'custom', workspaceid, propid, settings['Name'], dataFileName, importSettings)
def LoadTable(calculationObject, sourceFileName, databaseid, tableid, columns, loadSettings, importSettings, allowSubSampling): def DecoId(id): return '`' + id + '`' def EncodeCell(icontent, col): content = icontent if col['IsString']: if len(icontent) == 0: content = "''" else: try: content = content.encode('ascii', 'ignore') except UnicodeDecodeError: print('Unable to encode '+content) content='*failed encoding*' content = content.replace("\x92", "'") content = content.replace("\xC2", "'") content = content.replace("\x91", "'") #filter(lambda x: x in string.printable, val) content = content.replace("'", "\\'") content = content.replace('\r\n', '\\n') content = content.replace('\n\r', '\\n') content = '\'' + content + '\'' if col['IsValue']: if (content == 'NA') or (content == '') or (content == 'None') or (content == 'NULL') or (content == 'null') or (content == 'inf') or (content == '-'): content = 'NULL' if col['IsDate']: if len(content)>=5: try: dt = dateutil.parser.parse(content) tmdiff =(dt - datetime.datetime(1970, 1, 1)).days tmdiff += 2440587.5 +0.5 # note: extra 0.5 because we set datestamp at noon content = str(tmdiff) except: print('ERROR: date parsing string '+content) content = 'NULL' else: content = 'NULL' if col['IsBoolean']: vl = content content = 'NULL' if (vl.lower() == 'true') or (vl.lower() == 'yes') or (vl.lower() == 'y') or (vl == '1'): content = '1' if (vl.lower() == 'false') or (vl.lower() == 'no') or (vl.lower() == 'n') or (vl == '0'): content = '0' return content calculationObject.Log('Loading table {0} from {1}'.format(tableid, sourceFileName)) colDict = {col['name']: col for col in columns} colNameList = [col['name'] for col in columns] primkey = loadSettings['PrimKey'] autoPrimKey = (primkey == 'AutoKey') print('Column info: ' + str(columns)) print('Primary key: ' + primkey) for col in columns: col['IsString'] = (col['DataType'] == 'Text') col['IsValue'] = ImpUtils.IsValueDataTypeIdenfifier(col['DataType']) col['IsDate'] = ImpUtils.IsDateDataTypeIdenfifier(col['DataType']) col['IsBoolean'] = (col['DataType'] == 'Boolean') col['MaxLen'] = 0 destFileName = ImpUtils.GetTempFileName() maxLineCount = -1 if importSettings['ScopeStr'] == '1k': maxLineCount = 1000 if importSettings['ScopeStr'] == '10k': maxLineCount = 10000 if importSettings['ScopeStr'] == '100k': maxLineCount = 100000 if importSettings['ScopeStr'] == '1M': maxLineCount = 1000000 if importSettings['ScopeStr'] == '10M': maxLineCount = 10000000 with open(sourceFileName, 'r') as ifp: if ifp is None: raise Exception('Unable to read file '+sourceFileName) with open(destFileName, 'w') as ofp: if ofp is None: raise Exception('Unable to write to temporary file ' + destFileName) fileColNames = ifp.readline().rstrip('\n\r').split('\t') calculationObject.Log('File columns: ' + str(fileColNames)) fileColIndex = {fileColNames[i]: i for i in range(len(fileColNames))} if not(autoPrimKey) and (primkey not in fileColIndex): raise Exception('File is missing primary key '+primkey) for col in columns: # if 'ReadData' not in col: # print('==========' + str(col)) colname = col['name'] if (col['ReadData']) and (colname not in fileColIndex): raise Exception('File is missing column '+colname) blockSize = 499 blockStarted = False lineCount = 0 for line in ifp: line = line.rstrip('\r\n') if len(line) > 0: sourceCells = line.split('\t') writeCells = [] for col in columns: content = 'NULL' if col['name'] in fileColIndex: content = sourceCells[fileColIndex[col['name']]] content = EncodeCell(content, col) writeCells.append(content) if col['IsString']: col['MaxLen'] = max(col['MaxLen'], len(content)) if not(blockStarted): ofp.write('INSERT INTO {0} ({1}) VALUES '.format(DecoId(tableid), ', '.join([DecoId(col) for col in colNameList]))) blockStarted = True blockNr = 0 if blockNr > 0: ofp.write(',') ofp.write('(') ofp.write(','.join(writeCells)) ofp.write(')') blockNr += 1 if blockNr >= blockSize: ofp.write(';\n') blockStarted = False lineCount += 1 if lineCount % 250000 == 0: calculationObject.Log('Line '+str(lineCount)) if (maxLineCount>0) and (lineCount >= maxLineCount): calculationObject.Log('WARNING:Terminating import at {0} lines'.format(lineCount)) break calculationObject.Log('Column sizes: '+str({col['name']: col['MaxLen'] for col in columns})) calculationObject.Log('Creating schema') scr = ImpUtils.SQLScript(calculationObject) scr.AddCommand('drop table if exists {0};'.format(DecoId(tableid))) sql = 'CREATE TABLE {0} (\n'.format(DecoId(tableid)) colTokens = [] if autoPrimKey: colTokens.append("{0} int AUTO_INCREMENT PRIMARY KEY".format(primkey)) if allowSubSampling: colTokens.append("_randomval_ double") for col in columns: st = DecoId(col['name']) typestr = '' if col['DataType'] == 'Text': typestr = 'varchar({0})'.format(max(1, col['MaxLen'])) if len(typestr) == 0: typestr = ImpUtils.GetSQLDataType(col['DataType']) if len(typestr) == 0: raise Exception('Invalid property data type ' + col['DataType']) st += ' '+typestr colTokens.append(st) sql += ', '.join(colTokens) sql +=')' scr.AddCommand(sql) calculationObject.Log('Creation statement: '+sql) if not(autoPrimKey): scr.AddCommand('create unique index {0}_{1} ON {0}({1})'.format(tableid, primkey)) for col in columns: if ('Index' in col) and (col['Index']) and (col['name'] != primkey): scr.AddCommand('create index {0}_{1} ON {0}({1})'.format(tableid, col['name'])) scr.Execute(databaseid) calculationObject.Log('Importing data') ImpUtils.ExecuteSQLScript(calculationObject, destFileName, databaseid) if allowSubSampling: with calculationObject.LogHeader('Create subsampling table'): calculationObject.Log('Creating random data column') sql = "UPDATE {0} SET _randomval_=RAND()".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "CREATE INDEX {0}_randomindex ON {0}(_randomval_)".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "DROP TABLE IF EXISTS {0}_SORTRAND".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "CREATE TABLE {0}_SORTRAND LIKE {0}".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) if autoPrimKey: calculationObject.Log('Restructuring AutoKey') sql = "alter table {0}_SORTRAND drop column AutoKey".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "alter table {0}_SORTRAND add column AutoKey int FIRST".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "create index idx_autokey on {0}_SORTRAND(AutoKey)".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "alter table {0}_SORTRAND add column RandPrimKey int AUTO_INCREMENT PRIMARY KEY".format(tableid) ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) sql = "insert into {0}_SORTRAND select *,0 from {0} order by _randomval_".format(tableid) sql += ' LIMIT 5000000' # NOTE: there is little point in importing more than that! ImpUtils.ExecuteSQL(calculationObject, databaseid, sql) os.remove(destFileName)