def writeAccBCP(): ''' # requires: # # effects: # Creates approrpriate BCP records # # returns: # nothing # ''' global accKey, userKey # records that require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 1 ' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write('%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accrefFile.write('%d|%s|%s|%s|%s|%s\n' % (accKey, referenceKey, userKey, userKey, loaddate, loaddate)) accKey = accKey + 1 # records that don't require a reference results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \ 'from WRK_EntrezGene_Bucket0 ' + \ 'where taxID = %s and refRequired = 0' % (taxId), 'auto') for r in results: if r['_Object_key'] == -1: objectKey = geneIDtoMarkerKey[r['geneID']] else: objectKey = r['_Object_key'] prefixPart, numericPart = accessionlib.split_accnum(r['accID']) accFile.write('%d|%s|%s|%s|%d|%d|%s|%d|1|%s|%s|%s|%s\n' % (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate)) accKey = accKey + 1
def createBCPFile(): global accKey print 'Create the bcp file for the GENSAT associations' # # Find the marker key that the EntrezGene ID should be associated with. # Do not make an association for any EntrezGene IDs that are on the # discrepancy report. # cmds = [] cmds.append('select t.entrezgeneID, a._Object_key as markerKey ' + \ 'from ' + tempTable + ' t, ACC_Accession a ' + \ 'where lower(t.entrezgeneID) = lower(a.accID) and ' + \ 'a._MGIType_key = ' + str(markerMGITypeKey) + ' and ' + \ 'a._LogicalDB_key = ' + str(egLogicalDBKey) + ' ' + \ 'order by t.entrezgeneID') results = db.sql(cmds, 'auto') count = 0 # # Write the records to the bcp file. # for r in results[0]: entrezgeneID = r['entrezgeneID'] markerKey = r['markerKey'] # # Skip the EntrezGene ID if it was written to the discrepancy report. # if badIDs.has_key(entrezgeneID): continue # # Get the prefix and numeric parts of the EntrezGene ID and write # a record to the bcp file. # (prefixPart, numericPart) = accessionlib.split_accnum(entrezgeneID) fpAccBCPFile.write(str(accKey) + TAB + \ entrezgeneID + TAB + \ prefixPart + TAB + \ str(numericPart) + TAB + \ str(gensatLogicalDBKey) + TAB + \ str(markerKey) + TAB + \ str(markerMGITypeKey) + TAB + \ PRIVATE + TAB + PREFERRED + TAB + \ str(createdByKey) + TAB + \ str(createdByKey) + TAB + \ loadDate + TAB + \ loadDate + NL) count = count + 1 accKey = accKey + 1 print 'Number of GENSAT associations: ' + str(count) return
def createBCPFile (): global accKey print 'Create the bcp file for the GENSAT associations' # # Find the marker key that the EntrezGene ID should be associated with. # Do not make an association for any EntrezGene IDs that are on the # discrepancy report. # cmds = [] cmds.append('select t.entrezgeneID, a._Object_key as markerKey ' + \ 'from ' + tempTable + ' t, ACC_Accession a ' + \ 'where lower(t.entrezgeneID) = lower(a.accID) and ' + \ 'a._MGIType_key = ' + str(markerMGITypeKey) + ' and ' + \ 'a._LogicalDB_key = ' + str(egLogicalDBKey) + ' ' + \ 'order by t.entrezgeneID') results = db.sql(cmds,'auto') count = 0 # # Write the records to the bcp file. # for r in results[0]: entrezgeneID = r['entrezgeneID'] markerKey = r['markerKey'] # # Skip the EntrezGene ID if it was written to the discrepancy report. # if badIDs.has_key(entrezgeneID): continue # # Get the prefix and numeric parts of the EntrezGene ID and write # a record to the bcp file. # (prefixPart,numericPart) = accessionlib.split_accnum(entrezgeneID) fpAccBCPFile.write(str(accKey) + TAB + \ entrezgeneID + TAB + \ prefixPart + TAB + \ str(numericPart) + TAB + \ str(gensatLogicalDBKey) + TAB + \ str(markerKey) + TAB + \ str(markerMGITypeKey) + TAB + \ PRIVATE + TAB + PREFERRED + TAB + \ str(createdByKey) + TAB + \ str(createdByKey) + TAB + \ loadDate + TAB + \ loadDate + NL) count = count + 1 accKey = accKey + 1 print 'Number of GENSAT associations: ' + str(count) return
def writeMGPOutput(): # Purpose: writes to Accession, AccessionReference & StrainMarker # BCP file and Gene Model and GM Assoc files if there are no errors # Returns: 1 if error, else 0 # Assumes: file descriptors have been initialized # Effects: writes to the file system # Throws: Nothing global nextSMKey, nextAccKey, mgpSkipCt, totalLoadedCt # for markers with >1 strain specific MGP ID # report and load a strain marker and a accession for each MGP ID strainMarkerInputList = qcDict['mgi_mgp'] for strainMarkerInputDict in strainMarkerInputList: for strain in strainMarkerInputDict: strainMarkerObjectsList = strainMarkerInputDict[strain] for coordsForMarkerList in strainMarkerObjectsList: for strainMarkerObject in coordsForMarkerList: # write out to bcp file mgpID = strainMarkerObject.mgpID mgiID = strainMarkerObject.markerID markerKey = strainMarkerObject.markerKey if mgiID.find( 'TEMP:' ) == 0: # temp ID for no marker strain marker markerKey = '' strainKey = strainMarkerObject.strainKey chr = strainMarkerObject.chr start = strainMarkerObject.start end = strainMarkerObject.end strand = strainMarkerObject.strand description = strainMarkerObject.description biotype = strainMarkerObject.biotype totalLoadedCt += 1 fpStrainMarkerFile.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextSMKey, TAB, strainKey, TAB, markerKey, TAB, mgpRefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) prefixPart, numericPart = accessionlib.split_accnum(mgpID) fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, mgpID, TAB, prefixPart, TAB, numericPart, TAB, mgpLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, mgpRefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) fpGmMgpFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' % (mgpID, TAB, chr, TAB, start, TAB, end, TAB, strand, TAB, description, CRT)) fpBiotypeMgpFile.write('%s%s%s%s' % (mgpID, TAB, biotype, CRT)) nextSMKey += 1 nextAccKey += 1 return 0
def processSusceptibility(): # do formatted file doFileName = None # do file pointer doFile = None # insert statement INSERT_ACCESSION = '''insert into ACC_Accession values ((select max(_Accession_key) + 1 from ACC_Accession), '%s', '%s', %s, 15, %s, 13, 0, 0) ''' doFileName = os.environ['OBO_FILE'] doFile = open(doFileName, 'r') omimIdValue = 'id: OMIM:' relValue = 'relationship: RO:0003304' skipValue = 'OMIM:000000' foundOMIM = 0 for line in doFile.readlines(): # find [Term] # find relationship: RO:0003304 if line == '[Term]': foundOMIM = 0 elif line[:9] == omimIdValue: omimId = line[4:-1] if omimId == skipValue: continue foundOMIM = 1 elif foundOMIM and line[:24] == relValue: tokens = line[25:-1].split(' ') doId = tokens[0] prefixPart, numericPart = accessionlib.split_accnum(omimId) objectKey = loadlib.verifyObject(doId, 13, None, None, None) addSQL = INSERT_ACCESSION % (omimId, prefixPart, numericPart, objectKey) db.sql(addSQL, None) else: continue doFile.close() db.commit() return 0
def process(): global propertiesDict, expCount, loadedCount, inDbCount, invalidSampleCountDict global invalidReleaseDateDict, invalidUpdateDateDict, noIdList global nextExptKey, nextAccKey, nextExptVarKey, nextPropKey global updateExptCount for f in jFile['experiments']['experiment']: expCount += 1 # definitions with SUPERSERIES text get different evaluation state # than the load default and evalution date and evaluated by are set # by the load (default null) isSuperSeries = 0 evalStateToUseKey = defaultEvalStateTermKey try: # description is str.or list allDescription = f['description'] description = allDescription['text'] # experiment, onea # US108 'clean up URLs that appear in description field'. All # URLs that need to be cleaned up are the listType description # example of element in a description list with URL that we # need to parse: # {'a': {'href': 'http://lgsun.grc.nia.nih.gov/ANOVA/', 'target': '_blank', '$': 'http://lgsun.grc.nia.nih.gov/ANOVA/'}} if type(description) == list: listDescript = '' for d in description: if type(d) == dict: if 'a' in d: url = d['a']['$'] listDescript = listDescript + url # skip these: {"br":null} elif 'br' in d: continue else: listDescript = listDescript + str(d) description = listDescript except: description = '' if description == None: # {'text': None, 'id': None} description = '' description = str.strip(description) if description.find(SUPERSERIES) != -1: evalStateToUseKey = altEvalStateTermKey isSuperSeries = 1 try: name = f['name'] if type(name) == list: name = '|'.join(name) except: name = '' name = str.strip(name) try: primaryID = str.strip(f['accession']) # accession except: primaryID = '' try: sampleCount = f['samples'] # property, one except: sampleCount = '' try: releasedate = f['releasedate'] # experiment, one except: releasedate = '' try: # experimentalfactor.name # list or dict expFactor = f['experimentalfactor'] if type(expFactor) == dict: expFactorList = [expFactor] else: expFactorList = expFactor expFactorSet = set() for e in expFactorList: #property, many stored individ. # weed out dups expFactorSet.add(e['name']) expFactorList = list(expFactorSet) except: expFactorList = [] try: lastupdatedate = f['lastupdatedate'] # experiment, one except: lastupdatedate = '' try: # provider.contact, dictionary or list of dictionaries; need # to remove exact dups providerList = [] if type(f['provider']) != list: providerList = [f['provider']['contact']] else: for p in f['provider']: if p['contact'] != None: providerList.append(p['contact']) providerSet = set(providerList) providerList = list(providerSet) except: providerList = [] try: # experimenttype is str.or list, property, # many stored individ if type(f['experimenttype']) != list: experimenttypeList = [f['experimenttype']] else: experimenttypeList = f['experimenttype'] except: experimenttypeList = [] # pick first valid experiment type and translate it to populate the # exptype key exptTypeKey = 0 for exp in experimenttypeList: if exp in exptTypeTransDict: exptTypeKey = exptTypeTransDict[exp] break if exptTypeKey == 0: exptTypeKey = exptTypeNRKey # Not Resolved try: # PubMed IDs - bibliography.accession # TR13116/check for duplicate pubmedids bibliographyList = [] if type(f['bibliography']) == dict: # dictionary if str(f['bibliography']['accession']) not in bibliographyList: bibliographyList.append(str( f['bibliography']['accession'])) else: # ListType for b in f['bibliography']: # for each dict in the list if 'accession' in b: if str(b['accession']) not in bibliographyList: bibliographyList.append(str(b['accession'])) except: bibliographyList = [] # the template for properties: propertyTemplate = "#====#%s%s%s#=#%s%s%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % ( TAB, propTypeKey, TAB, TAB, nextExptKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT) propertyUpdateTemplate = "#====#%s%s%s#=#%s#=====#%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % ( TAB, propTypeKey, TAB, TAB, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT) # # update pubmed ID properties, if this ID already in the database # if primaryID in primaryIdDict: inDbCount += 1 # not all experiments have pubmed IDs if primaryID in pubMedByExptDict: # get the list of pubmed Ids for this expt in the database dbBibList = pubMedByExptDict[primaryID] # get the set of incoming pubmed IDs not in the database newSet = set(bibliographyList).difference(set(dbBibList)) # if we have new pubmed IDs, add them to the database if newSet: updateExpKey = primaryIdDict[primaryID] # get next sequenceNum for this expt's pubmed ID # in the database results = db.sql( '''select max(sequenceNum) + 1 as nextNum from MGI_Property p where p._Object_key = %s and p._PropertyTerm_key = 20475430 and p._PropertyType_key = 1002''' % updateExpKey, 'auto') nextSeqNum = results[0]['nextNum'] if newSet: updateExptCount += 1 for b in newSet: toLoad = propertyUpdateTemplate.replace( '#=#', str(pubmedPropKey)).replace( '#==#', str(b)).replace('#===#', str(nextSeqNum)).replace( '#====#', str(nextPropKey)).replace( '#=====#', str(updateExpKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 # continue so we don't dup what is in the db continue prefixPartPrimary, numericPartPrimary = accessionlib.split_accnum( primaryID) # # Do QC checks # If there are errors, skip to the next experiment if doQcChecks(primaryID, name, sampleCount, releasedate, lastupdatedate): continue # calculate secondary GEO ID for AE GEO IDs geoID = calculateGeoId(primaryID) # # now write out to bcp files # loadedCount += 1 # GXD_Experiment # many optional nulls - create the insert string line = '%s%s%s%s' % (nextExptKey, TAB, sourceKey, TAB) if name != '': line = line + name + TAB if description != '' and description != None: line = line + description + TAB else: line = line + TAB if releasedate != '': line = line + releasedate + TAB else: line = line + TAB if lastupdatedate != '': line = line + lastupdatedate + TAB else: line = line + TAB # evaluated data is today if isSuperSeries: line = line + loadDate + TAB else: # evaluated_date is null line = line + TAB line = line + str(evalStateToUseKey) + TAB if isSuperSeries: line = line + str(altCurStateTermKey) + TAB else: line = line + str(curStateTermKey) + TAB line = line + str(studyTypeTermKey) + TAB line = line + str(exptTypeKey) + TAB # evalByKey is null unless isSuperSeries is true then # it is load user if isSuperSeries: line = line + str(userKey) + TAB else: line = line + TAB # initialCurByKey, lastCurByKey, initialCurDate, lastCurDate # all null line = line + TAB + TAB + TAB + TAB # created and modified by line = line + str(userKey) + TAB + str(userKey) + TAB # creation and modification date line = line + loadDate + TAB + loadDate + CRT fpExperimentBcp.write(line) # Primary Accession fpAccBcp.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextAccKey, TAB, primaryID, TAB, prefixPartPrimary, TAB, numericPartPrimary, TAB, aeLdbKey, TAB, nextExptKey, TAB, mgiTypeKey, TAB, private, TAB, isPreferred, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT)) nextAccKey += 1 # Secondary Accession if geoID != '': prefixPartSecondary, numericPartSecondary = accessionlib.split_accnum( geoID) fpAccBcp.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextAccKey, TAB, geoID, TAB, prefixPartSecondary, TAB, numericPartSecondary, TAB, geoLdbKey, TAB, nextExptKey, TAB, mgiTypeKey, TAB, private, TAB, notPreferred, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT)) nextAccKey += 1 # Variable fpVariableBcp.write( '%s%s%s%s%s%s' % (nextExptVarKey, TAB, nextExptKey, TAB, exptVariableTermKey, CRT)) nextExptVarKey += 1 # # Properties # # name (0,1, pipe-delim) # sampleCount (0,1) # expFactorList (0-n) # providerList (0-n) # experimenttypeList (0-n) # bibiliographyList (0-n) # # propName, value and sequenceNum to be filled in later if name != '': toLoad = propertyTemplate.replace('#=#', str(namePropKey)).replace( '#==#', name).replace('#===#', '1').replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 if sampleCount != '': toLoad = propertyTemplate.replace( '#=#', str(sampleCountPropKey)).replace( '#==#', str(sampleCount)).replace('#===#', '1').replace( '#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 seqNumCt = 1 for e in expFactorList: toLoad = propertyTemplate.replace( '#=#', str(expFactorPropKey)).replace('#==#', e).replace( '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 seqNumCt = 1 for p in providerList: toLoad = propertyTemplate.replace( '#=#', str(contactNamePropKey)).replace('#==#', p).replace( '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 seqNumCt = 1 for e in experimenttypeList: toLoad = propertyTemplate.replace( '#=#', str(expTypePropKey)).replace('#==#', e).replace( '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 seqNumCt = 1 for b in bibliographyList: toLoad = propertyTemplate.replace( '#=#', str(pubmedPropKey)).replace('#==#', str(b)).replace( '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 nextExptKey += 1 return
def processFile(): global primerKey, refKey, aliasKey, accKey, mgiKey lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: markerSymbol = tokens[0] # not used markerIDs = string.split(tokens[1], '|') name = tokens[2] jnum = tokens[3] regionCovered = tokens[4] sequence1 = tokens[5] sequence2 = tokens[6] productSize = tokens[7] notes = tokens[8] sequenceIDs = tokens[9] aliasList = string.split(tokens[10], '|') createdBy = tokens[11] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) # marker IDs markerList = [] for markerID in markerIDs: markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) if len(markerID) > 0 and markerKey == 0: errorFile.write('Invalid Marker: %s, %s\n' % (name, markerID)) error = 1 elif len(markerID) > 0: markerList.append(markerKey) referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile) createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) # sequence IDs seqAccList = string.split(sequenceIDs, '|') # if errors, continue to next record if error: continue # if no errors, process the primer primerFile.write('%d\t%s\t\t%d\t%d\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t%s\t%s\n' \ % (primerKey, name, NA, vectorKey, segmentTypeKey, mgi_utils.prvalue(sequence1), \ mgi_utils.prvalue(sequence2), mgi_utils.prvalue(regionCovered), mgi_utils.prvalue(productSize), \ createdByKey, createdByKey, loaddate, loaddate)) for markerKey in markerList: if markerList.count(markerKey) == 1: markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \ % (primerKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate)) else: errorFile.write('Invalid Marker Duplicate: %s, %s\n' % (name, markerID)) # loaddate)) refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' % (refKey, primerKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) # aliases for alias in aliasList: if len(alias) == 0: continue aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate)) aliasKey = aliasKey + 1 # MGI Accession ID for the marker accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) newPrimerFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \ % (markerSymbol, string.join(markerIDs, '|'), name, jnum, regionCovered, sequence1, sequence2, productSize, notes, sequenceIDs, createdBy, mgiPrefix, mgiKey)) accKey = accKey + 1 mgiKey = mgiKey + 1 # sequence accession ids for acc in seqAccList: if len(acc) == 0: continue prefixPart, numericPart = accessionlib.split_accnum(acc) accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, acc, prefixPart, numericPart, logicalDBKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) accKey = accKey + 1 # notes if len(notes) > 0: noteFile.write('%s|1\t%s\t%s\t%s\n' \ % (primerKey, notes, loaddate, loaddate)) refKey = refKey + 1 primerKey = primerKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
def process(expFile): global expCount, exptLoadedCount, updateExptCount, updateExptList global nextExptKey, nextAccKey, nextExptVarKey, nextPropKey global expSkippedNotInDbTransIsSuperseriesSet, expSkippedNoSampleList global expIdsInDbSet, expLoadedNoSampleList, expIdsInDbNoSamplesSet global expSkippedNotInDbNoTransSet, expSkippedMaxSamplesSet f = open(expFile, encoding='utf-8', errors='replace') context = ET.iterparse(f, events=("start", "end")) context = iter(context) level = 0 expID = '' title = '' summary = '' pdat = '' gdsType = '' exptType = '' n_samples = '' pubmedList = [] sampleList = [] # list of samplIDs isSuperSeries = 'no' # flag to indicate expt is superseries, skip exptTypeKey = 0 # if 0 chosen gdstype did not translate, skip isTpr = 0 try: for event, elem in context: # end of a record - reset everything if event == 'end' and elem.tag == 'DocumentSummary': expCount += 1 skip = 0 #print('\n\nexpID: %s' % expID) allExptIdList.append(expID) # # Experiment is in the database # add new pubmed ids # add raw sample data to those curated experiments that do not have it # if expID in primaryIdDict: updateExpKey = primaryIdDict[expID] # # check for additional pubmed IDs # propertyUpdateTemplate = "#====#%s%s%s#=#%s#=====#%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % ( TAB, propTypeKey, TAB, TAB, TAB, exptMgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT) skip = 1 expIdsInDbSet.add(expID) #print(' expIdInDb skip') # not all experiments have pubmed IDs in the database # assigning empty list assures we pick up this case dbBibList = [] if expID in pubMedByExptDict: # get the list of pubmed Ids for this expt in the database dbBibList = pubMedByExptDict[expID] # get the set of incoming pubmed IDs not in the database newSet = set(pubmedList).difference(set(dbBibList)) # if we have new pubmed IDs, add them to the database if newSet: #print('found new pubmed ids: %s' % newSet) # get next sequenceNum for this expt's pubmed ID # in the database # get the next property sequence number results = db.sql( '''select max(sequenceNum) + 1 as nextNum from MGI_Property p where p._Object_key = %s and p._PropertyTerm_key = 20475430 and p._PropertyType_key = 1002''' % updateExpKey, 'auto') nextSeqNum = results[0]['nextNum'] if nextSeqNum == None: nextSeqNum = 1 updateExptCount += 1 updateExptList.append(expID) for b in newSet: toLoad = propertyUpdateTemplate.replace( '#=#', str(pubmedPropKey)).replace( '#==#', str(b)).replace( '#===#', str(nextSeqNum)).replace( '#====#', str(nextPropKey)).replace( '#=====#', str(updateExpKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 # if there's no raw sample data for the existing experiment, add it if expID in curatedGeoNoRawSampleDict: ret = processSamples(expID, 'true') # a list of sample info # means there was no sample file or 2 means there was # a parsing error if ret == 1 or ret == 2: print('expt inDb returnCode for %s: %s' % (expID, ret)) else: #print('expt inDb adding samples for key: %s expID: %s sampleInfo:%s' % (updateExpKey, expID, ret)) expIdsInDbNoSamplesSet.add(expID) # # GXD_HTRawSample and MGI_KeyValue BCP for # curated experiments (no raw sample data) # processSampleBcp(ret, updateExpKey) typeList = list(map(str.strip, gdsType.split(';'))) if skip != 1: (exptTypeKey, exptType) = processExperimentType(typeList) if exptTypeKey == 0: # expts whose type doesn't translate and is not already in the db expSkippedNotInDbNoTransSet.add(expID) skip = 1 #print(' expIdNotInDbNoTrans skip') if skip != 1 and isSuperSeries == 'yes': # number of superseries not already caught because of un translated # exptType or already in DB expSkippedNotInDbTransIsSuperseriesSet.add(expID) skip = 1 if skip != 1 and int(n_samples) > int(maxSamples): expSkippedMaxSamplesSet.add(expID) skip = 1 #print('exptTypeKey: %s isSuperSeries: %s skip: %s' % (exptTypeKey, isSuperSeries, skip)) if skip != 1: exptLoadedCount += 1 createExpObject = 0 # now process the samples ret = processSamples(expID, 'false') #print('ret: %s' % ret) if ret == 1: expLoadedNoSampleList.append('expID: %s' % (expID)) createExpObject = 1 elif ret == 2: expSkippedNoSampleList.append('expID: %s' % (expID)) exptLoadedCount -= 1 # decrement the loaded count else: sampleList = ret # list of sampleString's representing each # sample for the current experiment createExpObject = 1 if createExpObject: # catenate the global overallDesign parsed from the sample to the # experiment summary description = '%s %s' % (summary, overallDesign) description = description.replace('\t', ' ') description = description.replace('\n', ' ') if runParsingReports == 'true': fpExpParsingFile.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (expID, TAB, ', '.join(sampleList), TAB, title, TAB, description, TAB, isSuperSeries, TAB, pdat, TAB, exptType, TAB, n_samples, TAB, ', '.join(pubmedList), CRT)) # # GXD_HTExperiment BCP # line = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % ( nextExptKey, TAB, sourceKey, TAB, title, TAB, description, TAB, pdat, TAB, releasedate, TAB, evalDate, TAB, evalStateTermKey, TAB, curStateTermKey, TAB, studyTypeTermKey, TAB, exptTypeKey, TAB, evalByKey, TAB, initCurByKey, TAB, lastCurByKey, TAB, initCurDate, TAB, lastCurDate, TAB, confidence, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT) #print('line: %s' % line) fpExperimentBcp.write(line) # # GXD_HTVariable BCP # fpVariableBcp.write('%s%s%s%s%s%s' % (nextExptVarKey, TAB, nextExptKey, TAB, exptVariableTermKey, CRT)) nextExptVarKey += 1 # # ACC_Accession BCP # prefixPart, numericPart = accessionlib.split_accnum( expID) fpAccBcp.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextAccKey, TAB, expID, TAB, prefixPart, TAB, numericPart, TAB, geoLdbKey, TAB, nextExptKey, TAB, exptMgiTypeKey, TAB, private, TAB, isPreferred, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT)) nextAccKey += 1 # # Experiment Properties # # title (1) experiment name # namePropKey = 20475428 # n_samples (1) count of samples # sampleCountPropKey = 20475424 # typeList (1-n) raw experiment types # expTypePropKey = 20475425 # pubmedList (0-n) pubmed Ids # pubmedPropKey = 20475430 # description (1) sample overalldesign + expt summary # descriptionPropKey = 87508020 # the template for properties: propertyTemplate = "#====#%s%s%s#=#%s%s%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % ( TAB, propTypeKey, TAB, TAB, nextExptKey, TAB, exptMgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT) if title != '': toLoad = propertyTemplate.replace( '#=#', str(namePropKey)).replace( '#==#', title).replace('#===#', '1').replace( '#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 if n_samples != '': toLoad = propertyTemplate.replace( '#=#', str(sampleCountPropKey)).replace( '#==#', str(n_samples)).replace( '#===#', '1').replace('#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 seqNumCt = 1 for e in typeList: toLoad = propertyTemplate.replace( '#=#', str(expTypePropKey)).replace( '#==#', e).replace('#===#', str(seqNumCt)).replace( '#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 for b in pubmedList: toLoad = propertyTemplate.replace( '#=#', str(pubmedPropKey)).replace( '#==#', str(b)).replace('#===#', str(seqNumCt)).replace( '#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) seqNumCt += 1 nextPropKey += 1 if title != '': toLoad = propertyTemplate.replace( '#=#', str(namePropKey)).replace( '#==#', title).replace('#===#', '1').replace( '#====#', str(nextPropKey)) fpPropertyBcp.write(toLoad) nextPropKey += 1 # # GXD_HTRawSample and MGI_KeyValue BCP # # ret from processSample = 1 means there was no sample file # so exeriment is created, but no samples if ret != 1: processSampleBcp(sampleList, nextExptKey) # now increment the experiment key nextExptKey += 1 title = '' summary = '' isSuperSeries = 'no' pdat = '' gdsType = '' n_samples = '' pubmedList = [] sampleList = [] exptTypeKey = 0 if level == 4: # Accession tag at level 4 tells us we have a new record if elem.tag == 'Accession': expID = elem.text elif elem.tag == 'title': title = elem.text elif elem.tag == 'summary': summary = elem.text if summary.find(SUPERSERIES) != -1: isSuperSeries = 'yes' #print('isSuperSeries: %s' % expID) elif elem.tag == 'PDAT': pdat = elem.text elif elem.tag == 'gdsType': gdsType = elem.text elif elem.tag == 'n_samples': n_samples = elem.text if event == 'start': level += 1 #print('level: %s elemTag: %s elemText: %s' % (level, elem.tag, elem.text)) elif elem.tag == 'int': id = elem.text #print('id: %s' % id) pubmedList.append(id) elif level == 6 and elem.tag == 'Accession': sampleList.append(elem.text) if event == 'end': level -= 1 elem.clear() except: print('Parsing error: on %s' % expFile) return 1 return 0
def process(): """ Query database to determine if dummy sequences need to be created Generates appropriate BCP files """ global seqKey, assocKey, accKey # generate table of all mouse molecular segments Acc IDs whose GenBank SeqIDs # are not represented as Sequence objects. db.sql("""select a.accID, a._LogicalDB_key, ps._Organism_key INTO TEMPORARY TABLE probeaccs1 from ACC_Accession a, PRB_Probe p, PRB_Source ps where a._MGIType_key = 3 and a._LogicalDB_key = 9 and a._Object_key = p._Probe_key and p._Source_key = ps._Source_key and ps._Organism_key = 1 and not exists (select 1 from ACC_Accession s where s._MGIType_key = 19 and s._LogicalDB_key = a._LogicalDB_key and lower(s.accID) = lower(a.accID) )""", None) # generate table of all mouse marker Acc IDs whose GenBank, SWISSProt, RefSeq, # TrEMBL IDs are not represented as Sequence objects. db.sql("""select a.accID, a._LogicalDB_key, m._Organism_key INTO TEMPORARY TABLE markeraccs1 from ACC_Accession a, MRK_Marker m where a._MGIType_key = 2 and a._LogicalDB_key in (9,13,27,41) and a._Object_key = m._Marker_key and m._Organism_key = 1 and m._Marker_Status_key in (1,2) and not exists (select 1 from ACC_Accession s where s._MGIType_key = 19 and s._LogicalDB_key = a._LogicalDB_key and lower(s.accID) = lower(a.accID) )""", None) # generate table of all non-mouse molecular segments Acc IDs whose GenBank SeqIDs # are not represented as Sequence objects. db.sql("""select a.accID, a._LogicalDB_key, s._Organism_key INTO TEMPORARY TABLE probeaccs2 from ACC_Accession a, PRB_Probe p, PRB_Source s where a._MGIType_key = 3 and a._LogicalDB_key = 9 and a._Object_key = p._Probe_key and p._Source_key = s._Source_key and s._Organism_key != 1 and not exists (select 1 from ACC_Accession s where s._MGIType_key = 19 and s._LogicalDB_key = a._LogicalDB_key and lower(s.accID) = lower(a.accID) )""", None) # generate table of all non-mouse marker Acc IDs whose GenBank, SWISSProt, RefSeq, # TrEMBL IDs are not represented as Sequence objects. db.sql("""select a.accID, a._LogicalDB_key, m._Organism_key INTO TEMPORARY TABLE markeraccs2 from ACC_Accession a, MRK_Marker m where a._MGIType_key = 2 and a._LogicalDB_key in (9,13,27,41) and a._Object_key = m._Marker_key and m._Organism_key != 1 and not exists (select 1 from ACC_Accession s where s._MGIType_key = 19 and s._LogicalDB_key = a._LogicalDB_key and lower(s.accID) = lower(a.accID) )""", None) # union these 4 sets together to form one unique set db.sql('select accID, _LogicalDB_key, _Organism_key ' + \ 'INTO TEMPORARY TABLE allaccs ' + \ 'from probeaccs1 ' + \ 'union ' + \ 'select accID, _LogicalDB_key, _Organism_key ' + \ 'from markeraccs1 ' + \ 'union ' + \ 'select accID, _LogicalDB_key, _Organism_key ' + \ 'from probeaccs2 ' + \ 'union ' + \ 'select accID, _LogicalDB_key, _Organism_key ' + \ 'from markeraccs2', None) results = db.sql('select * from allaccs', 'auto') for r in results: accID = r['accID'] logicalDB = r['_LogicalDB_key'] organism = r['_Organism_key'] if organism == 1: sourceKey = mouseSourceKey else: sourceKey = nonmouseSourceKey virtual = 1 # change values for specific cases # types: 316347 (DNA), 316346 (RNA), 316348 (polypeptide), 316349 (not loaded) # quality: 316338 (high), 316339 (medium), 316340 (low), 316341 (not loaded) # provider: 316380 (GenBank/EMBL/DDBJ), 316372 (RefSeq) # 316384 (SwissProt), 316385 (TrEMBL) if logicalDB == 9: # GenBank typeKey = 316349 qualityKey = 316341 providerKey = 316380 virtual = 0 elif logicalDB == 27: # RefSeq typeKey = 316349 qualityKey = 316338 providerKey = 316372 elif logicalDB == 13: # SwissProt typeKey = 316348 qualityKey = 316338 providerKey = 316384 elif logicalDB == 41: # TrEMBL typeKey = 316348 qualityKey = 316340 providerKey = 316385 seqFile.write(mgi_utils.prvalue(seqKey) + DL + \ mgi_utils.prvalue(typeKey) + DL + \ mgi_utils.prvalue(qualityKey) + DL + \ mgi_utils.prvalue(statusKey) + DL + \ mgi_utils.prvalue(providerKey) + DL + \ mgi_utils.prvalue(organism) + DL + \ DL + DL + DL + DL + \ mgi_utils.prvalue(virtual) + DL + \ DL + \ loaddate + DL + loaddate + DL + \ str(userKey) + DL + str(userKey) + DL + \ loaddate + DL + loaddate + NL) rawFile.write(mgi_utils.prvalue(seqKey) + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ notLoaded + DL + \ str(userKey) + DL + str(userKey) + DL + \ loaddate + DL + loaddate + NL) sourceFile.write(mgi_utils.prvalue(assocKey) + DL + \ mgi_utils.prvalue(seqKey) + DL + \ mgi_utils.prvalue(sourceKey) + DL + \ str(userKey) + DL + str(userKey) + DL + \ loaddate + DL + loaddate + NL) prefixPart, numericPart = accessionlib.split_accnum(accID) accFile.write(mgi_utils.prvalue(accKey) + DL + \ mgi_utils.prvalue(accID) + DL + \ mgi_utils.prvalue(prefixPart) + DL + \ mgi_utils.prvalue(numericPart) + DL + \ mgi_utils.prvalue(logicalDB) + DL + \ mgi_utils.prvalue(seqKey) + DL + \ mgi_utils.prvalue(mgiTypeKey) + DL + \ '0' + DL + \ '1' + DL + \ str(userKey) + DL + str(userKey) + DL + \ loaddate + DL + loaddate + NL) seqKey = seqKey + 1 assocKey = assocKey + 1 accKey = accKey + 1
def processFile(): global probeKey, refKey, aliasKey, accKey, mgiKey lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: name = tokens[0] jnum = tokens[1] parentID = tokens[2] sourceName = tokens[3] organism = tokens[4] strain = tokens[5] tissue = tokens[6] gender = tokens[7] cellLine = tokens[8] age = tokens[9] vectorType = tokens[10] segmentType = tokens[11] regionCovered = tokens[12] insertSite = tokens[13] insertSize = tokens[14] markerIDs = string.split(tokens[15], '|') relationship = tokens[16] sequenceIDs = tokens[17] aliasList = string.split(tokens[18], '|') notes = tokens[19] rawnotes = tokens[20] createdBy = tokens[21] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) isParent = 0 isSource = 0 parentProbeKey = ''; sourceKey = 0 if parentID != '': isParent = 1 if sourceName != '': isSource = 1 if not isParent and not isSource: organismKey = sourceloadlib.verifyOrganism(organism, lineNum, errorFile) strainKey = sourceloadlib.verifyStrain(strain, lineNum, errorFile) tissueKey = sourceloadlib.verifyTissue(tissue, lineNum, errorFile) genderKey = sourceloadlib.verifyGender(gender, lineNum, errorFile) cellLineKey = sourceloadlib.verifyCellLine(cellLine, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifySource(segmentTypeKey, \ vectorKey, organismKey, strainKey, \ tissueKey, genderKey, cellLineKey, age, lineNum, errorFile) if organismKey == 0 or strainKey == 0 or tissueKey == 0 or \ genderKey == 0 or cellLineKey == 0 or vectorKey == 0 or \ segmentTypeKey == 0 or sourceKey == 0: errorFile.write('%s, %s, %s, %s, %s, %s, %s, %s\n' % (segmentType, vectorType, organism, strain, tissue, gender, cellLine, age)) error = 1 elif not isParent and isSource: vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifyLibrary(sourceName, lineNum, errorFile) if vectorKey == 0 or segmentTypeKey == 0 or sourceKey == 0: error = 1 # parent from = yes, source given = yes or no (ignored) else: parentProbeKey, sourceKey = verifyParentProbe(parentID, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile) if parentProbeKey == 0 or sourceKey == 0 or vectorKey == 0 or segmentTypeKey == 0: error = 1 referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile) createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) if referenceKey == 0: errorFile.write('Invalid Reference: %s\n' % (jnum)) error = 1 if createdByKey == 0: errorFile.write('Invalid Creator: %s\n\n' % (createdBy)) error = 1 # marker IDs markerList = [] for markerID in markerIDs: markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) if len(markerID) > 0 and markerKey == 0: errorFile.write('Invalid Marker: %s, %s\n' % (name, markerID)) error = 1 elif len(markerID) > 0: markerList.append(markerKey) # sequence IDs seqAccDict = {} for seqID in string.split(sequenceIDs, '|'): if len(seqID) > 0: [logicalDB, acc] = string.split(seqID, ':') logicalDBKey = loadlib.verifyLogicalDB(logicalDB, lineNum, errorFile) if logicalDBKey > 0: seqAccDict[acc] = logicalDBKey # if errors, continue to next record if error: continue # if no errors, process the probe probeFile.write('%d\t%s\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t\t%s\t%s\t%s\t%s\n' \ % (probeKey, name, parentProbeKey, sourceKey, vectorKey, segmentTypeKey, mgi_utils.prvalue(regionCovered), \ mgi_utils.prvalue(insertSite), mgi_utils.prvalue(insertSize), createdByKey, createdByKey, loaddate, loaddate)) for markerKey in markerList: if markerList.count(markerKey) == 1: markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \ % (probeKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate)) else: errorFile.write('Invalid Marker Duplicate: %s, %s\n' % (name, markerID)) refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' \ % (refKey, probeKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) # aliases for alias in aliasList: if len(alias) == 0: continue aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate)) aliasKey = aliasKey + 1 # MGI Accession ID for the marker accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) # Print out a new text file and attach the new MGI Probe IDs as the last field newProbeFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \ % (name, jnum, \ mgi_utils.prvalue(sourceName), \ organism, \ mgi_utils.prvalue(strain), \ mgi_utils.prvalue(tissue), \ mgi_utils.prvalue(gender), \ mgi_utils.prvalue(cellLine), \ mgi_utils.prvalue(age), \ mgi_utils.prvalue(vectorType), \ mgi_utils.prvalue(segmentType), \ mgi_utils.prvalue(regionCovered) + \ mgi_utils.prvalue(insertSite), \ mgi_utils.prvalue(insertSize), \ string.join(markerIDs, '|'), \ relationship, \ mgi_utils.prvalue(sequenceIDs), \ string.join(aliasList, '|'), \ mgi_utils.prvalue(notes), \ createdBy, mgiPrefix, mgiKey)) # Print out a raw note file if len(rawnotes) > 0: rawNoteFile.write('%s%d\t%s\n' % (mgiPrefix, mgiKey, rawnotes)) # Notes if len(notes) > 0: noteFile.write('%s\t%s\t%s\t%s\n' % (probeKey, notes, loaddate, loaddate)) accKey = accKey + 1 mgiKey = mgiKey + 1 # sequence accession ids for acc in seqAccDict.keys(): prefixPart, numericPart = accessionlib.split_accnum(acc) accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, acc, prefixPart, numericPart, seqAccDict[acc], probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) accKey = accKey + 1 refKey = refKey + 1 probeKey = probeKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
def writeB6Output(): # Purpose: parses the output line dictionary # writes to Accession, AccessionReference & StrainMarker BCP file # if there are no errors # Returns: 1 if error, else 0 # Assumes: file descriptors have been initialized # Effects: writes to the file system # Throws: Nothing global nextSMKey, nextAccKey, totalLoadedCt, b6LoadedCt description = '' for mgiID in b6ToLoadDict: #print 'writeB6Output mgiID: "%s"' % mgiID lineList = b6ToLoadDict[mgiID] #print 'writeB6Output lineList: %s' % lineList qNameSet = set() # Resolve MGI ID if mgiID not in markerLookup: print('%s in MGI GFF File, but NOT IN MGI' % (mgiID)) continue else: marker = markerLookup[mgiID] markerKey = marker.markerKey symbol = marker.symbol if len(lineList) == 1: # This is non-BlatAlignment gene/pseudogene #print 'This is non-BlatAlignment gene/pseudogene and nextSMKey: %s' % nextSMKey line = lineList[0] chr, start, end, strand, smID, mgiID, biotype, gmIdString, qName, description = parseB6Feature( line, 'f') fpStrainMarkerFile.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextSMKey, TAB, b6StrainKey, TAB, markerKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) prefixPart, numericPart = accessionlib.split_accnum(smID) fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, smID, TAB, prefixPart, TAB, numericPart, TAB, msgLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) fpGmB6File.write('%s%s%s%s%s%s%s%s%s%s%s%s' % (smID, TAB, chr, TAB, start, TAB, end, TAB, strand, TAB, description, CRT)) fpBiotypeB6File.write('%s%s%s%s' % (smID, TAB, biotype, CRT)) nextAccKey += 1 # get gmIDs from the input file for the sequence description, if they exist # gmIDs example: # Dbxref=miRBase:MI0005004,ENSEMBL:ENSMUSG00000076010,NCBI_Gene:751557 if gmIdString == '': continue gmIdList = gmIdString.split(',') nextSMKey += 1 else: # This is BlatAlignment set #print 'this is a BlatAlignment set nextSMKey: %s' % nextSMKey # The first line is feature line, the following are BlatAlignments featureLine = lineList[0] chr, start, end, strand, smID, mgiID, biotype, gmIdString, qName, description = parseB6Feature( featureLine, 'bf') # remainder of the list are blat hits - save them to a set lineList = lineList[1:] # remove the blat feature for line in lineList: j1, j2, j3, j4, j5, j6, j7, j8, qName, j9 = parseB6Feature( line, 'b') qNameSet.add(qName.strip()) # set the qNames (genbank IDs) in the description string description = description % ','.join(str(s) for s in qNameSet) # # Create the strain marker and its accession ID # fpStrainMarkerFile.write( '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (nextSMKey, TAB, b6StrainKey, TAB, markerKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) prefixPart, numericPart = accessionlib.split_accnum(smID) fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, smID, TAB, prefixPart, TAB, numericPart, TAB, msgLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \ % (nextAccKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT)) nextAccKey += 1 fpGmB6File.write('%s%s%s%s%s%s%s%s%s%s%s%s' % (smID, TAB, chr, TAB, start, TAB, end, TAB, strand, TAB, description, CRT)) fpBiotypeB6File.write('%s%s%s%s' % (smID, TAB, biotype, CRT)) # 6/12 GF-184, removed all associated IDs from strain gene # for blat hits in the input file associate the GenBank IDs that was # blatted for coordinates nextSMKey += 1 totalLoadedCt += 1 b6LoadedCt += 1 return 0
def processFile(): global probeKey, refKey, aliasKey, accKey, mgiKey lineNum = 0 # For each line in the input file for line in inputFile.readlines(): error = 0 lineNum = lineNum + 1 # Split the line into tokens tokens = string.split(line[:-1], '\t') try: name = tokens[0] jnum = tokens[1] parentID = tokens[2] sourceName = tokens[3] organism = tokens[4] strain = tokens[5] tissue = tokens[6] gender = tokens[7] cellLine = tokens[8] age = tokens[9] vectorType = tokens[10] segmentType = tokens[11] regionCovered = tokens[12] insertSite = tokens[13] insertSize = tokens[14] markerIDs = string.split(tokens[15], '|') relationship = tokens[16] sequenceIDs = tokens[17] aliasList = string.split(tokens[18], '|') notes = tokens[19] rawnotes = tokens[20] createdBy = tokens[21] except: exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line)) isParent = 0 isSource = 0 parentProbeKey = '' sourceKey = 0 if parentID != '': isParent = 1 if sourceName != '': isSource = 1 if not isParent and not isSource: organismKey = sourceloadlib.verifyOrganism(organism, lineNum, errorFile) strainKey = sourceloadlib.verifyStrain(strain, lineNum, errorFile) tissueKey = sourceloadlib.verifyTissue(tissue, lineNum, errorFile) genderKey = sourceloadlib.verifyGender(gender, lineNum, errorFile) cellLineKey = sourceloadlib.verifyCellLine(cellLine, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifySource(segmentTypeKey, \ vectorKey, organismKey, strainKey, \ tissueKey, genderKey, cellLineKey, age, lineNum, errorFile) if organismKey == 0 or strainKey == 0 or tissueKey == 0 or \ genderKey == 0 or cellLineKey == 0 or vectorKey == 0 or \ segmentTypeKey == 0 or sourceKey == 0: errorFile.write('%s, %s, %s, %s, %s, %s, %s, %s\n' % (segmentType, vectorType, organism, strain, tissue, gender, cellLine, age)) error = 1 elif not isParent and isSource: vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) sourceKey = sourceloadlib.verifyLibrary(sourceName, lineNum, errorFile) if vectorKey == 0 or segmentTypeKey == 0 or sourceKey == 0: error = 1 # parent from = yes, source given = yes or no (ignored) else: parentProbeKey, sourceKey = verifyParentProbe( parentID, lineNum, errorFile) vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile) segmentTypeKey = sourceloadlib.verifySegmentType( segmentType, lineNum, errorFile) if parentProbeKey == 0 or sourceKey == 0 or vectorKey == 0 or segmentTypeKey == 0: error = 1 referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile) createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile) if referenceKey == 0: errorFile.write('Invalid Reference: %s\n' % (jnum)) error = 1 if createdByKey == 0: errorFile.write('Invalid Creator: %s\n\n' % (createdBy)) error = 1 # marker IDs markerList = [] for markerID in markerIDs: markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile) if len(markerID) > 0 and markerKey == 0: errorFile.write('Invalid Marker: %s, %s\n' % (name, markerID)) error = 1 elif len(markerID) > 0: markerList.append(markerKey) # sequence IDs seqAccDict = {} for seqID in string.split(sequenceIDs, '|'): if len(seqID) > 0: [logicalDB, acc] = string.split(seqID, ':') logicalDBKey = loadlib.verifyLogicalDB(logicalDB, lineNum, errorFile) if logicalDBKey > 0: seqAccDict[acc] = logicalDBKey # if errors, continue to next record if error: continue # if no errors, process the probe probeFile.write('%d\t%s\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t\t%s\t%s\t%s\t%s\n' \ % (probeKey, name, parentProbeKey, sourceKey, vectorKey, segmentTypeKey, mgi_utils.prvalue(regionCovered), \ mgi_utils.prvalue(insertSite), mgi_utils.prvalue(insertSize), createdByKey, createdByKey, loaddate, loaddate)) for markerKey in markerList: if markerList.count(markerKey) == 1: markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \ % (probeKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate)) else: errorFile.write('Invalid Marker Duplicate: %s, %s\n' % (name, markerID)) refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' \ % (refKey, probeKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) # aliases for alias in aliasList: if len(alias) == 0: continue aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \ % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate)) aliasKey = aliasKey + 1 # MGI Accession ID for the marker accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) # Print out a new text file and attach the new MGI Probe IDs as the last field newProbeFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \ % (name, jnum, \ mgi_utils.prvalue(sourceName), \ organism, \ mgi_utils.prvalue(strain), \ mgi_utils.prvalue(tissue), \ mgi_utils.prvalue(gender), \ mgi_utils.prvalue(cellLine), \ mgi_utils.prvalue(age), \ mgi_utils.prvalue(vectorType), \ mgi_utils.prvalue(segmentType), \ mgi_utils.prvalue(regionCovered) + \ mgi_utils.prvalue(insertSite), \ mgi_utils.prvalue(insertSize), \ string.join(markerIDs, '|'), \ relationship, \ mgi_utils.prvalue(sequenceIDs), \ string.join(aliasList, '|'), \ mgi_utils.prvalue(notes), \ createdBy, mgiPrefix, mgiKey)) # Print out a raw note file if len(rawnotes) > 0: rawNoteFile.write('%s%d\t%s\n' % (mgiPrefix, mgiKey, rawnotes)) # Notes if len(notes) > 0: noteFile.write('%s\t%s\t%s\t%s\n' % (probeKey, notes, loaddate, loaddate)) accKey = accKey + 1 mgiKey = mgiKey + 1 # sequence accession ids for acc in seqAccDict.keys(): prefixPart, numericPart = accessionlib.split_accnum(acc) accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \ % (accKey, acc, prefixPart, numericPart, seqAccDict[acc], probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate)) accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \ % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate)) accKey = accKey + 1 refKey = refKey + 1 probeKey = probeKey + 1 # end of "for line in inputFile.readlines():" # # Update the AccessionMax value # if not DEBUG: db.sql('select * from ACC_setMax (%d)' % (lineNum), None)