Ejemplo n.º 1
def writeAccBCP():
	# requires:
	# effects:
	#	Creates approrpriate BCP records
	# returns:
	#	nothing

	global accKey, userKey

	# records that require a reference

	results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \
		'from WRK_EntrezGene_Bucket0 ' + \
		'where taxID = %s and refRequired = 1 ' % (taxId), 'auto')

	for r in results:

		if r['_Object_key'] == -1:
		    objectKey = geneIDtoMarkerKey[r['geneID']]
		    objectKey = r['_Object_key']

		prefixPart, numericPart = accessionlib.split_accnum(r['accID'])
			% (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate))
		accrefFile.write('%d|%s|%s|%s|%s|%s\n' % (accKey, referenceKey, userKey, userKey, loaddate, loaddate))
		accKey = accKey + 1

	# records that don't require a reference

	results = db.sql('select _Object_key, _LogicalDB_key, accID, private, geneID ' + \
		'from WRK_EntrezGene_Bucket0 ' + \
		'where taxID = %s and refRequired = 0' % (taxId), 'auto')

	for r in results:

		if r['_Object_key'] == -1:
		    objectKey = geneIDtoMarkerKey[r['geneID']]
		    objectKey = r['_Object_key']

		prefixPart, numericPart = accessionlib.split_accnum(r['accID'])
			% (accKey, r['accID'], mgi_utils.prvalue(prefixPart), mgi_utils.prvalue(numericPart), r['_LogicalDB_key'], objectKey, mgiTypeKey, r['private'], userKey, userKey, loaddate, loaddate))
		accKey = accKey + 1
Ejemplo n.º 2
def createBCPFile():
    global accKey

    print 'Create the bcp file for the GENSAT associations'

    # Find the marker key that the EntrezGene ID should be associated with.
    # Do not make an association for any EntrezGene IDs that are on the
    # discrepancy report.
    cmds = []
    cmds.append('select t.entrezgeneID, a._Object_key as markerKey ' + \
                'from ' + tempTable + ' t, ACC_Accession a ' + \
                'where lower(t.entrezgeneID) = lower(a.accID) and ' + \
                      'a._MGIType_key = ' + str(markerMGITypeKey) + ' and ' + \
                      'a._LogicalDB_key = ' + str(egLogicalDBKey) + ' ' + \
                'order by t.entrezgeneID')

    results = db.sql(cmds, 'auto')

    count = 0

    # Write the records to the bcp file.
    for r in results[0]:
        entrezgeneID = r['entrezgeneID']
        markerKey = r['markerKey']

        # Skip the EntrezGene ID if it was written to the discrepancy report.
        if badIDs.has_key(entrezgeneID):

        # Get the prefix and numeric parts of the EntrezGene ID and write
        # a record to the bcp file.
        (prefixPart, numericPart) = accessionlib.split_accnum(entrezgeneID)

        fpAccBCPFile.write(str(accKey) + TAB + \
                           entrezgeneID + TAB + \
                           prefixPart + TAB + \
                           str(numericPart) + TAB + \
                           str(gensatLogicalDBKey) + TAB + \
                           str(markerKey) + TAB + \
                           str(markerMGITypeKey) + TAB + \
                           PRIVATE + TAB + PREFERRED + TAB + \
                           str(createdByKey) + TAB + \
                           str(createdByKey) + TAB + \
                           loadDate + TAB + \
                           loadDate + NL)

        count = count + 1
        accKey = accKey + 1

    print 'Number of GENSAT associations: ' + str(count)

Ejemplo n.º 3
def createBCPFile ():
    global accKey

    print 'Create the bcp file for the GENSAT associations'

    # Find the marker key that the EntrezGene ID should be associated with.
    # Do not make an association for any EntrezGene IDs that are on the
    # discrepancy report.
    cmds = []
    cmds.append('select t.entrezgeneID, a._Object_key as markerKey ' + \
                'from ' + tempTable + ' t, ACC_Accession a ' + \
                'where lower(t.entrezgeneID) = lower(a.accID) and ' + \
                      'a._MGIType_key = ' + str(markerMGITypeKey) + ' and ' + \
                      'a._LogicalDB_key = ' + str(egLogicalDBKey) + ' ' + \
                'order by t.entrezgeneID')

    results = db.sql(cmds,'auto')

    count = 0

    # Write the records to the bcp file.
    for r in results[0]:
        entrezgeneID = r['entrezgeneID']
        markerKey = r['markerKey']

        # Skip the EntrezGene ID if it was written to the discrepancy report.
        if badIDs.has_key(entrezgeneID):

        # Get the prefix and numeric parts of the EntrezGene ID and write
        # a record to the bcp file.
        (prefixPart,numericPart) = accessionlib.split_accnum(entrezgeneID)

        fpAccBCPFile.write(str(accKey) + TAB + \
                           entrezgeneID + TAB + \
                           prefixPart + TAB + \
                           str(numericPart) + TAB + \
                           str(gensatLogicalDBKey) + TAB + \
                           str(markerKey) + TAB + \
                           str(markerMGITypeKey) + TAB + \
                           PRIVATE + TAB + PREFERRED + TAB + \
                           str(createdByKey) + TAB + \
                           str(createdByKey) + TAB + \
                           loadDate + TAB + \
                           loadDate + NL)

        count = count + 1
        accKey = accKey + 1

    print 'Number of GENSAT associations: ' + str(count)

Ejemplo n.º 4
def writeMGPOutput():
    # Purpose: writes to Accession, AccessionReference & StrainMarker
    #	BCP file and Gene Model and GM Assoc files if there are no errors
    # Returns: 1 if error, else 0
    # Assumes: file descriptors have been initialized
    # Effects: writes to the file system
    # Throws: Nothing

    global nextSMKey, nextAccKey, mgpSkipCt, totalLoadedCt

    # for markers with >1 strain specific MGP ID
    # report and load a strain marker and a accession for each MGP ID
    strainMarkerInputList = qcDict['mgi_mgp']
    for strainMarkerInputDict in strainMarkerInputList:
        for strain in strainMarkerInputDict:
            strainMarkerObjectsList = strainMarkerInputDict[strain]
            for coordsForMarkerList in strainMarkerObjectsList:
                for strainMarkerObject in coordsForMarkerList:
                    # write out to bcp file
                    mgpID = strainMarkerObject.mgpID
                    mgiID = strainMarkerObject.markerID
                    markerKey = strainMarkerObject.markerKey
                    if mgiID.find(
                    ) == 0:  # temp ID for no marker strain marker
                        markerKey = ''

                    strainKey = strainMarkerObject.strainKey
                    chr = strainMarkerObject.chr
                    start = strainMarkerObject.start
                    end = strainMarkerObject.end
                    strand = strainMarkerObject.strand
                    description = strainMarkerObject.description
                    biotype = strainMarkerObject.biotype

                    totalLoadedCt += 1
                        '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
                        (nextSMKey, TAB, strainKey, TAB, markerKey, TAB,
                         mgpRefsKey, TAB, userKey, TAB, userKey, TAB, loaddate,
                         TAB, loaddate, CRT))

                    prefixPart, numericPart = accessionlib.split_accnum(mgpID)

                    fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \
                    % (nextAccKey, TAB, mgpID, TAB, prefixPart, TAB, numericPart, TAB, mgpLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))
                    fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \
                    % (nextAccKey, TAB, mgpRefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))
                    fpGmMgpFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' %
                                      (mgpID, TAB, chr, TAB, start, TAB, end,
                                       TAB, strand, TAB, description, CRT))
                    fpBiotypeMgpFile.write('%s%s%s%s' %
                                           (mgpID, TAB, biotype, CRT))
                    nextSMKey += 1
                    nextAccKey += 1

    return 0
Ejemplo n.º 5
def processSusceptibility():

    # do formatted file
    doFileName = None
    # do file pointer
    doFile = None

    # insert statement
    INSERT_ACCESSION = '''insert into ACC_Accession 
      values ((select max(_Accession_key) + 1 from ACC_Accession), 
	   '%s', '%s', %s, 15, %s, 13, 0, 0)

    doFileName = os.environ['OBO_FILE']
    doFile = open(doFileName, 'r')

    omimIdValue = 'id: OMIM:'
    relValue = 'relationship: RO:0003304'
    skipValue = 'OMIM:000000'
    foundOMIM = 0

    for line in doFile.readlines():

        # find [Term]
        # find relationship: RO:0003304

        if line == '[Term]':
            foundOMIM = 0

        elif line[:9] == omimIdValue:
            omimId = line[4:-1]
	    if omimId == skipValue:
	    foundOMIM = 1

        elif foundOMIM and line[:24] == relValue:

            tokens = line[25:-1].split(' ')
	    doId = tokens[0]

            prefixPart, numericPart = accessionlib.split_accnum(omimId)
            objectKey = loadlib.verifyObject(doId, 13, None, None, None)
            addSQL = INSERT_ACCESSION % (omimId, prefixPart, numericPart, objectKey)
            db.sql(addSQL, None)
    return 0
Ejemplo n.º 6
def process():
    global propertiesDict, expCount, loadedCount, inDbCount, invalidSampleCountDict
    global invalidReleaseDateDict, invalidUpdateDateDict, noIdList
    global nextExptKey, nextAccKey, nextExptVarKey, nextPropKey
    global updateExptCount

    for f in jFile['experiments']['experiment']:
        expCount += 1
        # definitions with SUPERSERIES text get different evaluation state
        # than the load default and evalution date and evaluated by are set
        # by the load (default null)
        isSuperSeries = 0
        evalStateToUseKey = defaultEvalStateTermKey

            # description is str.or list
            allDescription = f['description']
            description = allDescription['text']  # experiment, onea

            # US108 'clean up URLs that appear in description field'. All
            # URLs that need to be cleaned up are the listType description
            # example of element in a description list with URL that we
            # need to parse:
            # {'a': {'href': 'http://lgsun.grc.nia.nih.gov/ANOVA/', 'target': '_blank', '$': 'http://lgsun.grc.nia.nih.gov/ANOVA/'}}
            if type(description) == list:
                listDescript = ''
                for d in description:
                    if type(d) == dict:
                        if 'a' in d:
                            url = d['a']['$']
                            listDescript = listDescript + url
                        # skip these: {"br":null}
                        elif 'br' in d:
                        listDescript = listDescript + str(d)
                description = listDescript
            description = ''
        if description == None:  #  {'text': None, 'id': None}
            description = ''

        description = str.strip(description)

        if description.find(SUPERSERIES) != -1:
            evalStateToUseKey = altEvalStateTermKey
            isSuperSeries = 1

            name = f['name']
            if type(name) == list:
                name = '|'.join(name)
            name = ''
        name = str.strip(name)

            primaryID = str.strip(f['accession'])  # accession
            primaryID = ''

            sampleCount = f['samples']  # property, one
            sampleCount = ''

            releasedate = f['releasedate']  # experiment, one
            releasedate = ''

            # experimentalfactor.name
            # list or dict
            expFactor = f['experimentalfactor']
            if type(expFactor) == dict:
                expFactorList = [expFactor]
                expFactorList = expFactor
            expFactorSet = set()
            for e in expFactorList:  #property, many stored individ.
                # weed out dups
            expFactorList = list(expFactorSet)
            expFactorList = []

            lastupdatedate = f['lastupdatedate']  # experiment, one
            lastupdatedate = ''

            # provider.contact, dictionary or list of dictionaries; need
            # to remove exact dups
            providerList = []
            if type(f['provider']) != list:
                providerList = [f['provider']['contact']]
                for p in f['provider']:
                    if p['contact'] != None:
            providerSet = set(providerList)
            providerList = list(providerSet)
            providerList = []

            # experimenttype is str.or list, property,
            # many stored individ
            if type(f['experimenttype']) != list:
                experimenttypeList = [f['experimenttype']]
                experimenttypeList = f['experimenttype']
            experimenttypeList = []

        # pick first valid experiment type and translate it to populate the
        # exptype key
        exptTypeKey = 0
        for exp in experimenttypeList:
            if exp in exptTypeTransDict:
                exptTypeKey = exptTypeTransDict[exp]
        if exptTypeKey == 0:
            exptTypeKey = exptTypeNRKey  # Not Resolved

            # PubMed IDs - bibliography.accession
            # TR13116/check for duplicate pubmedids
            bibliographyList = []
            if type(f['bibliography']) == dict:  # dictionary
                if str(f['bibliography']['accession']) not in bibliographyList:
            else:  # ListType
                for b in f['bibliography']:  # for each dict in the list
                    if 'accession' in b:
                        if str(b['accession']) not in bibliographyList:
            bibliographyList = []

        # the template for properties:
        propertyTemplate = "#====#%s%s%s#=#%s%s%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % (
            TAB, propTypeKey, TAB, TAB, nextExptKey, TAB, mgiTypeKey, TAB, TAB,
            TAB, userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT)
        propertyUpdateTemplate = "#====#%s%s%s#=#%s#=====#%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % (
            TAB, propTypeKey, TAB, TAB, TAB, mgiTypeKey, TAB, TAB, TAB,
            userKey, TAB, userKey, TAB, loadDate, TAB, loadDate, CRT)
        # update pubmed ID properties, if this ID already in the database
        if primaryID in primaryIdDict:
            inDbCount += 1

            # not all experiments have pubmed IDs
            if primaryID in pubMedByExptDict:
                # get the list of pubmed Ids for this expt in the database
                dbBibList = pubMedByExptDict[primaryID]

                # get the set of incoming pubmed IDs not in the database
                newSet = set(bibliographyList).difference(set(dbBibList))

                # if we have new pubmed IDs, add them to the database
                if newSet:
                    updateExpKey = primaryIdDict[primaryID]

                    # get next sequenceNum for this expt's pubmed ID
                    # in the database
                    results = db.sql(
                        '''select max(sequenceNum) + 1 
                        as nextNum
                        from MGI_Property p
                        where p._Object_key =  %s
                        and p._PropertyTerm_key = 20475430
                        and p._PropertyType_key = 1002''' % updateExpKey,

                    nextSeqNum = results[0]['nextNum']
                    if newSet:
                        updateExptCount += 1
                    for b in newSet:
                        toLoad = propertyUpdateTemplate.replace(
                            '#=#', str(pubmedPropKey)).replace(
                        nextPropKey += 1
            # continue so we don't dup what is in the db
        prefixPartPrimary, numericPartPrimary = accessionlib.split_accnum(
        # Do QC checks
        # If there are errors, skip to the next experiment
        if doQcChecks(primaryID, name, sampleCount, releasedate,

        # calculate secondary GEO ID for AE GEO IDs
        geoID = calculateGeoId(primaryID)

        # now write out to bcp files
        loadedCount += 1

        # GXD_Experiment
        # many optional nulls - create the insert string
        line = '%s%s%s%s' % (nextExptKey, TAB, sourceKey, TAB)
        if name != '':
            line = line + name + TAB

        if description != '' and description != None:
            line = line + description + TAB
            line = line + TAB

        if releasedate != '':
            line = line + releasedate + TAB
            line = line + TAB

        if lastupdatedate != '':
            line = line + lastupdatedate + TAB
            line = line + TAB

        # evaluated data is today
        if isSuperSeries:
            line = line + loadDate + TAB
            # evaluated_date is null
            line = line + TAB

        line = line + str(evalStateToUseKey) + TAB

        if isSuperSeries:
            line = line + str(altCurStateTermKey) + TAB
            line = line + str(curStateTermKey) + TAB

        line = line + str(studyTypeTermKey) + TAB
        line = line + str(exptTypeKey) + TAB

        # evalByKey  is null unless isSuperSeries is true then
        # it is load user
        if isSuperSeries:
            line = line + str(userKey) + TAB
            line = line + TAB

        # initialCurByKey, lastCurByKey, initialCurDate, lastCurDate
        # all null
        line = line + TAB + TAB + TAB + TAB

        # created and modified by
        line = line + str(userKey) + TAB + str(userKey) + TAB

        # creation and modification date
        line = line + loadDate + TAB + loadDate + CRT


        # Primary Accession
            '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
            (nextAccKey, TAB, primaryID, TAB, prefixPartPrimary, TAB,
             numericPartPrimary, TAB, aeLdbKey, TAB, nextExptKey, TAB,
             mgiTypeKey, TAB, private, TAB, isPreferred, TAB, userKey, TAB,
             userKey, TAB, loadDate, TAB, loadDate, CRT))
        nextAccKey += 1

        # Secondary Accession
        if geoID != '':
            prefixPartSecondary, numericPartSecondary = accessionlib.split_accnum(
                '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
                (nextAccKey, TAB, geoID, TAB, prefixPartSecondary, TAB,
                 numericPartSecondary, TAB, geoLdbKey, TAB, nextExptKey, TAB,
                 mgiTypeKey, TAB, private, TAB, notPreferred, TAB, userKey,
                 TAB, userKey, TAB, loadDate, TAB, loadDate, CRT))
            nextAccKey += 1

        # Variable
            '%s%s%s%s%s%s' %
            (nextExptVarKey, TAB, nextExptKey, TAB, exptVariableTermKey, CRT))
        nextExptVarKey += 1

        # Properties

        # name (0,1, pipe-delim)
        # sampleCount (0,1)
        # expFactorList (0-n)
        # providerList (0-n)
        # experimenttypeList (0-n)
        # bibiliographyList (0-n)
        # propName, value and sequenceNum to be filled in later
        if name != '':
            toLoad = propertyTemplate.replace('#=#', str(namePropKey)).replace(
                '#==#', name).replace('#===#',
                                      '1').replace('#====#', str(nextPropKey))
            nextPropKey += 1

        if sampleCount != '':
            toLoad = propertyTemplate.replace(
                '#=#', str(sampleCountPropKey)).replace(
                    '#==#', str(sampleCount)).replace('#===#', '1').replace(
                        '#====#', str(nextPropKey))
            nextPropKey += 1

        seqNumCt = 1
        for e in expFactorList:
            toLoad = propertyTemplate.replace(
                '#=#', str(expFactorPropKey)).replace('#==#', e).replace(
                    '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey))
            seqNumCt += 1
            nextPropKey += 1

        seqNumCt = 1
        for p in providerList:
            toLoad = propertyTemplate.replace(
                '#=#', str(contactNamePropKey)).replace('#==#', p).replace(
                    '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey))
            seqNumCt += 1
            nextPropKey += 1

        seqNumCt = 1
        for e in experimenttypeList:
            toLoad = propertyTemplate.replace(
                '#=#', str(expTypePropKey)).replace('#==#', e).replace(
                    '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey))
            seqNumCt += 1
            nextPropKey += 1

        seqNumCt = 1
        for b in bibliographyList:
            toLoad = propertyTemplate.replace(
                '#=#', str(pubmedPropKey)).replace('#==#', str(b)).replace(
                    '#===#', str(seqNumCt)).replace('#====#', str(nextPropKey))
            seqNumCt += 1
            nextPropKey += 1

        nextExptKey += 1

Ejemplo n.º 7
def processFile():

    global primerKey, refKey, aliasKey, accKey, mgiKey

    lineNum = 0
    # For each line in the input file

    for line in inputFile.readlines():

        error = 0
        lineNum = lineNum + 1

        # Split the line into tokens
        tokens = string.split(line[:-1], '\t')

	    markerSymbol = tokens[0]	# not used
	    markerIDs = string.split(tokens[1], '|')
	    name = tokens[2]
	    jnum = tokens[3]
	    regionCovered = tokens[4]
	    sequence1 = tokens[5]
	    sequence2 = tokens[6]
	    productSize = tokens[7]
	    notes = tokens[8]
	    sequenceIDs = tokens[9]
	    aliasList = string.split(tokens[10], '|')
	    createdBy = tokens[11]
            exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line))

	# marker IDs

	markerList = []
	for markerID in markerIDs:

	    markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile)

	    if len(markerID) > 0 and markerKey == 0:
	        errorFile.write('Invalid Marker:  %s, %s\n' % (name, markerID))
	        error = 1
            elif len(markerID) > 0:

        referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile)
	createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile)

	# sequence IDs
	seqAccList = string.split(sequenceIDs, '|')

        # if errors, continue to next record
        if error:

        # if no errors, process the primer

        primerFile.write('%d\t%s\t\t%d\t%d\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t%s\t%s\n' \
            % (primerKey, name, NA, vectorKey, segmentTypeKey, mgi_utils.prvalue(sequence1), \
	    mgi_utils.prvalue(sequence2), mgi_utils.prvalue(regionCovered), mgi_utils.prvalue(productSize), \
	    createdByKey, createdByKey, loaddate, loaddate))

	for markerKey in markerList:
	    if markerList.count(markerKey) == 1:
                markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \
		    % (primerKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate))
		errorFile.write('Invalid Marker Duplicate:  %s, %s\n' % (name, markerID))

	# loaddate))

        refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' % (refKey, primerKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))

        # aliases

        for alias in aliasList:
            if len(alias) == 0:
            aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
                    % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate))
            aliasKey = aliasKey + 1

        # MGI Accession ID for the marker

        accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
            % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))

	newPrimerFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \
	   % (markerSymbol, string.join(markerIDs, '|'), name, jnum, regionCovered, sequence1, sequence2, productSize, notes, sequenceIDs, createdBy, mgiPrefix, mgiKey))

        accKey = accKey + 1
        mgiKey = mgiKey + 1

	# sequence accession ids
	for acc in seqAccList:

	    if len(acc) == 0:

	    prefixPart, numericPart = accessionlib.split_accnum(acc)
            accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
                % (accKey, acc, prefixPart, numericPart, logicalDBKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))
            accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \
                % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))
	    accKey = accKey + 1

	# notes

	if len(notes) > 0:
	   noteFile.write('%s|1\t%s\t%s\t%s\n' \
		% (primerKey, notes, loaddate, loaddate))

	refKey = refKey + 1
        primerKey = primerKey + 1

    #	end of "for line in inputFile.readlines():"

    # Update the AccessionMax value

    if not DEBUG:
        db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
Ejemplo n.º 8
def processFile():

    global primerKey, refKey, aliasKey, accKey, mgiKey

    lineNum = 0
    # For each line in the input file

    for line in inputFile.readlines():

        error = 0
        lineNum = lineNum + 1

        # Split the line into tokens
        tokens = string.split(line[:-1], '\t')

            markerSymbol = tokens[0]  # not used
            markerIDs = string.split(tokens[1], '|')
            name = tokens[2]
            jnum = tokens[3]
            regionCovered = tokens[4]
            sequence1 = tokens[5]
            sequence2 = tokens[6]
            productSize = tokens[7]
            notes = tokens[8]
            sequenceIDs = tokens[9]
            aliasList = string.split(tokens[10], '|')
            createdBy = tokens[11]
            exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line))

# marker IDs

        markerList = []
        for markerID in markerIDs:

            markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile)

            if len(markerID) > 0 and markerKey == 0:
                errorFile.write('Invalid Marker:  %s, %s\n' % (name, markerID))
                error = 1
            elif len(markerID) > 0:

        referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile)
        createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile)

        # sequence IDs
        seqAccList = string.split(sequenceIDs, '|')

        # if errors, continue to next record
        if error:

        # if no errors, process the primer

        primerFile.write('%d\t%s\t\t%d\t%d\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t%s\t%s\n' \
            % (primerKey, name, NA, vectorKey, segmentTypeKey, mgi_utils.prvalue(sequence1), \
     mgi_utils.prvalue(sequence2), mgi_utils.prvalue(regionCovered), mgi_utils.prvalue(productSize), \
     createdByKey, createdByKey, loaddate, loaddate))

        for markerKey in markerList:
            if markerList.count(markerKey) == 1:
                markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \
      % (primerKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate))
                errorFile.write('Invalid Marker Duplicate:  %s, %s\n' %
                                (name, markerID))

# loaddate))

        refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' %
                      (refKey, primerKey, referenceKey, createdByKey,
                       createdByKey, loaddate, loaddate))

        # aliases

        for alias in aliasList:
            if len(alias) == 0:
            aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
                    % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate))
            aliasKey = aliasKey + 1

        # MGI Accession ID for the marker

        accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
            % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))

        newPrimerFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \
           % (markerSymbol, string.join(markerIDs, '|'), name, jnum, regionCovered, sequence1, sequence2, productSize, notes, sequenceIDs, createdBy, mgiPrefix, mgiKey))

        accKey = accKey + 1
        mgiKey = mgiKey + 1

        # sequence accession ids
        for acc in seqAccList:

            if len(acc) == 0:

            prefixPart, numericPart = accessionlib.split_accnum(acc)
            accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
                % (accKey, acc, prefixPart, numericPart, logicalDBKey, primerKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))
            accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \
                % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))
            accKey = accKey + 1

# notes

        if len(notes) > 0:
            noteFile.write('%s|1\t%s\t%s\t%s\n' \
          % (primerKey, notes, loaddate, loaddate))

        refKey = refKey + 1
        primerKey = primerKey + 1

    #	end of "for line in inputFile.readlines():"

    # Update the AccessionMax value

    if not DEBUG:
        db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
Ejemplo n.º 9
def process(expFile):
    global expCount, exptLoadedCount, updateExptCount, updateExptList
    global nextExptKey, nextAccKey, nextExptVarKey, nextPropKey
    global expSkippedNotInDbTransIsSuperseriesSet, expSkippedNoSampleList
    global expIdsInDbSet, expLoadedNoSampleList, expIdsInDbNoSamplesSet
    global expSkippedNotInDbNoTransSet, expSkippedMaxSamplesSet

    f = open(expFile, encoding='utf-8', errors='replace')
    context = ET.iterparse(f, events=("start", "end"))
    context = iter(context)

    level = 0
    expID = ''
    title = ''
    summary = ''
    pdat = ''
    gdsType = ''
    exptType = ''
    n_samples = ''
    pubmedList = []
    sampleList = []  # list of samplIDs

    isSuperSeries = 'no'  # flag to indicate expt is superseries, skip
    exptTypeKey = 0  # if 0 chosen gdstype did not translate, skip
    isTpr = 0
        for event, elem in context:
            # end of a record - reset everything
            if event == 'end' and elem.tag == 'DocumentSummary':
                expCount += 1
                skip = 0
                #print('\n\nexpID: %s' % expID)

                # Experiment is in the database
                # add new pubmed ids
                # add raw sample data to those curated experiments that do not have it
                if expID in primaryIdDict:
                    updateExpKey = primaryIdDict[expID]
                    # check for additional pubmed IDs
                    propertyUpdateTemplate = "#====#%s%s%s#=#%s#=====#%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % (
                        TAB, propTypeKey, TAB, TAB, TAB, exptMgiTypeKey, TAB,
                        TAB, TAB, userKey, TAB, userKey, TAB, loadDate, TAB,
                        loadDate, CRT)
                    skip = 1
                    #print('    expIdInDb skip')

                    # not all experiments have pubmed IDs in the database
                    # assigning empty list assures we pick up this case
                    dbBibList = []

                    if expID in pubMedByExptDict:
                        # get the list of pubmed Ids for this expt in the database
                        dbBibList = pubMedByExptDict[expID]

                    # get the set of incoming pubmed IDs not in the database
                    newSet = set(pubmedList).difference(set(dbBibList))

                    # if we have new pubmed IDs, add them to the database
                    if newSet:
                        #print('found new pubmed ids: %s' % newSet)

                        # get next sequenceNum for this expt's pubmed ID
                        # in the database

                        # get the next property sequence number
                        results = db.sql(
                            '''select max(sequenceNum) + 1
                        as nextNum
                        from MGI_Property p
                        where p._Object_key =  %s
                        and p._PropertyTerm_key = 20475430
                        and p._PropertyType_key = 1002''' % updateExpKey,
                        nextSeqNum = results[0]['nextNum']

                        if nextSeqNum == None:
                            nextSeqNum = 1

                        updateExptCount += 1

                        for b in newSet:
                            toLoad = propertyUpdateTemplate.replace(
                                '#=#', str(pubmedPropKey)).replace(
                                    '#==#', str(b)).replace(
                                        '#===#', str(nextSeqNum)).replace(
                                                '#=====#', str(updateExpKey))
                            nextPropKey += 1

                    # if there's no raw sample data for the existing experiment, add it
                    if expID in curatedGeoNoRawSampleDict:
                        ret = processSamples(expID,
                                             'true')  # a list of sample info

                        #  means there was no sample file or 2 means there was
                        # a parsing error
                        if ret == 1 or ret == 2:
                            print('expt inDb returnCode for %s: %s' %
                                  (expID, ret))
                            #print('expt inDb adding samples for key: %s expID: %s sampleInfo:%s' % (updateExpKey, expID, ret))
                            # GXD_HTRawSample and MGI_KeyValue BCP for
                            # curated experiments (no raw sample data)
                            processSampleBcp(ret, updateExpKey)

                typeList = list(map(str.strip, gdsType.split(';')))

                if skip != 1:
                    (exptTypeKey, exptType) = processExperimentType(typeList)
                    if exptTypeKey == 0:
                        # expts whose type doesn't translate and is not already in the db
                        skip = 1
                        #print('    expIdNotInDbNoTrans skip')

                if skip != 1 and isSuperSeries == 'yes':
                    # number of superseries not already caught because of un translated
                    # exptType or already in DB
                    skip = 1

                if skip != 1 and int(n_samples) > int(maxSamples):
                    skip = 1
                #print('exptTypeKey: %s isSuperSeries: %s skip: %s' % (exptTypeKey, isSuperSeries, skip))
                if skip != 1:
                    exptLoadedCount += 1
                    createExpObject = 0
                    # now process the samples
                    ret = processSamples(expID, 'false')
                    #print('ret: %s' % ret)
                    if ret == 1:
                        expLoadedNoSampleList.append('expID: %s' % (expID))
                        createExpObject = 1
                    elif ret == 2:
                        expSkippedNoSampleList.append('expID: %s' % (expID))
                        exptLoadedCount -= 1  # decrement the loaded count
                        sampleList = ret  #  list of sampleString's representing each
                        #  sample for the current experiment
                        createExpObject = 1
                    if createExpObject:
                        # catenate the global overallDesign parsed from the sample to the
                        # experiment summary
                        description = '%s %s' % (summary, overallDesign)
                        description = description.replace('\t', ' ')
                        description = description.replace('\n', ' ')

                        if runParsingReports == 'true':
                                '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
                                (expID, TAB, ', '.join(sampleList), TAB, title,
                                 TAB, description, TAB, isSuperSeries, TAB,
                                 pdat, TAB, exptType, TAB, n_samples, TAB,
                                 ', '.join(pubmedList), CRT))

                        # GXD_HTExperiment BCP

                        line = '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' % (
                            nextExptKey, TAB, sourceKey, TAB, title, TAB,
                            description, TAB, pdat, TAB, releasedate, TAB,
                            evalDate, TAB, evalStateTermKey, TAB,
                            curStateTermKey, TAB, studyTypeTermKey, TAB,
                            exptTypeKey, TAB, evalByKey, TAB, initCurByKey,
                            TAB, lastCurByKey, TAB, initCurDate, TAB,
                            lastCurDate, TAB, confidence, TAB, userKey, TAB,
                            userKey, TAB, loadDate, TAB, loadDate, CRT)
                        #print('line: %s' % line)

                        # GXD_HTVariable BCP
                        fpVariableBcp.write('%s%s%s%s%s%s' %
                                            (nextExptVarKey, TAB, nextExptKey,
                                             TAB, exptVariableTermKey, CRT))
                        nextExptVarKey += 1

                        # ACC_Accession BCP
                        prefixPart, numericPart = accessionlib.split_accnum(
                            % (nextAccKey, TAB, expID, TAB, prefixPart, TAB,
                               numericPart, TAB, geoLdbKey, TAB, nextExptKey,
                               TAB, exptMgiTypeKey, TAB, private, TAB,
                               isPreferred, TAB, userKey, TAB, userKey, TAB,
                               loadDate, TAB, loadDate, CRT))
                        nextAccKey += 1

                        # Experiment Properties

                        # title (1) experiment name
                        #   namePropKey = 20475428
                        # n_samples (1) count of samples
                        #   sampleCountPropKey = 20475424
                        # typeList (1-n) raw experiment types
                        #   expTypePropKey = 20475425
                        # pubmedList (0-n) pubmed Ids
                        #   pubmedPropKey = 20475430
                        # description (1) sample overalldesign + expt summary
                        #   descriptionPropKey = 87508020

                        # the template for properties:
                        propertyTemplate = "#====#%s%s%s#=#%s%s%s%s%s#==#%s#===#%s%s%s%s%s%s%s%s%s" % (
                            TAB, propTypeKey, TAB, TAB, nextExptKey, TAB,
                            exptMgiTypeKey, TAB, TAB, TAB, userKey, TAB,
                            userKey, TAB, loadDate, TAB, loadDate, CRT)

                        if title != '':
                            toLoad = propertyTemplate.replace(
                                '#=#', str(namePropKey)).replace(
                                    title).replace('#===#', '1').replace(
                                        '#====#', str(nextPropKey))
                            nextPropKey += 1

                        if n_samples != '':
                            toLoad = propertyTemplate.replace(
                                '#=#', str(sampleCountPropKey)).replace(
                                    '#==#', str(n_samples)).replace(
                            nextPropKey += 1

                        seqNumCt = 1
                        for e in typeList:
                            toLoad = propertyTemplate.replace(
                                '#=#', str(expTypePropKey)).replace(
                                    '#==#', e).replace('#===#',
                            seqNumCt += 1
                            nextPropKey += 1

                        for b in pubmedList:
                            toLoad = propertyTemplate.replace(
                                '#=#', str(pubmedPropKey)).replace(
                            seqNumCt += 1
                            nextPropKey += 1

                        if title != '':
                            toLoad = propertyTemplate.replace(
                                '#=#', str(namePropKey)).replace(
                                    title).replace('#===#', '1').replace(
                                        '#====#', str(nextPropKey))
                            nextPropKey += 1

                        # GXD_HTRawSample and MGI_KeyValue BCP
                        # ret from processSample = 1 means there was no sample file
                        # so exeriment is created, but no samples
                        if ret != 1:
                            processSampleBcp(sampleList, nextExptKey)

                        # now increment the experiment key
                        nextExptKey += 1
                title = ''
                summary = ''
                isSuperSeries = 'no'
                pdat = ''
                gdsType = ''
                n_samples = ''
                pubmedList = []
                sampleList = []
                exptTypeKey = 0

            if level == 4:
                # Accession tag at level 4 tells us we have a new record
                if elem.tag == 'Accession':
                    expID = elem.text
                elif elem.tag == 'title':
                    title = elem.text
                elif elem.tag == 'summary':
                    summary = elem.text
                    if summary.find(SUPERSERIES) != -1:
                        isSuperSeries = 'yes'
                        #print('isSuperSeries: %s' % expID)
                elif elem.tag == 'PDAT':
                    pdat = elem.text
                elif elem.tag == 'gdsType':
                    gdsType = elem.text
                elif elem.tag == 'n_samples':
                    n_samples = elem.text
            if event == 'start':
                level += 1
                #print('level: %s elemTag: %s elemText: %s' % (level, elem.tag, elem.text))
            elif elem.tag == 'int':
                id = elem.text
                #print('id: %s' % id)
            elif level == 6 and elem.tag == 'Accession':
            if event == 'end':
                level -= 1

        print('Parsing error: on %s' % expFile)
        return 1
    return 0
Ejemplo n.º 10
def process():
    Query database to determine if dummy sequences need to
	be created
    Generates appropriate BCP files

    global seqKey, assocKey, accKey

    # generate table of all mouse molecular segments Acc IDs whose GenBank SeqIDs
    # are not represented as Sequence objects.

    db.sql("""select a.accID, a._LogicalDB_key, ps._Organism_key 
	from ACC_Accession a, PRB_Probe p, PRB_Source ps 
	where a._MGIType_key = 3 
	and a._LogicalDB_key = 9 
	and a._Object_key = p._Probe_key 
	and p._Source_key = ps._Source_key 
	and ps._Organism_key = 1 
	and not exists (select 1 from ACC_Accession s 
	    where s._MGIType_key = 19 
		and s._LogicalDB_key = a._LogicalDB_key 
		and lower(s.accID) = lower(a.accID)
	)""", None)

    # generate table of all mouse marker Acc IDs whose GenBank, SWISSProt, RefSeq,
    # TrEMBL IDs are not represented as Sequence objects.

    db.sql("""select a.accID, a._LogicalDB_key, m._Organism_key 
	from ACC_Accession a, MRK_Marker m 
	where a._MGIType_key = 2 
	and a._LogicalDB_key in (9,13,27,41) 
	and a._Object_key = m._Marker_key 
	and m._Organism_key = 1 
	and m._Marker_Status_key in (1,2)
	and not exists (select 1 from ACC_Accession s 
	    where s._MGIType_key = 19 
		and s._LogicalDB_key = a._LogicalDB_key 
		and lower(s.accID) = lower(a.accID)
	)""", None)

    # generate table of all non-mouse molecular segments Acc IDs whose GenBank SeqIDs
    # are not represented as Sequence objects.

    db.sql("""select a.accID, a._LogicalDB_key, s._Organism_key 
	from ACC_Accession a, PRB_Probe p, PRB_Source s 
	where a._MGIType_key = 3 
	and a._LogicalDB_key = 9 
	and a._Object_key = p._Probe_key 
	and p._Source_key = s._Source_key 
	and s._Organism_key != 1 
	and not exists (select 1 from ACC_Accession s 
	    where s._MGIType_key = 19 
		and s._LogicalDB_key = a._LogicalDB_key 
		and lower(s.accID) = lower(a.accID)
	)""", None)

    # generate table of all non-mouse marker Acc IDs whose GenBank, SWISSProt, RefSeq,
    # TrEMBL IDs are not represented as Sequence objects.

    db.sql("""select a.accID, a._LogicalDB_key, m._Organism_key 
	from ACC_Accession a, MRK_Marker m 
	where a._MGIType_key = 2 
	and a._LogicalDB_key in (9,13,27,41) 
	and a._Object_key = m._Marker_key 
	and m._Organism_key != 1 
	and not exists (select 1 from ACC_Accession s 
	    where s._MGIType_key = 19 
		and s._LogicalDB_key = a._LogicalDB_key 
		and lower(s.accID) = lower(a.accID)
	)""", None)

    # union these 4 sets together to form one unique set

    db.sql('select accID, _LogicalDB_key, _Organism_key ' + \
	'INTO TEMPORARY TABLE allaccs ' + \
	'from probeaccs1 ' + \
	'union ' + \
	'select accID, _LogicalDB_key, _Organism_key ' + \
	'from markeraccs1 ' + \
	'union ' + \
	'select accID, _LogicalDB_key, _Organism_key ' + \
	'from probeaccs2 ' + \
	'union ' + \
	'select accID, _LogicalDB_key, _Organism_key ' + \
	'from markeraccs2', None)

    results = db.sql('select * from allaccs', 'auto')
    for r in results:

	accID = r['accID']
	logicalDB = r['_LogicalDB_key']
	organism = r['_Organism_key']

	if organism == 1:
	    sourceKey = mouseSourceKey
	    sourceKey = nonmouseSourceKey

	virtual = 1

        # change values for specific cases

	# types:  316347 (DNA), 316346 (RNA), 316348 (polypeptide), 316349 (not loaded)
	# quality:  316338 (high), 316339 (medium), 316340 (low), 316341 (not loaded)
	# provider: 316380 (GenBank/EMBL/DDBJ), 316372 (RefSeq)
	#           316384 (SwissProt), 316385 (TrEMBL)

        if logicalDB == 9:	# GenBank
            typeKey = 316349
            qualityKey = 316341
            providerKey = 316380
            virtual = 0

        elif logicalDB == 27:     # RefSeq
            typeKey = 316349
            qualityKey = 316338
            providerKey = 316372

        elif logicalDB == 13:     # SwissProt
            typeKey = 316348
            qualityKey = 316338
            providerKey = 316384

        elif logicalDB == 41:     # TrEMBL
            typeKey = 316348
            qualityKey = 316340
            providerKey = 316385

        seqFile.write(mgi_utils.prvalue(seqKey) + DL + \
        	mgi_utils.prvalue(typeKey) + DL + \
        	mgi_utils.prvalue(qualityKey) + DL + \
        	mgi_utils.prvalue(statusKey) + DL + \
        	mgi_utils.prvalue(providerKey) + DL + \
        	mgi_utils.prvalue(organism) + DL + \
		DL + DL + DL + DL + \
        	mgi_utils.prvalue(virtual) + DL + \
		DL + \
		loaddate + DL + loaddate + DL + \
		str(userKey) + DL + str(userKey) + DL + \
		loaddate + DL + loaddate + NL)

        rawFile.write(mgi_utils.prvalue(seqKey) + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		notLoaded + DL + \
		str(userKey) + DL + str(userKey) + DL + \
		loaddate + DL + loaddate + NL)

        sourceFile.write(mgi_utils.prvalue(assocKey) + DL + \
        	mgi_utils.prvalue(seqKey) + DL + \
        	mgi_utils.prvalue(sourceKey) + DL + \
		str(userKey) + DL + str(userKey) + DL + \
		loaddate + DL + loaddate + NL)

	prefixPart, numericPart = accessionlib.split_accnum(accID)
        accFile.write(mgi_utils.prvalue(accKey) + DL + \
        	mgi_utils.prvalue(accID) + DL + \
        	mgi_utils.prvalue(prefixPart) + DL + \
        	mgi_utils.prvalue(numericPart) + DL + \
        	mgi_utils.prvalue(logicalDB) + DL + \
        	mgi_utils.prvalue(seqKey) + DL + \
        	mgi_utils.prvalue(mgiTypeKey) + DL + \
		'0' + DL + \
		'1' + DL + \
		str(userKey) + DL + str(userKey) + DL + \
		loaddate + DL + loaddate + NL)

        seqKey = seqKey + 1
	assocKey = assocKey + 1
	accKey = accKey + 1
Ejemplo n.º 11
def processFile():

    global probeKey, refKey, aliasKey, accKey, mgiKey

    lineNum = 0
    # For each line in the input file

    for line in inputFile.readlines():

        error = 0
        lineNum = lineNum + 1

        # Split the line into tokens
        tokens = string.split(line[:-1], '\t')

	    name = tokens[0]
	    jnum = tokens[1]
	    parentID = tokens[2]
	    sourceName = tokens[3]
	    organism = tokens[4]
	    strain = tokens[5]
	    tissue = tokens[6]
	    gender = tokens[7]
	    cellLine = tokens[8]
	    age = tokens[9]
	    vectorType = tokens[10]
	    segmentType = tokens[11]
	    regionCovered = tokens[12]
	    insertSite = tokens[13]
	    insertSize = tokens[14]
	    markerIDs = string.split(tokens[15], '|')
	    relationship = tokens[16]
	    sequenceIDs = tokens[17]
	    aliasList = string.split(tokens[18], '|')
	    notes = tokens[19]
	    rawnotes = tokens[20]
	    createdBy = tokens[21]
            exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line))

	isParent = 0
	isSource = 0
	parentProbeKey = '';
	sourceKey = 0

	if parentID != '':
	    isParent = 1

	if sourceName != '':
	    isSource = 1

	if not isParent and not isSource:
	    organismKey = sourceloadlib.verifyOrganism(organism, lineNum, errorFile)
	    strainKey = sourceloadlib.verifyStrain(strain, lineNum, errorFile)
	    tissueKey = sourceloadlib.verifyTissue(tissue, lineNum, errorFile)
	    genderKey = sourceloadlib.verifyGender(gender, lineNum, errorFile)
	    cellLineKey = sourceloadlib.verifyCellLine(cellLine, lineNum, errorFile)
	    vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile)
	    segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile)
	    sourceKey = sourceloadlib.verifySource(segmentTypeKey, \
		vectorKey, organismKey, strainKey, \
		tissueKey, genderKey, cellLineKey, age, lineNum, errorFile)

	    if organismKey == 0 or strainKey == 0 or tissueKey == 0 or \
               genderKey == 0 or cellLineKey == 0 or vectorKey == 0 or \
               segmentTypeKey == 0 or sourceKey == 0:
		errorFile.write('%s, %s, %s, %s, %s, %s, %s, %s\n' % (segmentType, vectorType, organism, strain, tissue, gender, cellLine, age))
	        error = 1

        elif not isParent and isSource:
	    vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile)
	    segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile)
	    sourceKey = sourceloadlib.verifyLibrary(sourceName, lineNum, errorFile)

	    if vectorKey == 0 or segmentTypeKey == 0 or sourceKey == 0:
	        error = 1

	# parent from = yes, source given = yes or no (ignored)
	    parentProbeKey, sourceKey = verifyParentProbe(parentID, lineNum, errorFile)
	    vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum, errorFile)
	    segmentTypeKey = sourceloadlib.verifySegmentType(segmentType, lineNum, errorFile)

	    if parentProbeKey == 0 or sourceKey == 0 or vectorKey == 0 or segmentTypeKey == 0:
	        error = 1

        referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile)
	createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile)

	if referenceKey == 0:
	    errorFile.write('Invalid Reference:  %s\n' % (jnum))
	    error = 1

	if createdByKey == 0:
	    errorFile.write('Invalid Creator:  %s\n\n' % (createdBy))
	    error = 1

	# marker IDs

	markerList = []
	for markerID in markerIDs:

	    markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile)

	    if len(markerID) > 0 and markerKey == 0:
	        errorFile.write('Invalid Marker:  %s, %s\n' % (name, markerID))
	        error = 1
            elif len(markerID) > 0:

	# sequence IDs
	seqAccDict = {}
	for seqID in string.split(sequenceIDs, '|'):
	    if len(seqID) > 0:
	        [logicalDB, acc] = string.split(seqID, ':')
	        logicalDBKey = loadlib.verifyLogicalDB(logicalDB, lineNum, errorFile)
	        if logicalDBKey > 0:
		    seqAccDict[acc] = logicalDBKey

        # if errors, continue to next record
        if error:

        # if no errors, process the probe

        probeFile.write('%d\t%s\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t\t%s\t%s\t%s\t%s\n' \
            % (probeKey, name, parentProbeKey, sourceKey, vectorKey, segmentTypeKey, mgi_utils.prvalue(regionCovered), \
	    mgi_utils.prvalue(insertSite), mgi_utils.prvalue(insertSize), createdByKey, createdByKey, loaddate, loaddate))

	for markerKey in markerList:
	    if markerList.count(markerKey) == 1:
                markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \
		    % (probeKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate))
		errorFile.write('Invalid Marker Duplicate:  %s, %s\n' % (name, markerID))

        refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' \
		% (refKey, probeKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))

        # aliases

        for alias in aliasList:
	    if len(alias) == 0:
            aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
		    % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate))
	    aliasKey = aliasKey + 1

        # MGI Accession ID for the marker

        accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
            % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))

	# Print out a new text file and attach the new MGI Probe IDs as the last field

        newProbeFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \
	    % (name, jnum, \
	    mgi_utils.prvalue(sourceName), \
	    organism, \
	    mgi_utils.prvalue(strain), \
	    mgi_utils.prvalue(tissue), \
	    mgi_utils.prvalue(gender), \
	    mgi_utils.prvalue(cellLine), \
	    mgi_utils.prvalue(age), \
	    mgi_utils.prvalue(vectorType), \
	    mgi_utils.prvalue(segmentType), \
	    mgi_utils.prvalue(regionCovered) + \
	    mgi_utils.prvalue(insertSite), \
	    mgi_utils.prvalue(insertSize), \
	    string.join(markerIDs, '|'), \
	    relationship, \
	    mgi_utils.prvalue(sequenceIDs), \
	    string.join(aliasList, '|'), \
	    mgi_utils.prvalue(notes), \
	    createdBy, mgiPrefix, mgiKey))

	# Print out a raw note file

        if len(rawnotes) > 0:
            rawNoteFile.write('%s%d\t%s\n' % (mgiPrefix, mgiKey, rawnotes))

	# Notes

        if len(notes) > 0:
	    noteFile.write('%s\t%s\t%s\t%s\n' % (probeKey, notes, loaddate, loaddate))

        accKey = accKey + 1
        mgiKey = mgiKey + 1

	# sequence accession ids
	for acc in seqAccDict.keys():
	    prefixPart, numericPart = accessionlib.split_accnum(acc)
            accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
                % (accKey, acc, prefixPart, numericPart, seqAccDict[acc], probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))
            accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \
                % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))
	    accKey = accKey + 1

	refKey = refKey + 1
        probeKey = probeKey + 1

    #	end of "for line in inputFile.readlines():"

    # Update the AccessionMax value

    if not DEBUG:
        db.sql('select * from ACC_setMax (%d)' % (lineNum), None)
Ejemplo n.º 12
def writeB6Output():
    # Purpose: parses the output line dictionary
    # writes to Accession, AccessionReference & StrainMarker BCP file
    # if there are no errors
    # Returns: 1 if error, else 0
    # Assumes: file descriptors have been initialized
    # Effects: writes to the file system
    # Throws: Nothing

    global nextSMKey, nextAccKey, totalLoadedCt, b6LoadedCt
    description = ''
    for mgiID in b6ToLoadDict:
        #print 'writeB6Output mgiID: "%s"' % mgiID
        lineList = b6ToLoadDict[mgiID]
        #print 'writeB6Output lineList: %s' % lineList
        qNameSet = set()

        # Resolve MGI ID
        if mgiID not in markerLookup:
            print('%s in MGI GFF File, but NOT IN MGI' % (mgiID))
            marker = markerLookup[mgiID]
            markerKey = marker.markerKey
            symbol = marker.symbol
        if len(lineList) == 1:  # This is non-BlatAlignment gene/pseudogene
            #print 'This is non-BlatAlignment gene/pseudogene and nextSMKey: %s' % nextSMKey
            line = lineList[0]
            chr, start, end, strand, smID, mgiID, biotype, gmIdString, qName, description = parseB6Feature(
                line, 'f')
                '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
                (nextSMKey, TAB, b6StrainKey, TAB, markerKey, TAB, b6RefsKey,
                 TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate,

            prefixPart, numericPart = accessionlib.split_accnum(smID)

            fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \
                % (nextAccKey, TAB, smID, TAB, prefixPart, TAB, numericPart, TAB, msgLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))

            fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \
                % (nextAccKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))

            fpGmB6File.write('%s%s%s%s%s%s%s%s%s%s%s%s' %
                             (smID, TAB, chr, TAB, start, TAB, end, TAB,
                              strand, TAB, description, CRT))
            fpBiotypeB6File.write('%s%s%s%s' % (smID, TAB, biotype, CRT))

            nextAccKey += 1

            # get gmIDs from the input file for the sequence description, if they exist
            # gmIDs example:
            # Dbxref=miRBase:MI0005004,ENSEMBL:ENSMUSG00000076010,NCBI_Gene:751557
            if gmIdString == '':
            gmIdList = gmIdString.split(',')

            nextSMKey += 1
        else:  # This is BlatAlignment set
            #print 'this is a BlatAlignment set nextSMKey: %s' % nextSMKey
            # The first line is feature line, the following are BlatAlignments
            featureLine = lineList[0]

            chr, start, end, strand, smID, mgiID, biotype, gmIdString, qName, description = parseB6Feature(
                featureLine, 'bf')

            # remainder of the list are blat hits - save them to a set
            lineList = lineList[1:]  # remove the blat feature
            for line in lineList:
                j1, j2, j3, j4, j5, j6, j7, j8, qName, j9 = parseB6Feature(
                    line, 'b')

            # set the qNames (genbank IDs) in the description string
            description = description % ','.join(str(s) for s in qNameSet)

            # Create the strain marker and its accession ID
                '%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s' %
                (nextSMKey, TAB, b6StrainKey, TAB, markerKey, TAB, b6RefsKey,
                 TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate,

            prefixPart, numericPart = accessionlib.split_accnum(smID)
            fpAccFile.write('%s%s%s%s%s%s%s%s%s%s%s%s%s%s0%s1%s%s%s%s%s%s%s%s%s' \
                % (nextAccKey, TAB, smID, TAB, prefixPart, TAB, numericPart, TAB, msgLDBKey, TAB, nextSMKey, TAB, mgiTypeKey, TAB, TAB, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))

            fpAccRefFile.write('%s%s%s%s%s%s%s%s%s%s%s%s' \
                % (nextAccKey, TAB, b6RefsKey, TAB, userKey, TAB, userKey, TAB, loaddate, TAB, loaddate, CRT))

            nextAccKey += 1

            fpGmB6File.write('%s%s%s%s%s%s%s%s%s%s%s%s' %
                             (smID, TAB, chr, TAB, start, TAB, end, TAB,
                              strand, TAB, description, CRT))
            fpBiotypeB6File.write('%s%s%s%s' % (smID, TAB, biotype, CRT))

            # 6/12 GF-184, removed all associated IDs from strain gene
            # for blat hits in the input file associate the GenBank IDs that was
            # blatted for coordinates
            nextSMKey += 1
        totalLoadedCt += 1
        b6LoadedCt += 1
    return 0
Ejemplo n.º 13
def processFile():

    global probeKey, refKey, aliasKey, accKey, mgiKey

    lineNum = 0
    # For each line in the input file

    for line in inputFile.readlines():

        error = 0
        lineNum = lineNum + 1

        # Split the line into tokens
        tokens = string.split(line[:-1], '\t')

            name = tokens[0]
            jnum = tokens[1]
            parentID = tokens[2]
            sourceName = tokens[3]
            organism = tokens[4]
            strain = tokens[5]
            tissue = tokens[6]
            gender = tokens[7]
            cellLine = tokens[8]
            age = tokens[9]
            vectorType = tokens[10]
            segmentType = tokens[11]
            regionCovered = tokens[12]
            insertSite = tokens[13]
            insertSize = tokens[14]
            markerIDs = string.split(tokens[15], '|')
            relationship = tokens[16]
            sequenceIDs = tokens[17]
            aliasList = string.split(tokens[18], '|')
            notes = tokens[19]
            rawnotes = tokens[20]
            createdBy = tokens[21]
            exit(1, 'Invalid Line (%d): %s\n' % (lineNum, line))

        isParent = 0
        isSource = 0
        parentProbeKey = ''
        sourceKey = 0

        if parentID != '':
            isParent = 1

        if sourceName != '':
            isSource = 1

        if not isParent and not isSource:
            organismKey = sourceloadlib.verifyOrganism(organism, lineNum,
            strainKey = sourceloadlib.verifyStrain(strain, lineNum, errorFile)
            tissueKey = sourceloadlib.verifyTissue(tissue, lineNum, errorFile)
            genderKey = sourceloadlib.verifyGender(gender, lineNum, errorFile)
            cellLineKey = sourceloadlib.verifyCellLine(cellLine, lineNum,
            vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum,
            segmentTypeKey = sourceloadlib.verifySegmentType(
                segmentType, lineNum, errorFile)
            sourceKey = sourceloadlib.verifySource(segmentTypeKey, \
         vectorKey, organismKey, strainKey, \
         tissueKey, genderKey, cellLineKey, age, lineNum, errorFile)

            if organismKey == 0 or strainKey == 0 or tissueKey == 0 or \
                      genderKey == 0 or cellLineKey == 0 or vectorKey == 0 or \
                      segmentTypeKey == 0 or sourceKey == 0:
                errorFile.write('%s, %s, %s, %s, %s, %s, %s, %s\n' %
                                (segmentType, vectorType, organism, strain,
                                 tissue, gender, cellLine, age))
                error = 1

        elif not isParent and isSource:
            vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum,
            segmentTypeKey = sourceloadlib.verifySegmentType(
                segmentType, lineNum, errorFile)
            sourceKey = sourceloadlib.verifyLibrary(sourceName, lineNum,

            if vectorKey == 0 or segmentTypeKey == 0 or sourceKey == 0:
                error = 1

# parent from = yes, source given = yes or no (ignored)
            parentProbeKey, sourceKey = verifyParentProbe(
                parentID, lineNum, errorFile)
            vectorKey = sourceloadlib.verifyVectorType(vectorType, lineNum,
            segmentTypeKey = sourceloadlib.verifySegmentType(
                segmentType, lineNum, errorFile)

            if parentProbeKey == 0 or sourceKey == 0 or vectorKey == 0 or segmentTypeKey == 0:
                error = 1

        referenceKey = loadlib.verifyReference(jnum, lineNum, errorFile)
        createdByKey = loadlib.verifyUser(createdBy, lineNum, errorFile)

        if referenceKey == 0:
            errorFile.write('Invalid Reference:  %s\n' % (jnum))
            error = 1

        if createdByKey == 0:
            errorFile.write('Invalid Creator:  %s\n\n' % (createdBy))
            error = 1

# marker IDs

        markerList = []
        for markerID in markerIDs:

            markerKey = loadlib.verifyMarker(markerID, lineNum, errorFile)

            if len(markerID) > 0 and markerKey == 0:
                errorFile.write('Invalid Marker:  %s, %s\n' % (name, markerID))
                error = 1
            elif len(markerID) > 0:

# sequence IDs
        seqAccDict = {}
        for seqID in string.split(sequenceIDs, '|'):
            if len(seqID) > 0:
                [logicalDB, acc] = string.split(seqID, ':')
                logicalDBKey = loadlib.verifyLogicalDB(logicalDB, lineNum,
                if logicalDBKey > 0:
                    seqAccDict[acc] = logicalDBKey

        # if errors, continue to next record
        if error:

        # if no errors, process the probe

        probeFile.write('%d\t%s\t%s\t%s\t%s\t%s\t\t\t%s\t%s\t%s\t\t%s\t%s\t%s\t%s\n' \
            % (probeKey, name, parentProbeKey, sourceKey, vectorKey, segmentTypeKey, mgi_utils.prvalue(regionCovered), \
     mgi_utils.prvalue(insertSite), mgi_utils.prvalue(insertSize), createdByKey, createdByKey, loaddate, loaddate))

        for markerKey in markerList:
            if markerList.count(markerKey) == 1:
                markerFile.write('%s\t%s\t%d\t%s\t%s\t%s\t%s\t%s\n' \
      % (probeKey, markerKey, referenceKey, relationship, createdByKey, createdByKey, loaddate, loaddate))
                errorFile.write('Invalid Marker Duplicate:  %s, %s\n' %
                                (name, markerID))

        refFile.write('%s\t%s\t%s\t0\t0\t%s\t%s\t%s\t%s\n' \
  % (refKey, probeKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))

        # aliases

        for alias in aliasList:
            if len(alias) == 0:
            aliasFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
      % (aliasKey, refKey, alias, createdByKey, createdByKey, loaddate, loaddate))
            aliasKey = aliasKey + 1

        # MGI Accession ID for the marker

        accFile.write('%s\t%s%d\t%s\t%s\t1\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
            % (accKey, mgiPrefix, mgiKey, mgiPrefix, mgiKey, probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))

        # Print out a new text file and attach the new MGI Probe IDs as the last field

        newProbeFile.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%d\n' \
     % (name, jnum, \
     mgi_utils.prvalue(sourceName), \
     organism, \
     mgi_utils.prvalue(strain), \
     mgi_utils.prvalue(tissue), \
     mgi_utils.prvalue(gender), \
     mgi_utils.prvalue(cellLine), \
     mgi_utils.prvalue(age), \
     mgi_utils.prvalue(vectorType), \
     mgi_utils.prvalue(segmentType), \
     mgi_utils.prvalue(regionCovered) + \
     mgi_utils.prvalue(insertSite), \
     mgi_utils.prvalue(insertSize), \
     string.join(markerIDs, '|'), \
     relationship, \
     mgi_utils.prvalue(sequenceIDs), \
     string.join(aliasList, '|'), \
     mgi_utils.prvalue(notes), \
     createdBy, mgiPrefix, mgiKey))

        # Print out a raw note file

        if len(rawnotes) > 0:
            rawNoteFile.write('%s%d\t%s\n' % (mgiPrefix, mgiKey, rawnotes))

# Notes

        if len(notes) > 0:
            noteFile.write('%s\t%s\t%s\t%s\n' %
                           (probeKey, notes, loaddate, loaddate))

        accKey = accKey + 1
        mgiKey = mgiKey + 1

        # sequence accession ids
        for acc in seqAccDict.keys():
            prefixPart, numericPart = accessionlib.split_accnum(acc)
            accFile.write('%s\t%s\t%s\t%s\t%s\t%d\t%d\t0\t1\t%s\t%s\t%s\t%s\n' \
                % (accKey, acc, prefixPart, numericPart, seqAccDict[acc], probeKey, mgiTypeKey, createdByKey, createdByKey, loaddate, loaddate))
            accRefFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' \
                % (accKey, referenceKey, createdByKey, createdByKey, loaddate, loaddate))
            accKey = accKey + 1

        refKey = refKey + 1
        probeKey = probeKey + 1

    #	end of "for line in inputFile.readlines():"

    # Update the AccessionMax value

    if not DEBUG:
        db.sql('select * from ACC_setMax (%d)' % (lineNum), None)