def printMissingSequences(missingSequences=None, databaseVersion=None, outputDirectory=None, fileVersion='1.0', verbose=False): outputFileNameShort = databaseVersion + '_Missing_Reference_Alleles.txt' outputFileNameFull = join(outputDirectory, outputFileNameShort) if (verbose): print('Writing a list of ' + str(len(missingSequences)) + ' missing allele sequences to:' + str(outputFileNameFull)) outputFile = open(outputFileNameFull, 'w') outputFile.write('# filename: ' + str(outputFileNameShort) + '\n') outputFile.write('# date: ' + datetime.today().strftime('%Y-%m-%d') + '\n') outputFile.write('# version: ' + str(fileVersion) + '\n') outputFile.write('# author: ' + str('Ben Matern <*****@*****.**>') + '\n') alleleClusters = clusterSequences(alleleSequences=missingSequences, verbose=verbose) for locus in sorted(alleleClusters.keys()): for alleleGroup in sorted(alleleClusters[locus].keys()): for allele in alleleClusters[locus][alleleGroup]: outputFile.write(str(allele.alleleName) + '\n') outputFile.close()
def printSequenceCountsPerLocus(alleleSequences=None, outputFilename=None, verbose=None, imgtReleaseVersion=None, delimiter='\t'): if (verbose): print('Creating sequence counts per locus:' + str(outputFilename)) outputFile = open(outputFilename, 'w') outputFile.write('Locus' + delimiter + imgtReleaseVersion + ' Reference Count' + '\n') # Loop sequences: alleleClusters = clusterSequences(alleleSequences=alleleSequences, verbose=verbose) for locus in sorted(alleleClusters.keys()): # print('Finding reference for locus ' + str(locus)) seqCount = 0 for alleleGroup in sorted(alleleClusters[locus].keys()): seqCount += len(alleleClusters[locus][alleleGroup]) outputFile.write(locus + delimiter + str(seqCount) + '\n') # Write TOtal outputFile.write('Total' + delimiter + str(len(alleleSequences)) + '\n') outputFile.close()
def printSequenceList(alleleSequences=None, databaseVersion=None, outputDirectory=None, fileVersion='1.0', verbose=False, alleleDescriptionLookup=None): outputFileNameShort = databaseVersion + '_Reference_Alleles.txt' outputFileNameFull = join(outputDirectory, outputFileNameShort) if (verbose): print('Writing a list of ' + str(len(alleleSequences)) + ' allele sequences to:' + str(outputFileNameFull)) outputFile = open(outputFileNameFull, 'w') outputFile.write('# filename: ' + str(outputFileNameShort) + '\n') outputFile.write('# date: ' + datetime.today().strftime('%Y-%m-%d') + '\n') outputFile.write('# version: ' + str(fileVersion) + '\n') outputFile.write('# author: ' + str('Ben Matern <*****@*****.**>') + '\n') outputFile.write('IPD-IMGT/HLA Database ' + str(databaseVersion) + ' Accession Number\tLocus\tIPD-IMGT/HLA Database ' + str(databaseVersion) + ' Allele Name\tDescription\n') locusReferences = getLocusReferences() #Cluster to sort them alleleClusters = clusterSequences(alleleSequences=alleleSequences, verbose=verbose) for locus in sorted(alleleClusters.keys()): #print('Finding reference for locus ' + str(locus)) for alleleGroup in sorted(alleleClusters[locus].keys()): for allele in alleleClusters[locus][alleleGroup]: currentLocus, nomenclatureFields = allele.alleleName.split('*') nomenclatureTokens = nomenclatureFields.split(':') currentGroup = str(nomenclatureTokens[0]) allele.description = '' # Is it a locus reference? for locusReference in list(set(locusReferences)): if (allele.alleleName in locusReference or locusReference in allele.alleleName): allele.description = currentLocus + ' Locus Reference;' # If we already have a description (serotype or DP references) if allele.alleleName in alleleDescriptionLookup.keys(): allele.description += alleleDescriptionLookup[ allele.alleleName] else: # Otherwise it's a group reference. allele.description += currentLocus.replace( 'HLA-', '') + '*' + currentGroup + ' Reference' outputFile.write( str(allele.accessionNumber) + '\t' + currentLocus + '\t' + str(allele.alleleName) + '\t' + str(allele.description) + '\n') outputFile.close()
def printSequences(alleleSequences=None, outputFilename=None, verbose=False): if (verbose): print('Writing ' + str(len(alleleSequences)) + ' allele sequences to:' + str(outputFilename)) outputFile = open(outputFilename, 'w') alleleClusters = clusterSequences(alleleSequences=alleleSequences, verbose=verbose) for locus in sorted(alleleClusters.keys()): # print('Finding reference for locus ' + str(locus)) for alleleGroup in sorted(alleleClusters[locus].keys()): for alleleSequence in alleleClusters[locus][alleleGroup]: outputFile.write('>' + str(alleleSequence.alleleName) + '\n') outputFile.write(str(alleleSequence.getSequence()) + '\n') outputFile.close()
def printSequenceDetails(alleleSequences=None, outputFilename=None, verbose=False, delimiter='\t', imgtReleaseVersion=None): if (verbose): print('Creating Sequence Details file:' + str(outputFilename)) outputFile = open(outputFilename, 'w') outputFile.write(imgtReleaseVersion + ' Allele Name' + delimiter + imgtReleaseVersion + ' Sequence Length' + delimiter + imgtReleaseVersion + ' 5\'UTR Length' + delimiter + imgtReleaseVersion + ' 3\'UTR Length' + delimiter + imgtReleaseVersion + ' CWD Status' + '\n') # Loop sequences: alleleClusters = clusterSequences(alleleSequences=alleleSequences, verbose=verbose) for locus in sorted(alleleClusters.keys()): # print('Finding reference for locus ' + str(locus)) for alleleGroup in sorted(alleleClusters[locus].keys()): for alleleSequence in alleleClusters[locus][alleleGroup]: # TODO: This will break if the feature is missing. Only running on full-len for now. # Placeholder fix for missing 5 and 3 UTRs. if ('5UTR' not in alleleSequence.featureSequences.keys()): utr5Sequence = '' else: utr5Sequence = alleleSequence.featureSequences['5UTR'] if ('3UTR' not in alleleSequence.featureSequences.keys()): utr3Sequence = '' else: utr3Sequence = alleleSequence.featureSequences['3UTR'] outputFile.write(alleleSequence.alleleName + delimiter + str(len(alleleSequence.getSequence())) + delimiter + str(len(utr5Sequence)) + delimiter + str(len(utr3Sequence)) + delimiter + str(alleleSequence.cwdStatus) + '\n') outputFile.close()
# TODO: The database version from the XML file may be slightly different than the provided release number, due to minor versioning. # I am naming files based on the "0" version but there might be 3.42.1 for example. # Not completely accurate but its more consistent this way. May cause confusion. if (databaseVersion != args.release): print( 'Warning! the latest IPD-IMGT/HLA xml file shows a different (newer?) release date (' + str(databaseVersion) + ') than the provided release version (' + str(args.release) + ')') if (args.supplementary): printSequences(alleleSequences=alleleSequences, outputFilename=join( supplementalFileDirectory, str(args.release) + '_FullLengthSequences.fasta'), verbose=verbose) alleleSequenceClusters = clusterSequences(alleleSequences=alleleSequences, verbose=verbose) newReferenceSequences, missingSequences, alleleDescriptionLookup = createReferenceSequences( clusteredFullLenAlleleSequences=alleleSequenceClusters, verbose=verbose, imgtReleaseVersion=args.release) printSequences(alleleSequences=newReferenceSequences, outputFilename=join( outputDirectory, str(args.release) + '_ReferenceSequences.fasta'), verbose=verbose) printSequenceList(alleleSequences=newReferenceSequences, databaseVersion=args.release, outputDirectory=outputDirectory, verbose=verbose, fileVersion=args.version, alleleDescriptionLookup=alleleDescriptionLookup)