def convert(sequenceData, lastTranscriptID, fivePrimeSequence, fivePrimeUTRs, cdss, stopCodon, direction): cdsSequence = "" cdssAndStopCodon = cdss[:] for cds in cdssAndStopCodon: cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction) stopCodonSequence = "" if stopCodon: #stop codon is always last cdssAndStopCodon.append(stopCodon) stopCodonSequence += gencode.readSequence(sequenceData, stopCodon['start'], stopCodon['end'], direction) sequence = fivePrimeSequence + cdsSequence + stopCodonSequence EnsemblID = lastTranscriptID.split( '.' )[0] # transcript stripped of version number to match RefSeq dictionary transcript uORF_number = 1 while lee_TIS_dict.get((EnsemblID, uORF_number)): RefSeqID, position_to_aTIS, annotation, RLTM_RCHX, codon, uORF_number = lee_TIS_dict.get( (EnsemblID, uORF_number)) position = position_to_aTIS + sequence.find(cdsSequence) # Now find the closest codon in the transcript sequence offSet = 0 lst_pos = [] codon_position = sequence.find(codon, offSet) offSet = codon_position + 1 while codon_position != -1: lst_pos.append(codon_position) codon_position = sequence.find(codon, offSet) offSet = codon_position + 1 best_position = closest_match(lst_pos, position) #startPosition, endPosition = conversion_parts(fivePrimeUTRs, cdssAndStopCodon, best_position - 1, best_position + 3, direction) record = fivePrimeUTRs[0] uORF_ID = RefSeqID + ':' + annotation + '_' + codon if abs(best_position - position) <= 5: # 5 is an arbitrary threshold writeUORFOutput(uORF_ID, fivePrimeUTRs, cdssAndStopCodon, best_position, best_position + 3, direction, RLTM_RCHX) # 5 is an arbitrary threshold for matching based on me seeing that most of the elements were off by 1 or 0, which is what I expect #leeToFritschFile.write(record['chromosome'] + '\t' + str(startPosition) + '\t' + str(endPosition) + '\t' + \ # RefSeqID + ':' + annotation + '_' + codon + '\t' + RLTM_RCHX + '\t' + \ # direction + '\n') else: tooFarleeuorfsFile.write(record['chromosome'] + '\t' + str(best_position-1) + '\t' + str(best_position + 3) + '\t' + \ RefSeqID + ':' + annotation + '_' + codon + '\t' + RLTM_RCHX + '\t' + \ direction + '\t' + str(best_position - position) + '\n') uORF_number += 1
def transcriptCallback(records, sequenceData, lastTranscriptID, fivePrimeUTRs, fivePrimeContent, cdss, stopCodon, direction): cdsSequence = "" for cds in cdss: cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction) cdsAnnotationFile.write(cds['line']) stopCodonSequence = "" if stopCodon: stopCodonSequence += gencode.readSequence(sequenceData, stopCodon['start'], stopCodon['end'], direction) #print 'error! No transcript ID for ' + str(lastTranscriptID) if cdsSequence: cdsSequenceFile.write(">" + str(records[0]['geneID']) + "|" + str(lastTranscriptID) + "|" + str(records[0]["geneName"]) + "\n") cdsSequenceFile.write(cdsSequence + stopCodonSequence + "\n")
def transcriptCallback(records, sequenceData, lastTranscriptID, fivePrimeUTRs, fivePrimeContent, cdss, stopCodon, direction): #Scan intron's, ignore first cds block for cds in cdss[1:]: if direction == '+': startPosition = cds['start'] - 6 endPosition = cds['start'] + 2 elif direction == '-': startPosition = cds['end'] - 2 endPosition = cds['end'] + 6 #Not in bounds of sequence data, skip if endPosition > len(sequenceData): print 'once' print cds['start'] print cds['end'] print cds print len(sequenceData) continue sequence = gencode.readSequence(sequenceData, startPosition, endPosition, direction) #NAGNAGNAG lowerPosition = 'NA' higherPosition = 'NA' if sequence[1:3][0] == 'A' or sequence[1:3][1] == 'G': if direction == '+': lowerPosition = str(startPosition + 1) elif direction == '-': higherPosition = str(endPosition - 2) if sequence[7:9][0] == 'A' or sequence[7:9][1] == 'G': if direction == '+': higherPosition = str(startPosition + 7) elif direction == '-': lowerPosition = str(endPosition - 8) if "AG" == sequence[4:6]: spliceAcceptorCase = 'SpliceAcceptorIsAG' else: spliceAcceptorCase = 'SpliceAcceptorIsNotAG' agCase = None if "AG" == sequence[1:3] or "AG" == sequence[7:9]: agCase = 'FoundAGOnSides' elif sequence[1:3][0] == 'A' or sequence[1:3][1] == 'G' or sequence[ 7:9][0] == 'A' or sequence[7:9][1] == 'G': agCase = 'FoundEitherAOrGOnSides' if agCase: nagAnnotationFile.write("\t".join([ records[0]['chromosome'], 'mayur', 'NAGNAG', records[0] ['geneID'], records[0]['geneName'], records[0]['geneType'], lastTranscriptID, records[0]['transcriptType'], direction, str(startPosition), str(endPosition), sequence, lowerPosition, higherPosition, spliceAcceptorCase, agCase ]) + "\n")
def findKozak2(outputFile, sequenceData, transcript, cdss, stopCodon, direction): cdsSequence = "" cdssAndStopCodon = cdss[:] for cds in cdssAndStopCodon: cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction) stopCodonSequence = "" if stopCodon: #stop codon is always last cdssAndStopCodon.append(stopCodon) stopCodonSequence += gencode.readSequence(sequenceData, stopCodon['start'], stopCodon['end'], direction) afterCDSAndStopCodonSequence = "" genomeRecord = { 'chromosome': cdssAndStopCodon[0]['chromosome'], 'type': 'genome', 'transcriptName': cdssAndStopCodon[0]['transcriptName'], 'geneID': cdssAndStopCodon[0]['geneID'], 'geneName': cdssAndStopCodon[0]['geneName'], 'transcriptStatus': cdssAndStopCodon[0]['transcriptStatus'], 'geneStatus': cdssAndStopCodon[0]['geneStatus'], 'levelNumber': '' } sequence = cdsSequence + stopCodonSequence + afterCDSAndStopCodonSequence atgSequenceLength = len(cdsSequence) atgOffsetToSearch = 0 sequenceLength = len(sequence) transcriptNumber = 1 genomeLength = 300 while atgOffsetToSearch < atgSequenceLength - 2: atgIndex = cdsSequence.find('ATG', atgOffsetToSearch) if atgIndex == -1: break atgOffsetToSearch = atgIndex + 1 if atgIndex < atgSequenceLength - 2: endIndex = atgIndex + 3 while endIndex < sequenceLength - 2: threeCharacterSequence = sequence[endIndex:endIndex + 3] if threeCharacterSequence == 'TAG' or threeCharacterSequence == 'TAA' or threeCharacterSequence == 'TGA': kozakSequence = sequence[atgIndex:endIndex + 3] writeKozakOutput2( outputFile, transcript + ".kozak_cds." + str(transcriptNumber), cdssAndStopCodon, genomeRecord, atgIndex, endIndex + 3, direction) transcriptNumber += 1 found = True break endIndex += 3 if endIndex >= sequenceLength - 2 and sequenceLength < len( sequenceData): if direction == '+': afterCDSAndStopCodonSequence = gencode.readSequence( sequenceData, cdssAndStopCodon[-1]['end'] + 1, min(cdssAndStopCodon[-1]['end'] + genomeLength, len(sequenceData)), direction) genomeRecord['start'] = cdssAndStopCodon[-1]['end'] + 1 genomeRecord['end'] = len(sequenceData) elif direction == '-': afterCDSAndStopCodonSequence = gencode.readSequence( sequenceData, max(cdssAndStopCodon[-1]['start'] - genomeLength, 1), cdssAndStopCodon[-1]['start'] - 1, direction) genomeRecord['start'] = 1 genomeRecord['end'] = cdssAndStopCodon[-1]['start'] - 1 sequence = cdsSequence + stopCodonSequence + afterCDSAndStopCodonSequence sequenceLength = len(sequence) genomeLength += 300
def findKozak(utrOutputFile, cdsOutputFile, transcript, sequenceData, fivePrimeSequence, fivePrimeUTRs, cdss, stopCodon, direction): cdsSequence = "" for cds in cdss: cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction) beforeFivePrimeSequence = "" afterCDSSequence = "" if direction == "+": if fivePrimeUTRs[0]['start'] - 9 > 0: beforeFivePrimeSequence = gencode.readSequence( sequenceData, fivePrimeUTRs[0]['start'] - 9, fivePrimeUTRs[0]['start'] - 1, direction) if cdss[-1]['end'] + 7 <= len(sequenceData): afterCDSSequence = gencode.readSequence(sequenceData, cdss[-1]['end'] + 1, cdss[-1]['end'] + 7, direction) else: if fivePrimeUTRs[-1]['start'] - 9 > 0: beforeFivePrimeSequence = gencode.readSequence( sequenceData, fivePrimeUTRs[-1]['start'] - 9, fivePrimeUTRs[-1]['start'] - 1, direction) if cdss[0]['end'] + 7 <= len(sequenceData): afterCDSSequence = gencode.readSequence(sequenceData, cdss[0]['end'] + 1, cdss[0]['end'] + 7, direction) sequence = beforeFivePrimeSequence + fivePrimeSequence + cdsSequence + afterCDSSequence atgOffsetToSearch = len(beforeFivePrimeSequence) transcriptVersion = 1 while atgOffsetToSearch < len(beforeFivePrimeSequence) + len( fivePrimeSequence) - 2: atgIndex = sequence.find('ATG', atgOffsetToSearch) if atgIndex == -1: break if atgIndex < len(beforeFivePrimeSequence) + len( fivePrimeSequence) - 2: writeKozakOutput(utrOutputFile, fivePrimeUTRs[0], atgIndex, atgIndex - len(beforeFivePrimeSequence) + 1, len(fivePrimeSequence), sequence, "kozak_utr", transcript, transcriptVersion, direction) transcriptVersion += 1 atgOffsetToSearch = atgIndex + 1 transcriptVersion = 1 atgOffsetToSearch = len(beforeFivePrimeSequence) + len(fivePrimeSequence) while atgOffsetToSearch < len(beforeFivePrimeSequence) + len( fivePrimeSequence) + len(cdsSequence) - 2: atgIndex = sequence.find('ATG', atgOffsetToSearch) if atgIndex == -1: break if atgIndex < len(beforeFivePrimeSequence) + len( fivePrimeSequence) + len(cdsSequence) - 2: writeKozakOutput( cdsOutputFile, cdss[0], atgIndex, atgIndex - len(fivePrimeSequence) - len(beforeFivePrimeSequence) + 1, len(cdsSequence), sequence, "kozak_cds", transcript, transcriptVersion, direction) transcriptVersion += 1 atgOffsetToSearch = atgIndex + 1
def findUORF(outputFile, outputSequenceFile, sequenceData, divisibleByThree, transcript, fivePrimeSequence, fivePrimeUTRs, cdss, stopCodon, direction): cdsSequence = "" cdssAndStopCodon = cdss[:] for cds in cdssAndStopCodon: cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction) stopCodonSequence = "" if stopCodon: #stop codon is always last cdssAndStopCodon.append(stopCodon) stopCodonSequence += gencode.readSequence(sequenceData, stopCodon['start'], stopCodon['end'], direction) sequence = fivePrimeSequence + cdsSequence + stopCodonSequence atgOffsetToSearch = 0 atgSequenceLength = len(fivePrimeSequence) sequenceLength = len(sequence) transcriptNumber = 1 while atgOffsetToSearch < atgSequenceLength - 2: atgIndex = fivePrimeSequence.find(StartCodon, atgOffsetToSearch) if atgIndex == -1: break atgOffsetToSearch = atgIndex + 1 if atgIndex < atgSequenceLength - 2: endIndex = atgIndex + 3 while endIndex < sequenceLength - 2: threeCharacterSequence = sequence[endIndex:endIndex + 3] if threeCharacterSequence == 'TAG' or threeCharacterSequence == 'TAA' or threeCharacterSequence == 'TGA': if not divisibleByThree or (endIndex - atgIndex) % 3 == 0: uorfSequence = sequence[atgIndex:endIndex + 3] howFarReached = "" if endIndex + 3 <= len(fivePrimeSequence): howFarReached = "UTRonly" elif endIndex + 3 <= len(fivePrimeSequence) + len( cdsSequence): howFarReached = "CDSpartial" else: howFarReached = "CDSfull" record = fivePrimeUTRs[0] outputSequenceFile.write(record["chromosome"] + "|" + record["geneID"] + "|" + transcript + ".uORF_" + StartCodon + "." + str(transcriptNumber) + "|" + direction + "|" + record["geneName"] + "|" + howFarReached + "\n") outputSequenceFile.write(uorfSequence + "\n") writeUORFOutput( outputFile, transcript + ".uORF_" + StartCodon + "." + str(transcriptNumber), fivePrimeUTRs, cdssAndStopCodon, atgIndex, endIndex + 3, direction) transcriptNumber += 1 break endIndex += 1