def cutSequence(sequence, motif) : """Takes a Biopython sequence and a motif and return the sequences cut at this motif. Input: sequence = the sequence motif = a string containing a '-' at the cut site ; e.g. 'g-aattc' for EcoRI Output: a list of Biopython sequences SeqRecord, with names corresponding to the input sequence, the cut and the location in the input sequence """ # clean the cutting motif pattern=motif.replace('-', '').upper() cutSite=motif.find('-') if (cutSite==-1) : cutSite=len(motif) # if no '-' is found the cut is at the end of the motif # extract the sequence string try : sequenceString=sequence.seq.data.upper() # if sequence is a SeqRecord name=sequence.name+'_'+motif+'_' except : try : sequenceString=sequence.data.upper() # if sequence is a Seq name=motif+'_' except : sequenceString=sequence.upper() # sequence should be a string name=motif+'_' # find the first occurrence fragments=[] nextSite=sequenceString.find(pattern) lastCut=1 lastPosition=len(sequenceString) # loop while (nextSite>0) : # while a site is found newFragment=SeqRecord(sequenceString[:nextSite+cutSite]) newFragment.name=name+str(lastCut)+'_'+str(lastCut+nextSite+cutSite-1) lastCut=lastCut+nextSite+cutSite sequenceString=sequenceString[nextSite+cutSite:] fragments.append(newFragment) nextSite=sequenceString.find(pattern) # add the remaining sequence if (sequenceString!='') : name=name+str(lastCut)+'_'+str(lastPosition) lastFragment=SeqRecord(sequenceString) lastFragment.name=name fragments.append(lastFragment) # return return fragments
def constructRandomChunks(sequences, numChunks=10, chunksize=1000): """Construct random sequ""" rndChunks = list() mapping = dict() for s in sequences: for c in range(numChunks): rndPos = random.randint(0, len(s) - chunksize) rndSeq = s.seq[rndPos:rndPos + chunksize] rndID = str(uuid.uuid1()) rndChunks.append(SeqRecord(rndSeq, rndID)) mapping[rndID] = s.description random.shuffle(rndChunks) return rndChunks, mapping
def cutSequence(sequence, motif): """Takes a Biopython sequence and a motif and return the sequences cut at this motif. Input: sequence = the sequence motif = a string containing a '-' at the cut site ; e.g. 'g-aattc' for EcoRI Output: a list of Biopython sequences SeqRecord, with names corresponding to the input sequence, the cut and the location in the input sequence """ # clean the cutting motif pattern = motif.replace('-', '').upper() cutSite = motif.find('-') if (cutSite == -1): cutSite = len( motif) # if no '-' is found the cut is at the end of the motif # extract the sequence string try: sequenceString = sequence.seq.data.upper( ) # if sequence is a SeqRecord name = sequence.name + '_' + motif + '_' except: try: sequenceString = sequence.data.upper() # if sequence is a Seq name = motif + '_' except: sequenceString = sequence.upper() # sequence should be a string name = motif + '_' # find the first occurrence fragments = [] nextSite = sequenceString.find(pattern) lastCut = 1 lastPosition = len(sequenceString) # loop while (nextSite > 0): # while a site is found newFragment = SeqRecord(sequenceString[:nextSite + cutSite]) newFragment.name = name + str(lastCut) + '_' + str(lastCut + nextSite + cutSite - 1) lastCut = lastCut + nextSite + cutSite sequenceString = sequenceString[nextSite + cutSite:] fragments.append(newFragment) nextSite = sequenceString.find(pattern) # add the remaining sequence if (sequenceString != ''): name = name + str(lastCut) + '_' + str(lastPosition) lastFragment = SeqRecord(sequenceString) lastFragment.name = name fragments.append(lastFragment) # return return fragments
def main(): # Configuration #Select the desired NCBI translation table translationTable = 11 # Open the DNA sequence file and read the fasta sequences into a dictionary if (len(argv) > 1): dnaFileName = argv[1] else: dnaFileName = None dnaSeqFile = fileinput.input(dnaFileName) dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta")) # Translate the sequences aaSeqRecords = [] for key in dnaSeqDict: aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key) aaSeqRecords.append(aaSeq) dnaSeqFile.close() # Replace stop codons with X (unknown aa) so muscle doesn't drop them for aaSeq in aaSeqRecords: noStopCodonSeq = str(aaSeq.seq).replace('*', 'X') aaSeq.seq = Seq(noStopCodonSeq) # Align the aa sequences commandLine = str(MuscleCommandline(seqtype='protein')) childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta") childProcess.stdin.close() aaAlignment = AlignIO.read(childProcess.stdout, "fasta") # Convert the aa alignment into a dna alignment dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) for taxon in aaAlignment: aaCount = 0 dnaSeq = '' for aaResidue in taxon.seq: if (aaResidue == '-'): dnaSeq = dnaSeq + '---' else: dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3] aaCount+=1 # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq)) if (dnaFileName): outFileName = dnaFileName.split('.')[0] + '_aln.phy' else: outFileName = 'out_aln.phy' outFile = open(outFileName, 'w+') AlignIO.write([dnaAlignment], outFile, "phylip") #I think this section should be removed. If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc). I can use pamlize.py to add the I right before using paml. # Biopython doesn't tag Interleaved phylip files and PAML requires it so... # outFile.seek(0,0) # modifiedAlignmentText = outFile.readlines() # modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n' # outFile.seek(0,0) # outFile.writelines(modifiedAlignmentText) outFile.close()
alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-")); noGenSignalAlignment = alignmentNoGenSignalIterator.next() queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,) ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,) #print testAlignment[entryToTest].id #print testAlignment[entryToTest].seq alignmentWithSignal = alignmentIterator.next() desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq) desiredSeqString = desiredSeqString.replace("-", "") #print desiredSeqString seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein()) #print seqNoGaps seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id) #print seqRecNoGaps.seq #print seqRecNoGaps.id SeqIO.write(seqRecNoGaps, queryFasta, "fasta") #print "Number of entries: ",len(testAlignment) # Here we remove the desired element newTestAlignment = [] for i in range(len(noGenSignalAlignment)): if i != entryToTest-1 : newTestAlignment.append(noGenSignalAlignment[i]) newAlignment = MultipleSeqAlignment(newTestAlignment) AlignIO.write(newAlignment, ownCladeProfile,"fasta") #print "Number of entries after: ",len(newTestAlignment)