def cutSequence(sequence, motif) :
    """Takes a Biopython sequence and a motif and return the sequences cut at this motif.
    Input:
    sequence = the sequence
    motif = a string containing a '-' at the cut site ; e.g. 'g-aattc' for EcoRI
    Output:
    a list of Biopython sequences SeqRecord, with names corresponding to the input sequence, the cut and the location in the input sequence
    """
    # clean the cutting motif
    pattern=motif.replace('-', '').upper()
    cutSite=motif.find('-')
    if (cutSite==-1) :
        cutSite=len(motif) # if no '-' is found the cut is at the end of the motif
    # extract the sequence string
    try :
        sequenceString=sequence.seq.data.upper() # if sequence is a SeqRecord
        name=sequence.name+'_'+motif+'_'
    except :
        try :
            sequenceString=sequence.data.upper() # if sequence is a Seq
            name=motif+'_'
        except :
            sequenceString=sequence.upper() # sequence should be a string
            name=motif+'_'
    # find the first occurrence
    fragments=[]
    nextSite=sequenceString.find(pattern)
    lastCut=1
    lastPosition=len(sequenceString)
    # loop
    while (nextSite>0) :
        # while a site is found
        newFragment=SeqRecord(sequenceString[:nextSite+cutSite])
        newFragment.name=name+str(lastCut)+'_'+str(lastCut+nextSite+cutSite-1)
        lastCut=lastCut+nextSite+cutSite
        sequenceString=sequenceString[nextSite+cutSite:]
        fragments.append(newFragment)
        nextSite=sequenceString.find(pattern)
    # add the remaining sequence
    if (sequenceString!='') :
        name=name+str(lastCut)+'_'+str(lastPosition)
        lastFragment=SeqRecord(sequenceString)
        lastFragment.name=name
        fragments.append(lastFragment)
    # return
    return fragments
Example #2
0
def constructRandomChunks(sequences, numChunks=10, chunksize=1000):
    """Construct random sequ"""
    rndChunks = list()
    mapping = dict()
    for s in sequences:
        for c in range(numChunks):
            rndPos = random.randint(0, len(s) - chunksize)
            rndSeq = s.seq[rndPos:rndPos + chunksize]
            rndID = str(uuid.uuid1())
            rndChunks.append(SeqRecord(rndSeq, rndID))
            mapping[rndID] = s.description
    random.shuffle(rndChunks)
    return rndChunks, mapping
def cutSequence(sequence, motif):
    """Takes a Biopython sequence and a motif and return the sequences cut at this motif.
    Input:
    sequence = the sequence
    motif = a string containing a '-' at the cut site ; e.g. 'g-aattc' for EcoRI
    Output:
    a list of Biopython sequences SeqRecord, with names corresponding to the input sequence, the cut and the location in the input sequence
    """
    # clean the cutting motif
    pattern = motif.replace('-', '').upper()
    cutSite = motif.find('-')
    if (cutSite == -1):
        cutSite = len(
            motif)  # if no '-' is found the cut is at the end of the motif
    # extract the sequence string
    try:
        sequenceString = sequence.seq.data.upper(
        )  # if sequence is a SeqRecord
        name = sequence.name + '_' + motif + '_'
    except:
        try:
            sequenceString = sequence.data.upper()  # if sequence is a Seq
            name = motif + '_'
        except:
            sequenceString = sequence.upper()  # sequence should be a string
            name = motif + '_'
    # find the first occurrence
    fragments = []
    nextSite = sequenceString.find(pattern)
    lastCut = 1
    lastPosition = len(sequenceString)
    # loop
    while (nextSite > 0):
        # while a site is found
        newFragment = SeqRecord(sequenceString[:nextSite + cutSite])
        newFragment.name = name + str(lastCut) + '_' + str(lastCut + nextSite +
                                                           cutSite - 1)
        lastCut = lastCut + nextSite + cutSite
        sequenceString = sequenceString[nextSite + cutSite:]
        fragments.append(newFragment)
        nextSite = sequenceString.find(pattern)
    # add the remaining sequence
    if (sequenceString != ''):
        name = name + str(lastCut) + '_' + str(lastPosition)
        lastFragment = SeqRecord(sequenceString)
        lastFragment.name = name
        fragments.append(lastFragment)
    # return
    return fragments
def main():
    # Configuration
    #Select the desired NCBI translation table
    translationTable = 11

    # Open the DNA sequence file and read the fasta sequences into a dictionary
    if (len(argv) > 1):
        dnaFileName = argv[1]
    else:
        dnaFileName = None
    dnaSeqFile = fileinput.input(dnaFileName)
    dnaSeqDict = SeqIO.to_dict(SeqIO.parse(dnaSeqFile, "fasta"))

    # Translate the sequences
    aaSeqRecords = []
    for key in dnaSeqDict:
        aaSeq = SeqRecord(dnaSeqDict[key].seq.translate(table=translationTable), id=key)
        aaSeqRecords.append(aaSeq)
    dnaSeqFile.close()

    # Replace stop codons with X (unknown aa) so muscle doesn't drop them
    for aaSeq in aaSeqRecords:
        noStopCodonSeq = str(aaSeq.seq).replace('*', 'X')
        aaSeq.seq = Seq(noStopCodonSeq)

    # Align the aa sequences
    commandLine = str(MuscleCommandline(seqtype='protein'))
    childProcess = subprocess.Popen(commandLine, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=(sys.platform!="win32")) #don't pipe stderr or muscle hangs
    SeqIO.write(aaSeqRecords, childProcess.stdin, "fasta")
    childProcess.stdin.close()
    aaAlignment = AlignIO.read(childProcess.stdout, "fasta")

    # Convert the aa alignment into a dna alignment
    dnaAlignment = Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
    for taxon in aaAlignment:
        aaCount = 0
        dnaSeq = ''
        for aaResidue in taxon.seq:
            if (aaResidue == '-'):
                dnaSeq = dnaSeq + '---'
            else:
                dnaSeq = dnaSeq + dnaSeqDict[taxon.id].seq[aaCount*3:aaCount*3+3]
                aaCount+=1
        # As we add the sequences to the alignment remove gene name from the sequence id so they taxon match the PAML constraint tree
        dnaAlignment.add_sequence(taxon.id.split('_')[0], str(dnaSeq))
    if (dnaFileName):
        outFileName = dnaFileName.split('.')[0] + '_aln.phy'
    else:
        outFileName = 'out_aln.phy'
    outFile = open(outFileName, 'w+')
    AlignIO.write([dnaAlignment], outFile, "phylip")

#I think this section should be removed.  If I put the 'I' into the alignment file now, I can't open the alignment with BioPython-based scripts (for manual editing etc).  I can use pamlize.py to add the I right before using paml.
    # Biopython doesn't tag Interleaved phylip files and PAML requires it so...
#    outFile.seek(0,0)
#    modifiedAlignmentText = outFile.readlines()
#    modifiedAlignmentText[0] = modifiedAlignmentText[0].rstrip() + ' I\n'
#    outFile.seek(0,0)
#    outFile.writelines(modifiedAlignmentText)

    outFile.close()
Example #5
0
    alignmentNoGenSignalIterator = AlignIO.parse(fastaFileCladeNoGeneralSignal,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));
    alignmentIterator = AlignIO.parse(fastaFileClade,"fasta",alphabet=Gapped(IUPAC.ExtendedIUPACProtein(),"-"));

    noGenSignalAlignment = alignmentNoGenSignalIterator.next()
    queryFasta = resultFolder+"/"+"Query_%d.faa" % (entryToTest,)
    ownCladeProfile = resultFolder+"/"+"ForOwnCladeProfile_%d.faa" % (entryToTest,)
    #print testAlignment[entryToTest].id
    #print testAlignment[entryToTest].seq

    alignmentWithSignal = alignmentIterator.next()
    desiredSeqString = str(alignmentWithSignal[entryToTest-1].seq)
    desiredSeqString = desiredSeqString.replace("-", "")
    #print desiredSeqString
    seqNoGaps = Seq(desiredSeqString, alphabet=IUPAC.ExtendedIUPACProtein())
    #print seqNoGaps
    seqRecNoGaps = SeqRecord(seq=seqNoGaps, id=alignmentWithSignal[entryToTest-1].id)
    #print seqRecNoGaps.seq
    #print seqRecNoGaps.id
    SeqIO.write(seqRecNoGaps, queryFasta, "fasta")
    
    #print "Number of entries: ",len(testAlignment)
    # Here we remove the desired element
    newTestAlignment = []
    for i in range(len(noGenSignalAlignment)):
        if i != entryToTest-1 :
            newTestAlignment.append(noGenSignalAlignment[i])

    newAlignment = MultipleSeqAlignment(newTestAlignment)

    AlignIO.write(newAlignment, ownCladeProfile,"fasta")
    #print "Number of entries after: ",len(newTestAlignment)