# For more information: http://emboss.sourceforge.net/download/

Entrez.email = "*****@*****.**"

if __name__ == "__main__":
    with open('data/data.txt') as dataset:
        ids = dataset.read().split()

    handle = Entrez.efetch(db = 'nucleotide', id = ids, rettype = "fasta")
    records = list(SeqIO.parse(handle, 'fasta'))

    for i, r in enumerate(records):
        with open(ids[i], 'w') as f:
            SeqIO.write(r, f, 'fasta')

    needle_cline = NeedleCommandline()
    needle_cline.asequence = ids[0]
    needle_cline.bsequence = ids[1]
    needle_cline.outfile = "rosalind_need_output.txt"
    needle_cline.gapopen = 10
    needle_cline.gapextend = 1
    needle_cline.endopen = 10
    needle_cline.endextend = 1
    needle_cline.endweight = True
    needle_cline()

    with open('rosalind_need_output.txt') as f:
        output = f.readlines()

    for line in output:
        if 'Score:' in line:
Example #2
0
def needle_alignment_emboss(s1, s2):
    import subprocess
    from Bio.Emboss.Applications import NeedleCommandline
    from Bio import AlignIO
    cline = NeedleCommandline(auto=True, sprotein=True, stdout=True, gapopen=10, gapextend=1)
    cline.asequence = "asis:" + s1
    cline.bsequence = "asis:" + s2
    process = subprocess.Popen(str(cline), shell=True, stdout=subprocess.PIPE,
            universal_newlines=True)
    return AlignIO.read(process.stdout, "emboss")
Example #3
0
def needle_alignment(s1, s2):
    '''
DESCRIPTION

    Does a Needleman-Wunsch Alignment of sequence s1 and s2 and
    returns a Bio.Align.Generic.Alignment object.
    '''
    import subprocess
    from Bio.Emboss.Applications import NeedleCommandline
    from Bio import AlignIO
    cline = NeedleCommandline(auto=True, sprotein=True, stdout=True, gapopen=10, gapextend=1)
    cline.asequence = "asis:" + s1
    cline.bsequence = "asis:" + s2
    process = subprocess.Popen(str(cline), shell=True, stdout=subprocess.PIPE)
    return AlignIO.read(process.stdout, "emboss")
Example #4
0
 def test_needle_needs_output(self):
     """Run needle without output file or stdout/filter should give error."""
     cline = NeedleCommandline(
         cmd=exes["needle"],
         asequence="asis:ACCCGGGCGCGGT",
         bsequence="asis:ACCCGAGCGCGGT",
         gapopen=10,
         gapextend=0.5,
         auto=True,
     )
     self.assertTrue(cline.auto)
     self.assertTrue(not cline.stdout)
     self.assertTrue(not cline.filter)
     self.assertEqual(cline.outfile, None)
     self.assertRaises(ValueError, str, cline)
Example #5
0
def main():
    if len(sys.argv) != 3:
        print('usage {0:s} genbank_id1 genbank_id2'.format(sys.argv[0]))
        sys.exit(1)
    genbank_a = sys.argv[1]
    genbank_b = sys.argv[2]
    sequence_a = getFasta(genbank_a)
    sequence_b = getFasta(genbank_b)
    needle_cline = NeedleCommandline('needle',
                                     asequence=sequence_a,
                                     bsequence=sequence_b,
                                     gapopen=10, gapextend=1,
                                     endweight=True, endopen=10, endextend=1,
                                     outfile='{0:s}_{1:s}_needle.txt'.format(
                                         genbank_a, genbank_b))
    stdout, stderr = needle_cline()
    sys.exit(0)
Example #6
0
def _needle(fa, fb, needlefile, a, b, results):
    """
    Run single needle job
    """
    from Bio.Emboss.Applications import NeedleCommandline

    needle_cline = NeedleCommandline(asequence=fa,
                                     bsequence=fb,
                                     gapopen=10,
                                     gapextend=0.5,
                                     outfile=needlefile)
    stdout, stderr = needle_cline()
    nh = NeedleHeader(needlefile)
    FileShredder([fa, fb, needlefile], verbose=False)
    r = ["\t".join((a, b, nh.identity, nh.score))]

    results.extend(r)
Example #7
0
def needle(args):
    """
    %prog needle pairs a.pep.fasta b.pep.fasta

    Take protein pairs and needle them.
    """
    from Bio.Emboss.Applications import NeedleCommandline

    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.base import FileShredder

    p = OptionParser(needle.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    pairsfile, apep, bpep = args
    afasta = Fasta(apep)
    bfasta = Fasta(bpep)
    fp = open(pairsfile)
    for row in fp:
        fa = open(pairsfile + "_a.fasta", "w")
        fb = open(pairsfile + "_b.fasta", "w")
        a, b = row.split()
        a = afasta[a]
        b = bfasta[b]
        SeqIO.write([a], fa, "fasta")
        SeqIO.write([b], fb, "fasta")
        fa.close()
        fb.close()
        needlefile = pairsfile + "_ab.needle"
        needle_cline = NeedleCommandline(asequence=fa.name,
                                         bsequence=fb.name,
                                         gapopen=10,
                                         gapextend=0.5,
                                         outfile=needlefile)
        stdout, stderr = needle_cline()
        print >> sys.stderr, stdout + stderr
        #align = AlignIO.read(needlefile, "emboss")
        nh = NeedleHeader(needlefile)
        print "\t".join((a.id, b.id, nh.identity, nh.score))
        FileShredder([fa.name, fb.name, needlefile])
def needle_align(fa1, fa2, gapopen = 10.0, gapextend = 0.5):
    ''' Uses needle to align two fastas with default gap penalties

        fa1, fa2: filenames of fastas to pairwise align. Must exist on disk when
                  command is called.
        gapopen: gap open penalty [default = 10.0]
        gapextend: gap extend penalty [default = 0.5]

        Returns a MultipleSeqAlignment object

    '''

    needle_cmd = NeedleCommandline(asequence = fa1, bsequence = fa2,
                                   outfile='/dev/stdout', aformat = 'fasta',
                                   gapopen = gapopen, gapextend = gapextend
                                   )
    exaln = AlignIO.read(StringIO(needle_cmd()[0]), format = 'fasta')

    return exaln
Example #9
0
def cluster_seq_support_nw(seq_dict, ident_thresh=0.90):
    matrix = matlist.blosum62
    items = seq_dict.items()
    ident_matrix = np.identity(len(items))

    for ind1 in range(len(items)):
        (gi1, sr1) = items[ind1]
        # print ind1,' from ',len(items)
        for ind2 in range(ind1):
            (gi2, sr2) = items[ind2]
            # pairwise2.align.globalds(p53_human, p53_mouse, matrix, gap_open, gap_extend)
            # alns = pairwise2.align.globalds(sr1.seq, sr2.seq, matrix, -10, -0.5)
            # alns = pairwise2.align.globalxx(sr1.seq, sr2.seq)
            needle_cline = NeedleCommandline(asequence="asis::" + sr1.seq,
                                             bsequence="asis::" + sr2.seq,
                                             gapopen=10,
                                             gapextend=0.5,
                                             outfile=TEMP_DIR + "/needle.txt")
            stdout, stderr = needle_cline()
            align = AlignIO.read(TEMP_DIR + "/needle.txt", "emboss")
            # print align
            # l1,l2=alns[0][0:2]
            l1 = align[0].seq
            l2 = align[1].seq

            matches = sum(aa1 == aa2 for aa1, aa2 in zip(l1, l2))
            identity = matches / float(len(l1))
            # print identity
            ident_matrix[ind1, ind2] = identity
            ident_matrix[ind2, ind1] = identity

    #crude clustering
    # print ident_matrix
    support = dict()
    # print ident_matrix
    for i in range(len(items)):
        support[items[i][0]] = 0
        for k in range(len(items)):
            if (ident_matrix[i, k] > ident_thresh):
                support[items[i][0]] += 1

    return support
def call_emboss(emboss_tool, aseq, bseq, outfile):

    if 'needle' in emboss_tool:  # global alignment
        tool = NeedleCommandline(emboss_tool,
                                 asequence=aseq,
                                 bsequence=bseq,
                                 gapopen=10,
                                 gapextend=0.5,
                                 outfile=outfile)

    elif 'water' in emboss_tool:  # local alignment
        tool = WaterCommandline(emboss_tool,
                                asequence=aseq,
                                bsequence=bseq,
                                gapopen=10,
                                gapextend=0.5,
                                outfile=outfile)

    stdout, stderr = tool()

    return None
Example #11
0
def run_needle(needle_exe,aseq,bseq,outfile):
    """ 
    Executes the EMBOSS needle program.

    Args:
        needle_exe (str): path to EMBOSS needle executable
        aseq (str): path to first sequence file
        bseq (str): path to second sequence file
        outfile (str): path to the output file to generate
    """
    needle_cline = NeedleCommandline(
        needle_exe,
        asequence=aseq,
        bsequence=bseq,
        gapopen=10,
        gapextend=0.5,
        outfile=outfile
    )

    stdout,stderr = needle_cline()

    return None
Example #12
0
 def test_needle_piped(self):
     """Run needle with asis trick, output piped to stdout."""
     cline = NeedleCommandline(
         cmd=exes["needle"],
         asequence="asis:ACCCGGGCGCGGT",
         bsequence="asis:ACCCGAGCGCGGT",
         gapopen=10,
         gapextend=0.5,
         auto=True,
         filter=True,
     )
     self.assertEqual(
         str(cline),
         exes["needle"]
         + " -auto -filter"
         + " -asequence=asis:ACCCGGGCGCGGT"
         + " -bsequence=asis:ACCCGAGCGCGGT"
         + " -gapopen=10 -gapextend=0.5",
     )
     # Run the tool,
     child = subprocess.Popen(
         str(cline),
         stdin=subprocess.PIPE,
         stdout=subprocess.PIPE,
         stderr=subprocess.PIPE,
         universal_newlines=True,
         shell=(sys.platform != "win32"),
     )
     child.stdin.close()
     # Check we could read its output
     align = AlignIO.read(child.stdout, "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(align[0].seq, "ACCCGGGCGCGGT")
     self.assertEqual(align[1].seq, "ACCCGAGCGCGGT")
     # Check no error output:
     self.assertEqual(child.stderr.read(), "")
     self.assertEqual(0, child.wait())
     child.stdout.close()
     child.stderr.close()
Example #13
0
 def align_genbank(self, query, frames):
     """Aligns the chosen GenBank record to the query sequence with the
     'needle' program in EMBOSS."""
     #   First, we check the frames of the hits from the BLAST record. If
     #   they are different, then we have to RC one of the sequences. We
     #   choose to RC the query, since it is much simpler, and will not
     #   introduce errors. The frames are given as integers with the values
     #   {-3, -2, -1, 1, 2, 3}. If they are the same sign, don't RC, if they
     #   are not, then RC.
     if (frames[0] > 0 and frames[1] > 0):
         snp_seq = query.seq
     else:
         snp_seq = query.seq.reverse_complement()
     #   Build the 'needle' command
     needle_cmd = NeedleCommandline(asequence=self.genbank_seq.name,
                                    bsequence='asis:' + str(snp_seq),
                                    gapopen=self.gapopen,
                                    gapextend=self.gapextend,
                                    outfile=self.needle_out.name)
     #   Run it!
     needle_cmd()
     return
Example #14
0
def needle(*id, gop=10, gex=0.5, out='emb.aln'):
    """Alignement global par la methode de Needleman"""

    lso = list(SeqIO.parse(workfile, "fasta"))

    mkfasx('seqa.fas', id[0])

    mkfasx('seqb.fas', *id[1:])

    needle_cline = NeedleCommandline(asequence='seqa.fas',
                                     bsequence='seqb.fas',
                                     gapopen=gop,
                                     gapextend=gex,
                                     outfile=out)

    stdout, stderr = needle_cline()

    os.remove('seqa.fas')
    os.remove('seqb.fas')

    if len(id) < 3:
        align = AlignIO.read(out, "emboss")
        return align
Example #15
0
def alignment_filter(seqs, template, gapopen=10, gapextend=0.5, lo_cutoff=300,
                     hi_cutoff=1000, cleanup=True):
    text_logger = logging.getLogger(__name__+'.text_logger')
    text_logger.info('Started alignment-based filtering')
    start_n_seqs = len(seqs)

    # Save the template and sequences as temporary fasta files
    # Probably some hacking that can be done in the NeedleCommandline stuff
    seqs_f_name = 'tempseq.fa'

    with open(seqs_f_name, 'w') as sh:
        SeqIO.write(seqs, sh, 'fastq')

    # Generate alignment command, run the alignment
    text_logger.info("""Began EMBOSS needle routine with settings:\ngapopen:
                    %i\ngapextend: %i\nlo_cutoff: %i\nhi_cutoff: %i""",
                 gapopen, gapextend, lo_cutoff, hi_cutoff)
    ofilen = 'temp_'+str(uuid.uuid4())+'.needle'
    needle_cline = NeedleCommandline(asequence='asis::{}'.format(template),
                                     bsequence=seqs_f_name, gapopen=gapopen,
                                     gapextend=gapextend, outfile=ofilen)
    needle_cline()
    text_logger.info('Finished EMBOSS needle routine')

    aln_data = AlignIO.parse(open(ofilen), "emboss")
    new_seqs = cull_alignments(aln_data, lo_cutoff=lo_cutoff,
                               hi_cutoff=hi_cutoff)

    # Exit routine
    if cleanup:
        text_logger.info('Cleaning up temp files')
        os.remove(seqs_f_name)
        os.remove(ofilen)
    text_logger.info("""Finished alignment-based filtering. Kept %i of %i
                     sequences.""", len(new_seqs), start_n_seqs)
    return new_seqs
Example #16
0
 def global_align(self, aseq, bseq):
     """
     Perform a global alignment using EMBOSS needle with input string sequences aseq and bseq
     Creates a file needle.txt for the output
     TODO: maybe combine with Query.global_align
     """
     with NamedTemporaryFile(mode='w+') as temp:
         needle_cline = NeedleCommandline(asequence="asis:"+aseq,
                                          bsequence="asis:"+bseq,
                                          datafile='EBLOSUM62',
                                          gapopen=self.needle_gapopen,
                                          gapextend=self.needle_gapextend,
                                          outfile=temp.name)
         child = subprocess.Popen(str(needle_cline), shell=True, stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
         child.wait()
         ret = child.returncode
         if ret == 0:
             temp.seek(0)
             needle_res = self.read_needle_out(temp.readlines())
         else:
             print('ERROR: Non-zero return code from needle alignment (generate)')
             needle_res = ('', '', '')
     return needle_res
Example #17
0
 def test_needle_file(self):
     """needle with the asis trick, output to a file."""
     #Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     #EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     #Run the tool,
     child = subprocess.Popen(str(cline),
                              stdin=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     out, err = child.communicate()
     return_code = child.returncode
     #Check it worked,
     errors = err.strip()
     self.assert_(err.strip().startswith("Needleman-Wunsch global alignment"), errors)
     self.assertEqual(out.strip(), "")
     if return_code != 0 : print >> sys.stderr, "\n%s"%cline
     self.assertEqual(return_code, 0)
     filename = cline.outfile
     self.assert_(os.path.isfile(filename))
     #Check we can parse the output...
     align = AlignIO.read(open(filename),"emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     #Clean up,
     os.remove(filename)
Example #18
0
def get_hist_ss(test_seq, type='Unknown', debug=0):
    """Returns sequence elements in histone sequence, all numbers assume first element in seq has number 0!!! Not like in PDB"""

    #Let's define 1kx5 sequences
    templ_H3 = Seq(
        "ARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGTVALREIRRYQKSTELLIRKLPFQRLVREIAQDFKTDLRFQSSAVMALQEASEAYLVALFEDTNLCAIHAKRVTIMPKDIQLARRIRGERA",
        IUPAC.protein)
    templ_H4 = Seq(
        "SGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKRISGLIYEETRGVLKVFLENVIRDAVTYTEHAKRKTVTAMDVVYALKRQGRTLYGFGG",
        IUPAC.protein)
    templ_H2A = Seq(
        "SGRGKQGGKTRAKAKTRSSRAGLQFPVGRVHRLLRKGNYAERVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLPKKTESSKSKSK",
        IUPAC.protein)
    templ_H2B = Seq(
        "AKSAPAPKKGSKKAVTKTQKKDGKKRRKTRKESYAIYVYKVLKQVHPDTGISSKAMSIMNSFVNDVFERIAGEASRLAHYNKRSTITSREIQTAVRLLLPGELAKHAVSEGTKAVTKYTSAK",
        IUPAC.protein)

    #'element_name':[start,stop], start stop - are inclusive as in PDB file
    #Numbering differes between symmetrical chains and 1kx5 vs 1aoi.
    #We simply take the minimum length of alpha helices over all chains in 1kx5
    #1 substructed from PDB values!!! because these values are in array index numberins starting from 0

    #docking domain (amino acids 80 – 119) from paper by Luger 1aoi, however in JMB paper we defined it as 80-118, probably to be at the trypsin cleavage site KK???, so we stick with this here. Although HistoneDB uses the Luger convention (albite with a bug - it starts with 81 - that was fixed in code now).

    ss_templ_H3 = {
        'alphaN': [43, 56],
        'alpha1': [62, 76],
        'alpha2': [84, 113],
        'alpha3': [119, 130],
        'loopL1': [78, 83],
        'loopL2': [114, 118],
        'beta1': [82, 83],
        'beta2': [117, 118],
        'mgarg1': [62, 62],
        'mgarg2': [82, 82],
        'mgarg3': [48, 48]
    }
    ss_templ_H4 = {
        'alpha1ext': [23, 28],
        'alpha1': [29, 40],
        'alpha2': [48, 75],
        'alpha3': [81, 92],
        'loopL1': [41, 47],
        'loopL2': [76, 81],
        'beta1': [44, 45],
        'beta2': [79, 80],
        'beta3': [95, 97],
        'mgarg1': [44, 44]
    }
    # ss_templ_H2A={'alpha1ext':[15,21],'alpha1':[25,36],'alpha2':[45,72],'alpha3':[78,88],'alpha3ext':[89,96],'loopL1':[37,44],'loopL2':[73,77],'beta1':[41,42],'beta2':[76,77],'beta3':[99,101],'docking domain':[91,107],'docking tail':[108,116],'mgarg1':[41,41],'mgarg2':[76,76]}
    #new def of docking domains as in Suto Luger 2000
    ss_templ_H2A = {
        'alpha1ext': [15, 21],
        'alpha1': [25, 36],
        'alpha2': [45, 72],
        'alpha3': [78, 88],
        'alpha3ext': [89, 96],
        'loopL1': [37, 44],
        'loopL2': [73, 77],
        'beta1': [41, 42],
        'beta2': [76, 77],
        'beta3': [99, 101],
        'docking domain': [80, 118],
        'mgarg1': [41, 41],
        'mgarg2': [76, 76]
    }

    ss_templ_H2B = {
        'alpha1': [33, 45],
        'alpha2': [51, 80],
        'alpha3': [86, 98],
        'alphaC': [99, 119],
        'loopL1': [46, 50],
        'loopL2': [81, 85],
        'beta1': [49, 50],
        'beta2': [84, 85],
        'mgarg1': [29, 29]
    }

    ss_templ = {
        'H3': ss_templ_H3,
        'H4': ss_templ_H4,
        'H2A': ss_templ_H2A,
        'H2B': ss_templ_H2B
    }
    templ = {
        'H3': templ_H3,
        'H4': templ_H4,
        'H2A': templ_H2A,
        'H2B': templ_H2B
    }

    #Lets use blast and see what histone is our query
    my_records = [
        SeqRecord(templ_H3, id='H3', name='H3'),
        SeqRecord(templ_H4, id='H4', name='H4'),
        SeqRecord(templ_H2A, id='H2A', name='H2A'),
        SeqRecord(templ_H2B, id='H2B', name='H2B')
    ]

    n1 = str(uuid.uuid4())
    n2 = str(uuid.uuid4())

    faa_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".faa")
    fastan2_file = os.path.join(CONFIG.TEMP_DIR, n2 + ".fasta")
    fastan1_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".fasta")
    db_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db")
    xml_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".xml")
    txt_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".txt")

    phr_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.phr")
    pin_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.pin")
    psq_file = os.path.join(CONFIG.TEMP_DIR, n1 + ".db.psq")

    SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')], fastan2_file,
                'fasta')

    # print(os.environ.get('PATH'))
    if (type == 'Unknown'):

        SeqIO.write(my_records, faa_file, "fasta")
        os.system('makeblastdb -dbtype prot -in %s -out %s > /dev/null' %
                  (faa_file, db_file))

        blastp_cline = NcbiblastpCommandline(query=fastan2_file,
                                             db=db_file,
                                             evalue=100,
                                             outfmt=5,
                                             out=xml_file)
        stdout, stderr = blastp_cline()

        blast_record = NCBIXML.read(open(xml_file, 'r'))

        sname = list()
        evalue = list()
        hsp_list = list()
        # length_list=list()
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                sname.append(alignment.title)
                evalue.append(hsp.expect)
                hsp_list.append(hsp)
                # length_list.append(alignment.length)
        hist_identified = sname[evalue.index(min(evalue))].split()[1]
        hsp = hsp_list[evalue.index(min(evalue))]
        # length=length_list[evalue.index(min(evalue))]
    else:
        hist_identified = type

    if (debug): print('Most likely this is histone:')
    if (debug): print(hist_identified)
    if (debug): print('Blast alignment')
    #We need to determine secondary strucutre according to template using the alignment
    # if(debug): print(hsp)

    SeqIO.write([
        SeqRecord(
            templ[hist_identified], id=hist_identified, name=hist_identified)
    ], fastan1_file, 'fasta')

    #Now we will redo it with Needlman Wunsh - the global alignment
    needle_cline = NeedleCommandline(asequence=fastan1_file,
                                     bsequence=fastan2_file,
                                     gapopen=20,
                                     gapextend=1,
                                     outfile=txt_file)
    stdout, stderr = needle_cline()
    # print('Needle alignment')

    align = AlignIO.read(txt_file, "emboss")
    if (debug):
        print(align)
    # print(hsp.gaps)
    #Blast checking
    # ss_test=dict()
    # for key,value in ss_templ[hist_identified].iteritems():
    #     print('Checking %s'%key)
    #     if((hsp.sbjct_start<=value[1])&((hsp.sbjct_end)>=value[0])):
    #         print('Belongs')
    #     else:
    #         print('Not')

    #Now we will get correspondence

    ss_test = dict()
    hist = templ[hist_identified]

    corrsp_hist = list(range(len(hist)))
    k = 0
    for a, i in zip(align[0], range(len(align[0]))):
        if (a == '-'):
            k = k + 1
        else:
            corrsp_hist[i - k] = i
    if (debug): print(corrsp_hist)

    corrsp_test = list(range(len(test_seq)))
    k = 0
    for a, i in zip(align[1], range(len(align[1]))):
        if (a == '-'):
            k = k + 1
        else:
            corrsp_test[i - k] = i
    if (debug): print(corrsp_test)

    for key, value in ss_templ[hist_identified].items():
        if (debug): print('Checking %s' % key)
        start_in_aln = corrsp_hist[value[0]]
        if (debug): print('Start in aln %d' % start_in_aln)

        end_in_aln = corrsp_hist[value[1]]
        if (debug): print('End in aln %d' % end_in_aln)

        for k in range(len(align[0])):
            try:
                start_in_test_seq = corrsp_test.index(start_in_aln + k)
            except:
                start_in_test_seq = -1
                if (debug): print("Trying to move start"),
                continue
            break
        # print('\n %d'%start_in_test_seq)

        for k in range(len(align[0])):
            try:
                end_in_test_seq = corrsp_test.index(end_in_aln - k)
            except:
                end_in_test_seq = -1
                if (debug): print('Trying to move end'),
                continue
            break

        # print('\n %d'%end_in_test_seq)
        if ((start_in_test_seq == -1) | (end_in_test_seq == -1) |
            (start_in_test_seq > end_in_test_seq)):
            ss_test[key] = [-1, -1]
        else:
            ss_test[key] = [start_in_test_seq, end_in_test_seq]
        if (debug): print(ss_test[key])

    if (type == 'Unknown'):
        #os.system("rm %s.faa %s.db.phr %s.db.pin %s.db.psq %s.fasta %s.xml %s.txt %s.fasta"%(n1,n1,n1,n1,n2,n1,n1,n1))
        os.system("rm %s %s %s %s %s %s %s %s"%\
            (faa_file,phr_file,pin_file,psq_file,fastan2_file,xml_file,txt_file,fastan1_file))

    else:
        os.system("rm   %s  %s %s" % (fastan2_file, txt_file, fastan1_file))

    return hist_identified, ss_test
    def alignedvariants(self, threshold=0.9):
        import subprocess
        import re
        import itertools
        import hashlib
        from Bio.Emboss.Applications import NeedleCommandline
        from pythonlib import Alignment

        files = []
        var_dict = {}
        for i, s in enumerate(self.seq_obj):
            m_obj = re.search('posterior=(.*)\s*ave_reads=(.*)', s.description)
            post, ave_reads = map(float, (m_obj.group(1), m_obj.group(2)))
            if post < threshold or ave_reads < 1.:
                continue
            if post > 1.0:
                print('WARNING: posterior=', post, file=sys.stderr)
            outfile = 'tmp%d.needle' % i
            files.append(outfile)
            needle_cline = NeedleCommandline(asequence='asis:%s' % self.ref, bsequence='asis:%s' % s.seq.tostring().strip('-'), \
                                   outfile=outfile, gapopen=10.0, gapextend=0.5, aformat='markx10')
            needle_cline.auto = True

            try:
                retcode = subprocess.call(str(needle_cline), shell=True)
                if retcode < 0:
                    sys.exit('Child needle was terminated by signal %d' %
                             -retcode)

#               else:
#                   print >> sys.stderr, 'Child needle returned %i' % retcode
            except OSError:
                sys.exit('Execution of needle failed: %s' % ee)
                pass

            tal = Alignment.alignfile2dict([outfile],
                                           'support_seqs%d' % i,
                                           10.0,
                                           0.5,
                                           Verbose=False)
            os.remove(outfile)
            ka = tal.keys()[0]
            this = tal[ka]['asis']
            it_pair = itertools.izip(this.seq_a, this.seq_b)
            #this.summary()
            #start, stop = this.start, this.stop
            #it_pair = itertools.izip(this.seq_a[start-1:stop], this.seq_b[start-1:stop])

            this_seq = []
            while True:
                try:
                    p = it_pair.next()
                except StopIteration:
                    break
                if p is None:
                    break
                if p[1] == '-':
                    assert p[0] != '-', 'gap-gap?'
                    this_seq.append(p[0])
                elif p[0] != '-':
                    this_seq.append(p[1])
            ws = ''.join(this_seq)
            var_dict[ws] = var_dict.get(ws, 0) + ave_reads

        for k, v in var_dict.items():
            ts = Seq(k, IUPAC.unambiguous_dna)
            tsr = SeqRecord(ts, id = hashlib.sha224(k).hexdigest(), \
                            name='Reconstructed local hap')
            tsr.description = 'ave_reads=%f' % v
            self.dna_seqs.append(tsr)
        print('%d haplotypes have support >=%f'\
              % (len(files), threshold), file=sys.stderr)
        return self.dna_seqs
Example #20
0
    def GetExec(self, optList, frame):
        # Respond to the "embossn" type command.
        self.frame = frame
        plugin_exe = r"C:/mEMBOSS/needle.exe"
        self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt"
        self.outtype = "fasta"
        cline = NeedleCommandline(
            plugin_exe,
            asequence=str(self.frame.paramBoxes[1].GetValue()),
            bsequence=str(self.frame.paramBoxes[3].GetValue()))
        cline.outfile = self.outfile
        cline.gapopen = self.param[7].GetValue()
        cline.gapextend = self.param[9].GetValue()
        if self.param[10].GetValue():
            cline.similarity = True
        else:
            cline.similarity = False

        if self.frame.abet == "AA":
            cline.snucleotide = False
            cline.sprotein = True
        elif self.frame.abet == "DNA" or self.frame.abet == "RNA":
            cline.snucleotide = True
            cline.sprotein = False
        if self.frame.options:
            t = self.boxList[3].GetValue()
            if t != '':
                cline.datafile = str(t)
        return str(cline)
Example #21
0
    def binomial(self):
        patt = re.compile(r'_')
        pattern = re.compile('\#\sIdentity\:\s+[0-9\/]+\s+\(([0-9\.]+)\%\)')
        patt1 = re.compile('\#\sIdentity\:\s+(\d+)\/(\d+)\s+\([0-9\.]+\%\)')
        patt2 = re.compile('\#\sGaps\:\s+(\d+)\/\d+')
        out = open("reversa_result.txt", "w")
        results = []
        #print self.bestWin

        for j, i in self.bestWin.items():
            out.write('cluster %s \n' % j)

            for a, b in itertools.combinations(i, 2):
                seq1 = patt.split(a)[0]
                seq2 = patt.split(b)[0]
                #print seq1, seq2
                if seq1 != seq2:
                    for fasta in SeqIO.parse(self.fastaFile, "fasta"):
                        if fasta.id == seq1:
                            aseq = str(fasta.seq)
                        if fasta.id == seq2:
                            bseq = str(fasta.seq)

                    needle_cline = NeedleCommandline(
                        asequence="asis:%s" % str(aseq),
                        bsequence="asis:%s" % str(bseq),
                        gapopen=10,
                        gapextend=0.5,
                        outfile="needle.txt")
                    needle_cline()

                    for line in open('needle.txt'):
                        ident = pattern.search(line)
                        if ident is not None:
                            identity = ident.group(1)

                    for seq in SeqIO.parse("windows_sequence.fasta", "fasta"):
                        if seq.id == a:
                            awin = str(seq.seq)
                        if seq.id == b:
                            bwin = str(seq.seq)

                    ncline = NeedleCommandline(asequence="asis:%s" % str(awin),
                                               bsequence="asis:%s" % str(bwin),
                                               gapopen=10,
                                               gapextend=0.5,
                                               outfile="needle1.txt")
                    ncline()

                    for line1 in open('needle1.txt'):
                        #print line1
                        identwin = patt1.search(line1)
                        if identwin is not None:
                            iwin = identwin.group(1)
                            ilen = identwin.group(2)
                        gapPatt = patt2.search(line1)
                        #print gapPatt
                        if gapPatt is not None:
                            gap = gapPatt.group(1)

                    difWin = int(ilen) - (int(iwin) + int(gap))
                    size = self.sizeWin
                    totalDif = 1 - (float(identity) / 100)
                    pvalue = stats.binom.cdf(difWin, size, totalDif)

                    results.append({'seq1': a, 'seq2': b, 'pvalue': pvalue})

                    if pvalue <= 1.00e-20:
                        #print 'combination:', a, b, 'pvalue: ', pvalue
                        out.write('combination: %s  %s pvalue: %s \n' %
                                  (a, b, pvalue))

        out.close()
        dfResults = pd.DataFrame(results)
        dfResults.to_csv('results.csv')
Example #22
0
                      a[i:j].replace('-', ''), b[i:j].replace('-', '')]))
# too slow


### use linux server with EMBOSS pairwise alignment programs installed
from Bio import SeqIO
from Bio.Emboss.Applications import NeedleCommandline
from Bio import AlignIO 

seqfiles = []
for i in SeqIO.parse('rosalind_laff.txt', 'fasta'):
    seq_file = i.id + '.txt'
    SeqIO.write(i, seq_file, 'fasta')
    seqfiles.append(seq_file)

needle_cline = NeedleCommandline(asequence=seqfiles[0], bsequence=seqfiles[1],
                                 gapopen=11, gapextend=1, outfile="needle.txt")
needle_cline()

aln = AlignIO.read('needle.txt', "emboss")
a, b = [str(i.seq) for i in aln]

# need to find the score in needle output file 'needle.txt'
for ln in open('needle.txt'):
    if 'Score' in ln:
        print(ln)
        score = int(float(ln.rstrip().split()[-1]))
        break

open('rosalind_laff_sub.txt', 'wt').write('\n'.join([str(int(score)), 
                      a.replace('-', ''), b.replace('-', '')]))
Example #23
0
 def test_needle_file(self):
     """needle with the asis trick, output to a file."""
     # Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     # EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     stdout, stderr = cline()
     # Check it worked,
     self.assertTrue(stderr.strip().startswith("Needleman-Wunsch global alignment"), stderr)
     self.assertEqual(stdout.strip(), "")
     filename = cline.outfile
     self.assertTrue(os.path.isfile(filename),
                     "Missing output file %r from:\n%s" % (filename, cline))
     # Check we can parse the output...
     align = AlignIO.read(filename, "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     # Clean up,
     os.remove(filename)
        print "continue"
        continue
    else:
        print "good length = %i" % seqlength
        print "break"
        break
newrefrec = temprec
SeqIO.write(newrefrec, refoutfile, "fasta")


x=0
for item in protlist:
    x = x + len(item.seq)

avg = x/len(protlist)
print avg
needle_cline = NeedleCommandline()
needle_cline.asequence=refoutfile
needle_cline.bsequence=prot_outfile
needle_cline.gapopen=10
needle_cline.gapextend=0.5
needle_cline.outfile=alignment_out
print needle_cline
#stdout, stderr = needle_cline()

#logstring = stdout+stderr
#logout = open(logfile, "w")
#logout.write(logstring)
#logout.close()

Example #25
0
def renumber_noInputAlign(pdbfile,refseqfile,selection="protein",\
	outfile="renumbered.pdb",newAA=None,first=1):
	'''
	Renumber pdb file (pdbfile) according to reference sequence in refseqfile. 
	Pdb sequence is extracted and aligned with reference sequence using needle 
	from EMBOSS.
	- refseqfile: .fasta file containing the reference sequence by which to 
	renumber
	- selection: atom selection(s) in the the structure file to renumber. 
	Will iterate over comma separated selections to renumber each.
	- pdbfile: original structure file
	- outfile: output structure file
	- newAA: comma separated list of unrepresented amino acids
		XXXYCA: 
		XXX = three letter abbrevation as in pdbfile
		Y = one letter code in the alignment
		CA = atom to use as CA if different from "CA", eg 
		C1 in PVL of 1JEN	

	'''
	# selections = selection.split(",")
	selections = selection
	tmp=tempfile.gettempdir()
	tmp_refseqfile="%s/refseq.fasta"%tmp
	pdbID = re.search("\w+\.\w+", pdbfile).group(0)
	tmp_pdbseqfile="%s/%s.fasta"%(tmp,pdbID)
	tmp_needle="%s/needle.out"%tmp
	if os.path.exists(refseqfile):
		refseqRec = SeqIO.read(refseqfile,"fasta",alphabet=IUPAC.protein )
		refseqRec.id = "refseq"
		SeqIO.write(refseqRec,tmp_refseqfile,"fasta")
	else: 
		print ("ERROR, no such file: %s"%refseqfile)
		exit(1)

	if os.path.exists(pdbfile):
		structure=parsePDB("%s"%pdbfile)
		updateAA(structure,newAA)
	else:
		print ("ERROR, no such file: %s"%pdbfile)
		exit(1)

	modified_selections = []
	for polymer in selections:
		currentSel = structure.select("protein and name CA and %s"%polymer)
		if currentSel:
			pdbseq_str=''.join([oneletter[i] for i in currentSel.getResnames()])
			pdbseqRec=SeqRecord(Seq(pdbseq_str,IUPAC.protein),id=pdbID)
			SeqIO.write(pdbseqRec,tmp_pdbseqfile,"fasta")

			needle_cli = NeedleCommandline(asequence=tmp_pdbseqfile,bsequence=tmp_refseqfile,\
				gapopen=10,gapextend=0.5,outfile=tmp_needle)
			needle_cli()
			aln = AlignIO.read(tmp_needle, "emboss",alphabet=IUPAC.protein )
			# os.remove(tmp_needle)
			# os.remove(tmp_pdbseqfile)		

			gpdb.renumber_aln(aln,"refseq",pdbID,first)
			pdbRenSeq = gpdb.seqbyname(aln, pdbID)
			gpdb.renumber_struct(structure, pdbRenSeq,polymer)
			pdbRenSeq.annotations["resnum"]=str(pdbRenSeq.letter_annotations["resnum"])
			modified_selections.append(polymer)
			# seems to be the only way to store pret residue annotations
			# AlignIO.write(aln,"pdb.outseq","seqxml")		
		else:
			print ('ERROR: Selection \"%s\" has zero CA atoms'%polymer)

	if writePDB(outfile, structure):
		print ("Wrote renumbered %s selections from %s to %s"%\
				(str(modified_selections),pdbfile,outfile))
	os.remove(tmp_refseqfile)
for i in records:
    id = i.id
    with open(f"{id}.fasta", "w") as output_handle:
        SeqIO.write(i, output_handle, "fasta")
    files_created.append(f"{id}.fasta")

# pairwise alignment

output_file = "_".join(search)
output_file = output_file[1:]
output_file = output_file + ".txt"

pairwise = NeedleCommandline(
    asequence=f"{files_created[0]}",
    bsequence=f"{files_created[1]}",
    gapopen=10,
    gapextend=1,
    endopen=10,
    endextend=1,
    endweight=True,
    # emboss.sourceforge.net/apps/cvs/emboss/apps/needle.html
    outfile=output_file)

subprocess.run([str(pairwise)], shell=True, check=True)

with open(output_file, "r") as f:
    for line in f.readlines():
        if "Score" in line:
            print(line)
Example #27
0
def GetExec(inF, outF):
    # Create User Modifiable search check boxes.
    plugin_exe = r"C:/mEMBOSS/needle.exe"
    cline = NeedleCommandline(plugin_exe, infile=inF, outfile=outF)
    p = subprocess.Popen(str(self.cline))
    p.wait()
Example #28
0
def each_needle_run(pair_gene_dir, tmp_gene_converted_dir,
                    pair_gene_alignment_dir, og_id, strain_dict):
    """
    This function is used to call Needle program to do pairwise sequence alignment
    :param pair_gene_dir: each homologous gene directory
    :param tmp_gene_converted_dir: used to put some temporary files and will be deleted in the end
    :param pair_gene_alignment_dir: each orthologous gene pair-wised alignment directory
    :param og_id: each orthologous gene id
    :param strain_dict: inherit from load_strains_label function with strain information
    :return: the alignment result of each gene
    """
    if not os.path.exists(pair_gene_dir):
        logger.error("There is no directory contains gene file, please check.")
        logger.error(last_exception())
        sys.exit(1)
    tmp_gene_fasta = os.path.join(pair_gene_dir, og_id + '.fasta')
    converted_records = []
    re_pattern = re.compile(r'fig\|(\d+\.\d+)\.peg\.(\d+)\s(.*)')
    in_pattern = re.compile(r'Identity.*\((\d+\.\d+)%\)')
    annotation = ''
    og_list = []
    for record in SeqIO.parse(tmp_gene_fasta, 'fasta'):
        m = re.search(re_pattern, record.description)
        strain_id = m.group(1)
        gene_id = '{0}.peg.{1}'.format(strain_id, m.group(2))
        og_list.append(gene_id)
        annotation = m.group(3)
        record.id = strain_dict[strain_id][0]
        final_record = SeqRecord(record.seq, record.id, description='')
        converted_records.append(final_record)
    the_strain_fasta = os.path.join(tmp_gene_converted_dir, 'a.fasta')
    other_strain_fasta = os.path.join(tmp_gene_converted_dir, 'b.fasta')
    SeqIO.write(converted_records[0], the_strain_fasta, 'fasta')
    SeqIO.write(converted_records[1], other_strain_fasta, 'fasta')
    result_file = os.path.join(pair_gene_alignment_dir,
                               "{0}.txt".format(og_id))
    needle_cline = NeedleCommandline()
    needle_cline.asequence = the_strain_fasta
    needle_cline.bsequence = other_strain_fasta
    needle_cline.gapopen = 10
    needle_cline.gapextend = 0.5
    needle_cline.outfile = result_file
    devnull = open(os.devnull, 'w')
    try:
        subprocess.call(str(needle_cline),
                        shell=True,
                        stdout=devnull,
                        stderr=devnull)
    except OSError:
        logger.info(
            'Try to call Needle program failed, please check if Needle has been installed successfully.'
        )
        logger.error(last_exception())
        sys.exit(1)
    os.remove(the_strain_fasta)
    os.remove(other_strain_fasta)
    gene_alignment_result = ''
    with open(result_file, 'r') as f:
        for a_line in f.readlines():
            if 'Identity' in a_line:
                m = re.search(in_pattern, a_line.strip())
                identity = m.group(1)
                gene_alignment_result = '{0}\t[{1}|{2}]\t{3}\t{4}\n'.format(
                    og_id, og_list[0], og_list[1], identity, annotation)
    return gene_alignment_result
def f2():
    proteins=['NSP1', 'NSP2', 'NSP3','NSP4', 'NSP5', 'NSP6', 'NSP7', 'NSP8', 'NSP9', 'NSP10', 'NSP11', 'NSP12', 'NSP13', 'NSP14', 'NSP15', 'NSP16', 'Spike', 'NS3', 'E', 'M', 'NS6', 'NS7a', 'NS7b', 'NS8', 'N']
    for each in proteins:
        needle_cline = NeedleCommandline(asequence=f'{each}referance.fasta', bsequence=f'{each}.fasta', gapopen=10, gapextend=0.5, datafile='EPAM40', outfile=f'{each}needlePAM40.txt', aformat='score', nobrief=True)
        stdout, stderr = needle_cline()
        print(stdout + stderr)
Example #30
0
        len_seq = len(read.seq)
        length += float(len_seq)
        length2 += float(len_seq * len_seq)
        #        readdict[read.] = [seq,len_seq]
        n += 1.

meanlr = length / n
stdlr = math.sqrt((n * length2 - length * length) / (n * n - n))
allowed_length = [meanlr - acclength * stdlr, meanlr + (1 + acclength) * stdlr]
print >> sys.stderr, 'Allowed interval for length is', allowed_length

if not os.path.isfile('tmp_align_f.needle'):
    print >> sys.stderr, 'Aligning back...'
    cmline_forw = NeedleCommandline(asequence=options.ref,
                                    bsequence=f_fasta_forward_filename,
                                    outfile='tmp_align_f.needle',
                                    gapopen=6.0,
                                    gapextend=3.0,
                                    aformat='markx10')
    child_process_forw = subprocess.call(str(cmline_forw), shell=True)

if not os.path.isfile('tmp_align_r.needle'):
    print >> sys.stderr, '...and forth'
    cmline_rev = NeedleCommandline(asequence=options.ref,
                                   bsequence=f_fasta_reverse_filename,
                                   outfile='tmp_align_r.needle',
                                   gapopen=6.0,
                                   gapextend=3.0,
                                   aformat='markx10')
    child_process_rev = subprocess.call(str(cmline_rev), shell=True)

diff_ident = []
Example #31
0
    def getIdentity(self):
        self.getFiles()
        needleOut = self.workPath + 'needle.txt'
        dfResult = 'orthologs_probabilities.csv'

        patt = re.compile(r'.*\|(.*)\s+.*\|(.*)\s+\d+')
        pattFile = re.compile(r'\.')
        identPatt = re.compile(r'.*Identity\:\s+(\d+)\/(\d+)')

        hostSeq = SeqIO.to_dict(SeqIO.parse(self.queryProt, "fasta"))
        hostCds = len(hostSeq)
        #print hostCds

        with open(dfResult, 'w') as handle:
            handle.write('name\torthologs\tmean\tstd\tPr_orthologs\n')

            for i in range(0, len(self.listFiles)):

                #Define variable
                qSeq = os.path.join(self.protFolder, self.listFiles[i])
                queryGen = SeqIO.to_dict(SeqIO.parse(qSeq, "fasta"))
                name = str(pattFile.split(self.listFiles[i])[0])
                pairsFile = self.workPath + name + '.txt'

                queryID = []
                hostID = []
                ident = []

                for line in open(pairsFile, 'r'):
                    objM = patt.match(line)

                    if objM.group(1) in hostSeq.keys():
                        hostProt = objM.group(1)
                        qProt = objM.group(2)
                    else:
                        hostProt = objM.group(2)
                        qProt = objM.group(1)

                    needle_cline = NeedleCommandline(
                        asequence='asis:%s' % str(hostSeq[hostProt].seq),
                        bsequence='asis:%s' % str(queryGen[qProt].seq),
                        gapopen=10,
                        gapextend=0.5,
                        outfile=needleOut)
                    needle_cline()

                    for liFile in open(needleOut, 'r'):
                        objIdent = identPatt.match(liFile)
                        if objIdent is not None:
                            scoreIdent = float(objIdent.group(1)) / float(
                                objIdent.group(2))
                            #print scoreIdent
                            hostID.append(hostProt)
                            queryID.append(qProt)
                            ident.append(scoreIdent)

                #print len(ident), len(coliID), len(queryID)
                #Identities of each organism
                #ortList = [('queryProteome_ID',hostID),('%s_ID' %name,queryID),('Identity',ident)]
                orthoTotal = len(ident)
                #print orthoTotal
                #orthoData = pd.DataFrame.from_items(ortList)
                #orthoData.to_csv('ortho_%s.csv' %name, index=False)
                a = np.array(ident)
                meanQuery = np.mean(a)
                stdQuery = np.std(a)
                #print hostCds
                orthoPr = float(orthoTotal) / hostCds
                handle.write('%s\t%s\t%s\t%s\t%f\n' %
                             (name, orthoTotal, meanQuery, stdQuery, orthoPr))
                #print '%s\t%s\t%s\t%s\t%f\n' %(name, orthoTotal, meanQuery, stdQuery, orthoPr)
        os.remove(needleOut)
        return self.listFiles
def global_align(seq_record1, seq_record2):
    """Global alignment using the Bio.pairwise2 package. 
    Check if sequences are nucleotide or amino acids using the _verify_alphabet function from the Bio.Alphabet module.
    """

    from Bio.Alphabet import IUPAC
    from Bio.Seq import Seq
    from Bio.Alphabet import _verify_alphabet

    #gap_open = -10
    #gap_extend = -0.5

    seq_record1.seq = seq_record1.seq.upper()
    seq_record2.seq = seq_record2.seq.upper()

    seq1_file = NamedTemporaryFile()
    SeqIO.write(seq_record1, seq1_file, "fasta")
    seq1_file.flush()
    seq2_file = NamedTemporaryFile()
    SeqIO.write(seq_record2, seq2_file, "fasta")
    seq2_file.flush()

    seq_record1.seq.alphabet = IUPAC.ambiguous_dna
    seq_record2.seq.alphabet = IUPAC.ambiguous_dna

    if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq):
        #print "DNA!"
        #    alns = pairwise2.align.globalds(seq1, seq2, DNA_matrix, gap_open, gap_extend)
        #    print ">"+noms[id_seq1]
        #    print alns[0][0]
        #    print ">"+noms[id_seq2]
        #    print alns[0][1]
        #    return  alns[0]
        needle_cline = NeedleCommandline(asequence=seq1_file.name,
                                         bsequence=seq2_file.name,
                                         stdout=True,
                                         gapopen=10,
                                         gapextend=0.5,
                                         auto=True,
                                         aformat="srspair")
        stdout, stderr = needle_cline()
        #print stdout
        align = AlignIO.read(StringIO.StringIO(stdout), "emboss")
        return align

    seq_record1.seq.alphabet = IUPAC.protein
    seq_record2.seq.alphabet = IUPAC.protein
    #print seq1
    #print _verify_alphabet(seq1)

    if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq):
        #print "AA!"
        #    alns = pairwise2.align.globalds(seq1, seq2, matlist.blosum62, gap_open, gap_extend)
        #    return  alns[0]

        needle_cline = NeedleCommandline(asequence=seq1_file.name,
                                         bsequence=seq2_file.name,
                                         stdout=True,
                                         gapopen=10,
                                         gapextend=0.5,
                                         auto=True,
                                         aformat="srspair")
        stdout, stderr = needle_cline()
        align = AlignIO.read(StringIO.StringIO(stdout), "emboss")
        return align

    else:
        raise "unkown alphabet!"
Example #33
0
from Bio import SeqIO, Entrez
Entrez.email = "*****@*****.**"


if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_need.txt')) as dataset:
        ids = dataset.read().split()

    handle = Entrez.efetch(db='nucleotide', id=ids, rettype="fasta")
    records = list(SeqIO.parse(handle, 'fasta'))

    for i, r in enumerate(records):
        with open(ids[i], 'w') as f:
            SeqIO.write(r, f, 'fasta')

    needle_cline = NeedleCommandline()
    needle_cline.asequence = ids[0]
    needle_cline.bsequence = ids[1]
    needle_cline.outfile = "need.txt"
    needle_cline.gapopen = 11
    needle_cline.gapextend = 1
    needle_cline.endopen = 11
    needle_cline.endextend = 1
    needle_cline.endweight = True

    needle_cline()

    with open('need.txt') as f:
        output = f.readlines()

    for line in output:
Example #34
0
def transfer_features_from_template_to_query(template_features,
                                             query_file,
                                             save_dir="",
                                             save_not_found=False):
    """Transfer features from template to query. Position are defined in the 
    template and we use needle to find the corresponding position in the template

    Parameters:
    -----------
    template_features: QuerySet of Feature django models
        The features that relate to the template. 
    query_file: str
        Path to FASTA file containing query sequence 
    save_dir: str
        Path to save temp files.
    save_not_found: bool
        Add Features even if they weren't found. Indices will be (-1, -1)

    Yeilds:
    -------
    A Feature django model with the name of the feature and position relative to the query
    """
    if len(template_features) == 0:
        return

    n2 = str(uuid.uuid4())
    template = template_features.first().template
    template_file = template.path()
    needle_results = os.path.join(save_dir, "needle_{}.txt".format(n2))
    cmd = os.path.join(os.path.dirname(sys.executable), "needle")

    if not os.path.isfile(cmd):
        cmd = "needle"
    needle_cline = NeedleCommandline(cmd=cmd,
                                     asequence=template_file,
                                     bsequence=query_file,
                                     gapopen=10,
                                     gapextend=1,
                                     outfile=needle_results)
    stdout, stderr = needle_cline()

    align = AlignIO.read(needle_results, "emboss")
    # print align.format("fasta")
    core_histone = align[0]
    query = align[1]

    corresponding_hist = list(range(len(template.get_sequence())))
    k = 0
    for i, core_histone_postion in enumerate(core_histone):
        if core_histone_postion == "-":
            k += 1
        else:
            corresponding_hist[i - k] = i

    corresponding_test = list(
        range(len(next(SeqIO.parse(query_file, "fasta")))))
    k = 0
    for i, query_position in enumerate(query):
        if query_position == "-":
            k = k + 1
        else:
            corresponding_test[i - k] = i

    for feature in template_features:
        start = feature.start
        stop = feature.end
        start_in_aln = corresponding_hist[start]
        end_in_aln = corresponding_hist[stop]
        start_in_test_seq = -1
        end_in_test_seq = -1

        for k in range(len(core_histone)):
            try:
                start_in_test_seq = corresponding_test.index(start_in_aln + k)
                break
            except ValueError:
                continue

        for k in range(len(core_histone)):
            try:
                end_in_test_seq = corresponding_test.index(end_in_aln - k)
                break
            except ValueError:
                continue

        if start_in_test_seq == -1 or end_in_test_seq == -1 or start_in_test_seq > end_in_test_seq:
            if save_not_found:
                yield Feature(
                    id="{}_{}".format(
                        os.path.splitext(query_file)[0], feature.id),
                    name=feature.name,
                    description=feature.description,
                    start=-1,
                    end=-1,
                    color=feature.color,
                )
        else:
            yield Feature(
                id="{}_{}".format(os.path.splitext(query_file)[0], feature.id),
                name=feature.name,
                description=feature.description,
                start=start_in_test_seq,
                end=end_in_test_seq,
                color=feature.color,
            )

    #Cleanup
    os.remove(needle_results)
Example #35
0
 def test_needle_file(self):
     """needle with the asis trick, output to a file."""
     #Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     #EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     #Run the tool,
     result, out, err = generic_run(cline)
     #Check it worked,
     errors = err.read().strip()
     self.assert_(errors.startswith("Needleman-Wunsch global alignment"), errors)
     self.assertEqual(out.read().strip(), "")
     if result.return_code != 0 : print >> sys.stderr, "\n%s"%cline
     self.assertEqual(result.return_code, 0)
     filename = result.get_result("outfile")
     self.assertEqual(filename, "Emboss/temp with space.needle")
     assert os.path.isfile(filename)
     #Check we can parse the output...
     align = AlignIO.read(open(filename),"emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     #Clean up,
     os.remove(filename)
Example #36
0
 def test_needle_file(self):
     """needle with the asis trick, output to a file."""
     #Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     #EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     #Run the tool,
     result, out, err = generic_run(cline)
     #Check it worked,
     errors = err.read().strip()
     self.assert_(errors.startswith("Needleman-Wunsch global alignment"),
                  errors)
     self.assertEqual(out.read().strip(), "")
     if result.return_code != 0: print >> sys.stderr, "\n%s" % cline
     self.assertEqual(result.return_code, 0)
     filename = result.get_result("outfile")
     self.assertEqual(filename, "Emboss/temp with space.needle")
     assert os.path.isfile(filename)
     #Check we can parse the output...
     align = AlignIO.read(open(filename), "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     #Clean up,
     os.remove(filename)
Example #37
0
                )  # Output the input sequence restructured as Dunlop reference
        else:
            print(sequence.id + " lacks the origin of replication")
            continue

    # NCCR BLAST
    block = NcbiblastnCommandline(query="target_sequence",
                                  subject="source/NCCR_BKTyper.fasta",
                                  outfmt=6,
                                  word_size=12,
                                  perc_identity=75,
                                  evalue=0.05)()[0]  ###
    # VP1 Needleman and Wunch
    a = NeedleCommandline(asequence="target_sequence", \
       bsequence="source/VP1_Dunlop.fasta", \
       gapopen=10, \
       gapextend=0.5, \
       outfile="needle_fname")

    a()  # execute the alignment
    # Export the alignment back to Python
    VP1_alignment = AlignIO.read("needle_fname", "emboss")

    # Call functions based on mode
    NCCR = NCCR_complex = subgroup = subgroup_detail = 'NA'  # definition of table objects
    motif_list = (open("source/motif_list.txt", "r"))
    vp1_db_file = (open("source/VP1_BKTyper_MLtree_list.fasta", "r"))
    vp1_db = vp1_db_file.read()

    if sys.argv[2] == 'VP1':
        (subgroup, subgroup_detail) = VP1_classification(VP1_alignment)
Example #38
0
 def test_needle_file(self):
     """Run needle with the asis trick, output to a file."""
     # Setup,
     cline = NeedleCommandline(cmd=exes["needle"])
     cline.set_parameter("-asequence", "asis:ACCCGGGCGCGGT")
     cline.set_parameter("-bsequence", "asis:ACCCGAGCGCGGT")
     cline.set_parameter("-gapopen", "10")
     cline.set_parameter("-gapextend", "0.5")
     # EMBOSS would guess this, but let's be explicit:
     cline.set_parameter("-snucleotide", "True")
     cline.set_parameter("-outfile", "Emboss/temp with space.needle")
     self.assertEqual(str(eval(repr(cline))), str(cline))
     # Run the tool,
     stdout, stderr = cline()
     # Check it worked,
     self.assertTrue(
         stderr.strip().startswith("Needleman-Wunsch global alignment"),
         stderr)
     self.assertEqual(stdout.strip(), "")
     filename = cline.outfile
     self.assertTrue(
         os.path.isfile(filename),
         "Missing output file %r from:\n%s" % (filename, cline),
     )
     # Check we can parse the output...
     align = AlignIO.read(filename, "emboss")
     self.assertEqual(len(align), 2)
     self.assertEqual(str(align[0].seq), "ACCCGGGCGCGGT")
     self.assertEqual(str(align[1].seq), "ACCCGAGCGCGGT")
     # Clean up,
     os.remove(filename)
Example #39
0
    def GetExec(self, optList, frame):
        # Respond to the "embossn" type command.
        self.frame = frame
        plugin_exe = r"C:/mEMBOSS/needle.exe"
        self.outfile = r"C:\Users\francis\Documents\Monguis\BioGui\plugins\needle.txt"
        self.outtype = "fasta"
        cline = NeedleCommandline(plugin_exe, asequence=str(self.frame.paramBoxes[1].GetValue()), bsequence=str(self.frame.paramBoxes[3].GetValue()))
        cline.outfile = self.outfile
        cline.gapopen = self.param[7].GetValue()
        cline.gapextend = self.param[9].GetValue()
        if self.param[10].GetValue():
            cline.similarity = True
        else:
            cline.similarity = False

        if self.frame.abet=="AA":
            cline.snucleotide = False
            cline.sprotein = True
        elif self.frame.abet=="DNA" or self.frame.abet=="RNA":
            cline.snucleotide = True
            cline.sprotein = False
        if self.frame.options:
            t = self.boxList[3].GetValue()
            if t != '':
                cline.datafile = str(t)   
        return str(cline)