Example #1
0
 def test_Mafft_with_Clustalw_output(self):
     """Simple round-trip through app with clustal output"""
     cmdline = MafftCommandline(mafft_exe)
     #Use some properties:
     cmdline.input = self.infile1
     cmdline.clustalout = True
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     #e.g. "CLUSTAL format alignment by MAFFT ..."
     #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)"
     self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata)
     self.assertTrue("$#=0" not in stderrdata)
Example #2
0
 def test_Mafft_with_Clustalw_output(self):
     """Simple round-trip through app with clustal output"""
     cmdline = MafftCommandline(mafft_exe)
     #Use some properties:
     cmdline.input = self.infile1
     cmdline.clustalout = True
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdin, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(stdin.return_code, 0)
     self.assert_(stdout.read().startswith("CLUSTAL format alignment by MAFFT"))
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(stdin._cl), mafft_exe \
                      + " --clustalout Fasta/f002")
Example #3
0
def mafft_align(fa_path, afa_path):
    """Align amino acid FASTA file.
    
    Takes amino-acid seqs from fa_path and writes aligned amino-acids
    to afa_path. 
    
    """
    mafft_call = MafftCommandline(input = fa_path)
    mafft_call.maxiterate = 1000
    mafft_call.retree = 2
    stdout, stderr = mafft_call()
    open(afa_path, "w").write(stdout)
    open("%s.err" % afa_path, 'w').write(stderr)
Example #4
0
 def align(cls, seq_records, outfile=None):
     '''Align given sequences
     @param seq_records: a list of SeqRecords objects
     @param outfile: a filename for the output alignment or None
     @return: if the outfile is none, return an AlignmentExt object;
     otherwise return True on success. In both cases return None on error.'''
     if not outfile:
         outfile = mktmp_name('.aln.fasta')
         remove_out = True
     else:
         remove_out = False
     msafile = mktmp_fasta(seq_records)
     args = dict(thread=-1, input=msafile)
     if len(seq_records) < 10000:
         args['auto'] = True
     else:
         args['parttree'] = True
         args['partsize'] = 1000
     ali = None
     if run_cline(MafftCommandline(**args), stdout=outfile):
         if os.path.isfile(outfile) and os.path.getsize(outfile) > 0:
             if remove_out:
                 ali = AlignmentExt.from_msa(AlignIO.read(outfile, 'fasta'))
             else:
                 ali = True
         else:
             ali = False
     if remove_out: safe_unlink(outfile)
     safe_unlink(msafile)
     return ali
def align_sequences(fasta_temp_dir, alignment_temp_dir, wd):
    os.chdir(fasta_temp_dir)

    print('aligning each sample sequence to reference genome')
    n = 0
    for file in glob.glob('*.fasta'):
        n = n + 1
        print(n)
        sample_seq_name = file.split('.fasta')[0]
        #         for record in SeqIO.parse(file, 'fasta'):
        #             if record.id != ref_id:
        #                 sample_seq_name = record.id

        # create outpath file name for alignment
        alignment_file_name = os.path.join(
            alignment_temp_dir, '%s.alignment.fasta' % sample_seq_name)

        if not os.path.isfile(alignment_file_name):

            # do alignment
            mafft_cline = MafftCommandline(input=file)
            print(mafft_cline)
            stdout, stderr = mafft_cline()
            with open(alignment_file_name, 'w') as handle:
                handle.write(stdout)
Example #6
0
 def test_Mafft_with_Clustalw_output(self):
     """Simple round-trip through app with clustal output"""
     cmdline = MafftCommandline(mafft_exe)
     #Use some properties:
     cmdline.input = self.infile1
     cmdline.clustalout = True
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     output = stdout.read()
     #e.g. "CLUSTAL format alignment by MAFFT ..."
     #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)"
     self.assert_(output.startswith("CLUSTAL"), output)
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(result._cl), mafft_exe \
                      + " --clustalout Fasta/f002")
Example #7
0
def mafft_align(file):
    stdout, stderr = MafftCommandline(
        input=file,
        auto=True,
    )()
    with open(f"{os.path.splitext(file)[0]}.fasta.mafft", "w") as aligned:
        aligned.write(stdout)
Example #8
0
def call_mafft(genefile):
    """Calls MAFFT to generate an alignment.

    Parameters
    ----------
    genefile : str
        a string with the name/path for
        the FASTA file.

    Returns
    -------
    bool
        True if sucessful, False otherwise.
    """

    try:
        mafft_cline = MafftCommandline(
            input=genefile,
            adjustdirection=True,
            treeout=True,
            thread=1,
            retree=1,
            maxiterate=0,
        )
        stdout, stderr = mafft_cline()
        path_to_save = genefile.replace("_prot.fasta", "_aligned.fasta")
        with open(path_to_save, "w") as handle:
            handle.write(stdout)
        return True

    except Exception as e:
        print(e)
        return False
Example #9
0
def mafft_alignment(mafft_cmd, *args):
    fa_fpath = '/dev/shm/tmp.fa'
    mafft_fpath = '/dev/shm/tmp.mafft'

    # Write seqs to fasta file
    with open(fa_fpath, 'w') as out:
        for i, s in enumerate(args):
            out.write('>{}\n'.format(i))
            out.write('{}\n'.format(s))

    # Align
    mf_cline = MafftCommandline(mafft_cmd, input=fa_fpath)
    stdout, stderr = mf_cline()
    with open(mafft_fpath, 'w') as out:
        out.write(stdout)
    #check_call([mafft_cmd, '--quiet', fa_fpath, '>', mafft_fpath], shell=True)

    # Read and order output
    alignment = [
        (i, str(rec.seq))
        for i, rec in enumerate(SeqIO.parse(open(mafft_fpath), 'fasta'))
    ]
    output = [s.upper() for i, s in sorted(alignment)]

    # Delete files
    os.remove(fa_fpath)
    os.remove(mafft_fpath)

    return output
Example #10
0
def pool_write_microalignment(mblocknum, targetdata, extendedsourcedata,
                              nbinitialsource, all_ids, msamethod):
    aln = {}
    i = mblocknum[0]
    mblock = mblocknum[1]
    input_muscle_file = "input_muscle.fasta" + str(i)
    output_muscle_file = "output_muscle.fasta" + str(i)

    input_muscle = open(input_muscle_file, "w")

    nbseq = 0
    for gene in targetdata:
        geneid, geneseq = gene
        if geneid in mblock.keys() and mblock[geneid][1] > mblock[geneid][0]:
            input_muscle.write(">" + geneid + "\n" +
                               geneseq[mblock[geneid][0]:mblock[geneid][1]] +
                               "\n")
            nbseq += 1

    for j in range(nbinitialsource):
        cds = extendedsourcedata[j]
        cdsid, cdsseq, cdsgeneid, null = cds
        if cdsid in mblock.keys() and mblock[cdsid][1] > mblock[cdsid][0]:
            input_muscle.write(">" + cdsid + "\n" +
                               cdsseq[mblock[cdsid][0]:mblock[cdsid][1]] +
                               "\n")
            nbseq += 1

    input_muscle.close()

    msa = []
    if (nbseq > 0):
        if (msamethod == "muscle"):
            muscle_cline = MuscleCommandline(input=input_muscle_file,
                                             out=output_muscle_file,
                                             gapopen=-800.0)
            stdout, stderr = muscle_cline()
        else:  # msamethod == "mafft"
            mafft_cline = MafftCommandline(input=input_muscle_file)
            stdout, stderr = mafft_cline()
            with open(output_muscle_file, "w") as handle:
                handle.write(stdout)
        msa = AlignIO.read(output_muscle_file, "fasta")
    else:
        open(output_muscle_file, "w").close()

    present_ids = []
    length = 0
    for record in msa:
        present_ids.append(record.id)
        aln[record.id] = record.seq
        length = len(record.seq)

    for id in all_ids:
        if (id not in present_ids):
            aln[id] = '-' * length

    os.remove(input_muscle_file)
    os.remove(output_muscle_file)
    return aln
Example #11
0
def do_alignment(fasta, threads):
    # Run MAFFT alignment
    align_cmd = MafftCommandline(input=fasta, retree=1, maxiterate=0, thread=int(threads))
    align_so, align_se = align_cmd()
    align = AlignIO.read(io.StringIO(align_so), "fasta")

    return align
def MakeAlignments(seqs,name,path):   ##aligns exported data 
    if os.path.isfile(path + name + '_aligned.txt') is False:
        in_file = seqs
        mafft_cline = MafftCommandline(input=in_file, auto=True, reorder=True)
        stdout, stderr = mafft_cline()
        handle = open(path + name + '_aligned.txt', 'w')
        handle.write(stdout)
        handle.close()  
Example #13
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options, result passed to stdout."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertNotIn("$#=0", stderrdata)
 def run_mafft(mafftdir, outputDir, fastaFileList, outputFileNameList,
               startNum, endNum, algorithm):
     # Set up
     import platform
     for i in range(startNum, endNum):
         # Run MAFFT
         if platform.system() == 'Windows':
             mafft_cline = MafftCommandline(os.path.join(
                 mafftdir, 'mafft.bat'),
                                            input=fastaFileList[i])
         else:
             mafft_cline = MafftCommandline(os.path.join(mafftdir, 'mafft'),
                                            input=fastaFileList[i])
         if algorithm != None:
             if algorithm.lower() == 'genafpair':
                 mafft_cline.genafpair = True
             elif algorithm.lower() == 'localpair':
                 mafft_cline.localpair = True
             elif algorithm.lower() == 'globalpair':
                 mafft_cline.globalpair = True
         stdout, stderr = mafft_cline()
         if stdout == '':
             raise Exception('MAFFT error text below' + str(stderr))
         # Process MAFFT output
         stdout = stdout.split('\n')
         while stdout[-1] == '\n' or stdout[-1] == '' or stdout[
                 -1] == 'Terminate batch job (Y/N)?\n':  # Remove junk, sometimes MAFFT will have the 'Terminate ...' line
             del stdout[-1]
         stdout = '\n'.join(stdout)
         # Create output alignment files
         with open(outputFileNameList[i], 'w') as fileOut:
             fileOut.write(stdout)
Example #15
0
    def mafft_align(query_seq, target_seq, query_name, target_name, align_method="local", directory="./", quiet=False):
        # add time to file name to make it unique '20160809-144522_' 2016-08-09 14:45:22
        file_name = directory + datetime.now().strftime("%Y%m%d_%H%M%S_") + query_name + ".fasta"
        with open(file_name, 'w') as data_out:
            data_out.write(">{}\n{}\n>{}\n{}".format(target_name, target_seq, query_name, query_seq))
        if align_method == "local":
            mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, localpair=True, maxiterate=1000,
                                            quiet=quiet)

        else:
            mafft_cline = MafftCommandline(input=directory + file_name, nuc=True, globalpair=True, maxiterate=1000,
                                           quiet=quiet)
        out, _ = mafft_cline()
        align = AlignIO.read(StringIO(out), "fasta")
        my_list = list(align)
        # target name, target seq, query name, query seq
        os.remove(file_name)
        return my_list[0].id, my_list[0].seq, my_list[1].id, my_list[1].seq
def align(fasta):
    # MAFFT needs to be in the path	
    in_file = os.path.relpath(fasta)
    mafft_cline = MafftCommandline(input=in_file)
    stdout, stderr = mafft_cline()
    align = AlignIO.read(StringIO(stdout), "fasta")
    sequence1=str(align[0].seq)
    sequence2=str(align[1].seq)
    return [sequence1,sequence2]
Example #17
0
 def test_Mafft_with_Clustalw_output(self):
     """Simple round-trip through app with clustal output"""
     cmdline = MafftCommandline(mafft_exe)
     #Use some properties:
     cmdline.input = self.infile1
     cmdline.clustalout = True
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     output = child.stdout.read()
     #e.g. "CLUSTAL format alignment by MAFFT ..."
     #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)"
     self.assert_(output.startswith("CLUSTAL"), output)
     self.assert_("$#=0" not in child.stderr.read())
     del child
Example #18
0
def create_msa(fasta_infile, msa_fasta, msa_phy):
    "Creates a multiple sequence alignment with mafft in phylip format"
    mafft_cline = MafftCommandline(
        input=fasta_infile)  #Create mafft command line
    stdout, stderr = mafft_cline()  #save mafft output into variable
    with open(msa_fasta, 'w') as handle:
        handle.write(stdout)  #write mafft output in fasta format
    AlignIO.convert(
        msa_fasta, "fasta", msa_phy,
        "phylip-relaxed")  #convert mafft output from fasta to phylip
Example #19
0
def mafft(infile):
    from Bio.Align.Applications import MafftCommandline
    from io import StringIO
    from Bio import AlignIO
    mafft_cline = MafftCommandline("mafft", input=infile)
    print(mafft_cline)
    stdout, stderr = mafft_cline()
    align = AlignIO.read(StringIO(stdout), "fasta")
    outfile = infile.replace('.fasta', '_mafft.aln')
    AlignIO.write(align, outfile, "clustal")
Example #20
0
    def codon_align(self, alignment_tool="mafft", prune=True, verbose=0):
        ''' takes a nucleotide alignment, translates it, aligns the amino acids, pads the gaps
        note that this suppresses any compensated frameshift mutations

        Parameters:
        - alignment_tool: ['mafft', 'muscle'] the commandline tool to use
        '''
        from Bio import AlignIO, SeqIO
        from Bio.SeqRecord import SeqRecord
        make_dir(self.run_dir)
        os.chdir(self.run_dir)

        # translage
        aa_seqs = {}
        bad_seq = 0
        for seq in self.seqs.values():
            tempseq = seq.seq.translate()
            # use only sequences that translate with out trouble
            if '*' not in str(tempseq)[:-1] or prune == False:
                aa_seqs[seq.id] = SeqRecord(tempseq, id=seq.id)
                aa_seqs[seq.id].attributes = seq.attributes
            else:
                if verbose: print(seq.id, "has premature stops, discarding")
            bad_seq += '*' in str(tempseq)[:-1]

        print('Number of sequences with stops:', bad_seq, 'out of total',
              len(self.seqs))
        tmpfname = 'temp_in.fasta'
        SeqIO.write(aa_seqs.values(), tmpfname, 'fasta')

        if alignment_tool == 'muscle':
            from Bio.Align.Applications import MuscleCommandline
            cline = MuscleCommandline(input=tmpfname,
                                      out=tmpfname[:-5] + 'aligned.fasta')
            cline()
            aln_aa = AlignIO.read(tmpfname[:-5] + 'aligned.fasta', "fasta")
        elif alignment_tool == 'mafft':
            from Bio.Align.Applications import MafftCommandline
            from StringIO import StringIO
            mafft_cline = MafftCommandline(input=tmpfname)
            stdout, stderr = mafft_cline()
            aln_aa = AlignIO.read(StringIO(stdout), "fasta")
        else:
            print('Alignment tool not supported:', alignment_tool)
            return

        #generate nucleotide alignment
        self.aln = pad_nucleotide_sequences(aln_aa, self.seqs)
        self.sequence_lookup = {seq.id: seq for seq in self.aln}
        # add attributes to alignment
        for seq in self.seqs.values():
            if seq.id in self.sequence_lookup:
                self.sequence_lookup[seq.id].attributes = seq.attributes
        os.chdir('..')
        remove_dir(self.run_dir)
Example #21
0
def align_with_mafft(filepath, localpair=False, maxiterate=1000):
    """
    Align a file with the given filepath using MAFFT
    :param filepath: The file to align
    :param localpair: Should we use the l-insi method
    :return: The MAFFT alignment
    """
    mafft_cline = MafftCommandline(input=filepath, localpair=localpair, maxiterate=maxiterate)
    stdout, stderr = mafft_cline()
    align = AlignIO.read(io.StringIO(stdout), "fasta")
    return align
Example #22
0
 def test_Mafft_simple(self):
     """Simple round-trip through app with infile.
     Result passed to stdout.
     """
     #Use a keyword argument at init,
     cmdline = MafftCommandline(mafft_exe, input=self.infile1)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("Progressive alignment ..." in stderrdata, stderrdata)
     self.assertTrue("$#=0" not in stderrdata)
Example #23
0
 def align_cluster(self, cluster_file):
     """
     Worker fuction for align_clusters
     Inputs a FASTA file containing an unaligned sequence cluster.
     Uses MAFFT to align the cluster.
     """
     mafft_cline = MafftCommandline(input=cluster_file)
     mafft_cline.set_parameter("--auto", True)
     mafft_cline.set_parameter("--adjustdirection", True)
     color = Color()
     print(color.red + str(mafft_cline) + color.done)
     sys.stdout.flush()
     if cluster_file.find("/") != -1:
         alignment_file = "alignments" + cluster_file[cluster_file.index("/"):]
     else:
         alignment_file = "alignments/" + cluster_file
     stdout, stderr = mafft_cline()
     with open(alignment_file, "w") as handle:
         handle.write(stdout)
     return alignment_file
Example #24
0
 def align(self):
     if self.align_software == 'mafft':
         mafft_cline = MafftCommandline(
             cmd=self.mafft_path, input=self.pair_pep_file, auto=True)
         stdout, stderr = mafft_cline()
         align = AlignIO.read(StringIO(stdout), "fasta")
         AlignIO.write(align, self.prot_align_file, "fasta")
     if self.align_software == 'muscle':
         muscle_cline = MuscleCommandline(
             cmd=self.muscle_path, input=self.pair_pep_file, out=self.prot_align_file, seqtype="protein", clwstrict=True)
         stdout, stderr = muscle_cline()
Example #25
0
def align_fasta(in_file_loc):

    # Gets the base file *.fa
    out_file_base = in_file_loc.split(".fa")[0] + ".aln"

    # Runs command line to work with mafft
    mafft_cline = MafftCommandline(input=in_file_loc)

    # runs mafft using what our file was and to an output of base.aln
    stdout, stderr = mafft_cline()
    with open(out_file_base, "w") as handle:
        handle.write(stdout)
Example #26
0
 def test_Mafft_with_Clustalw_output(self):
     """Simple round-trip through app with clustal output"""
     cmdline = MafftCommandline(mafft_exe)
     #Use some properties:
     cmdline.input = self.infile1
     cmdline.clustalout = True
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     stdoutdata, stderrdata = child.communicate()
     return_code = child.returncode
     self.assertEqual(return_code, 0, "Got error code %i back from:\n%s"
                      % (return_code, cmdline))
     #e.g. "CLUSTAL format alignment by MAFFT ..."
     #or "CLUSTAL (-like) formatted alignment by MAFFT FFT-NS-2 (v6.240)"
     self.assertTrue(stdoutdata.startswith("CLUSTAL"), stdoutdata)
     self.assertTrue("$#=0" not in stderrdata)
     del child
Example #27
0
def call_mafft_0(in_file, out_file):
    #mafft_exe = "D:\Gal\MultiCrisper\mafft-7.245-win64\mafft-win\mafft.bat"
    #in_file = "../Doc/examples/opuntia.fasta"
    #mafft_cline = MafftCommandline(mafft_exe, input=in_file)
    mafft_cline = MafftCommandline(input=in_file)
    print(mafft_cline)
    stdout, stderr = mafft_cline()
    with open(out_file, "w") as handle:
        handle.write(stdout)
    ##from Bio import AlignIO  ##not in use for now
    ##	align = AlignIO.read("aligned.fasta", "fasta")  ##not in use for now
    return out_file
Example #28
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options, result passed to stdout."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertNotIn("$#=0", stderrdata)
Example #29
0
def run_msa(fasta_path, out_dir, bubble_num):
    mafft_cline = MafftCommandline(input=fasta_path)

    print('Performing MSA on bubble number', bubble_num)

    # run MAFFT
    stdout, stderr = mafft_cline()

    # write the MSA to a file
    with open(os.path.join(out_dir, 'msa-' + bubble_num + '.fasta'),
              'w') as fh:
        fh.write(stdout)
Example #30
0
 def align_cluster(self, cluster_file):
     """
     Worker fuction for align_clusters
     Inputs a FASTA file containing an unaligned sequence cluster.
     Uses MAFFT to align the cluster.
     """
     mafft_cline = MafftCommandline(input=cluster_file)
     mafft_cline.set_parameter("--auto", True)
     mafft_cline.set_parameter("--adjustdirection", True)
     color = Color()
     print(color.red + str(mafft_cline) + color.done)
     sys.stdout.flush()
     if cluster_file.find("/") != -1:
         alignment_file = "alignments" + cluster_file[cluster_file.index("/"
                                                                         ):]
     else:
         alignment_file = "alignments/" + cluster_file
     try:
         stdout, stderr = mafft_cline()
         with open(alignment_file, "w") as handle:
             handle.write(stdout)
     except:
         print(
             color.red +
             "Error: alignment file not generated. Please check your MAFFT installation."
             + color.done)
     return alignment_file
Example #31
0
 def align(self):
     if self.align_software == 'mafft':
         mafft_cline = MafftCommandline(cmd=self.mafft_path,
                                        input=self.sequencefile,
                                        auto=True)
         stdout, stderr = mafft_cline()
         align = AlignIO.read(StringIO(stdout), "fasta")
         AlignIO.write(align, self.alignfile, "fasta")
     if self.align_software == 'muscle':
         muscle_cline = MuscleCommandline(cmd=self.muscle_path,
                                          input=self.sequencefile,
                                          out=self.alignfile)
         stdout, stderr = muscle_cline()
Example #32
0
 def test_Mafft_with_PHYLIP_output(self):
     """Simple round-trip through app with PHYLIP output"""
     cmdline = MafftCommandline(mafft_exe,
                                input=self.infile1,
                                phylipout=True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     #e.g. " 3 706\n" but allow some variation in the column count
     self.assertTrue(stdoutdata.startswith(" 3 70"), stdoutdata)
     self.assertTrue("gi|1348912 " in stdoutdata, stdoutdata)
     self.assertTrue("gi|1348912|gb|G26680|G26680" not in stdoutdata,
                     stdoutdata)
     self.assertTrue("$#=0" not in stderrdata)
Example #33
0
def alignSeqs(in_file, out_file, mafft_bat):

    cline = MafftCommandline(mafft_bat, input=in_file)
    # print(cline)

    [stdout, stderr] = cline()

    with open(out_file, 'w+') as handle:
        handle.write(stdout)
    with open('error.txt', 'w+') as handle:
        handle.write(stderr)

    return True
Example #34
0
 def test_Mafft_simple(self):
     """Simple round-trip through app with infile, result passed to stdout."""
     # Use a keyword argument at init,
     cmdline = MafftCommandline(mafft_exe, input=self.infile1)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     # Used to get "Progressive alignment ..." but in v7.245
     # became "Progressive alignment 1/2..." and "Progressive alignment 2/2..."
     self.assertTrue(("Progressive alignment ..." in stderrdata)
                     or ("Progressive alignment 1/" in stderrdata),
                     stderrdata)
     self.assertNotIn("$#=0", stderrdata)
Example #35
0
 def test_Mafft_with_PHYLIP_namelength(self):
     """Check PHYLIP with --namelength"""
     cmdline = MafftCommandline(mafft_exe, input=self.infile1,
                                phylipout=True, namelength=50)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     #e.g. " 3 706\n" or " 3 681" but allow some variation in the column count
     self.assertTrue(stdoutdata.startswith(" 3 68") or
                     stdoutdata.startswith(" 3 69") or
                     stdoutdata.startswith(" 3 70"), stdoutdata)
     self.assertTrue("gi|1348912|gb|G26680|G26680" in stdoutdata,
                     stdoutdata)
     self.assertTrue("$#=0" not in stderrdata)
Example #36
0
def call_mafft(path_to_save, genefile):

    try:
        print "maffting " + os.path.basename(genefile)
        mafft_cline = MafftCommandline(input=genefile)
        stdout, stderr = mafft_cline()
        with open(path_to_save, "w") as handle:
            handle.write(stdout)
        return True

    except Exception as e:
        print e
        return False
def align_seqs(pool_input):
    counter, total, sequence_collection, aligner, gap_opening_penalty, gap_extension_penalty, no_trim, trimal_setting, window_size, seq_proportion, conserve_alignment_percentage, min_length, outdir = pool_input
    filename = os.path.basename(sequence_collection).replace(
        'sequence_collection_locus_', '')
    if aligner == 'mafft':
        cline = MafftCommandline(input=sequence_collection,
                                 adjustdirection=True,
                                 maxiterate=1000,
                                 op=gap_opening_penalty,
                                 ep=gap_extension_penalty)
    elif aligner == 'muscle':
        cline = MuscleCommandline(input=sequence_collection,
                                  maxiters=1000,
                                  gapopen=gap_opening_penalty,
                                  gapextend=gap_extension_penalty)
    stdout, stderr = cline()
    alignment_out = os.path.join(outdir, filename)
    sys.stdout.write('\rAligning sequence collections %i/%i ' %
                     (int(counter + 1), total))
    sys.stdout.flush()
    with open(alignment_out, "w") as handle:
        handle.write(stdout)

    if not no_trim:
        # trim alignments with trimal
        if trimal_setting != 'manual':
            cmd = [
                "trimal", "-in", alignment_out, "-out", alignment_out,
                '-%s' % trimal_setting
            ]
        else:
            cmd = [
                "trimal", "-in", alignment_out, "-out", alignment_out, '-w',
                str(window_size), '-gt',
                str(seq_proportion), '-cons',
                str(conserve_alignment_percentage)
            ]
        # run trimal command
        proc = subprocess.Popen(cmd,
                                stderr=subprocess.PIPE,
                                stdout=subprocess.PIPE)
        stderr, stdout = proc.communicate()
    if min_length:
        align = AlignIO.read(alignment_out, "fasta")
        al_length = len(align[0])
        if al_length < min_length:
            # delete file if smaller than minlength
            os.remove(alignment_out)
            #too_short_alignments.append(filename.replace('.fasta',''))
            return (filename.replace('.fasta', '')
                    )  # Return locus name in case alignment is too short
Example #38
0
def mafft(in_file: str):
    """ MAFFT command line for MSA. 

        Args: 
            in_file [str]: Input file 
    """

    mafft_cline = MafftCommandline(input=in_file)

    stdout, stderr = mafft_cline()
    print(mafft_cline)

    with open("gisaid_results/aligned.fasta", "w") as file:
        file.write(stdout)
Example #39
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(result._cl), mafft_exe \
                      + " --localpair --maxiterate 100 Fasta/f002")
Example #40
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in child.stderr.read())
     del child
Example #41
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     stdoutdata, stderrdata = child.communicate()
     return_code = child.returncode
     self.assertEqual(return_code, 0, "Got error code %i back from:\n%s"
                      % (return_code, cmdline))
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
     del child
Example #42
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe
                      + " --localpair --weighti 4.2 --retree 5 "
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51"
                      + " --lop 0.233 --lep 0.2 --reorder --treeout"
                      + " --nuc Fasta/f002")
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
Example #43
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     stdoutdata, stderrdata = child.communicate()
     return_code = child.returncode
     self.assertEqual(return_code, 0, "Got error code %i back from:\n%s"
                      % (return_code, cmdline))
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
     del child
Example #44
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in child.stderr.read())
     del child
Example #45
0
def main():
    
    print("\n\nmatrix_maker.py\n\n")
    
    print("Getting all taxid...\n")
    print("Writing taxids to file taxids.txt...\n")
    taxids_file = open("taxids.txt", "w")
    name_file = open(taxa_file)
    names = name_file.readlines()
    taxids = []
    import time
    for name in names:
        name = "%s" %(name.split()[0])
        taxid = get_taxon_id(name)
        name_taxid_text = name + "\t" + taxid
        print(name_taxid_text)
        taxids_file.write(name_taxid_text + "\n")
        taxids.append( taxid )
        # dont overload genbank
        time.sleep(0.1)
    taxids_file.close()

    print("\nDownloading sequences for each taxid...\n") #Keeping the longest sequence for each taxon...\n")
    from Bio import Entrez
    from Bio import SeqIO
    final_records = []
    for taxid in taxids:
        if taxid != "not found":
            records = get_sequences(taxid)
            # keep all records
            final_records = final_records + records
            # dont overload genbank
            time.sleep(0.2)

            # find the longest sequence
            #longest_len = 0
            #longest_seq = None
            #for record in records:
            #    if len(record) > longest_len:
            #        longest_len = len(record)
            #        longest_seq = record
            #if longest_seq != None:
            #    final_records.append(longest_seq)
    
    print("\nGenerating unaligned FASTA file with GenBank formatted description...\n")
    SeqIO.write(final_records, "output_unaligned_gb_format.fasta", "fasta")

    print("Generating unaligned FASTA file with custom formatted description...\n")
    unaligned_file = open("output_unaligned_custom_format.fasta", "w")
    for record in final_records:
        # remove the organism name from the description
        description = record.description
        if description.find(record.annotations["organism"] + " ") != -1:
            description = description.replace(record.annotations["organism"] + " ", "")
        # custom format for Andrew: >Organism name_accession_description
        description = record.annotations["organism"] + "_" + record.id + "_" + description
        description = description.replace(" ", "_")
        unaligned_file.write(">" + description + "\n")
        unaligned_file.write(str(record.seq) + "\n")
    unaligned_file.close()


    print("Making alignment with MAFFT...")
    try:
        from Bio.Align.Applications import MafftCommandline
        mafft_cline = MafftCommandline(input="output_unaligned_custom_format.fasta")
        mafft_cline.set_parameter("--auto", True)
        mafft_cline.set_parameter("--adjustdirection", True)
        print(str(mafft_cline))
        stdout, stderr = mafft_cline()
        print("Writing alignment to FASTA file...\n")
        with open("output_aligned.fasta", "w") as handle:
            handle.write(stdout)
    except:
        print("Problem finding MAFFT, alignment skipped.")
        
    print("Done!\n")
Example #46
0
def main():

    # parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--email", "-e", help="Email address for NCBI database searches.")
    parser.add_argument("--genes", "-g", help="Text file that contains a list of all gene names.")
    parser.add_argument(
        "--max_seq_length",
        "-m",
        help="Optional. Sets the maximum sequence length to include. Use this to exclude genomes.",
    )
    parser.add_argument(
        "--species", "-s", help="Text file that contains a list of all species binomials and their synonyms."
    )
    parser.add_argument(
        "--taxids",
        "-t",
        help="Optional. Text file that contains a list of all taxids. Use this to avoid repeating the NCBI taxid lookups.",
    )
    args = parser.parse_args()

    print("\n\nmatrix_maker.py\n\n")

    if not args.email:
        print(
            "NCBI requires an email address for database searches. Please use the --email flag to specify an email address.\n"
        )
        sys.exit(0)
    else:
        email = args.email

    if not args.species or not os.path.isfile(args.species):
        print("Please specify a valid list of taxa to search for.\n")
        sys.exit(0)

    if args.max_seq_length:
        max_seq_length = int(args.max_seq_length)
    else:
        max_seq_length = -1

    genes = []
    if not args.genes or not os.path.isfile(args.genes):
        print("Please specify a valid list of genes to search for.\n")
        sys.exit(0)
    else:
        # read in gene names....
        # format of file:
        # gene_name,include,rbcL,RBCL
        # gene_name,exclude,RRRBCL
        with open(args.genes, "rb") as csvfile:
            genereader = csv.reader(csvfile, delimiter=",")
            for row in genereader:
                if row[1] == "include":
                    gene = Gene(row[0])
                    for i in range(2, len(row)):
                        if row[i] != "":
                            gene.gene_names.append(row[i])
                    genes.append(gene)
                if row[1] == "exclude":
                    for gene in genes:
                        if gene.name == row[0]:
                            for i in range(2, len(row)):
                                if row[i] != "":
                                    gene.exclusions.append(row[i])

    # list of all taxon objects
    taxa = []

    # check for taxid
    print("Checking for taxids csv file...")
    if args.taxids and os.path.isfile(args.taxids):
        with open(args.taxids, "rb") as csvfile:
            print("Found taxids csv file, reading taxids...\n")
            taxidsreader = csv.reader(csvfile, delimiter=",")
            for row in taxidsreader:
                taxa.append(Taxon(row[0], row[1]))
    else:
        print("No taxids csv file found.\n")

    # open species list file, get synonyms and any missing taxids
    with open(args.species, "rb") as csvfile:
        print("Checking list of species, getting missing taxids from NCBI...")
        taxids_file = open("taxids.csv", "w")
        namesreader = csv.reader(csvfile, delimiter=",")
        i = 1
        num_lines = sum(1 for line in open(args.species))
        for row in namesreader:
            # update status
            percent = str(round(100 * i / float(num_lines), 2))
            sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)")
            sys.stdout.flush()
            i += 1
            # check to see if we already have a taxid for this species
            found = False
            for taxon in taxa:
                if taxon.binomial == row[0]:
                    found = True
                    taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n")
                    # add synonyms
                    for j in range(1, len(row)):
                        taxon.synonyms.append(row[j])
                    break
            if not found:
                # get the taxid from NCBI
                taxon = Taxon(row[0])
                taxon.get_taxid(email)
                # dont overload genbank
                time.sleep(0.1)
                taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n")
                # add synonyms
                for j in range(1, len(row)):
                    taxon.synonyms.append(row[j])
                taxa.append(taxon)
        taxids_file.close()
        print("\nWriting all taxids to file taxids.csv...")

    print("\nDownloading sequences from NCBI...")
    for gene in genes:
        print("\nSearching for gene: " + gene.name)
        i = 1
        for taxon in taxa:
            # update status
            percent = str(round(100 * i / float(len(taxa)), 2))
            sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)")
            sys.stdout.flush()
            i += 1
            if taxon.taxid != "not found":
                taxon.get_sequences(email, gene)
                # dont overload genbank
                time.sleep(0.2)

        print("\nGenerating unaligned FASTA file...")
        unaligned_file = open(gene.name + ".fasta", "w")
        for taxon in taxa:
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if record != None:
                # output format: >binomial_accession_description
                description = taxon.binomial + "_" + record.id + "_" + record.description
                description = description.replace(" ", "_")
                unaligned_file.write(">" + description + "\n")
                unaligned_file.write(str(record.seq) + "\n\n")
        unaligned_file.close()

        print("Making alignment with MAFFT...")
        try:
            from Bio.Align.Applications import MafftCommandline

            mafft_cline = MafftCommandline(input=gene.name + ".fasta")
            mafft_cline.set_parameter("--auto", True)
            mafft_cline.set_parameter("--adjustdirection", True)
            print(str(mafft_cline))
            stdout, stderr = mafft_cline()
            print("Writing alignment to FASTA file...")
            with open("aligned_" + gene.name + ".fasta", "w") as handle:
                handle.write(stdout)
        except:
            print("Problem finding MAFFT, alignment skipped.")

    print("\nGenerating summary results spreadsheet...\n")
    summary = open("result.csv", "w")
    header = "taxon,"
    for gene in genes:
        header += gene.name + ","
    summary.write(header + "\n")
    for taxon in taxa:
        accessions = taxon.binomial + ","
        for gene in genes:
            # each column will be the longest sequences accession
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if record != None:
                accessions += record.id + ","
            else:
                accessions += ","
        summary.write(accessions + "\n")
    summary.close()
    print("Done!\n")
Example #47
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(result._cl), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
Example #48
0
        atpA_records.append(SeqIO.read(handle, 'fasta'))
        handle.close()
        sleep(0.02)
SeqIO.write(atpA_records, "atpA_unaligned.fasta", "fasta")

for accession in rbcL_accessions:
    if accession.strip() != '':
        handle = Entrez.efetch(db='nucleotide', rettype='fasta', retmode='text', id=accession)
        rbcL_records.append(SeqIO.read(handle, 'fasta'))
        handle.close()
        sleep(0.02)
SeqIO.write(rbcL_records, "rbcL_unaligned.fasta", "fasta")


print("Aligning atpA with MAFFT...")
mafft_cline = MafftCommandline(input="atpA_unaligned.fasta")
mafft_cline.set_parameter("--auto", True)
mafft_cline.set_parameter("--adjustdirection", True)
print(str(mafft_cline))
stdout, stderr = mafft_cline()

print("Writing atpA alignment to FASTA file...")
with open("atpA_aligned.fasta", "w") as handle:
    handle.write(stdout)

print("Aligning rbcL with MAFFT...")
mafft_cline = MafftCommandline(input="rbcL_unaligned.fasta")
mafft_cline.set_parameter("--auto", True)
mafft_cline.set_parameter("--adjustdirection", True)
print(str(mafft_cline))
stdout, stderr = mafft_cline()