Esempio n. 1
0
 def align_cluster(self, cluster_file):
     """
     Worker fuction for align_clusters
     Inputs a FASTA file containing an unaligned sequence cluster.
     Uses MAFFT to align the cluster.
     """
     mafft_cline = MafftCommandline(input=cluster_file)
     mafft_cline.set_parameter("--auto", True)
     mafft_cline.set_parameter("--adjustdirection", True)
     color = Color()
     print(color.red + str(mafft_cline) + color.done)
     sys.stdout.flush()
     if cluster_file.find("/") != -1:
         alignment_file = "alignments" + cluster_file[cluster_file.index("/"
                                                                         ):]
     else:
         alignment_file = "alignments/" + cluster_file
     try:
         stdout, stderr = mafft_cline()
         with open(alignment_file, "w") as handle:
             handle.write(stdout)
     except:
         print(
             color.red +
             "Error: alignment file not generated. Please check your MAFFT installation."
             + color.done)
     return alignment_file
Esempio n. 2
0
def executeMafft(mafft_exe, directory='', gap_penalty=10.0):
    import os, sys
    from Bio.Align.Applications import MafftCommandline

    if len(directory) > 0 and directory[-1] != '/':
        directory += '/'

    if len(mafft_exe) == 0:
        sys.stderr.write('Install mafft before execution.')
        sys.exit(-1)

    after = directory + 'aligned_contigs/'
    if not os.path.exists(after):
        os.mkdir(after)

    seq_dir = directory + 'sequences/'
    seqfiles = os.listdir(seq_dir)
    for seqfile in seqfiles:
        if seqfile[-6:] == '.fasta':
            sequences = {}
            seq_ids = []
            for line in open(seq_dir + seqfile, 'r'):
                if line[0] == '>':
                    seq_ids.append(line.strip()[1:])
                else:
                    sequences.setdefault(seq_ids[-1], '')
                    sequences[seq_ids[-1]] += line.strip()

            transcript = seqfile[:seqfile.find('.')]
            mafft_cline = MafftCommandline(mafft_exe, input=seq_dir + seqfile)
            mafft_cline.set_parameter('--op', gap_penalty)
            writefile = open(after + transcript + '_aligned.fasta', 'w')
            stdout, stderr = mafft_cline()
            writefile.write(stdout)
            writefile.close()
Esempio n. 3
0
 def align_cluster(self, cluster_file):
     """
     Worker fuction for align_clusters
     Inputs a FASTA file containing an unaligned sequence cluster.
     Uses MAFFT to align the cluster.
     """
     mafft_cline = MafftCommandline(input=cluster_file)
     mafft_cline.set_parameter("--auto", True)
     mafft_cline.set_parameter("--adjustdirection", True)
     color = Color()
     print(color.red + str(mafft_cline) + color.done)
     sys.stdout.flush()
     if cluster_file.find("/") != -1:
         alignment_file = "alignments" + cluster_file[cluster_file.index("/"):]
     else:
         alignment_file = "alignments/" + cluster_file
     stdout, stderr = mafft_cline()
     with open(alignment_file, "w") as handle:
         handle.write(stdout)
     return alignment_file
Esempio n. 4
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options, result passed to stdout."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertNotIn("$#=0", stderrdata)
Esempio n. 5
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options, result passed to stdout."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertNotIn("$#=0", stderrdata)
Esempio n. 6
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(result._cl), mafft_exe \
                      + " --localpair --maxiterate 100 Fasta/f002")
Esempio n. 7
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in child.stderr.read())
     del child
Esempio n. 8
0
 def test_Mafft_with_options(self):
     """Simple round-trip through app with infile and options.
     Result passed to stdout.
     """
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("maxiterate", 100)
     cmdline.set_parameter("--localpair", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     stdoutdata, stderrdata = child.communicate()
     return_code = child.returncode
     self.assertEqual(return_code, 0, "Got error code %i back from:\n%s"
                      % (return_code, cmdline))
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
     del child
Esempio n. 9
0
        handle.close()
        sleep(0.02)
SeqIO.write(atpA_records, "atpA_unaligned.fasta", "fasta")

for accession in rbcL_accessions:
    if accession.strip() != '':
        handle = Entrez.efetch(db='nucleotide', rettype='fasta', retmode='text', id=accession)
        rbcL_records.append(SeqIO.read(handle, 'fasta'))
        handle.close()
        sleep(0.02)
SeqIO.write(rbcL_records, "rbcL_unaligned.fasta", "fasta")


print("Aligning atpA with MAFFT...")
mafft_cline = MafftCommandline(input="atpA_unaligned.fasta")
mafft_cline.set_parameter("--auto", True)
mafft_cline.set_parameter("--adjustdirection", True)
print(str(mafft_cline))
stdout, stderr = mafft_cline()

print("Writing atpA alignment to FASTA file...")
with open("atpA_aligned.fasta", "w") as handle:
    handle.write(stdout)

print("Aligning rbcL with MAFFT...")
mafft_cline = MafftCommandline(input="rbcL_unaligned.fasta")
mafft_cline.set_parameter("--auto", True)
mafft_cline.set_parameter("--adjustdirection", True)
print(str(mafft_cline))
stdout, stderr = mafft_cline()
Esempio n. 10
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     result, stdout, stderr = Application.generic_run(cmdline)
     self.assertEqual(result.return_code, 0)
     self.assert_(stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in stderr.read())
     self.assertEqual(str(result._cl), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
Esempio n. 11
0
def main():

    # parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--email", "-e", help="Email address for NCBI database searches.")
    parser.add_argument("--genes", "-g", help="Text file that contains a list of all gene names.")
    parser.add_argument(
        "--max_seq_length",
        "-m",
        help="Optional. Sets the maximum sequence length to include. Use this to exclude genomes.",
    )
    parser.add_argument(
        "--species", "-s", help="Text file that contains a list of all species binomials and their synonyms."
    )
    parser.add_argument(
        "--taxids",
        "-t",
        help="Optional. Text file that contains a list of all taxids. Use this to avoid repeating the NCBI taxid lookups.",
    )
    args = parser.parse_args()

    print("\n\nmatrix_maker.py\n\n")

    if not args.email:
        print(
            "NCBI requires an email address for database searches. Please use the --email flag to specify an email address.\n"
        )
        sys.exit(0)
    else:
        email = args.email

    if not args.species or not os.path.isfile(args.species):
        print("Please specify a valid list of taxa to search for.\n")
        sys.exit(0)

    if args.max_seq_length:
        max_seq_length = int(args.max_seq_length)
    else:
        max_seq_length = -1

    genes = []
    if not args.genes or not os.path.isfile(args.genes):
        print("Please specify a valid list of genes to search for.\n")
        sys.exit(0)
    else:
        # read in gene names....
        # format of file:
        # gene_name,include,rbcL,RBCL
        # gene_name,exclude,RRRBCL
        with open(args.genes, "rb") as csvfile:
            genereader = csv.reader(csvfile, delimiter=",")
            for row in genereader:
                if row[1] == "include":
                    gene = Gene(row[0])
                    for i in range(2, len(row)):
                        if row[i] != "":
                            gene.gene_names.append(row[i])
                    genes.append(gene)
                if row[1] == "exclude":
                    for gene in genes:
                        if gene.name == row[0]:
                            for i in range(2, len(row)):
                                if row[i] != "":
                                    gene.exclusions.append(row[i])

    # list of all taxon objects
    taxa = []

    # check for taxid
    print("Checking for taxids csv file...")
    if args.taxids and os.path.isfile(args.taxids):
        with open(args.taxids, "rb") as csvfile:
            print("Found taxids csv file, reading taxids...\n")
            taxidsreader = csv.reader(csvfile, delimiter=",")
            for row in taxidsreader:
                taxa.append(Taxon(row[0], row[1]))
    else:
        print("No taxids csv file found.\n")

    # open species list file, get synonyms and any missing taxids
    with open(args.species, "rb") as csvfile:
        print("Checking list of species, getting missing taxids from NCBI...")
        taxids_file = open("taxids.csv", "w")
        namesreader = csv.reader(csvfile, delimiter=",")
        i = 1
        num_lines = sum(1 for line in open(args.species))
        for row in namesreader:
            # update status
            percent = str(round(100 * i / float(num_lines), 2))
            sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)")
            sys.stdout.flush()
            i += 1
            # check to see if we already have a taxid for this species
            found = False
            for taxon in taxa:
                if taxon.binomial == row[0]:
                    found = True
                    taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n")
                    # add synonyms
                    for j in range(1, len(row)):
                        taxon.synonyms.append(row[j])
                    break
            if not found:
                # get the taxid from NCBI
                taxon = Taxon(row[0])
                taxon.get_taxid(email)
                # dont overload genbank
                time.sleep(0.1)
                taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n")
                # add synonyms
                for j in range(1, len(row)):
                    taxon.synonyms.append(row[j])
                taxa.append(taxon)
        taxids_file.close()
        print("\nWriting all taxids to file taxids.csv...")

    print("\nDownloading sequences from NCBI...")
    for gene in genes:
        print("\nSearching for gene: " + gene.name)
        i = 1
        for taxon in taxa:
            # update status
            percent = str(round(100 * i / float(len(taxa)), 2))
            sys.stdout.write("\r" + "Completed: " + str(i) + "/" + str(num_lines) + " (" + percent + "%)")
            sys.stdout.flush()
            i += 1
            if taxon.taxid != "not found":
                taxon.get_sequences(email, gene)
                # dont overload genbank
                time.sleep(0.2)

        print("\nGenerating unaligned FASTA file...")
        unaligned_file = open(gene.name + ".fasta", "w")
        for taxon in taxa:
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if record != None:
                # output format: >binomial_accession_description
                description = taxon.binomial + "_" + record.id + "_" + record.description
                description = description.replace(" ", "_")
                unaligned_file.write(">" + description + "\n")
                unaligned_file.write(str(record.seq) + "\n\n")
        unaligned_file.close()

        print("Making alignment with MAFFT...")
        try:
            from Bio.Align.Applications import MafftCommandline

            mafft_cline = MafftCommandline(input=gene.name + ".fasta")
            mafft_cline.set_parameter("--auto", True)
            mafft_cline.set_parameter("--adjustdirection", True)
            print(str(mafft_cline))
            stdout, stderr = mafft_cline()
            print("Writing alignment to FASTA file...")
            with open("aligned_" + gene.name + ".fasta", "w") as handle:
                handle.write(stdout)
        except:
            print("Problem finding MAFFT, alignment skipped.")

    print("\nGenerating summary results spreadsheet...\n")
    summary = open("result.csv", "w")
    header = "taxon,"
    for gene in genes:
        header += gene.name + ","
    summary.write(header + "\n")
    for taxon in taxa:
        accessions = taxon.binomial + ","
        for gene in genes:
            # each column will be the longest sequences accession
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if record != None:
                accessions += record.id + ","
            else:
                accessions += ","
        summary.write(accessions + "\n")
    summary.close()
    print("Done!\n")
Esempio n. 12
0
def main():
    
    print("\n\nmatrix_maker.py\n\n")
    
    print("Getting all taxid...\n")
    print("Writing taxids to file taxids.txt...\n")
    taxids_file = open("taxids.txt", "w")
    name_file = open(taxa_file)
    names = name_file.readlines()
    taxids = []
    import time
    for name in names:
        name = "%s" %(name.split()[0])
        taxid = get_taxon_id(name)
        name_taxid_text = name + "\t" + taxid
        print(name_taxid_text)
        taxids_file.write(name_taxid_text + "\n")
        taxids.append( taxid )
        # dont overload genbank
        time.sleep(0.1)
    taxids_file.close()

    print("\nDownloading sequences for each taxid...\n") #Keeping the longest sequence for each taxon...\n")
    from Bio import Entrez
    from Bio import SeqIO
    final_records = []
    for taxid in taxids:
        if taxid != "not found":
            records = get_sequences(taxid)
            # keep all records
            final_records = final_records + records
            # dont overload genbank
            time.sleep(0.2)

            # find the longest sequence
            #longest_len = 0
            #longest_seq = None
            #for record in records:
            #    if len(record) > longest_len:
            #        longest_len = len(record)
            #        longest_seq = record
            #if longest_seq != None:
            #    final_records.append(longest_seq)
    
    print("\nGenerating unaligned FASTA file with GenBank formatted description...\n")
    SeqIO.write(final_records, "output_unaligned_gb_format.fasta", "fasta")

    print("Generating unaligned FASTA file with custom formatted description...\n")
    unaligned_file = open("output_unaligned_custom_format.fasta", "w")
    for record in final_records:
        # remove the organism name from the description
        description = record.description
        if description.find(record.annotations["organism"] + " ") != -1:
            description = description.replace(record.annotations["organism"] + " ", "")
        # custom format for Andrew: >Organism name_accession_description
        description = record.annotations["organism"] + "_" + record.id + "_" + description
        description = description.replace(" ", "_")
        unaligned_file.write(">" + description + "\n")
        unaligned_file.write(str(record.seq) + "\n")
    unaligned_file.close()


    print("Making alignment with MAFFT...")
    try:
        from Bio.Align.Applications import MafftCommandline
        mafft_cline = MafftCommandline(input="output_unaligned_custom_format.fasta")
        mafft_cline.set_parameter("--auto", True)
        mafft_cline.set_parameter("--adjustdirection", True)
        print(str(mafft_cline))
        stdout, stderr = mafft_cline()
        print("Writing alignment to FASTA file...\n")
        with open("output_aligned.fasta", "w") as handle:
            handle.write(stdout)
    except:
        print("Problem finding MAFFT, alignment skipped.")
        
    print("Done!\n")
Esempio n. 13
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              universal_newlines=True,
                              shell=(sys.platform!="win32"))
     stdoutdata, stderrdata = child.communicate()
     return_code = child.returncode
     self.assertEqual(return_code, 0, "Got error code %i back from:\n%s"
                      % (return_code, cmdline))
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
     del child
Esempio n. 14
0
def main():

    # parse the command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--email",
                        "-e",
                        help="Email address for NCBI database searches.")
    parser.add_argument(
        "--genes",
        "-g",
        help=
        "Text file that defines the gene regions of interest using both include and exclude terms."
    )
    parser.add_argument(
        "--max_seq_length",
        "-m",
        help=
        "Optional. Sets the maximum sequence length to include. Use this to exclude genomes."
    )
    parser.add_argument(
        "--species",
        "-s",
        help=
        "Text file that contains a list of all species binomials and their synonyms."
    )
    parser.add_argument(
        "--taxids",
        "-t",
        help=
        "Optional. Text file that contains a list of all taxids. Use this to avoid repeating the NCBI taxid lookups."
    )
    args = parser.parse_args()

    print("\n\nmatrix_maker.py\n\n")

    if not args.email:
        print(
            "NCBI requires an email address for database searches. Please use the --email flag to specify an email address.\n"
        )
        sys.exit(0)
    else:
        email = args.email

    if not args.species or not os.path.isfile(args.species):
        print("Please specify a valid list of taxa to search for.\n")
        sys.exit(0)

    if args.max_seq_length:
        max_seq_length = int(args.max_seq_length)
    else:
        max_seq_length = -1

    genes = []
    if not args.genes or not os.path.isfile(args.genes):
        print("Please specify a valid list of genes to search for.\n")
        sys.exit(0)
    else:
        # read in gene names....
        # format of file:
        # gene_name,include,rbcL,RBCL
        # gene_name,exclude,RRRBCL
        with open(args.genes, 'rb') as csvfile:
            genereader = csv.reader(csvfile, delimiter=",")
            for row in genereader:
                if row[1] == "include":
                    gene = Gene(row[0])
                    for i in range(2, len(row)):
                        if row[i] != "":
                            gene.gene_names.append(row[i])
                    genes.append(gene)
                if row[1] == "exclude":
                    for gene in genes:
                        if gene.name == row[0]:
                            for i in range(2, len(row)):
                                if row[i] != "":
                                    gene.exclusions.append(row[i])

    # list of all taxon objects
    taxa = []

    # check for taxid
    print("Checking for taxids csv file...")
    if args.taxids and os.path.isfile(args.taxids):
        with open(args.taxids, 'rb') as csvfile:
            print("Found taxids csv file, reading taxids...\n")
            taxidsreader = csv.reader(csvfile, delimiter=",")
            for row in taxidsreader:
                taxa.append(Taxon(row[0], row[1]))
    else:
        print("No taxids csv file found.\n")

    # open species list file, get synonyms and any missing taxids
    with open(args.species, 'rb') as csvfile:
        print("Checking list of species, getting missing taxids from NCBI...")
        taxids_file = open("taxids.csv", "w")
        namesreader = csv.reader(csvfile, delimiter=",")
        i = 1
        num_lines = sum(1 for line in open(args.species))
        for row in namesreader:
            # update status
            percent = str(round(100 * i / float(num_lines), 2))
            sys.stdout.write('\r' + 'Completed: ' + str(i) + '/' +
                             str(num_lines) + ' (' + percent + '%)')
            sys.stdout.flush()
            i += 1
            # check to see if we already have a taxid for this species
            found = False
            for taxon in taxa:
                if taxon.binomial == row[0]:
                    found = True
                    taxids_file.write(taxon.binomial + "," + taxon.taxid +
                                      "\n")
                    # add synonyms
                    for j in range(1, len(row)):
                        taxon.synonyms.append(row[j])
                    break
            if not found:
                taxon = Taxon(row[0])
                # add synonyms
                for j in range(1, len(row)):
                    taxon.synonyms.append(row[j])
                taxa.append(taxon)
                # get the taxid from NCBI
                taxon.get_taxid(email)
                # dont overload genbank
                time.sleep(0.1)
                taxids_file.write(taxon.binomial + "," + taxon.taxid + "\n")
        taxids_file.close()
        print("\nWriting all taxids to file taxids.csv...")

    print("\nDownloading sequences from NCBI...")
    for gene in genes:
        print("\nSearching for gene: " + gene.name)
        i = 1
        for taxon in taxa:
            # update status
            percent = str(round(100 * i / float(len(taxa)), 2))
            sys.stdout.write('\r' + 'Completed: ' + str(i) + '/' +
                             str(num_lines) + ' (' + percent + '%)')
            sys.stdout.flush()
            i += 1
            if taxon.taxid != "not found":
                taxon.get_sequences(email, gene)
                # dont overload genbank
                time.sleep(0.2)

        print("\nGenerating unaligned FASTA file...")
        unaligned_file = open(gene.name + ".fasta", "w")
        for taxon in taxa:
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if type(record) == Bio.SeqRecord.SeqRecord:
                # output format: >binomial_accession_description
                description = taxon.binomial + "_" + record.id + "_" + record.description
                description = description.replace(" ", "_")
                unaligned_file.write(">" + description + "\n")
                unaligned_file.write(str(record.seq) + "\n\n")
        unaligned_file.close()

        print("Making alignment with MAFFT...")
        try:
            from Bio.Align.Applications import MafftCommandline
            mafft_cline = MafftCommandline(input=gene.name + ".fasta")
            mafft_cline.set_parameter("--auto", True)
            mafft_cline.set_parameter("--adjustdirection", True)
            print(str(mafft_cline))
            stdout, stderr = mafft_cline()
            print("Writing alignment to FASTA file...")
            with open("aligned_" + gene.name + ".fasta", "w") as handle:
                handle.write(stdout)
        except:
            print("Problem finding MAFFT, alignment skipped.")

    print("\nGenerating summary results spreadsheet...\n")
    summary = open("result.csv", "w")
    header = "taxon,"
    for gene in genes:
        header += gene.name + ","
    summary.write(header + "\n")
    for taxon in taxa:
        accessions = taxon.binomial + ","
        for gene in genes:
            # each column will be the longest sequences accession
            record = taxon.get_longest_seq(gene.name, max_seq_length)
            if type(record) == Bio.SeqRecord.SeqRecord:
                accessions += record.id + ","
            else:
                accessions += ","
        summary.write(accessions + "\n")
    summary.close()
    print("Done!\n")
Esempio n. 15
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe \
                      + " --localpair --weighti 4.2 --retree 5 " \
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51" \
                      + " --lop 0.233 --lep 0.2 --reorder --treeout" \
                      + " --nuc Fasta/f002")
     child = subprocess.Popen(str(cmdline),
                              stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              shell=(sys.platform!="win32"))
     return_code = child.wait()
     self.assertEqual(return_code, 0)
     self.assert_(child.stdout.read().startswith(">gi|1348912|gb|G26680|G26680"))
     self.assert_("$#=0" not in child.stderr.read())
     del child
Esempio n. 16
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe
                      + " --localpair --weighti 4.2 --retree 5 "
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51"
                      + " --lop 0.233 --lep 0.2 --reorder --treeout"
                      + " --nuc Fasta/f002")
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
Esempio n. 17
0
 def test_Mafft_with_complex_command_line(self):
     """Round-trip with complex command line."""
     cmdline = MafftCommandline(mafft_exe)
     cmdline.set_parameter("input", self.infile1)
     cmdline.set_parameter("--localpair", True)
     cmdline.set_parameter("--weighti", 4.2)
     cmdline.set_parameter("retree", 5)
     cmdline.set_parameter("maxiterate", 200)
     cmdline.set_parameter("--nofft", True)
     cmdline.set_parameter("op", 2.04)
     cmdline.set_parameter("--ep", 0.51)
     cmdline.set_parameter("--lop", 0.233)
     cmdline.set_parameter("lep", 0.2)
     cmdline.set_parameter("--reorder", True)
     cmdline.set_parameter("--treeout", True)
     cmdline.set_parameter("nuc", True)
     self.assertEqual(str(eval(repr(cmdline))), str(cmdline))
     self.assertEqual(str(cmdline), mafft_exe
                      + " --localpair --weighti 4.2 --retree 5 "
                      + "--maxiterate 200 --nofft --op 2.04 --ep 0.51"
                      + " --lop 0.233 --lep 0.2 --reorder --treeout"
                      + " --nuc Fasta/f002")
     stdoutdata, stderrdata = cmdline()
     self.assertTrue(stdoutdata.startswith(">gi|1348912|gb|G26680|G26680"))
     self.assertTrue("$#=0" not in stderrdata)
Esempio n. 18
0
    for j, accession in enumerate(accessions[i]):
        if accession.strip() != '':
            handle = Entrez.efetch(db='nucleotide',
                                   rettype='fasta',
                                   retmode='text',
                                   id=accession)
            record = SeqIO.read(handle, 'fasta')
            records[i].append(
                SeqRecord(Seq(str(record.seq), IUPAC.ambiguous_dna),
                          id=taxa[j],
                          description=""))
            handle.close()
            sleep(0.02)
    SeqIO.write(records[i], "sequences_unaligned/" + genes[i] + ".fasta",
                "fasta")

for i, gene in enumerate(genes):
    print("Aligning " + gene + " with MAFFT...")
    mafft_cline = MafftCommandline(input="sequences_unaligned/" + genes[i] +
                                   ".fasta")
    mafft_cline.set_parameter("--auto", True)
    mafft_cline.set_parameter("--adjustdirection", True)
    print(str(mafft_cline))
    stdout, stderr = mafft_cline()

    print("Writing " + gene + " alignment to FASTA file...")
    with open("sequences_aligned/" + genes[i] + ".fasta", "w") as handle:
        handle.write(stdout)

print("Done.")