예제 #1
0
def runDNAdist(seqout, outbase):
    """
    This part runs DNAdist for the seqboot output
    """
    print "running DNA distance"
    distout = outbase + ".dnadist"
    dnadist_cline = FDNADistCommandline(sequence=seqout,
                                        outfile=distout,
                                        method="j")
    stdout, stderr = dnadist_cline()
    return distout
예제 #2
0
 def distances_from_alignment(self, filename, DNA=True):
     """Check we can make a distance matrix from a given alignment."""
     self.assertTrue(os.path.isfile(filename), "Missing %s" % filename)
     if DNA:
         cline = FDNADistCommandline(exes["fdnadist"],
                                     method="j",
                                     sequence=filename,
                                     outfile="test_file",
                                     auto=True)
     else:
         cline = FProtDistCommandline(exes["fprotdist"],
                                      method="j",
                                      sequence=filename,
                                      outfile="test_file",
                                      auto=True)
     stdout, strerr = cline()
     # biopython can't grok distance matrices, so we'll just check it exists
     self.assertTrue(os.path.isfile("test_file"))
def compute_pairwise_matrices():
    alignments = os.listdir(
        "data/alignments"
    )  # get list of all pairwise alignments previously completed
    fline = FDNADistCommandline()  # create FDNADist object
    fline.method = "f"
    fline.stdout = True
    processes = []
    matrix_id = 1
    end_filename = 24  # filename end before ".phy", see below
    for alignment in alignments:
        fline.sequence = "data/alignments/" + alignment  # set sequence parameter
        fline.outfile = alignment[:end_filename] + "_matrix"  # to get string ISL_XXXXXX_vs_ISL_YYYYYY_matrix
        print(str(fline))
        p = subprocess.Popen(str(fline) + " -odirectory2 data/matrices ",
                             shell=True)  # launching subprocess
        processes.append((matrix_id, p))  # appending to process list
        matrix_id += 1

    while processes:
        for proc in processes:
            if proc[1].poll() is not None:  # if process is completed
                processes.remove(proc)  # remove it
                print("Done matrix %d" % (proc[0]))
예제 #4
0
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."):
    """
    build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS

    PHYLIP manual
    http://evolution.genetics.washington.edu/phylip/doc/
    """

    phy_file = op.join(work_dir, "work", "aln.phy")
    try:
        AlignIO.write(alignment, file(phy_file, "w"), "phylip")
    except ValueError:
        print >>sys.stderr, \
            "Repeated seq name, possibly due to truncation. NJ tree not built."
        return None

    seqboot_out = phy_file.rsplit(".", 1)[0] + ".fseqboot"
    seqboot_cl = FSeqBootCommandline(FPHYLIP_BIN("fseqboot"), \
        sequence=phy_file, outfile=seqboot_out, \
        seqtype="d", reps=100, seed=12345)
    stdout, stderr = seqboot_cl()
    logging.debug("Resampling alignment: %s" % seqboot_cl)

    dnadist_out = phy_file.rsplit(".", 1)[0] + ".fdnadist"
    dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \
        sequence=seqboot_out, outfile=dnadist_out, method="f")
    stdout, stderr = dnadist_cl()
    logging.debug\
        ("Calculating distance for bootstrapped alignments: %s" % dnadist_cl)

    neighbor_out = phy_file.rsplit(".", 1)[0] + ".njtree"
    e = phy_file.rsplit(".", 1)[0] + ".fneighbor"
    neighbor_cl = FNeighborCommandline(FPHYLIP_BIN("fneighbor"), \
        datafile=dnadist_out, outfile=e, outtreefile=neighbor_out)
    stdout, stderr = neighbor_cl()
    logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl)

    consense_out = phy_file.rsplit(".", 1)[0] + ".consensustree.nodesupport"
    e = phy_file.rsplit(".", 1)[0] + ".fconsense"
    consense_cl = FConsenseCommandline(FPHYLIP_BIN("fconsense"), \
        intreefile=neighbor_out, outfile=e, outtreefile=consense_out)
    stdout, stderr = consense_cl()
    logging.debug("Building consensus tree: %s" % consense_cl)

    # distance without bootstrapping
    dnadist_out0 = phy_file.rsplit(".", 1)[0] + ".fdnadist0"
    dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \
        sequence=phy_file, outfile=dnadist_out0, method="f")
    stdout, stderr = dnadist_cl0()
    logging.debug\
        ("Calculating distance for original alignment: %s" % dnadist_cl0)

    # infer branch length on consensus tree
    consensustree1 = phy_file.rsplit(".", 1)[0] + ".consensustree.branchlength"
    run_ffitch(distfile=dnadist_out0, outtreefile=consensustree1, \
            intreefile=consense_out)

    # write final tree
    ct_s = Tree(consense_out)

    if outgroup:
        t1 = consensustree1 + ".rooted"
        t2 = smart_reroot(consensustree1, outgroup, t1)
        if t2 == t1:
            outfile = outfile.replace(".unrooted", "")
        ct_b = Tree(t2)
    else:
        ct_b = Tree(consensustree1)

    nodesupport = {}
    for node in ct_s.traverse("postorder"):
        node_children = tuple(sorted([f.name for f in node]))
        if len(node_children) > 1:
            nodesupport[node_children] = node.dist / 100.

    for k, v in nodesupport.items():
        ct_b.get_common_ancestor(*k).support = v
    print ct_b
    ct_b.write(format=0, outfile=outfile)

    try:
        s = op.getsize(outfile)
    except OSError:
        s = 0
    if s:
        logging.debug("NJ tree printed to %s" % outfile)
        return outfile, phy_file
    else:
        logging.debug("Something was wrong. NJ tree was not built.")
        return None
    infile_Formatted.write("\n")
    with open("query_searchSequences_dna.fasta",'r') as infile_sequences:
        for record in SeqIO.parse("query_searchSequences_dna.fasta", "fasta"):
            speciesName = '{:<10}'.format(record.id[:10]) #This line ensures that any species' name that is greater than 10 characters, gets cut to 10 characters. And any species' name that is less than
            #10 characters, gets increased to 10 characters using spaces. 
            speciesSequence = record.seq[:numberOfPositions] #This line cuts each sequence to the length stored in 'numberOfPositions' so that each sequence is the same length.
            infile_Formatted.write(str(speciesName))
            infile_Formatted.write(str(speciesSequence))
            infile_Formatted.write("\n")


##The next part of the code uses FDNADistCommandline and the "FormattedInputFile.txt" file created above to create a distance matrix.
#The first part is "/usr/local/bin/fdnadist", which tells Python where it can find this application. The sequence parameter tells FDNADistCommandline which file you would like to use that contains
#the sequences you want to compute a distance matrix for. The method parameter tells FDNADistCommandline which distance matrix algorithm you would like to use; in this case, f means FDNADistCommandline
#will use the F84 distance model. The outfile parameter specifies the name of the outfile in which FDNADistCommandline will write the results to.
FDNADist_matrix = FDNADistCommandline("/usr/local/bin/fdnadist",sequence="FormattedInputFile.txt",method="f",outfile="distanceMatrix.fdnadist")

stdout, stderr = FDNADist_matrix() #This line is required in order for FDNADistCommandline to actually write anything to the outfile. 


##The next part of the code uses FNeighborCommandline and the "distanceMatrix.fdnadist" file created above to create a phylogenetic tree.
#The first part is "/usr/local/bin/fneighbor", which tells Python where it can find this application. The datafile parameter tells FNeighborCommandline which file you would like to use that contains
#the distance matrix that will be used to create the phylogenetic tree. The outfile parameter specifies the name of the outfile in which FNeighborCommandline will write the results to.
FNeighbor_tree = FNeighborCommandline("/usr/local/bin/fneighbor",datafile="distanceMatrix.fdnadist",outfile="treeFile.fneighbor")

stdout, stderr = FNeighbor_tree() #This line is required in order for FNeighborCommandline to actually write anything to the outfile. 


##The next part of the code creates alignments between each and every one of the sequences in the "query_searchSequences_dna.fasta" using the NeedleallCommandline.
#The next few lines create an input file for NeedleallCommandline. The FASTA file called 'query_searchSequences_dna.fasta' cannot be used because it contains spaces between each record and for
#some reason the spaces affect the Needleall alignemnt. So the next few lines create a FASTA without spaces between each record called 'needlallAlignmentInput_Nucleotide.fasta'. The sequences are also all