def runDNAdist(seqout, outbase): """ This part runs DNAdist for the seqboot output """ print "running DNA distance" distout = outbase + ".dnadist" dnadist_cline = FDNADistCommandline(sequence=seqout, outfile=distout, method="j") stdout, stderr = dnadist_cline() return distout
def distances_from_alignment(self, filename, DNA=True): """Check we can make a distance matrix from a given alignment.""" self.assertTrue(os.path.isfile(filename), "Missing %s" % filename) if DNA: cline = FDNADistCommandline(exes["fdnadist"], method="j", sequence=filename, outfile="test_file", auto=True) else: cline = FProtDistCommandline(exes["fprotdist"], method="j", sequence=filename, outfile="test_file", auto=True) stdout, strerr = cline() # biopython can't grok distance matrices, so we'll just check it exists self.assertTrue(os.path.isfile("test_file"))
def compute_pairwise_matrices(): alignments = os.listdir( "data/alignments" ) # get list of all pairwise alignments previously completed fline = FDNADistCommandline() # create FDNADist object fline.method = "f" fline.stdout = True processes = [] matrix_id = 1 end_filename = 24 # filename end before ".phy", see below for alignment in alignments: fline.sequence = "data/alignments/" + alignment # set sequence parameter fline.outfile = alignment[:end_filename] + "_matrix" # to get string ISL_XXXXXX_vs_ISL_YYYYYY_matrix print(str(fline)) p = subprocess.Popen(str(fline) + " -odirectory2 data/matrices ", shell=True) # launching subprocess processes.append((matrix_id, p)) # appending to process list matrix_id += 1 while processes: for proc in processes: if proc[1].poll() is not None: # if process is completed processes.remove(proc) # remove it print("Done matrix %d" % (proc[0]))
def build_nj_phylip(alignment, outfile, outgroup, work_dir="."): """ build neighbor joining tree of DNA seqs with PHYLIP in EMBOSS PHYLIP manual http://evolution.genetics.washington.edu/phylip/doc/ """ phy_file = op.join(work_dir, "work", "aln.phy") try: AlignIO.write(alignment, file(phy_file, "w"), "phylip") except ValueError: print >>sys.stderr, \ "Repeated seq name, possibly due to truncation. NJ tree not built." return None seqboot_out = phy_file.rsplit(".", 1)[0] + ".fseqboot" seqboot_cl = FSeqBootCommandline(FPHYLIP_BIN("fseqboot"), \ sequence=phy_file, outfile=seqboot_out, \ seqtype="d", reps=100, seed=12345) stdout, stderr = seqboot_cl() logging.debug("Resampling alignment: %s" % seqboot_cl) dnadist_out = phy_file.rsplit(".", 1)[0] + ".fdnadist" dnadist_cl = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \ sequence=seqboot_out, outfile=dnadist_out, method="f") stdout, stderr = dnadist_cl() logging.debug\ ("Calculating distance for bootstrapped alignments: %s" % dnadist_cl) neighbor_out = phy_file.rsplit(".", 1)[0] + ".njtree" e = phy_file.rsplit(".", 1)[0] + ".fneighbor" neighbor_cl = FNeighborCommandline(FPHYLIP_BIN("fneighbor"), \ datafile=dnadist_out, outfile=e, outtreefile=neighbor_out) stdout, stderr = neighbor_cl() logging.debug("Building Neighbor Joining tree: %s" % neighbor_cl) consense_out = phy_file.rsplit(".", 1)[0] + ".consensustree.nodesupport" e = phy_file.rsplit(".", 1)[0] + ".fconsense" consense_cl = FConsenseCommandline(FPHYLIP_BIN("fconsense"), \ intreefile=neighbor_out, outfile=e, outtreefile=consense_out) stdout, stderr = consense_cl() logging.debug("Building consensus tree: %s" % consense_cl) # distance without bootstrapping dnadist_out0 = phy_file.rsplit(".", 1)[0] + ".fdnadist0" dnadist_cl0 = FDNADistCommandline(FPHYLIP_BIN("fdnadist"), \ sequence=phy_file, outfile=dnadist_out0, method="f") stdout, stderr = dnadist_cl0() logging.debug\ ("Calculating distance for original alignment: %s" % dnadist_cl0) # infer branch length on consensus tree consensustree1 = phy_file.rsplit(".", 1)[0] + ".consensustree.branchlength" run_ffitch(distfile=dnadist_out0, outtreefile=consensustree1, \ intreefile=consense_out) # write final tree ct_s = Tree(consense_out) if outgroup: t1 = consensustree1 + ".rooted" t2 = smart_reroot(consensustree1, outgroup, t1) if t2 == t1: outfile = outfile.replace(".unrooted", "") ct_b = Tree(t2) else: ct_b = Tree(consensustree1) nodesupport = {} for node in ct_s.traverse("postorder"): node_children = tuple(sorted([f.name for f in node])) if len(node_children) > 1: nodesupport[node_children] = node.dist / 100. for k, v in nodesupport.items(): ct_b.get_common_ancestor(*k).support = v print ct_b ct_b.write(format=0, outfile=outfile) try: s = op.getsize(outfile) except OSError: s = 0 if s: logging.debug("NJ tree printed to %s" % outfile) return outfile, phy_file else: logging.debug("Something was wrong. NJ tree was not built.") return None
infile_Formatted.write("\n") with open("query_searchSequences_dna.fasta",'r') as infile_sequences: for record in SeqIO.parse("query_searchSequences_dna.fasta", "fasta"): speciesName = '{:<10}'.format(record.id[:10]) #This line ensures that any species' name that is greater than 10 characters, gets cut to 10 characters. And any species' name that is less than #10 characters, gets increased to 10 characters using spaces. speciesSequence = record.seq[:numberOfPositions] #This line cuts each sequence to the length stored in 'numberOfPositions' so that each sequence is the same length. infile_Formatted.write(str(speciesName)) infile_Formatted.write(str(speciesSequence)) infile_Formatted.write("\n") ##The next part of the code uses FDNADistCommandline and the "FormattedInputFile.txt" file created above to create a distance matrix. #The first part is "/usr/local/bin/fdnadist", which tells Python where it can find this application. The sequence parameter tells FDNADistCommandline which file you would like to use that contains #the sequences you want to compute a distance matrix for. The method parameter tells FDNADistCommandline which distance matrix algorithm you would like to use; in this case, f means FDNADistCommandline #will use the F84 distance model. The outfile parameter specifies the name of the outfile in which FDNADistCommandline will write the results to. FDNADist_matrix = FDNADistCommandline("/usr/local/bin/fdnadist",sequence="FormattedInputFile.txt",method="f",outfile="distanceMatrix.fdnadist") stdout, stderr = FDNADist_matrix() #This line is required in order for FDNADistCommandline to actually write anything to the outfile. ##The next part of the code uses FNeighborCommandline and the "distanceMatrix.fdnadist" file created above to create a phylogenetic tree. #The first part is "/usr/local/bin/fneighbor", which tells Python where it can find this application. The datafile parameter tells FNeighborCommandline which file you would like to use that contains #the distance matrix that will be used to create the phylogenetic tree. The outfile parameter specifies the name of the outfile in which FNeighborCommandline will write the results to. FNeighbor_tree = FNeighborCommandline("/usr/local/bin/fneighbor",datafile="distanceMatrix.fdnadist",outfile="treeFile.fneighbor") stdout, stderr = FNeighbor_tree() #This line is required in order for FNeighborCommandline to actually write anything to the outfile. ##The next part of the code creates alignments between each and every one of the sequences in the "query_searchSequences_dna.fasta" using the NeedleallCommandline. #The next few lines create an input file for NeedleallCommandline. The FASTA file called 'query_searchSequences_dna.fasta' cannot be used because it contains spaces between each record and for #some reason the spaces affect the Needleall alignemnt. So the next few lines create a FASTA without spaces between each record called 'needlallAlignmentInput_Nucleotide.fasta'. The sequences are also all