def fasta_to_tree(DIR, fasta, num_cores, seqtype, num_seq_cutoff=NUM_SEQ_CUTOFF): """ given a fasta file align, trim alignment and build a tree choose appropriate tools depending on size of the fasta file """ if DIR[-1] != "/": DIR += "/" seqcount, maxlen = get_fasta_size(DIR + fasta) assert seqcount >= 4, "Less than four sequences in " + DIR + fasta print fasta, seqcount, "sequences" if seqcount >= NUM_SEQ_CUTOFF: # large cluster print "running pasta" alignment = pasta(DIR, fasta, num_cores, seqtype) cleaned = pxclsq(DIR, alignment, 0.01, seqtype) if len(read_fasta_file(DIR + cleaned)) >= 4: tree = fasttree(DIR, cleaned, seqtype) else: print "Less than 4 taxa in", cleaned else: # small cluster alignment = mafft(DIR, fasta, num_cores, seqtype) cleaned = pxclsq(DIR, alignment, 0.1, seqtype) if len(read_fasta_file(DIR + cleaned)) >= 4: tree = raxml(DIR, cleaned, num_cores, seqtype) else: print "Less than 4 taxa in", cleaned
def refine(query_fasta, start_fasta, deep_paralog_cutoff, num_cores): gene_name = get_filename_from_path(query_fasta)[1].split(".")[0] outdir, fasta = get_filename_from_path(start_fasta) #print outdir,fasta deep_paralog_cutoff = float(deep_paralog_cutoff) query_ids = [s.name for s in seq.read_fasta_file(query_fasta)] new_fasta = [] # list of output refined fasta files print outdir, fasta # make a tree from the start_fasta tree = fasta_to_tree.fasta_to_tree(outdir, fasta, num_cores, "aa") if tree == None: return [] with open(tree, "r") as infile: intree = newick3.parse(infile.readline()) root = trim_tips.trim(intree, relative_cutoff=deep_paralog_cutoff, absolute_cutoff=deep_paralog_cutoff * 2) if os.path.exists(outdir + fasta + ".pasta.aln-cln"): clnfile = outdir + fasta + ".pasta.aln-cln" else: clnfile = outdir + fasta + ".mafft.aln-cln" root = mask_tips_by_taxonID_transcripts.mask(root,\ clnfile=clnfile,\ para="y", ignore=GENOMES) if root != None: with open(tree + ".tt.mm", "w") as outfile: outfile.write(newick3.tostring(root) + "\n") subtrees = cut_long_internal_branches.cut_long_internal_branches( root, cutoff=deep_paralog_cutoff) count = 0 base_name = fasta.split(".")[0] seqDICT = {} # key is seqid, value is seq for s in seq.read_fasta_file(start_fasta): seqDICT[s.name] = s.seq for tree in subtrees: if tree == None: continue label_set = set(tree_utils.get_front_labels(tree)) if len(label_set) > 4 and len(label_set & set(query_ids)) > 0: count += 1 with open(outdir + base_name + "_" + str(count) + ".subtree", "w") as outfile: outfile.write(newick3.tostring(tree) + ";\n") with open(outdir + base_name + "_" + str(count) + ".fa", "w") as outfile: for seqid in tree_utils.get_front_labels(tree): try: outfile.write(">" + seqid + "\n" + seqDICT[seqid] + "\n") except: print seqid, "not found in fasta file" new_fasta.append(outdir + base_name + "_" + str(count) + ".fa") return new_fasta
def mcl_to_fasta(all_fasta, mcl_outfile, minimal_taxa, outdir): print "Reading mcl output file" clusterDICT = {} #key is seqID, value is clusterID count = 0 if outdir[-1] != "/": outdir += "/" with open(mcl_outfile, "rU") as infile: for line in infile: if len(line) < 3: continue #ignore empty lines spls = line.strip().split('\t') if len(set(i.split("@")[0] for i in spls)) >= minimal_taxa: count += 1 clusterID = str(count) for seqID in spls: clusterDICT[seqID] = clusterID print count, "clusters with at least", minimal_taxa, "taxa read" print "Reading the fasta file", all_fasta #handle = open(all_fasta,"rU") #for record in SeqIO.parse(handle,"fasta"): for s in read_fasta_file(all_fasta): #seqid,seq = str(record.id),str(record.seq) seqid, seq = s.name, s.seq try: clusterID = clusterDICT[seqid] with open(outdir + "cluster" + clusterID + ".fa", "a") as outfile: outfile.write(">" + seqid + "\n" + seq + "\n") except: pass # Those seqs that did not go in a cluster with enough taxa
def ortho_to_aln(alndir, tredir, outdir, ortho_tree_file_ending=".tre"): """ Read final homolog write individual alignment files for each ortholog Shorten seq id to taxon id """ if alndir[-1] != "/": alndir += "/" if tredir[-1] != "/": tredir += "/" if outdir[-1] != "/": outdir += "/" filecount = 0 for i in os.listdir(tredir): if i.endswith(ortho_tree_file_ending): filecount += 1 print i #read in the alignment into an dictionary seqDICT = {} #key is seqID, value is seq for s in read_fasta_file(alndir + i.split(".")[0] + ".fa.mafft.aln"): seqDICT[s.name] = s.seq #read in tree tips and write output alignment with open(tredir + i, "r") as infile: intree = newick3.parse(infile.readline()) labels = tree_utils.get_front_labels(intree) with open(outdir + i.replace(ortho_tree_file_ending, ".aln"), "w") as outfile: for lab in labels: outfile.write(">" + tree_utils.get_name(lab) + "\n" + seqDICT[lab] + "\n") assert filecount > 0,\ "No file ends with "+ortho_tree_file_ending+" was found in "+tredir
def phyutility(DIR, alignment, min_col_occup, seqtype, min_chr=10): """ remove columns with occupancy lower than MIN_COLUMN_OCCUPANCY remove seqs shorter than MIN_CHR after filter columns """ if DIR[-1] != "/": DIR += "/" cleaned = alignment + "-cln" if os.path.exists(DIR + cleaned): return cleaned assert alignment.endswith(".aln"),\ "phyutility infile "+alignment+" not ends with .aln" assert os.stat(DIR + alignment).st_size > 0, DIR + alignment + "empty" assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa" if seqtype == "aa": cmd = ["phyutility","-aa","-clean",str(min_col_occup),"-in",\ DIR+alignment,"-out",DIR+alignment+"-pht"] else: cmd = ["phyutility","-clean",str(min_col_occup),"-in",\ DIR+alignment,"-out",DIR+alignment+"-pht"] print " ".join(cmd) os.system(" ".join(cmd)) assert os.path.exists(DIR + alignment + "-pht"), "Error phyutility" #remove empty and very short seqs outfile = open(DIR + cleaned, "w") for s in read_fasta_file(DIR + alignment + "-pht"): if len(s.seq.replace("-", "")) >= min_chr: outfile.write(s.get_fasta()) outfile.close() os.remove(DIR + alignment + "-pht") return cleaned
def main(fasta, treDIR, tree_file_ending, outDIR): if treDIR[-1] != "/": treDIR += "/" if outDIR[-1] != "/": outDIR += "/" print "Reading fasta file", fasta seqDICT = {} #key is seqID, value is seq for s in read_fasta_file(fasta): seqDICT[s.name] = s.seq print "Writing fasta files" filecount = 0 for i in os.listdir(treDIR): if i.endswith(tree_file_ending): print i filecount += 1 with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) clusterID = tree_utils.get_clusterID(i) if clusterID.endswith("rr"): outname = outDIR + clusterID + "_rr.fa" else: outname = outDIR + clusterID + "rr.fa" with open(outname, "w") as outfile: for label in tree_utils.get_front_labels(intree): outfile.write(">" + label + "\n" + seqDICT[label] + "\n") assert filecount > 0,\ "No file ends with "+tree_file_ending+" found in "+treDIR
def main(treDIR, clnDIR, para, intree_file_ending=INTREE_FILE_ENDING): if treDIR[-1] != "/": treDIR += "/" if clnDIR[-1] != "/": clnDIR += "/" assert para == "y" or para == "n", "mask paraphyletic tips? (y/n)" mask_para = True if para == "y" else False filecount = 0 filematch = {} #key is clusterID, value is the .aln-cln file for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): clusterID = get_clusterID(i) assert clusterID not in filematch, \ "The clusterID "+clusterID+" repeats in "+clnDIR filematch[clusterID] = i for i in os.listdir(treDIR): if i.endswith(intree_file_ending): with open(treDIR + i, "r") as infile: intree = newick3.parse(infile.readline()) print i clusterID = get_clusterID(i) filecount += 1 chrDICT = {} #key is seqid, value is number of unambiguous chrs for s in read_fasta_file(clnDIR + filematch[clusterID]): for ch in ['-', 'X', "x", "?", "*"]: s.seq = s.seq.replace(ch, "") #ignore gaps, xs and Xs chrDICT[s.name] = len(s.seq) curroot = mask_monophyletic_tips(intree, chrDICT) if mask_para: curroot = mask_paraphyletic_tips(curroot, chrDICT) with open(treDIR + i + ".mm", "w") as outfile: outfile.write(newick3.tostring(curroot) + ";\n") assert filecount > 0, \ "No file ends with "+intree_file_ending+" found in "+treDIR
def raxml_bs(DIR, cleaned, num_cores, seqtype, replicates=100): assert cleaned.endswith(".aln-cln"),\ "raxml infile "+cleaned+" not ends with .aln-cln" assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa" assert len(read_fasta_file(DIR+cleaned)) >= 4,\ "less than 4 sequences in "+DIR+cleaned clusterID = cleaned.split(".")[0] tree = DIR + clusterID + ".raxml_bs.tre" raw_tree = "RAxML_bipartitions." + cleaned model = "PROTCATWAG" if seqtype == "aa" else "GTRCAT" if not os.path.exists(tree) and not os.path.exists(raw_tree): # raxml crashes if input file starts with . infasta = cleaned if DIR == "./" else DIR + cleaned cmd = ["raxml","-T",str(num_cores),\ "-f","a","-x","12345","-#",str(replicates),\ "-p","12345","-s",infasta,"-n",cleaned,"-m",model] print " ".join(cmd) p = subprocess.Popen(cmd, stdout=subprocess.PIPE) out = p.communicate() assert p.returncode == 0, "Error raxml" + out[0] try: os.rename(raw_tree, tree) os.rename("RAxML_bootstrap." + cleaned, DIR + clusterID + ".raxml_bs.trees") os.remove("RAxML_bestTree." + cleaned) os.remove("RAxML_info." + cleaned) os.remove("RAxML_log." + cleaned) os.remove("RAxML_parsimonyTree." + cleaned) os.remove("RAxML_result." + cleaned) os.remove("RAxML_bipartitionsBranchLabels." + cleaned) os.remove(DIR + cleaned + ".reduced") except: pass # no need to worry about extra intermediate files os.remove("RAxML_bipartitionsBranchLabels." + cleaned) return tree
def get_fasta_size(fasta): """ given a fasta file output the number of seqs and the length of the longest seq """ longest = 0 seqlist = read_fasta_file(fasta) for s in seqlist: longest = max(longest, len(s.seq.replace("-", ""))) return len(seqlist), longest
def fasta_ok(fasta, min_count=MIN_FASTA): """count number of non-empty fasta sequences""" if not os.path.exists(fasta): return False fasta_count = 0 for i in seq.read_fasta_file(fasta): if len(i.seq) > 0: fasta_count += 1 print fasta, "contains", fasta_count, "non-empty sequences" if fasta_count >= min_count: return True else: return False
def mask(curroot, clnfile, para, ignore=[]): chrDICT = {} #key is seqid, value is number of unambiguous chrs for s in read_fasta_file(clnfile): for ch in ['-', 'X', "x", "?", "*"]: s.seq = s.seq.replace(ch, "") #ignore gaps, xs and Xs chrDICT[s.name] = len(s.seq) curroot = mask_monophyletic_tips(curroot, chrDICT, ignore) if para: curroot = mask_paraphyletic_tips(curroot, chrDICT, ignore) return curroot
def fasta_to_tree(DIR, fasta, num_cores, seqtype, num_seq_cutoff=NUM_SEQ_CUTOFF): """ given a fasta file align, trim alignment and build a tree choose appropriate tools depending on size of the fasta file """ if DIR[-1] != "/": DIR += "/" seqcount, maxlen = get_fasta_size(DIR + fasta) assert seqcount >= 3, "Less than three sequences in " + DIR + fasta print fasta, seqcount, "sequences" if seqcount >= NUM_SEQ_CUTOFF: # large cluster alignment = pasta(DIR, fasta, num_cores, seqtype) # cleaned = phyutility(DIR,alignment,0.01,seqtype) cleaned = trimal( DIR, alignment, 0.5, 0.001) # use trimal-added by Tao, now need to def trimal if len(read_fasta_file(DIR + cleaned)) >= 3: tree = fasttree(DIR, cleaned, seqtype) else: print "Less than 3 taxa in", cleaned else: # small cluster alignment = mafft(DIR, fasta, num_cores, seqtype) # cleaned = phyutility(DIR,alignment,0.1,seqtype) "phyutility can only trim gaps-added by tao" cleaned = trimal( DIR, alignment, 0.5, 0.001) # use trimal-added by Tao, now need to def trimal seqcount, maxlen = get_fasta_size(DIR + cleaned) print cleaned, seqcount, "sequences" if len(read_fasta_file(DIR + cleaned)) >= 3: tree = fasttree(DIR, cleaned, seqtype) #tree = raxml(DIR,cleaned,num_cores,seqtype) #if len(read_fasta_file(DIR+cleaned)) == 3: # added by Tao #tree = fasttree(DIR,cleaned,seqtype) # use added by Tao else: print "Less than 3 taxa in", cleaned
def fasta_to_bs_tree(DIR, fasta, num_cores, seqtype): """ given a fasta file for the final homolog align, trim alignment and build a tree with bootstrap support """ if DIR[-1] != "/": DIR += "/" seqcount, maxlen = get_fasta_size(DIR + fasta) assert seqcount >= 4, "Less than four sequences in " + DIR + fasta print fasta, seqcount, "sequences" alignment = mafft(DIR, fasta, num_cores, seqtype) cleaned = phyutility(DIR, alignment, 0.2, seqtype) if len(read_fasta_file(DIR + cleaned)) >= 4: tree = raxml_bs(DIR, cleaned, num_cores, seqtype) else: print "Less than 4 taxa in", cleaned
def mafft(DIR, fasta, thread, seqtype): if DIR[-1] != "/": DIR += "/" alignment = fasta + ".mafft.aln" if os.path.exists(DIR + alignment) and os.stat(DIR + alignment).st_size > 0: return alignment assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa" seqlist = read_fasta_file(DIR + fasta) seqcount = len(seqlist) maxlen = 0 for s in seqlist: maxlen = max(maxlen, len(s.seq)) assert seqcount >= 4, "less than 4 sequences in " + DIR + fasta if seqtype == "dna": infasta = DIR + fasta seq = "--nuc" else: infasta = DIR + fasta + ".temp" seq = "--amino" with open(infasta, "w") as outfile: for s in seqlist: #remove U which is usually not in aa alphabet s.seq = s.seq.replace("U", "X") s.seq = s.seq.replace("u", "x") #remove stop codon and seq after it if "*" in s.seq: s.seq = s.seq[:s.seq.find("*")] outfile.write(s.get_fasta()) if seqcount >= 2000 or maxlen >= 10000: alg = ["--auto"] #so that the run actually finishes! else: alg = ["--genafpair", "--maxiterate", "1000"] cmd = ["mafft"] + alg + [seq, "--thread", str(thread)] #com += ["--anysymbol"] # when there are "U"s in aa sequences cmd += [infasta] print " ".join(cmd) out = open(DIR + alignment, 'w') p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=out) out.close() p.communicate() assert p.returncode == 0, "Error mafft" if seqtype == "aa": os.remove(DIR + fasta + ".temp") return alignment
def mcl_to_fasta(all_fasta,mcl_outfile,minimal_taxa,outdir): #print "Reading mcl output file" clusterDICT = {} #key is seqID, value is clusterID minimal_taxa = int(minimal_taxa) count = 0 with open(mcl_outfile,"r") as infile: for line in infile: if len(line) < 3: continue #ignore empty lines spls = line.strip().split('\t') if len(set(get_name(i) for i in spls)) >= minimal_taxa: count += 1 for seqID in spls: clusterDICT[seqID] = str(count) #print "Reading the fasta file",all_fasta for s in read_fasta_file(all_fasta): try: clusterID = clusterDICT[s.name] with open(outdir+"cluster"+clusterID+".fa","a") as outfile: outfile.write(">"+s.name+"\n"+s.seq+"\n") except: pass # Seqs that did not go in a cluster with enough taxa
def blastp(query_fasta, DIR, num_cores, max_num_hits=20, min_bitscore=20.0): """ same as swipe but using blastp """ if DIR[-1] != "/": DIR += "/" max_num_hits = int(max_num_hits) min_bitscore = float(min_bitscore) # blastp with each taxon pepfiles = [ i for i in os.listdir(DIR) if (i.endswith(".pep.fa") or i.endswith(".cdhit")) ] datasets = [i.split(".")[0] for i in pepfiles] print len(pepfiles), "input peptide files read" assert len(set(datasets)) == len(datasets),\ "dataset name repeats. remove duplicated sets" for i in os.listdir(DIR): if i.endswith(".pep.fa") or i.endswith(".cdhit"): if not os.path.exists(DIR + i + ".psd"): os.system("makeblastdb -in " + DIR + i + " -parse_seqids -dbtype prot -out " + DIR + i) blastp_outname = DIR + i + "." + get_filename_from_path( query_fasta).split(".")[0] + ".blastp" if not os.path.exists(blastp_outname): cmd = "blastp -db " + DIR + i cmd += " -query " + query_fasta cmd += " -num_threads " + str(num_cores) cmd += " -out " + blastp_outname cmd += " -evalue 10" cmd += " -max_target_seqs " + str(max_num_hits) cmd += " -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore'" print cmd os.system(cmd) assert os.path.exists(blastp_outname), \ "blastp did not finish correctly" """ blastp output colums are: 0-qseqid 2-sseqid 15-bitscore'" """ # summarize the hit seq ids if not os.path.exists(blastp_outname + ".hits"): hit_tuples = [] # alist of tuples (hit, bitscore) with open(blastp_outname, "r") as infile: for line in infile: if len(line) < 3: continue # skip empty lines spls = line.strip().split("\t") query, hit, bitscore = spls[0], spls[2], float( spls[-1]) if query != hit and bitscore >= min_bitscore: hit_tuples.append((hit, bitscore)) out = [] # unique hit ids for hit, bitscore in sorted(hit_tuples, key=lambda x: x[1], reverse=True): if hit not in out: out.append(hit) if len(out) == max_num_hits: break if len(out) == 0: print "Warning: No hits found" with open(blastp_outname + ".hits", "w") as outfile: for hit in out: print hit outfile.write(hit + "\n") # write output fasta outname = query_fasta.replace(".pep.fa", "_blastp.fa") print "Writing output fasta", outname outfile = open(outname, "w") query_seqids = [] # avoid seq id repeats with open(query_fasta, "r") as infile: for line in infile: outfile.write(line) # copy over query seqs if line[0] == ">": query_seqids.append(line.strip()[1:]) for i in os.listdir(DIR): if i.endswith(".pep.fa") or i.endswith(".cdhit"): seqDICT = {} # key is seq name, value is seq for s in seq.read_fasta_file(DIR + i): seqDICT[s.name] = s.seq with open( DIR + i + "." + get_filename_from_path(query_fasta).split(".")[0] + ".blastp.hits", "r") as infile: for line in infile: line = line.strip() if len(line) > 0 and line not in query_seqids: outfile.write(">" + line + "\n" + seqDICT[line] + "\n") outfile.close()
required=False, default=False) parser.add_argument("-t", "--threads", help="how many threads for iqtree?", required=False, type=int, default=2) if len(sys.argv[1:]) == 0: sys.argv.append("-h") args = parser.parse_args() seqfile = args.seqfile print("running " + seqfile, file=sys.stderr) seqs = seq.read_fasta_file(seqfile) sw = args.sw #sliding window step = args.increment #steps numthreads = args.threads a_len = len(seqs[0].seq) if a_len < sw: print("sequence is less than " + str(args.sw), file=sys.stderr) sys.exit(0) if a_len < sw + step: print("wouldn't make more than one segment: " + str(a_len) + "<" + str(sw + step), file=sys.stderr) sys.exit(0) x, segc = make_trees(seqs, seqfile, a_len, sw, step, numthreads) y = run_iqtree(seqfile, numthreads)
def swipe(query_fasta, pepdir, outdir, num_cores, max_num_hits=10, min_bitscore=30.0): """ get the initial fasta files swipe query_fasta against each data set in DIR with peptide fasta files that either end with .pep.fa or.cdhit, swipe on each one write output in outdir take the top num_to_bait hits ranked by bitscore evalue set to 10 ignore all hits with bitscore less than 0.1 of the highest hit from the query """ if pepdir[-1] != "/": pepdir += "/" if outdir[-1] != "/": outdir += "/" num_cores = str(num_cores) max_num_hits = int(max_num_hits) gene_name = get_filename_from_path(query_fasta).split(".")[0] # swipe with each taxon pepfiles = [ i for i in os.listdir(pepdir) if (i.endswith(".pep.fa") or i.endswith(".cdhit")) ] datasets = [i.split(".")[0] for i in pepfiles] print len(pepfiles), "input peptide files read" assert len(set(datasets)) == len(datasets),\ "dataset name repeats. remove duplicated sets" temp_files = [] # will remove these later for i in os.listdir(pepdir): if i.endswith(".pep.fa") or i.endswith(".cdhit"): if not os.path.exists(pepdir + i + ".psd"): os.system("makeblastdb -in " + pepdir + i + " -parse_seqids -dbtype prot -out " + pepdir + i) swipe_outname = pepdir + i + "." + get_filename_from_path( query_fasta).split(".")[0] + ".swipe" temp_files.append(swipe_outname) temp_files.append(swipe_outname + ".hits") if not os.path.exists(swipe_outname): cmd = "swipe -d " + pepdir + i + " -i " + query_fasta + " -a " + num_cores cmd += " -p blastp -o " + swipe_outname + " -m 8 -e 10" print cmd os.system(cmd) assert os.path.exists(swipe_outname), \ "swipe did not finish correctly" """ swipe output colums are: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score """ # summarize the hit seq ids if not os.path.exists(swipe_outname + ".hits"): hit_tuples = [] # a list of tuples (hit, bitscore) with open(swipe_outname, "r") as infile: for line in infile: if len(line) < 3: continue # skip empty lines spls = line.strip().split("\t") query, hit, bitscore = spls[0], spls[1].replace( "lcl|", ""), float(spls[-1]) if query != hit and bitscore >= min_bitscore: hit_tuples.append((hit, bitscore)) out = [] # unique hit ids highest = 0.0 # store the highest bitscore for hit, bitscore in sorted(hit_tuples, key=lambda x: x[1], reverse=True): if highest == 0.0: highest = bitscore # record the first hit if bitscore < 0.05 * highest: break # stop recording if hit not in out: out.append(hit) if len(out) == max_num_hits: break if len(out) == 0: print "Warning: No hits found" with open(swipe_outname + ".hits", "w") as outfile: for hit in out: print hit outfile.write(hit + "\n") # write output summary files .rawswipe, .filetered_hits, .swipe.fa print "Writing output fasta" outfile1 = open(outdir + gene_name + ".rawswipe", "w") outfile2 = open(outdir + gene_name + ".filetered_hits", "w") outfile3 = open(outdir + gene_name + ".swipe.fa", "w") query_seqids = [] # avoid seq id repeats with open(query_fasta, "r") as infile: for line in infile: outfile3.write(line) # copy over query seqs if line[0] == ">": query_seqids.append(line.strip()[1:]) for i in os.listdir(pepdir): if i.endswith(".pep.fa") or i.endswith(".cdhit"): # write the gene_name.rawswipe file to outdir with open( pepdir + i + "." + get_filename_from_path(query_fasta).split(".")[0] + ".swipe", "r") as infile: for line in infile: outfile1.write(line) seqDICT = { } # a dict for each taxon, key is seq name, value is seq for s in seq.read_fasta_file(pepdir + i): seqDICT[s.name] = s.seq with open( pepdir + i + "." + get_filename_from_path(query_fasta).split(".")[0] + ".swipe.hits", "r") as infile: for line in infile: line = line.strip() if len(line) == 0: continue outfile2.write(line + "\n") if line not in query_seqids: outfile3.write(">" + line + "\n" + seqDICT[line] + "\n") outfile1.close() outfile2.close() outfile3.close() # remove intermediate files for i in temp_files: os.remove(i)
curfas = sys.argv[1] + "/notinchildren.fas" if os.path.isfile(curfas) == False: sys.exit(0) outclu = sys.argv[2] LOGFILE = sys.argv[3] log = Logger(LOGFILE) log.a() tempdir = "./" if len(sys.argv) == 5: tempdir = sys.argv[4] if tempdir[-1] != "/": tempdir += "/" slen = 0 seqs = seq.read_fasta_file(curfas) if len(seqs) > 0: seqd = {} for i in seqs: seqd[i.name] = i log.w("UNCLUSTERED SEQS IN " + sys.argv[1]) #make blastdb of the cluster dir # see if there are clusters yet, if so numfiles = 0 for i in os.listdir(outclu): numfiles += 1 break if numfiles > 0: make_blast_db_from_cluster(outclu, tempdir) blast_file_against_db(sys.argv[1], "notinchildren.fas", tempdir) dclus, clus = filter_blast.process_blast_out(tempdir + tempname +
def combine_seqs(seq1,seq2): finalseq = "" for i,j in zip(seq1,seq2): if i == "-": finalseq += j else: finalseq += i return finalseq if __name__ == "__main__": if len(sys.argv) != 3: print "python "+sys.argv[0]+" infile.aln outfile.aln" sys.exit() seqs = seq.read_fasta_file(sys.argv[1]) seqs_sample = {} seqs_d = {} keep_seqs = {} for i in seqs: seqs_d[i.name] = i keep_seqs[i.name] = i spls = i.name.split("@")[0] try: seqs_sample[spls].append(i) except: seqs_sample[spls] = [] seqs_sample[spls].append(i)
def concatenate(clnDIR,numofsitesFilter,numoftaxaFilter,outname): """filter cleaned alignments and concatenate""" if clnDIR[-1] != "/": clnDIR += "/" sites_filter = int(numofsitesFilter) taxa_filter = int(numoftaxaFilter) print "Filtering ortholog matrixes" selected = [] # list of alignment file names that pass the filters for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): seqlist = read_fasta_file(clnDIR+i) num_seq = len(seqlist) num_col = len(seqlist[0].seq) if num_seq >= taxa_filter and num_col >= sites_filter: selected.append(i) print len(selected),"matrices passed the filter" print "Getting matrix occupancy stats" taxon_occupancy = {} #key is taxon name, value is [times present in a matrix,total length for this taxon] total_aligned_len = 0 #record how long the final concatenated matrix is cmd = "pxcat"+" -o "+outname+".fa"+" -p "+outname+".model"+" -s " for i in selected: cmd += clnDIR+i+" " seqlist = read_fasta_file(clnDIR+i) total_aligned_len += len(seqlist[0].seq) for s in seqlist: taxonid = get_name(s.name) if taxonid not in taxon_occupancy: taxon_occupancy[taxonid] = [0,0] taxon_occupancy[taxonid][0] += 1 taxon_occupancy[taxonid][1] += len((s.seq.replace("-","")).replace("?","")) cmd += "\n" total_ortho = len(selected) with open(outname+"_taxon_occupancy_stats","w") as outfile: outfile.write("taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n") sum_char = 0 for taxon in taxon_occupancy: times,chars = taxon_occupancy[taxon][0],taxon_occupancy[taxon][1] sum_char += chars out = taxon+"\t"+str(times)+"\t"+str(chars)+"\t" out += str(times/float(total_ortho))+"\t"+str(chars/float(total_aligned_len))+"\n" outfile.write(out) total_taxa = len(taxon_occupancy) out = "\nSupermatrix dimension "+str(total_taxa)+" taxa, " out += str(total_ortho)+" loci and "+str(total_aligned_len)+" aligned columns\n" out += "Overall matrix occupancy "+str(sum_char/float(total_taxa*total_aligned_len))+"\n" outfile.write(out) print "Supermatrix taxon occupancy stats written to",outname+"_taxon_occupancy_stats" print "Waiting for concatenation to finish. This may take several minutes..." with open(outname+".temp.sh","w") as f: f.write(cmd) os.system("bash "+outname+".temp.sh") #writes phy file cmd_pxs2phy = ["pxs2phy","-o",outname+".phy","-s",outname+".fa"] print (" ".join(cmd_pxs2phy)) os.system(" ".join(cmd_pxs2phy)) #writes nex file cmd_pxs2nex = ["pxs2nex","-o",outname+".nex","-s",outname+".fa"] print (" ".join(cmd_pxs2nex)) os.system(" ".join(cmd_pxs2nex)) assert os.path.exists(outname+".phy") and os.path.exists(outname+".nex") and os.path.exists(outname+".fa"), "error concatenate" os.system("rm "+outname+".temp.sh") print "outfiles written",outname+".phy",outname+".model"
def swipe(query_fasta, DIR, num_cores, max_num_hits=20, min_bitscore=20.0): """ given a DIR with peptide fasta files that either end with .pep.fa or .cdhit, swipe on each one return a fasta file with hits from each taxa plus the queries added """ if DIR[-1] != "/": DIR += "/" max_num_hits = int(max_num_hits) min_bitscore = float(min_bitscore) # swipe with each taxon pepfiles = [ i for i in os.listdir(DIR) if (i.endswith(".pep.fa") or i.endswith(".cdhit")) ] datasets = [i.split(".")[0] for i in pepfiles] print len(pepfiles), "input peptide files read" assert len(set(datasets)) == len(datasets),\ "dataset name repeats. remove duplicated sets" for i in os.listdir(DIR): if i.endswith(".pep.fa") or i.endswith(".cdhit"): if not os.path.exists(DIR + i + ".psd"): os.system("makeblastdb -in " + DIR + i + " -parse_seqids -dbtype prot -out " + DIR + i) swipe_outname = DIR + i + "." + get_filename_from_path( query_fasta).split(".")[0] + ".swipe" if not os.path.exists(swipe_outname): cmd = "swipe -d " + DIR + i + " -i " + query_fasta + " -a " + str( num_cores) cmd += " -p blastp -o " + swipe_outname + " -m 8 -e 10" print cmd os.system(cmd) assert os.path.exists(swipe_outname), \ "swipe did not finish correctly" """ swipe output colums are: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score """ # summarize the hit seq ids if not os.path.exists(swipe_outname + ".hits"): hit_tuples = [] # alist of tuples (hit, bitscore) with open(swipe_outname, "r") as infile: for line in infile: if len(line) < 3: continue # skip empty lines spls = line.strip().split("\t") query, hit, bitscore = spls[0], spls[1].replace( "lcl|", ""), float(spls[-1]) if query != hit and bitscore >= min_bitscore: hit_tuples.append((hit, bitscore)) out = [] # unique hit ids for hit, bitscore in sorted(hit_tuples, key=lambda x: x[1], reverse=True): if hit not in out: out.append(hit) if len(out) == max_num_hits: break if len(out) == 0: print "Warning: No hits found" with open(swipe_outname + ".hits", "w") as outfile: for hit in out: print hit outfile.write(hit + "\n") # write output fasta outname = query_fasta.replace(".pep.fa", "_swipe.fa") print "Writing output fasta", outname outfile = open(outname, "w") query_seqids = [] # avoid seq id repeats with open(query_fasta, "r") as infile: for line in infile: outfile.write(line) # copy over query seqs if line[0] == ">": query_seqids.append(line.strip()[1:]) for i in os.listdir(DIR): if i.endswith(".pep.fa") or i.endswith(".cdhit"): seqDICT = {} # key is seq name, value is seq for s in seq.read_fasta_file(DIR + i): seqDICT[s.name] = s.seq with open( DIR + i + "." + get_filename_from_path(query_fasta).split(".")[0] + ".swipe.hits", "r") as infile: for line in infile: line = line.strip() if len(line) > 0 and line not in query_seqids: outfile.write(">" + line + "\n" + seqDICT[line] + "\n") outfile.close()
if hit not in out: out.append(hit) if len(out) == max_num_hits: break if len(out) == 0: print "Warning: No hits found" with open(swipe_outname + ".hits", "w") as outfile: for hit in out: print hit outfile.write(hit + "\n") # write output summary files .rawswipe, .filetered_hits, .swipe.fa print "Writing output fasta" outfile1 = open(outdir + gene_name + ".rawswipe", "w") outfile2 = open(outdir + gene_name + ".filetered_hits", "w") outfile3 = open(outdir + gene_name + ".swipe.fa", "w") seqids_written = [] # avoid seq id repeats for s in read_fasta_file(query_fasta): seqid, seq = str(s.name), str(s.seq) if seqid not in seqids_written: outfile3.write(">" + seqid + "\n" + seq + "\n") seqids_written.append(seqid) for s in read_fasta_file(existing_homolog): seqid, seq = str(s.name), str(s.seq) if seqid not in seqids_written: outfile3.write(">" + seqid + "\n" + seq + "\n") seqids_written.append(seqid) for i in pepfiles: # write the gene_name.rawswipe file to outdir with open(pepdir + i + "." + gene_name + ".swipe", "r") as infile: for line in infile: outfile1.write(line) seqDICT = {
def concatenate(clnDIR, numofsitesFilter, numoftaxaFilter, seqtype, outname): """filter cleaned alignments and concatenate""" if clnDIR[-1] != "/": clnDIR += "/" sites_filter = int(numofsitesFilter) taxa_filter = int(numoftaxaFilter) assert seqtype == "aa" or seqtype == "dna", "Input data type: dna or aa" model = "AUTO" if seqtype == "aa" else "DNA" print "Filtering ortholog matrixes" selected = [] # list of alignment file names that pass the filters for i in os.listdir(clnDIR): if i.endswith(".aln-cln"): seqlist = read_fasta_file(clnDIR + i) num_seq = len(seqlist) num_col = len(seqlist[0].seq) if num_seq >= taxa_filter and num_col >= sites_filter: selected.append(i) print len(selected), "matrices passed the filter" print "Getting matrix occupancy stats" taxon_occupancy = {} #key is taxon name, value is [times present in a matrix,total length for this taxon] total_aligned_len = 0 #record how long the final concatenated matrix is if seqtype == "aa": cmd = "phyutility -concat -aa -out " + outname + ".nex -in " else: cmd = "phyutility -concat -out " + outname + ".nex -in " for i in selected: cmd += clnDIR + i + " " seqlist = read_fasta_file(clnDIR + i) total_aligned_len += len(seqlist[0].seq) for s in seqlist: taxonid = get_name(s.name) if taxonid not in taxon_occupancy: taxon_occupancy[taxonid] = [0, 0] taxon_occupancy[taxonid][0] += 1 taxon_occupancy[taxonid][1] += len( (s.seq.replace("-", "")).replace("?", "")) cmd += "\n" total_ortho = len(selected) with open(outname + "_taxon_occupancy_stats", "w") as outfile: outfile.write( "taxon\t#orthologs\t#total_charactors\tperc_orthologs\tperc_charactors\n" ) sum_char = 0 for taxon in taxon_occupancy: times, chars = taxon_occupancy[taxon][0], taxon_occupancy[taxon][1] sum_char += chars out = taxon + "\t" + str(times) + "\t" + str(chars) + "\t" out += str(times / float(total_ortho)) + "\t" + str( chars / float(total_aligned_len)) + "\n" outfile.write(out) total_taxa = len(taxon_occupancy) out = "\nSupermatrix dimension " + str(total_taxa) + " taxa, " out += str(total_ortho) + " loci and " + str( total_aligned_len) + " aligned columns\n" out += "Overall matrix occupancy " + str( sum_char / float(total_taxa * total_aligned_len)) + "\n" outfile.write(out) print "Supermatrix taxon occupancy stats written to", outname + "_taxon_occupancy_stats" print "Waiting for concatenation to finish. This may take several minutes..." with open(outname + ".temp.sh", "w") as f: f.write(cmd) os.system(cmd) #convert the .nex file to .fasta and .model files for raxml infile = open(outname + ".nex", "r") outfile = open(outname + ".phy", "w") for line in infile: line = line.strip() if len(line) < 10: continue if line[0] == "#" or line[: 5] == "BEGIN" or line[: 6] == "MATRIX" or line == "END;" or line[: 6] == "FORMAT": continue if line[0] == "[": line = line.replace("[", model + ",") line = line.replace(" ]", "") line = line.replace(" cluster", "\n" + model + ",cluster") #line = line.replace(" homolog","\n"+model+",homolog") line = line.replace(" ", "=") with open(outname + ".model", "w") as outfile2: outfile2.write(line.strip() + "\n") #make sure that wc -l will get how many partitions elif line[:10] == "DIMENSIONS": ntax = (line.split("NTAX=")[1]).split(" ")[0] nchar = (line.split("NCHAR=")[1]).replace(";", "") outfile.write(ntax + " " + nchar + "\n") else: spls = line.split("\t") outfile.write(spls[0] + " " + spls[1] + "\n") infile.close() outfile.close() assert os.path.exists(outname + ".phy"), "error concatenate" os.system("rm " + outname + ".temp.sh") print "outfiles written", outname + ".phy", outname + ".model" os.system("rm " + outname + ".nex") #remove intermediate .nex file