def __init__(self, args, confs, funcs, output_dir): self.nSteps = len(funcs) self.outputdir = os.path.abspath(output_dir) self.args = args self.confs = confs self.stepFuncs = {} self.order = list() self.stepDir = {} self.tempDir = {} i = 0 for pair in funcs: self.stepFuncs[pair[0]] = pair[1] self.order.append(pair[0]) dir_name = self.outputdir + "/step_" + str(i + 1) + "-" + pair[0] print(dir_name) self.stepDir[pair[0]] = dir_name self.tempDir[pair[0]] = dir_name + "-tmp" i += 1 if not os.path.exists(self.outputdir): print("Creating " + self.outputdir) code = runCommand("mkdir " + self.outputdir) if code != 0: print("No permission to create " + self.outputdir + ", cannot procede.") self.ready = False else: self.ready = True else: self.ready = True
def get_infernal_output(output_dir, output_file): paths = getFilesWith(output_dir, ".tsv") paths2 = getFilesWith(output_dir, ".out") if len(paths) > 0: if len(paths2) > 0: results2 = " ".join(paths2) runCommand("cat " + results2 + " > " + output_file.rstrip(".tsv") + ".out") erase_comments(output_file.rstrip(".tsv") + ".out") results = " ".join(paths) runCommand("cat " + results + " > " + output_file + ".tsv") erase_comments(output_file) return True else: print("No results ready yet") return False
def run_trnascan(args, confs, tmpDir, stepDir): genome_path = args["genome_link"] tRNAscan = confs["tRNAscan-SE"] output_file = tmpDir + "/trna_raw.txt" stats_file = tmpDir + "/trna_stats.txt" cmd = " ".join( [tRNAscan, "-o", output_file, "-m", stats_file, genome_path]) code = runCommand(cmd) return code == 0
def split_genome(args, confs, tmpDir, stepDir): output_dir = args["data_dir"] + "/genome_parts" if not "genome" in args: print("Cannot run annotation without a path to a genome.") return False fasta_path = os.path.abspath(args["genome"]) n = 100 #creating shortcut to genome fasta runCommand("ln -s " + fasta_path + " " + args["genome_link"]) fasta_path = args["genome_link"] print("Reading input fasta") seqs = readSeqsFromFasta(fasta_path) total_length = sum([len(entry[1]) for entry in seqs]) print("Total length:" + str(total_length)) max_length = int(total_length / n) current_length = 0 part = [] parts = [] print("Spliting parts of fasta") cumulative = 0 parts_done = 0 for seq in seqs: if n > parts_done: max_length = int((total_length - cumulative) / (n - parts_done)) if ((current_length >= max_length)): parts.append(part) parts_done += 1 part = [] current_length = 0 part.append(shortFastaHeader(seq)) cumulative += len(seq[1]) current_length += len(seq[1]) if len(part) > 0: parts.append(part) file_names = [ output_dir + "/" + str(i) + ".fasta" for i in range(len(parts)) ] runCommand("mkdir " + output_dir) print("Writing fasta files") for i in range(len(parts)): writeFastaSeqs(parts[i], file_names[i]) return True
def blast(query, db, max_evalue = 0.001, threads=8, blast_type="blastn", output = "results.tsv"): if threads > 8: threads = 8 print("Blasting query to DB") cmd = " ".join([blast_type, "-db", db, "-query", query, "-evalue", str(max_evalue), "-num_threads", str(threads), "-outfmt", "'6 qaccver saccver pident length mismatch"+ " gapopen qstart qend sstart send evalue bitscore qcovs'", "-out", output]) code = runCommand(cmd) return code == 0
ap.add_argument( "-edb", "--extra-db", required=False, default=None, help= ("Add extra ncRNA databases for this run. Sintax: -edb db_name:db_path;db_name2:db_path2" )) return vars(ap.parse_args()) #parsing arguments cmdArgs = getArgs() outputdir = os.path.abspath(cmdArgs["output"]) if not os.path.exists(outputdir): runCommand("mkdir " + outputdir) argsfile = outputdir + "/args.json" args = {} if os.path.exists(argsfile): with open(argsfile, "r") as input_stream: content = "\n".join(input_stream.readlines()) args = eval(content) for arg in cmdArgs: if cmdArgs[arg] is not None: args[arg] = cmdArgs[arg] if not "best_hits" in args: args["best_hits"] = "False" #if not os.path.isfile(inputFasta):
def erase_comments(path): tmp = path + ".tmp" runCommand("grep -v '^#' " + path + " > " + tmp) runCommand("mv " + tmp + " " + path)
def infernal(fasta, cmscan, rfam, threads): output_name = fasta.rstrip(".fasta") + ".tsv" output_name2 = fasta.rstrip(".fasta") + ".out" new_fasta = fasta + "_running" runCommand("mv " + fasta + " " + new_fasta) cmd = (cmscan + " -o " + output_name2 + " --tblout " + output_name + " -E 0.01 --acc --cpu " + str(threads) + " " + rfam + " " + new_fasta) runCommand("rm " + output_name) runCommand("rm " + fasta.rstrip(".fasta") + ".out") code = runCommand(cmd) if (code != 0): runCommand("mv " + new_fasta + " " + fasta) else: runCommand("mv " + new_fasta + " " + fasta + "_done")
def run(self, start_from="-1", stop_at="-1"): print("Running pipeline") try: int(start_from) start_from = int(start_from) except ValueError: start_from = self.get_step_order(start_from) + 1 try: int(stop_at) stop_at = int(stop_at) except ValueError: stop_at = self.get_step_order(stop_at) if not self.ready: print("Not ready to start pipeline.") return startingStep = 1 if start_from > 0: startingStep = start_from if startingStep > self.nSteps: sys.exit("This step does not exist") elif startingStep > 1: if not os.path.exists(self.get_dir(startingStep - 2)): sys.exit("The previous step to Step " + str(startingStep) + " has not been done yet.") print("Starting from " + str(startingStep)) else: for name, path in self.stepDir.items(): if os.path.exists(path): print("Skipping step " + str(startingStep)) startingStep += 1 #running necessary steps limit = len(self.stepFuncs) if stop_at > 0: stopAt = stop_at if stopAt > 0: limit = stopAt print("Stoping at " + str(limit)) #print("entering steps loop from " + str(startingStep-1) + " " + str(limit)) #print(str(range(startingStep-1, limit))) for i in range(startingStep - 1, limit): print(str(i)) step = self.get_step_name(i) #print("entered") print("--- STEP " + str(i + 1) + ": " + step + " ---") #create temporary dir to store files from next step if os.path.exists(self.tempDir[step]): runCommand("rm -Rf " + self.tempDir[step]) runCommand("mkdir " + self.tempDir[step]) print(self.tempDir[step]) #run step success = self.stepFuncs[step](self.args, self.confs, self.tempDir[step], self.stepDir) if success: #move results from temporary dir to permanent one if os.path.exists(self.stepDir[step]): runCommand("rm -Rf " + self.stepDir[step]) runCommand("mv " + self.tempDir[step] + " " + self.stepDir[step]) else: print("Step " + str(i + 1) + " was not successful.") break
def blast_annotate(query, db, output_dir, max_evalue = 0.0000000001, threads=8, blast_type="blastn", db_id_sep_char=" ", source="db_name", remaining_fasta="auto_name", run_blast=True, alternative_outputdir=None): import os import pandas as pd print("Blasting query to DB") create_db = True if os.path.exists(db+".nhr"): create_db = False if create_db: cmd = " ".join(["makeblastdb -in " + db + " -dbtype nucl"]) code = runCommand(cmd) if code != 0: print("Could not create database for given genome.") return False, "" db_name = os.path.basename(db).split(".")[0] if source == "db_name": source = db_name query_name = os.path.basename(query).split(".")[0] search_name = query_name+".to."+db_name output = output_dir + "/"+search_name+"_results.tsv" cmd = " ".join([blast_type, "-db", db, "-query", query, "-out", output, "-outfmt", "'6 qaccver saccver pident length mismatch gapopen qstart qend sstart send evalue bitscore qcovs'", "-num_threads", str(threads), "-evalue", str(max_evalue)]) #cmd = " ".join([blast_type, "-d", db, "-i", query, "-e", str(max_evalue), "-a", # str(threads), "-outfmt 1", "-o", output]) if run_blast: code = runCommand(cmd) if code != 0: return False, "" else: if alternative_outputdir != None: output = alternative_outputdir + "/"+search_name+"_results.tsv" print("Reading full seq names from DB") rnas = getFastaHeaders(db) full_names = dict() #x = 0 for entry in rnas: parts = entry.split(db_id_sep_char) full_names[parts[0]] = " ".join(parts[1:]) print("Parsing blast output") seqs = readSeqsFromFasta(query) seq_lens = {} for seq in seqs: seq_lens[seq[0].rstrip("\n").lstrip(">")] = len(seq[1]) example_sequence_names = list(seq_lens.keys())[:5] print("Some sequence keys: " + str(example_sequence_names)) blast_df = pd.read_csv(output, sep='\t', header=None, index_col=False, names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"]) blast_df = blast_df.astype({"pident": 'float32', "length": 'int32', "mismatch": 'int32', "gapopen": 'int32', "qstart": 'int32', "qend": 'int32', "sstart": 'int32', "send": 'int32', "evalue": 'float64', "bitscore": 'float64'}) print(str(blast_df.head())) print("Calculating coverage") blast_df["coverage"] = blast_df.apply(lambda row: row["length"] / seq_lens[row['qseqid']], axis=1) best_hits = dict() print(str(len(blast_df)) + " alignments") min_coverage = 0.99 min_pid = 99 print("Filtering blast results") blast_df = blast_df[blast_df["pident"] > min_pid] print(str(len(blast_df)) + " alignments filtered by pident") blast_df = blast_df[blast_df["coverage"] > min_coverage] print(str(len(blast_df)) + " alignments filtered by coverage") print("Choosing best hits") unique = 0 for name, hits in blast_df.groupby(["qseqid"]): hit = get_best_mapping(hits) if hit != None: for i in range(len(hit)): best_hits[name+"."+str(i)] = hit[i] unique += 1 print(str(unique) + " transcripts with genome mapping.") print(str(len(best_hits.keys())) + " total mappings.") print("Writing gff file about seqs identified") rows = [] for name in best_hits: hit = best_hits[name] '''print(str(hit)) print(type(str(hit))) print(hi) print(type(hit["sstart"])) print(type(hit["sstart"].item()))''' int_sstart = int(hit["sstart"]) int_send = int(hit["send"]) start = min(int_sstart, int_send) end = max(int_sstart, int_send) #full_name = "." #if hit["sseqid"] in full_names: # full_name = full_names[hit["sseqid"]] row = {"seqname": hit["sseqid"], "source": source, "feature": "transcript", "start": str(start), "end":str(end), "score": ".", "strand": get_strand(int(hit["qstart"]),int(hit["qend"]),int(hit["sstart"]),int(hit["send"])), "frame": ".", "attribute":"ID="+name+";evalue="+str(hit["evalue"]) +";coverage="+str(hit["coverage"])+";pident="+str(hit["pident"])} rows.append(row) gff = pd.DataFrame(rows, columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]) gff_name = output_dir+"/"+search_name+"_found.gff" gff.to_csv(gff_name, sep="\t", index=False, header = False) print(str(len(seqs)) + " transcripts analyzed.") print(str(len(gff)) + " mappings detected and annotated on " + gff_name) '''print("Writing fasta files.") known = set([raw_name.split(".")[0] for raw_name in gff["seqname"].unique().tolist()]) print("Some known sequences: " + str(list(known)[:5])) print("Some sequences: " + str([x[0] for x in seqs[:5]])) knownSeqs, unknownSeqs = filterSeqs(seqs, known) fasta_name = output_dir + "/" + remaining_fasta if remaining_fasta=="auto_name": fasta_name = output_dir+"/"+search_name+"_missing.fasta" writeFastaSeqs(unknownSeqs, fasta_name) writeFastaSeqs(knownSeqs, gff_name.rstrip("gff")+"fasta") print(str(len(gff)) + " detected and annotated on " + gff_name) print(str(len(unknownSeqs)) + " unknown seqs remaining on " + fasta_name)''' return True, gff_name
def writeFastaWithUniqueHeaders(input_fasta, base_name="Contig"): seqs = readSeqsFromFasta(input_fasta) runCommand("cp " + input_fasta + " " + input_fasta + ".old_names") writeSeqsWithUniqueHeaders(input_fasta, seqs, base_name=base_name)