def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="dataset_dir", help="dir of dataset files") parser.add_option("-c", "--c", dest="context_dir", help="dir of context alignent files") (options, args) = parser.parse_args() dataset_dir = options.dataset_dir dataset_dir = check_dirname(dataset_dir) context_dir = options.context_dir context_dir = check_dirname(context_dir) aln_files = glob.glob(dataset_dir + "/*/*best.fas") for aln_file in aln_files: print(aln_file) basename_dataset_dir = aln_file.split(".phy")[0].split( "_aln")[0].split(".aln")[0] tree = glob.glob(basename_dataset_dir + "[_.A-z]*tree*")[0] rooted_tree = glob.glob(basename_dataset_dir + ".r.tree")[0] basename = aln_file.split(".phy")[0].split("_aln")[0].split( ".aln")[0].split(dataset_dir)[-1] context_alns = glob.glob(context_dir + "/%s[_.A-z]*.aln" % basename) for context_aln in context_alns: context_basename = context_aln.split(".aln")[0] ctl_gtr = context_basename + "_gtr.ctl" ctl_unr = context_basename + "_unr.ctl" output_gtr = context_basename + "_gtr.mlb" output_unr = context_basename + "_unr.mlb" write_ctl_file(ctl_gtr, context_aln, tree, output_gtr, 7, fix_alpha="1", alpha="0") write_ctl_file(ctl_unr, context_aln, rooted_tree, output_unr, 8, fix_alpha="1", alpha="0")
def pipeline_runner(input_dir, output_dir, ref_file, NGS_or_Cirseq, TYPE_OF_INPUT_FILE=None, start=None, end=None, gaps=None, qscore=None, blast=None, rep=None, t=None, alias="pipeline"): input_dir = check_dirname(input_dir) output_dir = check_dirname(output_dir) ref_file = check_filename(ref_file) if NGS_or_Cirseq not in [1, 2]: raise Exception("NGS_or_Cirseq has to be 1 or 2") cmds = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r %s -NGS_or_Cirseq %i" \ % (input_dir, output_dir, ref_file, NGS_or_Cirseq) if TYPE_OF_INPUT_FILE != None: cmds += " -t %s" % TYPE_OF_INPUT_FILE if start != None: cmds += " -s %i" % start if end != None: cmds += " -e %i" % end if gaps != None: cmds += " -g %s" % gaps if qscore != None: cmds += " -q %i" % qscore if blast != None: cmds += " -b %i" % blast if rep != None: cmds += " -rep %i" % int(rep) if t != None: cmds += " -t %s" % t print(cmds) cmdfile = pbs_jobs.get_cmdfile_dir("pipeline.txt", alias) tnum = 1 gmem = 2 pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, jnum=tnum, gmem=gmem, cmds=cmds, load_python=True) job_id = pbs_jobs.submit(cmdfile) return job_id
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="dir", help="directory with fastml output files") parser.add_option("-o", "--output", dest="output", help="output file perfix") (options, args) = parser.parse_args() dir = options.dir dir = check_dirname(dir) output = options.output if output == None: output = "fastml_anlysis.csv" files = glob.glob(dir + "*prob.marginal.txt") basenames = [f.split(".prob.marginal.txt")[0] for f in files] df = pd.DataFrame(columns=["Basename", "Mutation", "Branch", "Context", "Mutation_type", "Codon_position", "APOBEC_context_GA","APOBEC_context_CT"]) for basename in basenames: print(basename) prob_marginal = basename + ".prob.marginal.txt" seq_marginal = basename + ".seq.marginal.txt" tree_ancestor = basename + ".tree.ancestor.txt" basename = basename.split("/")[-1] positions_to_remove = get_position_to_remove(prob_marginal) ancestor_info, seqs = get_sequence_and_ancestry_data(tree_ancestor, seq_marginal) df = go_over_positions(ancestor_info, seqs, positions_to_remove, basename, df) df.to_csv(dir + output, index=False)
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dirname", dest="dirname", help="dirname that contains fastml result files") parser.add_option("-o", "--output", dest="output", help="output file") (options, args) = parser.parse_args() dirname = options.dirname dirname = check_dirname(dirname) output = options.output if output == None: output = dirname + "/fastml_analysis_output.csv" output = check_filename(output, Truefile=False) files = glob.glob(dirname + "/*/*.fasta") files = [] if files == []: files = glob.glob(dirname + "/*seq.joint.txt") basenames = [f.split(".")[0] for f in files] df = pd.DataFrame(columns=[ "family", "group", "mutation", "mutation_count_in_context", "context_count_overall", "mutation_count_overall" ]) for basename in basenames: df = run_on_basename(basename, df)
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="dir", help="dir input fasta files") parser.add_option("-s", "--subtypes", dest="subtypes", help="subtypes to keep, seperated by comma. example A,B") (options, args) = parser.parse_args() dir = options.dir dir = check_dirname(dir) subtypes = options.subtypes.split(",") input_files = glob.glob(dir + "/*.fasta") for file in input_files: fasta = open(file, "r").read() basename = file.split(".fasta")[0] for subtype in subtypes: pattern = re.compile(">%s[^>]*" % subtype) results = pattern.findall(fasta) print("%s: %s: %s" % (file, subtype, str(len(results)))) fasta_out = "".join(results) output_file = basename + "_%s.aln" % subtype output = open(output_file, "w") output.write(fasta_out) output.close() print("splited HIV files to subtypes")
def tophat2_runner(output_dir, bowtie_reference, fastq, alias="tophat2"): """ tophat2 runner :param output_dir: output directory :param bowtie_reference: bowtie reference path :param fastq: fastq path :param alias: job name (tophat2) :return: job id """ output_dir = check_dirname(output_dir, Truedir=False) bowtie_reference = check_filename(bowtie_reference, Truefile=False) fastq = check_filename(fastq) cmdfile = pbs_jobs.get_cmdfile_dir("tophat2", alias) tnum = 1 gmem = 2 cmds = "/sternadi/home/volume1/taliakustin/software/tophat-2.1.1.Linux_x86_64/tophat2"\ + " -o %s %s %s" % (output_dir, bowtie_reference, fastq) pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, jnum=tnum, gmem=gmem, cmds=cmds, load_python=False) job_id = pbs_jobs.submit(cmdfile) return job_id
def selecton_runner(codon_aln, output_dir=None, tree=None, log=None, rate=None, output=None, color=None, out_tree=None, query_seq=None, model="M8", alias="selecton", use_query_seq=False): codon_aln = check_filename(codon_aln) if output_dir == None: base = codon_aln.split(".")[0] + "_selecton" else: base = check_dirname(output_dir) base = base + codon_aln.split("/")[-1].split(".")[0] + "_selecton" log = set_filenames_for_pbs_runs(log, base, "log.txt") rate = set_filenames_for_pbs_runs(rate, base, "kaks.txt") output = set_filenames_for_pbs_runs(output, base, "output.txt") color = set_filenames_for_pbs_runs(color, base, "color.txt") out_tree = set_filenames_for_pbs_runs(out_tree, base, "output_tree.txt") if query_seq == None: query_seq = get_longest_sequence_name_in_fasta(codon_aln) if model == "M8": model = "" elif model == "M8a": model = "-w1 -Fw" elif model == "M7": model = "-p1 -Fp" if tree != None: tree = check_filename(tree) if use_query_seq == False: cmds = "selecton -i %s -u %s -l %s -r %s -o %s -c %s -t %s %s" \ % (codon_aln, tree, log, rate, output, color, out_tree, model) else: cmds = "selecton -i %s -u %s -l %s -r %s -o %s -c %s -t %s %s -q %s" \ % (codon_aln, tree, log, rate, output, color, out_tree, model, query_seq) else: if use_query_seq == False: cmds = "selecton -i %s -l %s -r %s -o %s -c %s -t %s %s" \ % (codon_aln, log, rate, output, color, out_tree, model) else: cmds = "selecton -i %s -l %s -r %s -o %s -c %s -t %s %s -q %s" \ % (codon_aln, log, rate, output, color, out_tree, model, query_seq) cmdfile = pbs_jobs.get_cmdfile_dir("selecton.txt", alias) tnum = 1 gmem = 2 pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, jnum=tnum, gmem=gmem, cmds=cmds) job_id = pbs_jobs.submit(cmdfile) return job_id
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--directory", dest="dir", help="input directory with kaks gaps files") parser.add_option("-o", "--output", dest="output", help="output file name") parser.add_option("-v", "--virus", dest="virus", help="virus - tells the script how to parse filenames. " "options: influenza, tilv, influenza20, thogoto") (options, args) = parser.parse_args() dir = options.dir output = options.output virus = options.virus dir = check_dirname(dir) output = check_filename(output, Truefile=False) files = glob.glob("%s/*kaks*gaps" % dir) if len(files) == 0: raise Exception("No files in %s" % dir) if virus == "influenza": #add more virus name if you add anything output_text = "POS\tAMINO\tKaKs\tconfidence_interval\tvirus\tprotein\n" elif virus == "influenza20" or virus == "tilv" or virus == "thogoto": output_text = "POS\tAMINO\tKaKs\tconfidence_interval\tvirus\tprotein\tsegment\n" else: output_text = "POS\tAMINO\tKaKs\tconfidence_interval\n" for f in files: print(f) if virus == "influenza": virus_name = "Influenza " + f.split("/")[-1].split("inf")[1].split("_")[0] protein = f.split("/")[-1].split("_")[2] segment = None elif virus == "tilv": virus_name = "TiLV" protein = "Segment " + f.split("/")[-1].split("_")[0].split("seg")[-1] segment = protein elif virus == "influenza20": virus_name = "Influenza " + f.split("/")[-1].split("Segment")[0].split("_")[1] protein = f.split("/")[-1].split("Protein")[1].split("_")[1] segment = "Segment " + f.split("Segment")[1].split("_")[1] elif virus == "thogoto": virus_name = "Thogoto" protein = f.split("gene_")[1].split("_")[0] segment = "Segment " + f.split("segment_")[1].split("_")[0] #if adding more virus type this is what you have to do: #elif virus == "XXXX": #virus_name = XXX #protein = XXX else: virus_name = None protein = None segment = None output_text += kaks_file_to_txt_delimeted_results(f, virus_name, protein, segment) output = open(output, "w") output.write(output_text) output.close()
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="dir", help="dir of temp files of cirseq pipeline run") parser.add_option("-o", "--output", dest="output", help="output folder to save") (options, args) = parser.parse_args() in_dir = options.dir out_dir = options.output in_dir = check_dirname(in_dir) out_dir = check_dirname(out_dir) repeat_summmery = get_repeats_num(in_dir) make_graph(repeat_summmery, out_dir)
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="freqs_dir", help="dir with freqs file") (options, args) = parser.parse_args() freqs_dir = options.freqs_dir freqs_dir = check_dirname(freqs_dir) freqs_files = glob.glob(freqs_dir + "/*.freqs") with_mutation_files = glob.glob(freqs_dir + "/*_with_mutations.csv") if with_mutation_files == []: print("Adding mutation to freqs files") for freqs_file in freqs_files: print(freqs_file) output = freqs_file.split(".freqs")[0] + "_with_mutations.csv" add_mutation_to_freq_file(output, freqs_file=freqs_file) with_mutation_files.append(output) segment_files = glob.glob(freqs_dir + "/Segment_[0-9].csv") + glob.glob( freqs_dir + "/Segment_[0-9][0-9].csv") if segment_files == []: print("Merging segment from different passages") for s in range(1, 11): specific_segment_mutation_files = glob.glob( freqs_dir + "/P*-S%s_with_mutations.csv" % s) segment_file, segment_freqs = merge_freqs_files( specific_segment_mutation_files, freqs_dir + "/Segment_%i.csv" % s) segment_files.append(segment_file) filtered_files = glob.glob(freqs_dir + "/Segment_[0-9]_filtered.csv") + \ glob.glob(freqs_dir + "/Segment_[0-9][0-9]_filtered.csv") print("Filtering positions from segment csvs") for segment_file in segment_files: output = segment_file.split(".csv")[0] + "_filtered.csv" filtered_file, filtered_ferqs = filter_freqs_for_regression_analysis( output, freqs_file=segment_file) filtered_files.append(filtered_file) merge_dfs(filtered_files, freqs_dir + "/All_segments_filtered_for_regression.csv")
def main(): parser = OptionParser("usage: %prog [options]") parser.add_option("-d", "--dir", dest="dataset_dir", help="dir of dataset files") parser.add_option("-o", "--output", dest="output_file", help="output file name") (options, args) = parser.parse_args() dataset_dir = options.dataset_dir dataset_dir = check_dirname(dataset_dir) output_file = options.output_file output_file = check_filename(output_file, Truefile=False) aln_files = glob.glob(dataset_dir + "/*/*best.fas") df = pd.DataFrame(columns=[ "filename", "1", "0.9", "0.8", "0.7", "0.6", "0.5", "0.4", "0.3", "0.2" ]) for aln_file in aln_files: consensus, consensus_percentage = get_consensus_percentage(aln_file) filename = aln_file.split(dataset_dir)[-1] print(aln_file) df = df.append( { "filename": filename, "1": consensus_percentage[1], "0.9": consensus_percentage[0.9], "0.8": consensus_percentage[0.8], "0.7": consensus_percentage[0.7], "0.6": consensus_percentage[0.6], "0.5": consensus_percentage[0.5], "0.4": consensus_percentage[0.4], "0.3": consensus_percentage[0.3], "0.2": consensus_percentage[0.2] }, ignore_index=True) df.to_csv(output_file, index=False)
def fastml_runner(alignment, tree, outdir=None, alias="fastml", additional_params=None): """ run fastml from phylogenyCode on cluster :param alignment: alignment file path :param tree: tree file path :param alias: job name (default: fastml) :param outdir: output directory for results (default: None - saves in the alignment's dir) :return: job id """ alignment = check_filename(alignment) tree = check_filename(tree) if outdir == None: outdir = os.path.dirname(alignment) else: outdir = check_dirname(outdir) basename = os.path.basename(alignment).split(".")[0].split("_aln")[0] newick_tree = outdir + "/" + basename + ".tree.newick.txt" ancestor_tree = outdir + "/" + basename + ".tree.ancestor.txt" joint_seqs = outdir + "/" + basename + ".seq.joint.txt" marginal_seqs = outdir + "/" + basename + ".seq.marginal.txt" joint_prob = outdir + "/" + basename + ".prob.joint.txt" marginal_prob = outdir + "/" + basename + ".prob.marginal.txt" cmdfile = pbs_jobs.get_cmdfile_dir("fastml.txt", alias) tnum = 1 gmem = 1 cmds = "/sternadi/home/volume1/shared/tools/phylogenyCode/programs/fastml/fastml -s %s -t %s -mn -x %s " \ "-y %s -j %s -k %s -d %s -e %s -qf" % (alignment, tree, newick_tree, ancestor_tree, joint_seqs, marginal_seqs, joint_prob, marginal_prob) if additional_params != None: cmds += " %s" % additional_params pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, gmem=gmem, cmds=cmds) job_id = pbs_jobs.submit(cmdfile) return job_id
def save_all_rooted_trees(tree_file, output_dir): """ saves all possible rooted trees for a given tree :param tree_file: input tree path :param output_dir: output directory for trees :return: rooted file dictionary """ # save all possible rooted tree of a given tree tree_file = check_filename(tree_file) output_dir = check_dirname(output_dir) basename = path.basename(path.splitext(tree_file)[0]) treefile_out = output_dir + "/" + basename tree = Phylo.read(tree_file, "newick") clades = list(tree.find_clades()) out_files = [] for clade in clades: if clade.name != None: tree.root_with_outgroup(clade) if tree.rooted == True: outfile = treefile_out +"_%s.txt" % clade.name Phylo.write(tree, outfile , "newick") out_files.append(outfile) out_files[clade.name]["rooted_file"] = outfile return out_files
def mlbs_to_df(output, mlbs=[], dirname=None, baltimore=True): """ analyzes mlb file to dataframe - extracts lnL, base frequencies and substiution matrics :param output: output csv file path :param mlbs: list of mlb files :param dirname: dirname that has mlb files :return: output file path """ if mlbs == [] and dirname == None: raise Exception( "you need to provide mlb or dirname that contains mlbs") if mlbs != [] and dirname != None: raise Exception("you need to provide only one - mlb or dirname") if dirname != None: dirname = check_dirname(dirname) mlbs = glob.glob(dirname + "/*.mlb") if mlbs != []: mlbs = [check_filename(m) for m in mlbs] output = check_filename(output, Truefile=False) df = pd.DataFrame(columns=[ "mlb_file_name", "basename", "family", "protein", "group", "model", "lnL", "freq_T", "freq_C", "freq_A", "freq_G", "TC", "TA", "TG", "CT", "CA", "CG", "AT", "AC", "AG", "GT", "GC", "GA" ]) lnL_1 = re.compile("lnL.*") lnL_2 = re.compile("\-?\d*\.\d*") base_1 = re.compile("Base frequencies.*") base_2 = re.compile("0.\d+") rate_1 = re.compile("Rate matrix Q.*\n.*\n.*\n.*\n.*", re.IGNORECASE) rate_2 = re.compile("\d+.\d+") for mlb_file_name in mlbs: basename = mlb_file_name.split("/")[-1].split(".mlb")[0] print(mlb_file_name) family = mlb_file_name.split("/")[-1].split("_")[0] protein = mlb_file_name.split("/")[-1].split(family + "_")[1].split(".")[0] filename = mlb_file_name.split("/")[-1].split(family + "_")[-1].split(".mlb")[0] #if "_gtr" in filename or "_unrest" in filename: # filename = filename.split("_gtr")[0] # filename = filename.split("_unrest")[0] model = mlb_file_name.split(".mlb")[-1] mlb = open(mlb_file_name, "r").read() L = lnL_1.findall(mlb) if len(L) != 1: L = None elif "nan" in L[0]: L = None else: L = float(lnL_2.findall(L[0])[0]) B = base_1.findall(mlb) if len(B) != 1: freq_T = None freq_C = None freq_A = None freq_G = None elif "nan" in B[0]: freq_T = None freq_C = None freq_A = None freq_G = None else: B = base_2.findall(B[0]) freq_T = float(B[0]) freq_C = float(B[1]) freq_A = float(B[2]) freq_G = float(B[3]) R = rate_1.findall(mlb) if len(R) != 1: TC = None TA = None TG = None CT = None CA = None CG = None AT = None AC = None AG = None GT = None GC = None GA = None elif len(R) >= 1 and "nan" in R[0]: TC = None TA = None TG = None CT = None CA = None CG = None AT = None AC = None AG = None GT = None GC = None GA = None else: R = R[0].split("\n") first = R[1] first = rate_2.findall(first) TC = first[1] TA = first[2] TG = first[3] second = R[2] second = rate_2.findall(second) CT = second[0] CA = second[2] CG = second[3] third = R[3] third = rate_2.findall(third) AT = third[0] AC = third[1] AG = third[3] fourth = R[4] fourth = rate_2.findall(fourth) GT = fourth[0] GC = fourth[1] GA = fourth[2] if baltimore == True: balt = get_baltimore_classifiaction(family) df = df.append( { "mlb_file_name": mlb_file_name, "baltimore": balt, "basename": basename, "family": family, "protein": protein, "group": filename, "model": model, "lnL": L, "freq_T": freq_T, "freq_C": freq_C, "freq_A": freq_A, "freq_G": freq_G, "TC": TC, "TA": TA, "TG": TG, "CT": CT, "CA": CA, "CG": CG, "AT": AT, "AC": AC, "AG": AG, "GT": GT, "GC": GC, "GA": GA }, ignore_index=True) else: df = df.append( { "mlb_file_name": mlb_file_name, "basename": basename, "family": family, "protein": protein, "group": filename, "model": model, "lnL": L, "freq_T": freq_T, "freq_C": freq_C, "freq_A": freq_A, "freq_G": freq_G, "TC": TC, "TA": TA, "TG": TG, "CT": CT, "CA": CA, "CG": CG, "AT": AT, "AC": AC, "AG": AG, "GT": GT, "GC": GC, "GA": GA }, ignore_index=True) df.to_csv(output) return (output)
def main(args): pipeline_path = "/sternadi/home/volume1/shared/SternLab/pipeline/runner.pl" NGS_or_Cirseq = args.NGS_or_Cirseq print("Pipeline to run: %s" % pipeline_path) input_dir = args.input_dir input_dir = check_dirname(input_dir) output = args.output output = check_dirname(output, Truedir=False) reference = args.reference reference = check_filename(reference) start = args.start end = args.end if start not in [1, 2, 3]: raise Exception("Not a valid start step - needs to be between 1:3") if end not in [2, 3, 4]: raise Exception("Not a valid end step - needs to be between 2:4") type_of_input_file = args.type_of_input_file gaps = args.gaps if gaps not in ["Y", "N"]: raise Exception("Not a valid gap - must be Y or N") q_score = args.q_score if q_score == None: if NGS_or_Cirseq == 1: q_score = 30 else: q_score = 23 blast_id = args.blast_id path_to_save_pipeline_summary = output + "/pipeline_summary.txt" print(start, end, q_score, blast_id, NGS_or_Cirseq) cmd = "perl {} {} {} {} {} {} {} {} {} {} {} {}".format( pipeline_path, input_dir, output, reference, start, end, type_of_input_file, gaps, NGS_or_Cirseq, q_score, blast_id) print("running this pipeline command:") print(cmd) os.system(cmd) # get additional statistics about this running os.system("cd {}".format(os.path.join(input_dir, "tmp"))) # number of reads that were mapped only once only_once_reads = subprocess.check_output( "grep -P '^1\t' *stats -h | awk '{sum+=$2}END{print sum}'", shell=True) #number of reads that are "contributing to frequency counts” freq_contr = subprocess.check_output( "grep 'reads contributing to frequency counts' -h *stats | awk '{sum+=$1}END{print sum}'", shell=True) #number of bases called num_based_called = subprocess.check_output( "grep 'num bases called' *stats | awk -F = '{sum+=$2}END{print sum}'", shell=True) #number of reads that were mapped to reference num_reads_mapped = subprocess.check_output( "cat *blast | awk '{print $1}' | sort | uniq | wc -l", shell=True) with open(path_to_save_pipeline_summary, "w") as o: o.write("---- Pipeline running -----\n") o.write("{}\n\n".format(datetime.datetime.now())) o.write("Pipeline command used: {}\n\n".format(cmd)) o.write("Number of reads that were mapped only once: {}\n".format( only_once_reads)) o.write( "Number of reads that are contributing to frequency count: {}\n". format(freq_contr)) o.write("Number of bases called: {}\n".format(num_based_called)) o.write("Number of reads mapped to reference: {}\n".format( num_reads_mapped)) # create a simple coverage plot freq_file_path = os.path.join( input_dir, [f for f in os.listdir(input_dir) if f.endswith("freq")][0]) freq_file_path = check_filename(freq_file_path) label = os.path.basename(freq_file_path).split('.')[0] df = pd.read_csv(freq_file_path) df = df.drop_duplicates("Pos") plt.plot(df['Pos'].values, df['Read_count'].values, label=label, color='darkorange') plt.title("Coverage {}".format(label), fontsize=16) plt.xlabel("Position in the genome (bp)") plt.ylabel("Read count") plt.savefig(os.path.join(input_dir, 'coverage.png'), format='png') print("Ran pipeline")
def main(args): pipeline_path = "/sternadi/home/volume1/shared/SternLab/pipeline/runner.pl" NGS_or_Cirseq = args.NGS_or_Cirseq print("Pipeline to run: %s" % pipeline_path) input_dir = args.input_dir input_dir = check_dirname(input_dir) output = args.output output = check_dirname(output, Truedir=False) reference = args.ref reference = check_filename(reference) start = args.start end = args.end if start not in [1, 2, 3]: raise Exception("Not a valid start step - needs to be between 1:3") if end not in [2, 3, 4]: raise Exception("Not a valid end step - needs to be between 2:4") type_of_input_file = args.type_of_input_file gaps = args.gaps if gaps not in ["Y", "N"]: raise Exception("Not a valid gap - must be Y or N") q_score = args.q_score if q_score == None: if NGS_or_Cirseq == 1: q_score = 30 else: q_score = 23 blast_id = args.blast evalue = args.evalue repeats = args.repeats if repeats <= 0: raise Exception( "Number of repeats sholud be a positive integer, entered a non-positive value" ) if repeats > 1 and NGS_or_Cirseq == 1: print("WARNING:: running NGS mapping with more then 1 repeat") if repeats == 1 and NGS_or_Cirseq == 2: print("WARNING:: running CirSeq mapping with 1 repeat") #prefix = args.prefix path_to_save_pipeline_summary = output + "/pipeline_summary.txt" print(start, end, q_score, blast_id, NGS_or_Cirseq) cmd = "perl {} {} {} {} {} {} {} {} {} {} {} {} {}".format( pipeline_path, input_dir, output, reference, start, end, type_of_input_file, gaps, NGS_or_Cirseq, q_score, blast_id, evalue, repeats) print("running this pipeline command:") print(cmd) os.system(cmd) # get additional statistics about this running os.chdir(os.path.join(output, "tmp")) os.system("pwd") # number of reads that were mapped only once only_once_reads = subprocess.getoutput( "grep -P '^1\t' *stats -h | awk '{sum+=$2}END{print sum}'") # number of reads that were mapped exactly twice twice_mapped_reads = subprocess.getoutput( "grep -P '^2\t' *stats -h | awk '{sum+=$2}END{print sum}'") #number of reads that are "contributing to frequency counts” freq_contr = subprocess.getoutput( "grep 'reads contributing to frequency counts' -h *stats | awk '{sum+=$1}END{print sum}'" ) #number of bases called num_based_called = subprocess.getoutput( "grep 'num bases called' *stats | awk -F = '{sum+=$2}END{print sum}'") #number of reads that were mapped to reference num_reads_mapped = subprocess.getoutput( "cat *blast | awk '{print $1}' | sort | uniq | wc -l") #total number of reads num_reads = subprocess.getoutput("cat *fasta | grep '^>' | wc -l") with open(path_to_save_pipeline_summary, "w") as o: o.write("---- Pipeline running -----\n") o.write("{}\n\n".format(datetime.datetime.now())) o.write("Pipeline command used:\n{}\n\n".format(cmd)) o.write("Blast parameters: %id for blast = {}, E value = {}\n".format( blast_id, evalue)) o.write("Number of repeats used: {}\n".format(repeats)) o.write("Number of reads: {}\n".format(num_reads)) o.write("Number of reads mapped to reference: {}\n".format( num_reads_mapped)) o.write("Number of reads that were mapped only once: {}\n".format( only_once_reads)) o.write("Number of reads that were mapped exactly twice: {}\n".format( twice_mapped_reads)) o.write( "Number of reads that are contributing to frequency count: {}\n". format(freq_contr)) o.write("Number of bases called: {}\n".format(num_based_called)) #get back to the freq file directory os.chdir(output) # create a simple coverage plot freq_file_path = os.path.join( output, [f for f in os.listdir(output) if ".freq" in f][0]) freq_file_path = check_filename(freq_file_path) label = os.path.basename(freq_file_path).split('.')[0] df = pd.read_csv(freq_file_path, sep='\t') df = df[(df.Ref != '-') & (df.Ref == df.Base)].drop_duplicates("Pos") plt.plot(df['Pos'].values, df['Read_count'].values, label=label, color='darkorange') plt.title("Coverage {}".format(label), fontsize=16) plt.xlabel("Position in the genome (bp)") plt.ylabel("Read count") plt.savefig(os.path.join(output, 'coverage.png'), format='png') print("Ran pipeline")
def r4s_runner(tree_file, seq_file, outfile, dirname, tree_outfile=None, unormelized_outfile=None, log_outfile=None, \ ref_seq = None, n_categories = 4, alias = "r4s"): """ run r4site on cluster :param tree_file: input tree file path :param seq_file: input sequence file path :param outfile: outfile path :param dirname: dirname for ouput files :param tree_outfile: output tree file path (default: None) :param unormelized_outfile: unormelized rated output file (default: None) :param log_outfile: output log file (default: None) :param alias: job name (default: r4s) :return: job id """ tree_file = check_filename(tree_file) seq_file = check_filename(seq_file) dirname = check_dirname(dirname) if tree_outfile != None: tree_outfile = check_filename(tree_outfile, Truefile=False) else: tree_outfile = dirname + "/" + "out-tree" if unormelized_outfile != None: unormelized_outfile = check_filename(unormelized_outfile, Truefile=False) else: unormelized_outfile = dirname + "/out-unormelized" if log_outfile != None: log_outfile = check_filename(log_outfile, Truefile=False) else: log_outfile = dirname + "/out-log" cmdfile = pbs_jobs.get_cmdfile_dir("r4s_cmd.txt", alias) tnum = 1 gmem = 2 ref_seq_parameter = " -a " + ref_seq if ref_seq is not None else "" if tree_file != None: cmds = "/sternadi/home/volume1/shared/tools/rate4site"\ + " -t " + tree_file\ + " -s " + seq_file\ + " -o " + outfile\ + ref_seq_parameter \ + " -x " + tree_outfile\ + " -y " + unormelized_outfile\ + " -V 10"\ + " -l " + log_outfile\ + " -Mh -k " + n_categories else: cmds = "/sternadi/home/volume1/shared/tools/rate4site"\ + " -s " + seq_file\ + " -o " + outfile \ + ref_seq_parameter\ + " -x " + tree_outfile\ + " -y " + unormelized_outfile\ + " -V 10"\ + " -l " + log_outfile\ + " -Mh -k " + n_categories pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, jnum=tnum, gmem=gmem, cmds=cmds) job_id = pbs_jobs.submit(cmdfile) return job_id