def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d",
                      "--dir",
                      dest="dataset_dir",
                      help="dir of dataset files")
    parser.add_option("-c",
                      "--c",
                      dest="context_dir",
                      help="dir of context alignent files")
    (options, args) = parser.parse_args()

    dataset_dir = options.dataset_dir
    dataset_dir = check_dirname(dataset_dir)

    context_dir = options.context_dir
    context_dir = check_dirname(context_dir)

    aln_files = glob.glob(dataset_dir + "/*/*best.fas")
    for aln_file in aln_files:
        print(aln_file)

        basename_dataset_dir = aln_file.split(".phy")[0].split(
            "_aln")[0].split(".aln")[0]
        tree = glob.glob(basename_dataset_dir + "[_.A-z]*tree*")[0]
        rooted_tree = glob.glob(basename_dataset_dir + ".r.tree")[0]
        basename = aln_file.split(".phy")[0].split("_aln")[0].split(
            ".aln")[0].split(dataset_dir)[-1]
        context_alns = glob.glob(context_dir + "/%s[_.A-z]*.aln" % basename)

        for context_aln in context_alns:
            context_basename = context_aln.split(".aln")[0]
            ctl_gtr = context_basename + "_gtr.ctl"
            ctl_unr = context_basename + "_unr.ctl"
            output_gtr = context_basename + "_gtr.mlb"
            output_unr = context_basename + "_unr.mlb"

            write_ctl_file(ctl_gtr,
                           context_aln,
                           tree,
                           output_gtr,
                           7,
                           fix_alpha="1",
                           alpha="0")
            write_ctl_file(ctl_unr,
                           context_aln,
                           rooted_tree,
                           output_unr,
                           8,
                           fix_alpha="1",
                           alpha="0")
def pipeline_runner(input_dir,
                    output_dir,
                    ref_file,
                    NGS_or_Cirseq,
                    TYPE_OF_INPUT_FILE=None,
                    start=None,
                    end=None,
                    gaps=None,
                    qscore=None,
                    blast=None,
                    rep=None,
                    t=None,
                    alias="pipeline"):
    input_dir = check_dirname(input_dir)
    output_dir = check_dirname(output_dir)
    ref_file = check_filename(ref_file)
    if NGS_or_Cirseq not in [1, 2]:
        raise Exception("NGS_or_Cirseq has to be 1 or 2")
    cmds = "python /sternadi/home/volume1/shared/SternLab/pipeline_runner.py -i %s -o %s -r %s -NGS_or_Cirseq %i" \
           % (input_dir, output_dir, ref_file, NGS_or_Cirseq)
    if TYPE_OF_INPUT_FILE != None:
        cmds += " -t %s" % TYPE_OF_INPUT_FILE
    if start != None:
        cmds += " -s %i" % start
    if end != None:
        cmds += " -e %i" % end
    if gaps != None:
        cmds += " -g %s" % gaps
    if qscore != None:
        cmds += " -q %i" % qscore
    if blast != None:
        cmds += " -b %i" % blast
    if rep != None:
        cmds += " -rep %i" % int(rep)
    if t != None:
        cmds += " -t %s" % t

    print(cmds)
    cmdfile = pbs_jobs.get_cmdfile_dir("pipeline.txt", alias)
    tnum = 1
    gmem = 2
    pbs_jobs.create_pbs_cmd(cmdfile=cmdfile,
                            alias=alias,
                            jnum=tnum,
                            gmem=gmem,
                            cmds=cmds,
                            load_python=True)
    job_id = pbs_jobs.submit(cmdfile)
    return job_id
Exemple #3
0
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d", "--dir", dest="dir", help="directory with fastml output files")
    parser.add_option("-o", "--output", dest="output", help="output file perfix")

    (options, args) = parser.parse_args()
    dir = options.dir
    dir = check_dirname(dir)
    output = options.output
    if output == None:
        output = "fastml_anlysis.csv"

    files = glob.glob(dir + "*prob.marginal.txt")
    basenames = [f.split(".prob.marginal.txt")[0] for f in files]
    df = pd.DataFrame(columns=["Basename", "Mutation", "Branch", "Context", "Mutation_type",
                               "Codon_position", "APOBEC_context_GA","APOBEC_context_CT"])

    for basename in basenames:
        print(basename)
        prob_marginal = basename + ".prob.marginal.txt"
        seq_marginal = basename + ".seq.marginal.txt"
        tree_ancestor = basename + ".tree.ancestor.txt"

        basename = basename.split("/")[-1]

        positions_to_remove = get_position_to_remove(prob_marginal)
        ancestor_info, seqs = get_sequence_and_ancestry_data(tree_ancestor, seq_marginal)
        df = go_over_positions(ancestor_info, seqs, positions_to_remove, basename, df)



    df.to_csv(dir + output, index=False)
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d",
                      "--dirname",
                      dest="dirname",
                      help="dirname that contains fastml result files")
    parser.add_option("-o", "--output", dest="output", help="output file")
    (options, args) = parser.parse_args()

    dirname = options.dirname
    dirname = check_dirname(dirname)

    output = options.output
    if output == None:
        output = dirname + "/fastml_analysis_output.csv"
    output = check_filename(output, Truefile=False)

    files = glob.glob(dirname + "/*/*.fasta")
    files = []
    if files == []:
        files = glob.glob(dirname + "/*seq.joint.txt")
    basenames = [f.split(".")[0] for f in files]

    df = pd.DataFrame(columns=[
        "family", "group", "mutation", "mutation_count_in_context",
        "context_count_overall", "mutation_count_overall"
    ])

    for basename in basenames:
        df = run_on_basename(basename, df)
Exemple #5
0
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d", "--dir", dest="dir", help="dir input fasta files")
    parser.add_option("-s",
                      "--subtypes",
                      dest="subtypes",
                      help="subtypes to keep, seperated by comma. example A,B")
    (options, args) = parser.parse_args()

    dir = options.dir
    dir = check_dirname(dir)

    subtypes = options.subtypes.split(",")
    input_files = glob.glob(dir + "/*.fasta")

    for file in input_files:
        fasta = open(file, "r").read()
        basename = file.split(".fasta")[0]
        for subtype in subtypes:
            pattern = re.compile(">%s[^>]*" % subtype)
            results = pattern.findall(fasta)
            print("%s: %s: %s" % (file, subtype, str(len(results))))
            fasta_out = "".join(results)
            output_file = basename + "_%s.aln" % subtype
            output = open(output_file, "w")
            output.write(fasta_out)
            output.close()
    print("splited HIV files to subtypes")
def tophat2_runner(output_dir, bowtie_reference, fastq, alias="tophat2"):
    """
    tophat2 runner
    :param output_dir: output directory
    :param bowtie_reference: bowtie reference path
    :param fastq: fastq path
    :param alias: job name (tophat2)
    :return: job id
    """
    output_dir = check_dirname(output_dir, Truedir=False)
    bowtie_reference = check_filename(bowtie_reference, Truefile=False)
    fastq = check_filename(fastq)

    cmdfile = pbs_jobs.get_cmdfile_dir("tophat2", alias)
    tnum = 1
    gmem = 2
    cmds = "/sternadi/home/volume1/taliakustin/software/tophat-2.1.1.Linux_x86_64/tophat2"\
           + " -o %s %s %s" % (output_dir, bowtie_reference, fastq)
    pbs_jobs.create_pbs_cmd(cmdfile=cmdfile,
                            alias=alias,
                            jnum=tnum,
                            gmem=gmem,
                            cmds=cmds,
                            load_python=False)
    job_id = pbs_jobs.submit(cmdfile)
    return job_id
def selecton_runner(codon_aln,
                    output_dir=None,
                    tree=None,
                    log=None,
                    rate=None,
                    output=None,
                    color=None,
                    out_tree=None,
                    query_seq=None,
                    model="M8",
                    alias="selecton",
                    use_query_seq=False):
    codon_aln = check_filename(codon_aln)
    if output_dir == None:
        base = codon_aln.split(".")[0] + "_selecton"
    else:
        base = check_dirname(output_dir)
        base = base + codon_aln.split("/")[-1].split(".")[0] + "_selecton"
    log = set_filenames_for_pbs_runs(log, base, "log.txt")
    rate = set_filenames_for_pbs_runs(rate, base, "kaks.txt")
    output = set_filenames_for_pbs_runs(output, base, "output.txt")
    color = set_filenames_for_pbs_runs(color, base, "color.txt")
    out_tree = set_filenames_for_pbs_runs(out_tree, base, "output_tree.txt")

    if query_seq == None:
        query_seq = get_longest_sequence_name_in_fasta(codon_aln)

    if model == "M8":
        model = ""
    elif model == "M8a":
        model = "-w1 -Fw"
    elif model == "M7":
        model = "-p1 -Fp"

    if tree != None:
        tree = check_filename(tree)
        if use_query_seq == False:
            cmds = "selecton -i %s -u %s -l %s -r %s -o %s -c %s -t %s %s" \
                   % (codon_aln, tree, log, rate, output, color, out_tree, model)
        else:
            cmds = "selecton -i %s -u %s -l %s -r %s -o %s -c %s -t %s %s -q %s" \
                   % (codon_aln, tree, log, rate, output, color, out_tree, model, query_seq)
    else:
        if use_query_seq == False:
            cmds = "selecton -i %s -l %s -r %s -o %s -c %s -t %s %s" \
                   % (codon_aln, log, rate, output, color, out_tree, model)
        else:
            cmds = "selecton -i %s -l %s -r %s -o %s -c %s -t %s %s -q %s" \
                   % (codon_aln, log, rate, output, color, out_tree, model, query_seq)
    cmdfile = pbs_jobs.get_cmdfile_dir("selecton.txt", alias)
    tnum = 1
    gmem = 2
    pbs_jobs.create_pbs_cmd(cmdfile=cmdfile,
                            alias=alias,
                            jnum=tnum,
                            gmem=gmem,
                            cmds=cmds)
    job_id = pbs_jobs.submit(cmdfile)
    return job_id
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d", "--directory", dest="dir", help="input directory with kaks gaps files")
    parser.add_option("-o", "--output", dest="output", help="output file name")
    parser.add_option("-v", "--virus", dest="virus", help="virus - tells the script how to parse filenames. "
                                                          "options: influenza, tilv, influenza20, thogoto")
    (options, args) = parser.parse_args()


    dir = options.dir
    output = options.output
    virus = options.virus

    dir = check_dirname(dir)
    output = check_filename(output, Truefile=False)
    files = glob.glob("%s/*kaks*gaps" % dir)
    if len(files) == 0:
        raise Exception("No files in %s" % dir)

    if virus == "influenza": #add more virus name if you add anything
        output_text = "POS\tAMINO\tKaKs\tconfidence_interval\tvirus\tprotein\n"
    elif virus == "influenza20" or virus == "tilv" or virus == "thogoto":
        output_text = "POS\tAMINO\tKaKs\tconfidence_interval\tvirus\tprotein\tsegment\n"
    else:
        output_text = "POS\tAMINO\tKaKs\tconfidence_interval\n"

    for f in files:
        print(f)
        if virus == "influenza":
            virus_name = "Influenza " + f.split("/")[-1].split("inf")[1].split("_")[0]
            protein = f.split("/")[-1].split("_")[2]
            segment = None
        elif virus == "tilv":
            virus_name = "TiLV"
            protein = "Segment " + f.split("/")[-1].split("_")[0].split("seg")[-1]
            segment = protein
        elif virus == "influenza20":
            virus_name  = "Influenza " + f.split("/")[-1].split("Segment")[0].split("_")[1]
            protein = f.split("/")[-1].split("Protein")[1].split("_")[1]
            segment = "Segment " + f.split("Segment")[1].split("_")[1]
        elif virus == "thogoto":
            virus_name = "Thogoto"
            protein = f.split("gene_")[1].split("_")[0]
            segment = "Segment " + f.split("segment_")[1].split("_")[0]
        #if adding more virus type this is what you have to do:
        #elif virus == "XXXX":
            #virus_name = XXX
            #protein = XXX
        else:
            virus_name = None
            protein = None
            segment = None
        output_text += kaks_file_to_txt_delimeted_results(f, virus_name, protein, segment)

    output = open(output, "w")
    output.write(output_text)
    output.close()
Exemple #9
0
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d",
                      "--dir",
                      dest="dir",
                      help="dir of temp files of cirseq pipeline run")
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      help="output folder to save")

    (options, args) = parser.parse_args()
    in_dir = options.dir
    out_dir = options.output
    in_dir = check_dirname(in_dir)
    out_dir = check_dirname(out_dir)

    repeat_summmery = get_repeats_num(in_dir)
    make_graph(repeat_summmery, out_dir)
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d",
                      "--dir",
                      dest="freqs_dir",
                      help="dir with freqs file")

    (options, args) = parser.parse_args()
    freqs_dir = options.freqs_dir
    freqs_dir = check_dirname(freqs_dir)

    freqs_files = glob.glob(freqs_dir + "/*.freqs")

    with_mutation_files = glob.glob(freqs_dir + "/*_with_mutations.csv")
    if with_mutation_files == []:
        print("Adding mutation to freqs files")
        for freqs_file in freqs_files:
            print(freqs_file)
            output = freqs_file.split(".freqs")[0] + "_with_mutations.csv"
            add_mutation_to_freq_file(output, freqs_file=freqs_file)
            with_mutation_files.append(output)

    segment_files = glob.glob(freqs_dir + "/Segment_[0-9].csv") + glob.glob(
        freqs_dir + "/Segment_[0-9][0-9].csv")
    if segment_files == []:
        print("Merging segment from different passages")
        for s in range(1, 11):
            specific_segment_mutation_files = glob.glob(
                freqs_dir + "/P*-S%s_with_mutations.csv" % s)
            segment_file, segment_freqs = merge_freqs_files(
                specific_segment_mutation_files,
                freqs_dir + "/Segment_%i.csv" % s)
            segment_files.append(segment_file)


    filtered_files = glob.glob(freqs_dir + "/Segment_[0-9]_filtered.csv") + \
                    glob.glob(freqs_dir + "/Segment_[0-9][0-9]_filtered.csv")

    print("Filtering positions from segment csvs")
    for segment_file in segment_files:
        output = segment_file.split(".csv")[0] + "_filtered.csv"
        filtered_file, filtered_ferqs = filter_freqs_for_regression_analysis(
            output, freqs_file=segment_file)
        filtered_files.append(filtered_file)

    merge_dfs(filtered_files,
              freqs_dir + "/All_segments_filtered_for_regression.csv")
def main():
    parser = OptionParser("usage: %prog [options]")
    parser.add_option("-d",
                      "--dir",
                      dest="dataset_dir",
                      help="dir of dataset files")
    parser.add_option("-o",
                      "--output",
                      dest="output_file",
                      help="output file name")
    (options, args) = parser.parse_args()

    dataset_dir = options.dataset_dir
    dataset_dir = check_dirname(dataset_dir)

    output_file = options.output_file
    output_file = check_filename(output_file, Truefile=False)

    aln_files = glob.glob(dataset_dir + "/*/*best.fas")

    df = pd.DataFrame(columns=[
        "filename", "1", "0.9", "0.8", "0.7", "0.6", "0.5", "0.4", "0.3", "0.2"
    ])

    for aln_file in aln_files:
        consensus, consensus_percentage = get_consensus_percentage(aln_file)
        filename = aln_file.split(dataset_dir)[-1]
        print(aln_file)
        df = df.append(
            {
                "filename": filename,
                "1": consensus_percentage[1],
                "0.9": consensus_percentage[0.9],
                "0.8": consensus_percentage[0.8],
                "0.7": consensus_percentage[0.7],
                "0.6": consensus_percentage[0.6],
                "0.5": consensus_percentage[0.5],
                "0.4": consensus_percentage[0.4],
                "0.3": consensus_percentage[0.3],
                "0.2": consensus_percentage[0.2]
            },
            ignore_index=True)

    df.to_csv(output_file, index=False)
def fastml_runner(alignment,
                  tree,
                  outdir=None,
                  alias="fastml",
                  additional_params=None):
    """
    run fastml from phylogenyCode on cluster
    :param alignment: alignment file path
    :param tree: tree file path
    :param alias: job name (default: fastml)
    :param outdir: output directory for results (default: None - saves in the alignment's dir)
    :return: job id
    """
    alignment = check_filename(alignment)
    tree = check_filename(tree)
    if outdir == None:
        outdir = os.path.dirname(alignment)
    else:
        outdir = check_dirname(outdir)
    basename = os.path.basename(alignment).split(".")[0].split("_aln")[0]
    newick_tree = outdir + "/" + basename + ".tree.newick.txt"
    ancestor_tree = outdir + "/" + basename + ".tree.ancestor.txt"
    joint_seqs = outdir + "/" + basename + ".seq.joint.txt"
    marginal_seqs = outdir + "/" + basename + ".seq.marginal.txt"
    joint_prob = outdir + "/" + basename + ".prob.joint.txt"
    marginal_prob = outdir + "/" + basename + ".prob.marginal.txt"
    cmdfile = pbs_jobs.get_cmdfile_dir("fastml.txt", alias)
    tnum = 1
    gmem = 1
    cmds = "/sternadi/home/volume1/shared/tools/phylogenyCode/programs/fastml/fastml -s %s -t %s -mn -x %s " \
           "-y %s -j %s -k %s -d %s -e %s -qf" % (alignment, tree, newick_tree, ancestor_tree, joint_seqs,
                                                 marginal_seqs, joint_prob, marginal_prob)
    if additional_params != None:
        cmds += " %s" % additional_params
    pbs_jobs.create_pbs_cmd(cmdfile=cmdfile, alias=alias, gmem=gmem, cmds=cmds)
    job_id = pbs_jobs.submit(cmdfile)
    return job_id
def save_all_rooted_trees(tree_file, output_dir):
    """
    saves all possible rooted trees for a given tree
    :param tree_file: input tree path
    :param output_dir: output directory for trees
    :return: rooted file dictionary
    """
    # save all possible rooted tree of a given tree
    tree_file = check_filename(tree_file)
    output_dir = check_dirname(output_dir)
    basename =  path.basename(path.splitext(tree_file)[0])
    treefile_out = output_dir + "/" + basename
    tree = Phylo.read(tree_file, "newick")
    clades = list(tree.find_clades())
    out_files = []
    for clade in clades:
        if clade.name != None:
            tree.root_with_outgroup(clade)
            if tree.rooted == True:
                outfile = treefile_out +"_%s.txt" % clade.name
                Phylo.write(tree, outfile , "newick")
                out_files.append(outfile)
                out_files[clade.name]["rooted_file"] = outfile
    return out_files
def mlbs_to_df(output, mlbs=[], dirname=None, baltimore=True):
    """
    analyzes mlb file to dataframe - extracts lnL, base frequencies and substiution matrics
    :param output: output csv file path
    :param mlbs: list of mlb files
    :param dirname: dirname that has mlb files
    :return: output file path
    """
    if mlbs == [] and dirname == None:
        raise Exception(
            "you need to provide mlb or dirname that contains mlbs")
    if mlbs != [] and dirname != None:
        raise Exception("you need to provide only one - mlb or dirname")

    if dirname != None:
        dirname = check_dirname(dirname)
        mlbs = glob.glob(dirname + "/*.mlb")
    if mlbs != []:
        mlbs = [check_filename(m) for m in mlbs]

    output = check_filename(output, Truefile=False)

    df = pd.DataFrame(columns=[
        "mlb_file_name", "basename", "family", "protein", "group", "model",
        "lnL", "freq_T", "freq_C", "freq_A", "freq_G", "TC", "TA", "TG", "CT",
        "CA", "CG", "AT", "AC", "AG", "GT", "GC", "GA"
    ])

    lnL_1 = re.compile("lnL.*")
    lnL_2 = re.compile("\-?\d*\.\d*")
    base_1 = re.compile("Base frequencies.*")
    base_2 = re.compile("0.\d+")
    rate_1 = re.compile("Rate matrix Q.*\n.*\n.*\n.*\n.*", re.IGNORECASE)
    rate_2 = re.compile("\d+.\d+")
    for mlb_file_name in mlbs:
        basename = mlb_file_name.split("/")[-1].split(".mlb")[0]
        print(mlb_file_name)
        family = mlb_file_name.split("/")[-1].split("_")[0]
        protein = mlb_file_name.split("/")[-1].split(family +
                                                     "_")[1].split(".")[0]
        filename = mlb_file_name.split("/")[-1].split(family +
                                                      "_")[-1].split(".mlb")[0]
        #if "_gtr" in filename or "_unrest" in filename:
        #    filename = filename.split("_gtr")[0]
        #    filename = filename.split("_unrest")[0]
        model = mlb_file_name.split(".mlb")[-1]

        mlb = open(mlb_file_name, "r").read()
        L = lnL_1.findall(mlb)
        if len(L) != 1:
            L = None
        elif "nan" in L[0]:
            L = None
        else:
            L = float(lnL_2.findall(L[0])[0])

        B = base_1.findall(mlb)
        if len(B) != 1:
            freq_T = None
            freq_C = None
            freq_A = None
            freq_G = None
        elif "nan" in B[0]:
            freq_T = None
            freq_C = None
            freq_A = None
            freq_G = None
        else:
            B = base_2.findall(B[0])
            freq_T = float(B[0])
            freq_C = float(B[1])
            freq_A = float(B[2])
            freq_G = float(B[3])

        R = rate_1.findall(mlb)
        if len(R) != 1:
            TC = None
            TA = None
            TG = None
            CT = None
            CA = None
            CG = None
            AT = None
            AC = None
            AG = None
            GT = None
            GC = None
            GA = None
        elif len(R) >= 1 and "nan" in R[0]:
            TC = None
            TA = None
            TG = None
            CT = None
            CA = None
            CG = None
            AT = None
            AC = None
            AG = None
            GT = None
            GC = None
            GA = None

        else:
            R = R[0].split("\n")
            first = R[1]
            first = rate_2.findall(first)
            TC = first[1]
            TA = first[2]
            TG = first[3]
            second = R[2]
            second = rate_2.findall(second)
            CT = second[0]
            CA = second[2]
            CG = second[3]
            third = R[3]
            third = rate_2.findall(third)
            AT = third[0]
            AC = third[1]
            AG = third[3]
            fourth = R[4]
            fourth = rate_2.findall(fourth)
            GT = fourth[0]
            GC = fourth[1]
            GA = fourth[2]

        if baltimore == True:
            balt = get_baltimore_classifiaction(family)
            df = df.append(
                {
                    "mlb_file_name": mlb_file_name,
                    "baltimore": balt,
                    "basename": basename,
                    "family": family,
                    "protein": protein,
                    "group": filename,
                    "model": model,
                    "lnL": L,
                    "freq_T": freq_T,
                    "freq_C": freq_C,
                    "freq_A": freq_A,
                    "freq_G": freq_G,
                    "TC": TC,
                    "TA": TA,
                    "TG": TG,
                    "CT": CT,
                    "CA": CA,
                    "CG": CG,
                    "AT": AT,
                    "AC": AC,
                    "AG": AG,
                    "GT": GT,
                    "GC": GC,
                    "GA": GA
                },
                ignore_index=True)
        else:
            df = df.append(
                {
                    "mlb_file_name": mlb_file_name,
                    "basename": basename,
                    "family": family,
                    "protein": protein,
                    "group": filename,
                    "model": model,
                    "lnL": L,
                    "freq_T": freq_T,
                    "freq_C": freq_C,
                    "freq_A": freq_A,
                    "freq_G": freq_G,
                    "TC": TC,
                    "TA": TA,
                    "TG": TG,
                    "CT": CT,
                    "CA": CA,
                    "CG": CG,
                    "AT": AT,
                    "AC": AC,
                    "AG": AG,
                    "GT": GT,
                    "GC": GC,
                    "GA": GA
                },
                ignore_index=True)

    df.to_csv(output)
    return (output)
Exemple #15
0
def main(args):

    pipeline_path = "/sternadi/home/volume1/shared/SternLab/pipeline/runner.pl"

    NGS_or_Cirseq = args.NGS_or_Cirseq

    print("Pipeline to run: %s" % pipeline_path)

    input_dir = args.input_dir
    input_dir = check_dirname(input_dir)
    output = args.output
    output = check_dirname(output, Truedir=False)
    reference = args.reference
    reference = check_filename(reference)

    start = args.start
    end = args.end
    if start not in [1, 2, 3]:
        raise Exception("Not a valid start step - needs to be between 1:3")
    if end not in [2, 3, 4]:
        raise Exception("Not a valid end step - needs to be between 2:4")

    type_of_input_file = args.type_of_input_file
    gaps = args.gaps
    if gaps not in ["Y", "N"]:
        raise Exception("Not a valid gap - must be Y or N")
    q_score = args.q_score

    if q_score == None:
        if NGS_or_Cirseq == 1:
            q_score = 30
        else:
            q_score = 23

    blast_id = args.blast_id

    path_to_save_pipeline_summary = output + "/pipeline_summary.txt"
    print(start, end, q_score, blast_id, NGS_or_Cirseq)

    cmd = "perl {} {} {} {} {} {} {} {} {} {} {} {}".format(
        pipeline_path, input_dir, output, reference, start, end,
        type_of_input_file, gaps, NGS_or_Cirseq, q_score, blast_id)

    print("running this pipeline command:")
    print(cmd)
    os.system(cmd)

    # get additional statistics about this running
    os.system("cd {}".format(os.path.join(input_dir, "tmp")))

    # number of reads that were mapped only once
    only_once_reads = subprocess.check_output(
        "grep -P '^1\t' *stats  -h | awk '{sum+=$2}END{print sum}'",
        shell=True)
    #number of reads that are "contributing to frequency counts”
    freq_contr = subprocess.check_output(
        "grep 'reads contributing to frequency counts' -h *stats | awk '{sum+=$1}END{print sum}'",
        shell=True)
    #number of bases called
    num_based_called = subprocess.check_output(
        "grep 'num bases called' *stats | awk -F = '{sum+=$2}END{print sum}'",
        shell=True)
    #number of reads that were mapped to reference
    num_reads_mapped = subprocess.check_output(
        "cat *blast | awk '{print $1}' | sort | uniq | wc -l", shell=True)

    with open(path_to_save_pipeline_summary, "w") as o:
        o.write("---- Pipeline running -----\n")
        o.write("{}\n\n".format(datetime.datetime.now()))
        o.write("Pipeline command used: {}\n\n".format(cmd))
        o.write("Number of reads that were mapped only once: {}\n".format(
            only_once_reads))
        o.write(
            "Number of reads that are contributing to frequency count: {}\n".
            format(freq_contr))
        o.write("Number of bases called: {}\n".format(num_based_called))
        o.write("Number of reads mapped to reference: {}\n".format(
            num_reads_mapped))

    # create a simple coverage plot
    freq_file_path = os.path.join(
        input_dir, [f for f in os.listdir(input_dir) if f.endswith("freq")][0])
    freq_file_path = check_filename(freq_file_path)
    label = os.path.basename(freq_file_path).split('.')[0]

    df = pd.read_csv(freq_file_path)

    df = df.drop_duplicates("Pos")
    plt.plot(df['Pos'].values,
             df['Read_count'].values,
             label=label,
             color='darkorange')
    plt.title("Coverage {}".format(label), fontsize=16)
    plt.xlabel("Position in the genome (bp)")
    plt.ylabel("Read count")
    plt.savefig(os.path.join(input_dir, 'coverage.png'), format='png')

    print("Ran pipeline")
def main(args):

    pipeline_path = "/sternadi/home/volume1/shared/SternLab/pipeline/runner.pl"

    NGS_or_Cirseq = args.NGS_or_Cirseq

    print("Pipeline to run: %s" % pipeline_path)

    input_dir = args.input_dir
    input_dir = check_dirname(input_dir)
    output = args.output
    output = check_dirname(output, Truedir=False)
    reference = args.ref
    reference = check_filename(reference)

    start = args.start
    end = args.end
    if start not in [1, 2, 3]:
        raise Exception("Not a valid start step - needs to be between 1:3")
    if end not in [2, 3, 4]:
        raise Exception("Not a valid end step - needs to be between 2:4")

    type_of_input_file = args.type_of_input_file
    gaps = args.gaps
    if gaps not in ["Y", "N"]:
        raise Exception("Not a valid gap - must be Y or N")
    q_score = args.q_score

    if q_score == None:
        if NGS_or_Cirseq == 1:
            q_score = 30
        else:
            q_score = 23

    blast_id = args.blast

    evalue = args.evalue

    repeats = args.repeats
    if repeats <= 0:
        raise Exception(
            "Number of repeats sholud be a positive integer, entered a non-positive value"
        )
    if repeats > 1 and NGS_or_Cirseq == 1:
        print("WARNING:: running NGS mapping with more then 1 repeat")
    if repeats == 1 and NGS_or_Cirseq == 2:
        print("WARNING:: running CirSeq mapping with 1 repeat")

    #prefix = args.prefix

    path_to_save_pipeline_summary = output + "/pipeline_summary.txt"
    print(start, end, q_score, blast_id, NGS_or_Cirseq)

    cmd = "perl {} {} {} {} {} {} {} {} {} {} {} {} {}".format(
        pipeline_path, input_dir, output, reference, start, end,
        type_of_input_file, gaps, NGS_or_Cirseq, q_score, blast_id, evalue,
        repeats)

    print("running this pipeline command:")
    print(cmd)
    os.system(cmd)

    # get additional statistics about this running
    os.chdir(os.path.join(output, "tmp"))
    os.system("pwd")

    # number of reads that were mapped only once
    only_once_reads = subprocess.getoutput(
        "grep -P '^1\t' *stats  -h | awk '{sum+=$2}END{print sum}'")
    # number of reads that were mapped exactly twice
    twice_mapped_reads = subprocess.getoutput(
        "grep -P '^2\t' *stats  -h | awk '{sum+=$2}END{print sum}'")
    #number of reads that are "contributing to frequency counts”
    freq_contr = subprocess.getoutput(
        "grep 'reads contributing to frequency counts' -h *stats | awk '{sum+=$1}END{print sum}'"
    )
    #number of bases called
    num_based_called = subprocess.getoutput(
        "grep 'num bases called' *stats | awk -F = '{sum+=$2}END{print sum}'")
    #number of reads that were mapped to reference
    num_reads_mapped = subprocess.getoutput(
        "cat *blast | awk '{print $1}' | sort | uniq | wc -l")
    #total number of reads
    num_reads = subprocess.getoutput("cat *fasta | grep '^>' | wc -l")

    with open(path_to_save_pipeline_summary, "w") as o:
        o.write("---- Pipeline running -----\n")
        o.write("{}\n\n".format(datetime.datetime.now()))
        o.write("Pipeline command used:\n{}\n\n".format(cmd))
        o.write("Blast parameters: %id for blast = {}, E value = {}\n".format(
            blast_id, evalue))
        o.write("Number of repeats used: {}\n".format(repeats))
        o.write("Number of reads: {}\n".format(num_reads))
        o.write("Number of reads mapped to reference: {}\n".format(
            num_reads_mapped))
        o.write("Number of reads that were mapped only once: {}\n".format(
            only_once_reads))
        o.write("Number of reads that were mapped exactly twice: {}\n".format(
            twice_mapped_reads))
        o.write(
            "Number of reads that are contributing to frequency count: {}\n".
            format(freq_contr))
        o.write("Number of bases called: {}\n".format(num_based_called))

    #get back to the freq file directory

    os.chdir(output)
    # create a simple coverage plot
    freq_file_path = os.path.join(
        output, [f for f in os.listdir(output) if ".freq" in f][0])
    freq_file_path = check_filename(freq_file_path)
    label = os.path.basename(freq_file_path).split('.')[0]

    df = pd.read_csv(freq_file_path, sep='\t')

    df = df[(df.Ref != '-') & (df.Ref == df.Base)].drop_duplicates("Pos")
    plt.plot(df['Pos'].values,
             df['Read_count'].values,
             label=label,
             color='darkorange')
    plt.title("Coverage {}".format(label), fontsize=16)
    plt.xlabel("Position in the genome (bp)")
    plt.ylabel("Read count")
    plt.savefig(os.path.join(output, 'coverage.png'), format='png')

    print("Ran pipeline")
def r4s_runner(tree_file, seq_file, outfile, dirname, tree_outfile=None, unormelized_outfile=None, log_outfile=None, \
               ref_seq = None, n_categories = 4, alias = "r4s"):
    """
    run r4site on cluster
    :param tree_file: input tree file path
    :param seq_file: input sequence file path
    :param outfile: outfile path
    :param dirname: dirname for ouput files
    :param tree_outfile: output tree file path (default: None)
    :param unormelized_outfile: unormelized rated output file (default: None)
    :param log_outfile: output log file (default: None)
    :param alias: job name (default: r4s)
    :return: job id
    """
    tree_file = check_filename(tree_file)
    seq_file = check_filename(seq_file)
    dirname = check_dirname(dirname)

    if tree_outfile != None:
        tree_outfile = check_filename(tree_outfile, Truefile=False)
    else:
        tree_outfile = dirname + "/" + "out-tree"
    if unormelized_outfile != None:
        unormelized_outfile = check_filename(unormelized_outfile,
                                             Truefile=False)
    else:
        unormelized_outfile = dirname + "/out-unormelized"
    if log_outfile != None:
        log_outfile = check_filename(log_outfile, Truefile=False)
    else:
        log_outfile = dirname + "/out-log"

    cmdfile = pbs_jobs.get_cmdfile_dir("r4s_cmd.txt", alias)
    tnum = 1
    gmem = 2
    ref_seq_parameter = " -a " + ref_seq if ref_seq is not None else ""
    if tree_file != None:
        cmds = "/sternadi/home/volume1/shared/tools/rate4site"\
                                                            + " -t " + tree_file\
                                                            + " -s " + seq_file\
                                                            + " -o " + outfile\
                                                            + ref_seq_parameter \
                                                            + " -x " + tree_outfile\
                                                            + " -y " + unormelized_outfile\
                                                            + " -V 10"\
                                                            + " -l " + log_outfile\
                                                            + " -Mh -k " + n_categories
    else:
        cmds = "/sternadi/home/volume1/shared/tools/rate4site"\
                                                            + " -s " + seq_file\
                                                            + " -o " + outfile \
                                                            + ref_seq_parameter\
                                                            + " -x " + tree_outfile\
                                                            + " -y " + unormelized_outfile\
                                                            + " -V 10"\
                                                            + " -l " + log_outfile\
                                                            + " -Mh -k " + n_categories

    pbs_jobs.create_pbs_cmd(cmdfile=cmdfile,
                            alias=alias,
                            jnum=tnum,
                            gmem=gmem,
                            cmds=cmds)
    job_id = pbs_jobs.submit(cmdfile)
    return job_id