def main_load_library(args):
    lib_prefix = args.prefix.split("/")[-1]
    files = {
        "gff": ".gff",
        "ref": ".fasta",
        "barcode": ".barcode.bed",
        "version": ".version.json"
    }
    if pp.nofolder(sys.base_prefix + "/share/covidprofiler"):
        pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/"))
    for key in files:
        new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[
            key]
        pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location))
    pp.run_cmd("samtools faidx %s" % sys.base_prefix +
               "/share/covidprofiler/" + lib_prefix + ".fasta")
    pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" +
               lib_prefix + ".fasta")
    if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" +
                      lib_prefix + ".dict"):
        pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" +
                   lib_prefix + ".dict")
    pp.run_cmd("gatk CreateSequenceDictionary -R %s" % sys.base_prefix +
               "/share/covidprofiler/" + lib_prefix + ".fasta")
    pp.log("Sucessfully imported library")
def main_load_library(args):
    lib_prefix = args.prefix.split("/")[-1]
    files = {
        "gff": ".gff",
        "ref": ".fasta",
        "barcode": ".barcode.bed",
        "version": ".version.json",
        "proteins": ".proteins.csv",
        "non_coding_bed": ".non_coding.bed"
    }
    if pp.nofolder(sys.base_prefix + "/share/covidprofiler"):
        pp.run_cmd("mkdir %s " % (sys.base_prefix + "/share/covidprofiler/"))
    pp.run_cmd("cp %s %s" % (args.msa, "%s/share/covidprofiler/%s.msa.fa" %
                             (sys.base_prefix, lib_prefix)))
    pp.run_cmd("cp %s %s" %
               (args.meta, "%s/share/covidprofiler/%s.msa.meta.csv" %
                (sys.base_prefix, lib_prefix)))
    for key in files:
        new_file_location = sys.base_prefix + "/share/covidprofiler/" + lib_prefix + files[
            key]
        pp.run_cmd("cp %s %s" % (args.prefix + files[key], new_file_location))
    pp.run_cmd("samtools faidx %s" % sys.base_prefix +
               "/share/covidprofiler/" + lib_prefix + ".fasta")
    pp.run_cmd("bwa index %s" % sys.base_prefix + "/share/covidprofiler/" +
               lib_prefix + ".fasta")
    if os.path.isfile("%s" % sys.base_prefix + "/share/covidprofiler/" +
                      lib_prefix + ".dict"):
        pp.run_cmd("rm %s" % sys.base_prefix + "/share/covidprofiler/" +
                   lib_prefix + ".dict")
    pp.log("Sucessfully imported library")
Example #3
0
def main_profile(args):
    #### Setup conf dictionary ###
    if args.db == "tbdb" and not args.external_db and pp.nofile(
            sys.base_prefix + "/share/tbprofiler/tbdb.fasta"):
        pp.log(
            "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag"
            % sys.base_prefix,
            ext=True)
    if args.external_db:
        conf = get_conf_dict(args.external_db)
    else:
        conf = get_conf_dict(sys.base_prefix +
                             "/share/tbprofiler/%s" % args.db)

    ### Create folders for results if they don't exist ###
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)

    for x in ["bam", "vcf", "results"]:
        if pp.nofolder(args.dir + "/" + x):
            os.mkdir(args.dir + "/" + x)

    ### Set up platform dependant parameters ###
    if args.platform == "nanopore":
        args.mapper = "minimap2"
        args.caller = "bcftools"
        args.no_trim = True
        run_delly = False
    else:
        if args.no_delly:
            run_delly = False
        else:
            run_delly = True

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    ### Create bam file if fastq has been supplied ###
    if args.bam == None:
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        else:
            exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n")
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        bam_file = bam_obj.bam_file
    else:
        bam_file = args.bam

    print(args.delly_bcf_file)
    run_coverage = False if args.no_coverage else True
    ### Run profiling module from pathogen-profiler ###
    results = pp.bam_profiler(
        conf=conf,
        bam_file=bam_file,
        prefix=files_prefix,
        platform=args.platform,
        caller=args.caller,
        threads=args.threads,
        no_flagstat=args.no_flagstat,
        run_delly=run_delly,
        calling_params=args.calling_params,
        coverage_fraction_threshold=args.coverage_fraction_threshold,
        missing_cov_threshold=args.missing_cov_threshold,
        delly_bcf_file=args.delly_bcf_file)
    json.dump(results, open(args.prefix + ".tmp_results.json", "w"))
    ### Reformat the results to TB-Profiler style ###
    results = tbp.reformat(results, conf, reporting_af=args.reporting_af)
    results["id"] = args.prefix
    results["tbprofiler_version"] = tbp._VERSION
    results["pipeline"] = {
        "mapper": args.mapper if not args.bam else "N/A",
        "variant_caller": args.caller
    }

    json_output = args.dir + "/results/" + args.prefix + ".results.json"
    tex_output = args.dir + "/results/" + args.prefix + ".results.tex"
    text_output = args.dir + "/results/" + args.prefix + ".results.txt"
    csv_output = args.dir + "/results/" + args.prefix + ".results.csv"

    json.dump(results, open(json_output, "w"))
    extra_columns = [x.lower() for x in args.add_columns.split(",")
                     ] if args.add_columns else []
    if args.pdf:
        tbp.write_tex(results, conf, tex_output, extra_columns)
        pp.run_cmd("pdflatex %s" % tex_output, verbose=1)
        pp.rm_files([
            tex_output, args.dir + "/" + args.prefix + ".results.aux",
            args.dir + "/" + args.prefix + ".results.log"
        ])
    if args.txt:
        tbp.write_text(results,
                       conf,
                       text_output,
                       extra_columns,
                       reporting_af=args.reporting_af)
    if args.csv:
        tbp.write_csv(results, conf, csv_output, extra_columns)

    ### Move files to respective directories ###
    if not args.bam:
        pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args))
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args))
    if run_delly and results["delly"] == "success" and not args.delly_bcf_file:
        pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" %
                   vars(args))

    ### Add meta data to results
    if args.meta:
        for row in csv.DictReader(open(args.meta)):
            if row["id"] == results["id"]:
                for col in row:
                    results["meta_" + col] = row[col]
    pp.log("Profiling finished sucessfully!")
Example #4
0
def main_profile(args):
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)
    conf = get_conf_dict(sys.base_prefix + "/share/covidprofiler/%s" % args.db)

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    if args.fasta:
        if args.read1 or args.read2:
            sys.stderr.write(
                "Please use --fasta or --read1/2 but not both... Exiting!\n")
            quit()
        fasta_obj = pp.fasta(args.fasta)
        wg_vcf_obj = pp.vcf(
            fasta_obj.get_ref_variants(conf["ref"],
                                       prefix=args.prefix,
                                       file_prefix=files_prefix))
    else:
        if not args.read1:
            sys.stderr.write(
                "Please provide assembly using --fasta or at least one read file using --read1... Exiting!\n"
            )
            quit()
        ### Create bam file if fastq has been supplied ###
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        wg_vcf_obj = bam_obj.call_variants(conf["ref"],
                                           args.caller,
                                           remove_missing=True)
        cp.vcf2consensus(bam_obj.bam_file, wg_vcf_obj.filename, conf["ref"],
                         wg_vcf_obj.samples[0],
                         wg_vcf_obj.prefix + ".consensus.fasta")
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    refseq = pp.fasta(conf["ref"]).fa_dict
    refseqname = list(refseq.keys())[0]

    results = {}
    barcode_mutations = wg_vcf_obj.get_bed_gt(conf["barcode"], conf["ref"])
    barcode = pp.barcode(barcode_mutations, conf["barcode"])
    clade = ";".join(sorted([d["annotation"] for d in barcode]))
    sys.stdout.write("%s\t%s\n" % (args.prefix, clade))
    results["clade"] = clade

    variant_data = cp.get_variant_data(wg_vcf_obj.filename, conf["ref"],
                                       conf["gff"], conf["proteins"])
    results["variants"] = variant_data

    json.dump(results, open("%s.results.json" % files_prefix, "w"))
Example #5
0
def main(args):
    if pp.nofolder(args.out_dir):
        pp.run_cmd("mkdir %s" % args.out_dir)
    conf = {
        "ref": args.ref,
        "gff": args.gff,
        "bed": args.bed,
        "ann": args.ann,
    }
    if args.conf:
        conf = json.load(open(args.conf))
    for x in ["ref", "gff", "bed", "ann"]:
        if conf[x] == None:
            pp.log("%s variable is not defined" % x, True)
    bam_obj = pp.bam(args.bam,
                     args.prefix,
                     conf["ref"],
                     platform=args.platform)
    bcf_obj = bam_obj.call_variants(
        prefix=args.prefix + ".targets",
        call_method=args.call_method,
        gff_file=conf["gff"],
        bed_file=conf["bed"],
        mixed_as_missing=False if args.platform == "Illumina" else True,
        threads=args.threads,
        min_dp=args.min_depth,
        af=args.af,
        caller=args.caller)
    csq = bcf_obj.load_csq(ann_file=conf["ann"])
    variants = []
    chr2gene_pos = {}
    for l in open(conf["ann"]):
        row = l.rstrip().split()
        chr2gene_pos[int(row[1])] = int(row[3])
    for var in list(csq.values())[0]:
        var["_internal_change"] = var["change"]
        var["change"] = pp.reformat_mutations(var["change"], var["type"],
                                              var["gene_id"], chr2gene_pos)
        variants.append(var)
    if not args.no_delly:
        delly_bcf = bam_obj.run_delly()
        deletions = delly_bcf.overlap_bed(conf["bed"])
        for deletion in deletions:
            tmp_change = pp.reformat_mutations(
                "%(chr)s_%(start)s_%(end)s" % deletion, var["type"],
                var["gene_id"], chr2gene_pos)
            tmp = {
                "genome_pos": deletion["start"],
                "gene_id": deletion["region"],
                "chr": deletion["chr"],
                "freq": 1,
                "type": "large_deletion",
                "change": tmp_change
            }
            variants.append(tmp)
    json.dump(variants,
              open("%s/%s.pp-results.json" % (args.out_dir, args.prefix), "w"))
    for x in [
            ".targets.bcf", ".targets.csq.bcf", ".targets.csq.bcf.csi",
            ".targets.delly.bcf", ".targets.delly.bcf.csi",
            ".targets.del_pos.bed", ".targets.gvcf.gz", ".targets.gvcf.gz.csi",
            ".targets.missing.bcf"
    ]:
        if args.no_delly and "delly" in x: continue
        pp.run_cmd("rm %s%s" % (args.prefix, x))