def main(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name):

    print("aligning consensus sequence from all samples\n")
    tmp_file = pathlib.Path(project_path, "temp_aligned_file.fasta")
    mafft_cmd = f"mafft --globalpair --maxiterate 1000 {str(all_samples_consens_seqs)} > {str(tmp_file)}"

    ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0]
    print(mafft_cmd)
    run = try_except_continue_on_fail(mafft_cmd)
    if not run:
        print(f"could not align {all_samples_consens_seqs}")
        sys.exit("exiting")
    else:
        all_samples_consens_seqs.unlink()
        os.rename(str(tmp_file), str(all_samples_consens_seqs))

        # calculate coverage
        ref_length = len(ref_seq)
        coverage_outfile = pathlib.Path(project_path,
                                        f"{run_name}_genome_coverage.csv")
        all_consensus_d = fasta_to_dct(all_samples_consens_seqs)
        ref_lookup_name = list(all_consensus_d.keys())[0]
        aligned_ref = all_consensus_d[ref_lookup_name]
        del all_consensus_d[ref_lookup_name]
        with open(coverage_outfile, 'w') as fh:
            fh.write("sample_name,genome_coverage\n")
            for v_name, v_seq in all_consensus_d.items():
                seq_coverage = 0
                for i, base in enumerate(v_seq.upper()):
                    if base != "-" and base != "N" and aligned_ref[i] != "-":
                        seq_coverage += 1
                percent_coverage = round((seq_coverage / ref_length) * 100, 2)
                fh.write(f"{v_name},{percent_coverage}\n")

    print("done")
Exemple #2
0
def main(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name):

    print("aligning consensus sequence from all samples\n")
    tmp_file = pathlib.Path(project_path, "temp_aligned_file.fasta")
    mafft_cmd = f"mafft --thread -1 --auto {str(all_samples_consens_seqs)} > {str(tmp_file)}"

    ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0]
    print(mafft_cmd)
    run = try_except_continue_on_fail(mafft_cmd)
    if not run:
        print(f"could not align {all_samples_consens_seqs}")
        sys.exit("exiting")
    else:
        all_samples_consens_seqs.unlink()
        os.rename(str(tmp_file), str(all_samples_consens_seqs))
        # calculate & collect seq stats
        ref_length = len(ref_seq)
        seqstats_outfile = pathlib.Path(project_path,
                                        f"{run_name}_seqstats.csv")
        all_consensus_d = fasta_to_dct(all_samples_consens_seqs)
        ref_lookup_name = list(all_consensus_d.keys())[0]
        aligned_ref = all_consensus_d[ref_lookup_name]
        sample_folder = pathlib.Path(project_path, "samples")
        del all_consensus_d[ref_lookup_name]
        with open(seqstats_outfile, 'w') as fh:
            fh.write("sample_name,genome_coverage,mean_depth\n")
            for v_name, v_seq in all_consensus_d.items():
                print(v_name)
                seqname = v_name[0:-11]
                depth_file = csv.reader(
                    open(f"{sample_folder}/{seqname}/{seqname}_depth.csv",
                         "r"))
                mean_depth = ""
                for k, v in depth_file:
                    mean_depth = v
                seq_coverage = 0
                for i, base in enumerate(v_seq.upper()):
                    if base != "-" and base != "N" and aligned_ref[i] != "-":
                        seq_coverage += 1
                percent_coverage = round((seq_coverage / ref_length) * 100, 2)
                fh.write(f"{v_name},{percent_coverage},{mean_depth}\n")

    print("done")
Exemple #3
0
def main(project_path, reference, ref_start, ref_end, min_len, max_len,
         min_depth, run_step, rerun_step_only, basecall_mode, msa_cons, artic,
         cpu_cores, gpu_cores, gpu_buffers, use_gaps, use_bwa, guppy_path,
         real_time):

    threads = cpu_cores
    # set threads
    # threads = int()
    # if msa_cons:
    #     threads = cpu_cores - 8
    # else:
    #     threads = cpu_cores

    # set the primer_scheme directory
    script_folder = pathlib.Path(__file__).absolute().parent
    primer_scheme_dir = pathlib.Path(script_folder, "primer-schemes")

    # get folder paths
    project_path = pathlib.Path(project_path).absolute()
    plot_folder = pathlib.Path(project_path, "seq_depth_plots")
    if os.path.exists(plot_folder):
        shutil.rmtree(plot_folder)
    plot_folder.mkdir(mode=0o777, parents=True, exist_ok=True)
    run_name = project_path.parts[-1]
    fast5_dir = pathlib.Path(project_path, "fast5")
    fastq_dir = pathlib.Path(project_path, "fastq")
    # sequencing_summary_file = pathlib.Path(fastq_dir, "sequencing_summary.txt")
    sample_names = pathlib.Path(project_path, "sample_names.csv")
    if not sample_names:
        sys.exit("Could not find sample_names.csv in project folder")
    demultiplexed_folder = pathlib.Path(project_path, "demultiplexed")
    sample_folder = pathlib.Path(project_path, "samples")
    print(sample_folder)
    # master_reads_file = pathlib.Path(project_path, run_name + "_all.fastq")
    time_stamp = str('{:%Y-%m-%d_%H_%M}'.format(datetime.datetime.now()))
    log_file = pathlib.Path(project_path,
                            f"{time_stamp}_{run_name}_log_file.txt")

    with open(log_file, "w") as handle:
        handle.write(f"# start of pipeline run for project: {run_name}\n")

    now = datetime.datetime.now()
    date_time = now.strftime("%d/%m/%Y, %H:%M:%S")
    print(f"\nstart time = {date_time}\n\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nstart time = {date_time}\n\n")

    # set dir to project dir so that output is written in correct place by external tools
    os.chdir(project_path)

    # set the reference genome
    reference_scheme = \
        {"ChikECSA_V1_800": pathlib.Path(primer_scheme_dir, "ChikECSA800", "V1", "ChikECSA800.reference.fasta"),
         "ChikAsian_V1_400": pathlib.Path(primer_scheme_dir, "ChikAsian400", "V1", "ChikAsian400.reference.fasta"),
         "ZikaAsian_V1_400": pathlib.Path(primer_scheme_dir, "ZikaAsian400", "V1", "ZikaAsian400.reference.fasta"),
         "SARS2_V1_800": pathlib.Path(primer_scheme_dir, "SARS2_800", "V1", "SARS2_800.reference.fasta"),
         "SARS2_V1_400": pathlib.Path(primer_scheme_dir, "SARS2_400", "V1", "SARS2_400.reference.fasta"),
         "DENV1_V1_400": pathlib.Path(primer_scheme_dir, "DENV1_400", "V1", "DENV1_400.reference.fasta"),
         "DENV1_V2_400": pathlib.Path(primer_scheme_dir, "DENV1_400", "V2", "DENV1_400.reference.fasta"),
         "DENV2_V1_400": pathlib.Path(primer_scheme_dir, "DENV2_400", "V1", "DENV2_400.reference.fasta")
         }

    chosen_ref_scheme = str(reference_scheme[reference])
    chosen_ref_scheme_bed_file = chosen_ref_scheme.replace(
        ".reference.fasta", ".scheme.bed")
    scheme_name = reference.replace("_V1_", "_")
    ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0]
    ref_name = ref_name.split()[0]
    if not ref_start or ref_start == 0:
        ref_start = 1
    if not ref_end or ref_end > len(ref_seq):
        ref_end = len(ref_seq)
    # reference_slice = f'{ref_name}:{ref_start}-{ref_end}'
    print(f"\nReference is {chosen_ref_scheme}\n")
    print(f"\nPrimer bed file is {chosen_ref_scheme_bed_file}\n")
    with open(log_file, "a") as handle:
        handle.write(
            f"\nReference is {chosen_ref_scheme}\nPrimer bed file is {chosen_ref_scheme_bed_file}\n"
        )

    if run_step == 0:
        run = gupppy_basecall(fast5_dir, guppy_path, fastq_dir, gpu_cores,
                              basecall_mode, real_time, reference,
                              script_folder)
        faildir = pathlib.Path(fastq_dir, "fail")
        shutil.rmtree(faildir)
        if run and not rerun_step_only:
            run_step = 1
        elif run and rerun_step_only:
            sys.exit("Run step only completed, exiting")
        else:
            sys.exit("Basecalling failed")

    if run_step == 1:
        # demultiplex
        print(f"\nrunning: demultiplexing")
        with open(log_file, "a") as handle:
            handle.write(f"\nrunning: demultiplexing")
        if not list(fastq_dir.glob("*.fastq*")):
            fastq_dir = pathlib.Path(fastq_dir, "pass")
            if not list(fastq_dir.glob("*.fastq*")):
                print(
                    f"No fastq files found in {str(fastq_dir)} or {str(fastq_dir.parent)}"
                )
                sys.exit("fastq files not found")
        run = guppy_demultiplex(fastq_dir, guppy_path, demultiplexed_folder,
                                threads, gpu_buffers, gpu_cores)
        if run and not rerun_step_only:
            run_step = 2
        elif run and rerun_step_only:
            sys.exit("demultiplexing completed, exiting")
        else:
            sys.exit("demultiplexing failed")

    if run_step == 2:

        pre_existing_files = list(demultiplexed_folder.glob("*.fastq"))
        if pre_existing_files:
            print(
                "Found existing files in top level of demultiplex folder.\nThese files will be deleted"
            )
            for file in pre_existing_files:
                os.unlink((str(file)))

        for folder in demultiplexed_folder.glob("barcode*"):
            search = list(pathlib.Path(folder).glob("*.fastq"))
            if not search:
                print(f"no files in folder\nskipping folder: {folder}\n")
                continue
            if len(search) > 1:
                barcode_number = pathlib.Path(search[0]).parent.parts[-1]
                concat_outfile = f"cat_barcode_{barcode_number}.fastq"
                cat_cmd = f"cat "
                for file in search:
                    cat_cmd += f"{str(file)} "
                cat_cmd += f" > {concat_outfile}"
                try_except_exit_on_fail(cat_cmd)
                new_name = pathlib.Path(demultiplexed_folder,
                                        f"{run_name}_{barcode_number}.fastq")
                filtered_file = filter_length(concat_outfile, new_name,
                                              max_len, min_len)

                os.unlink(str(concat_outfile))
                if not filtered_file:
                    print(
                        f"no sequences in file after length filtering for {concat_outfile}\n"
                    )

                # sed_syntax = r"\t/\n"
                # bash_cmd = f"cat {concat_outfile} | paste - - - - | awk 'length($2)  >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}"
                # print(bash_cmd)
                # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \
                #                 f"--min-length {min_len} --max-length {max_len} " \
                #                 f"{concat_outfile} {new_name} "
                # vsearch_cmd = f"vsearch --fastq_filter {concat_outfile} -fastq_maxlen {max_len} " \
                #               f"--fastq_qmax 100 --fastq_minlen {min_len} --fastqout {new_name}"
                # try_except_exit_on_fail(bash_cmd)

            else:
                file = pathlib.Path(search[0])
                barcode_number = file.parent.parts[-1]
                new_name = pathlib.Path(demultiplexed_folder,
                                        f"{run_name}_{barcode_number}.fastq")

                filtered_file = filter_length(file, new_name, max_len, min_len)

                if not filtered_file:
                    print(
                        f"no sequences in file after length filtering for {file}\n"
                    )

                # sed_syntax = r"\t/\n"
                # bash_cmd = f"cat {file} | paste - - - - | awk 'length($2)  >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}"
                # print(bash_cmd)
                # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \
                #                 f"--min-length {min_len} --max-length {max_len} " \
                #                 f"{file} {new_name} "
                # vsearch_cmd = f"vsearch --fastq_filter {file} -fastq_maxlen {max_len} --fastq_minlen {min_len} " \
                #               f"--fastq_qmax 100 --fastqout {new_name}"
                # try_except_exit_on_fail(bash_cmd)

        if not rerun_step_only:
            run_step = 3
        elif rerun_step_only:
            sys.exit(
                "filer demultiplexed files and rename them completed, exiting")
        else:
            sys.exit("filtering and renaming demultiplexed files failed")

    # if run_step == 3 and not msa_cons:
    #     # index concatenated fastq with nanopolish
    #     print(f"\nrunning: nanopolish index on fast5/fastq files")
    #     with open(log_file, "a") as handle:
    #         handle.write(f"\nrunning: nanopolish index on fast5/fastq files\n")
    #         if not sequencing_summary_file.is_file():
    #             handle.write(f"\nSequencing summary file not found")
    #             nanopolish_index_cmd = f"nanopolish index -d {fast5_dir} {master_reads_file} "
    #         else:
    #             nanopolish_index_cmd = f"nanopolish index -s {sequencing_summary_file} -d {fast5_dir} " \
    #                 f"{master_reads_file} "
    #     try_except_exit_on_fail(nanopolish_index_cmd)
    #     if not rerun_step_only:
    #         run_step = 4
    #     else:
    #         sys.exit("Run step only completed, exiting")

    if run_step == 3:
        # concatenated demultiplexed files for each sample and setup sample names and barcode combinations
        print(
            "collecting demultiplexed files into sample.fastq files based on specified sample barcode combinations\n"
        )
        with open(log_file, "a") as handle:
            handle.write(
                f"\ncollecting demultiplexed files into sample.fastq files based on specified sample "
                f"barcode combinations\n")

        sample_names_df = pd.read_csv(sample_names,
                                      sep=None,
                                      keep_default_na=False,
                                      na_values=['NA'],
                                      engine="python")
        sample_names_df['barcode_1'] = sample_names_df['barcode_1'].apply(
            lambda x: cat_sample_names(x, run_name))
        sample_names_df['barcode_2'] = sample_names_df['barcode_2'].apply(
            lambda x: cat_sample_names(x, run_name))
        sample_names_dict = sample_names_df.set_index('sample_name').T.to_dict(
            orient='list')

        for sample_name, [barcode_1, barcode_2] in sample_names_dict.items():
            sample_dir = pathlib.Path(sample_folder, sample_name)
            if not sample_dir.exists():
                pathlib.Path(sample_dir).mkdir(mode=0o777,
                                               parents=True,
                                               exist_ok=True)

            # allow for case where only one barcode was specified per sample.
            barcode_1_file = pathlib.Path(demultiplexed_folder, barcode_1)
            if barcode_2 == " ":
                barcode_2_file = ""
            else:
                barcode_2_file = pathlib.Path(demultiplexed_folder, barcode_2)

            cat_outfile = pathlib.Path(sample_dir, f"{sample_name}.fastq")
            cat_cmd = f"cat {str(barcode_1_file)} {str(barcode_2_file)} > {cat_outfile}"
            print(cat_cmd)
            run = try_except_continue_on_fail(cat_cmd)
            if not run:
                print(
                    "missing one or more demultiplexed files for this sample")
                with open(log_file, "a") as handle:
                    handle.write(
                        "\nmissing one or more demultiplexed files for this sample\n"
                    )
                continue

        if not rerun_step_only:
            run_step = 4
        else:
            sys.exit("Run step only completed, exiting")

    if run_step == 4:
        print("Running variant calling on samples")

        with open(log_file, "a") as handle:
            handle.write(f"\nRunning variant calling on samples\n")
        if use_bwa:
            make_index_cmd = f"bwa index {chosen_ref_scheme}"
            with open(log_file, "a") as handle:
                handle.write(f"\n{make_index_cmd}\n")

            try_except_exit_on_fail(make_index_cmd)

        all_sample_files = pathlib.Path(sample_folder).glob("*/*.fastq")
        number_samples = len(list(sample_folder.glob('*/*.fastq')))

        # make variable for project file containing all samples' consensus sequences
        project_name = project_path.parts[-1]
        all_samples_consens_seqs = pathlib.Path(
            project_path, project_name + "_all_samples.fasta")

        # initialize the file, and add reference to all consensus file
        with open(all_samples_consens_seqs, 'w') as fh:
            fh.write(f">{ref_name}\n{ref_seq}\n")
        p = pathlib.Path(project_path, project_name + '_mapping.csv')
        with open(p, 'w') as fh:
            fh.close()

        samples_run = 1
        old_number_png_files = 0
        for sample_fastq in all_sample_files:
            # get folder paths
            sample_folder = pathlib.Path(sample_fastq).parent
            sample_name = pathlib.Path(sample_fastq).stem
            os.chdir(sample_folder)
            seq_summary_file_name = ""
            for file in project_path.glob('sequencing_summary*.txt'):
                seq_summary_file_name = file
            seq_summary_file = pathlib.Path(seq_summary_file_name).resolve()
            artic_folder = pathlib.Path(sample_folder, "artic")
            if os.path.exists(artic_folder):
                shutil.rmtree(artic_folder)
            artic_folder.mkdir(mode=0o777, parents=True, exist_ok=True)

            # check if fastq is present
            if not sample_fastq.is_file():
                print(
                    f"\nCould not find the concatenated sample fastq file: {sample_fastq}\nskipping sample"
                )
                with open(log_file, "a") as handle:
                    handle.write(
                        f"\nCould not find the concatenated sample fastq file: {sample_fastq}\nskipping sample"
                    )
                continue
            print(
                f"\n________________\n\nStarting processing sample: {sample_name}\n________________\n"
            )
            with open(log_file, "a") as handle:
                handle.write(
                    f"\n________________\n\nStarting processing sample: {sample_name}\n________________\n"
                )

            # start artic pipeline in new window
            if artic:
                print(f"\n------->Running Artic's pipeline in new window\n")
                with open(log_file, "a") as handle:
                    handle.write(
                        f"\n------->Running Artic's pipeline in new window\n\n"
                    )

                artic_cmd = f"artic minion --normalise 400 --threads {threads} --scheme-directory ~/artic-ncov2019/primer_schemes " \
                            f"--read-file {sample_fastq} --fast5-directory {fast5_dir} " \
                            f"--sequencing-summary {seq_summary_file} {scheme_name} {sample_name} " \
                            f"2>&1 | tee -a {log_file}"
                print(artic_cmd)
                try_except_continue_on_fail(
                    f"gnome-terminal -- /bin/sh -c 'conda run -n artic-ncov2019 {artic_cmd}'"
                )

                last_file_made = pathlib.Path(
                    sample_folder, sample_name + ".muscle.out.fasta")
                while pathlib.Path.exists(last_file_made) == False:
                    time.sleep(5)
                else:
                    time.sleep(2)
                    all_files = os.listdir(sample_folder)

                    # write consensus to master consensus file
                    artic_cons_file = pathlib.Path(
                        sample_folder, sample_name + ".consensus.fasta")
                    artic_d = fasta_to_dct(artic_cons_file)
                    with open(all_samples_consens_seqs, 'a') as fh:
                        for name, seq in artic_d.items():
                            newname = re.sub("/ARTIC.*", "_art", name)
                            fh.write(f">{newname}\n{seq.replace('-', '')}\n")

                    for filename in all_files:
                        if os.path.isfile(
                                filename) and not filename.endswith('.fastq'):
                            file = os.path.join(sample_folder, filename)
                            shutil.move(file, artic_folder)

            # start majority consensus pipeline in new window
            if msa_cons:

                print(
                    f"\n\n------->Running majority consensus pipeline in new window\n"
                )
                with open(log_file, "a") as handle:
                    handle.write(
                        f"\n\n------->Running majority consensus pipeline in new window\n"
                    )

                majority_cmd = f"python ~/nanopore_pipeline_wrapper/msa_consensus.py -in {sample_fastq} -pf {plot_folder} -lf {log_file} " \
                               f"{use_bwa} -rs {chosen_ref_scheme} -bf {chosen_ref_scheme_bed_file} " \
                               f"-t {threads} -d {min_depth} {use_gaps} -ac {all_samples_consens_seqs}"

                print(majority_cmd)
                try_except_continue_on_fail(
                    f"gnome-terminal -- /bin/sh -c 'conda run -n nanop {majority_cmd}'"
                )

                # open(f"{sample_name}_msa_from_bam_file.fasta", "w+")
                last_file_made_2 = pathlib.Path(
                    sample_folder, sample_name + "_msa_from_bam_file.fasta")
                while pathlib.Path.exists(last_file_made_2) == False:
                    time.sleep(5)
                else:
                    if samples_run + 1 <= number_samples:
                        print(f"\ncontinuing with sample {samples_run + 1}\n")

                # keep threads balanced
                number_png_files = len(
                    list(plot_folder.glob('*_sequencing_depth.png')))
                print(f'{number_png_files} png files created')
                difference = number_png_files - old_number_png_files
                old_number_png_files = number_png_files
                threads = threads - 1 + difference
                samples_run += 1

        # run sample summary as soon as all sequencing_depth.png files made
        number_pngs = len(list(plot_folder.glob('*_sequencing_depth.png')))
        if number_pngs < number_samples:
            print('waiting for all msa to be completed')
        while number_pngs < number_samples:
            time.sleep(10)
            number_pngs = len(list(plot_folder.glob('*_sequencing_depth.png')))
        else:
            sample_summary(project_path, all_samples_consens_seqs,
                           chosen_ref_scheme, run_name)

    now = datetime.datetime.now()
    date_time = now.strftime("%d/%m/%Y, %H:%M:%S")
    print(f"\nend time = {date_time}\n\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nend time = {date_time}\n\n")

    print("sample processing completed\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nsample processing completed\n\n")

    # compress fast5 files
    targzpath = pathlib.Path(project_path.parent, run_name + ".tar.gz")
    tarcmd = f"tar cf - {fast5_dir} | pigz -7 -p 16  > {targzpath}"
    try_except_exit_on_fail(tarcmd)
    print(tarcmd)
    with open(log_file, "a") as handle:
        handle.write(f"\n{tarcmd}\n\n")
def main(infile, plot_folder, log_file, use_minmap2, chosen_ref_scheme,
         chosen_ref_scheme_bed_file, threads, msa_cons_also, min_depth,
         use_gaps, all_samples_consens_seqs):
    # force absolute file paths
    sample_fastq = pathlib.Path(infile).absolute()
    script_folder = pathlib.Path(__file__).absolute().parent
    if not sample_fastq.is_file():
        print(
            f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample"
        )
        with open(log_file, "a") as handle:
            handle.write(
                f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample"
            )
        return False

    # set the reference coordinates to use
    ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0]
    ref_name = ref_name.split()[0]
    reference_slice = f"{ref_name}:0-{len(ref_seq)}"

    # set input and output file name
    sample_name = pathlib.Path(sample_fastq).stem
    sample_folder = pathlib.Path(sample_fastq).parent
    sam_name = pathlib.Path(sample_folder, sample_name + "_mapped.sam")
    trimmed_sam_file = pathlib.Path(sample_folder,
                                    sample_name + ".primerclipped.sam")
    trimmed_bam_file = pathlib.Path(sample_folder,
                                    sample_name + ".primerclipped.bam")
    sorted_trimmed_bam_file = pathlib.Path(
        sample_folder, sample_name + ".primerclipped_sorted.bam")
    bcftools_vcf_file = pathlib.Path(sample_folder,
                                     sample_name + "_bcftools.vcf")
    bcftools_cons_file = pathlib.Path(
        sample_folder, sample_name + "_consensus_bcftools.fasta")
    msa_fasta = pathlib.Path(sample_folder,
                             sample_name + "_msa_from_bam_file.fasta")
    msa_cons = pathlib.Path(sample_folder,
                            sample_name + "_msa_consensus.fasta")

    # make sure cwd is the sample folder, as some programs output to cwd
    os.chdir(sample_folder)

    print(
        f"\n\n________________\nStarting processing sample: {sample_name}\n\n________________\n"
    )
    with open(log_file, "a") as handle:
        handle.write(
            f"\n\n________________\nStarting processing sample: {sample_name}\n\n________________\n"
        )
    if use_minmap2:
        # run read mapping using minimap
        print(f"\nrunning: minimap2 read mapping\n")
        minimap2_cmd = f"minimap2 -a -Y -t 8 -x ava-ont {chosen_ref_scheme} {sample_fastq} -o {sam_name} " \
                       f"2>&1 | tee -a {log_file}"
        print("\n", minimap2_cmd, "\n")
        with open(log_file, "a") as handle:
            handle.write(f"\nrunning: bwa read mapping\n")
            handle.write(f"{minimap2_cmd}\n")
        run = try_except_continue_on_fail(minimap2_cmd)
        if not run:
            return False
    else:
        # run read mapping using bwa
        print(f"\nrunning: bwa read mapping\n")
        bwa_cmd = f"bwa mem -t {threads} -x ont2d {chosen_ref_scheme} {sample_fastq} -o {sam_name} " \
                  f"2>&1 | tee -a {log_file}"
        print("\n", bwa_cmd, "\n")
        with open(log_file, "a") as handle:
            handle.write(f"\nrunning: bwa read mapping\n")
            handle.write(f"{bwa_cmd}\n")
        run = try_except_continue_on_fail(bwa_cmd)
        if not run:
            return False

    # remove primer sequences with custom script
    print(f"\nrunning: trim primer sequences from bam file\n")
    trim_script = pathlib.Path(script_folder, "src",
                               "clip_primers_from_bed_file.py")
    trim_primer = f"python {trim_script} -in {sam_name} -o {trimmed_sam_file} " \
                  f"-b {chosen_ref_scheme_bed_file} 2>&1 | tee -a {log_file}"
    print("\n", trim_primer, "\n")
    with open(log_file, "a") as handle:
        handle.write(
            f"\nrunning: soft clipping primer sequences from bam file\n")
        handle.write(f"{trim_primer}\n")
    run = try_except_continue_on_fail(trim_primer)
    if not run:
        return False

    # convert sam to bam
    print(f"\nrunning: sam to bam conversion of trimmed file")
    sam_bam_cmd = f"samtools view -bS {trimmed_sam_file} -o {trimmed_bam_file} 2>&1 | tee -a {log_file}"
    print("\n", sam_bam_cmd, "\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nrunning: sam to bam conversion\n")
        handle.write(f"{sam_bam_cmd}\n")
    run = try_except_continue_on_fail(sam_bam_cmd)
    if not run:
        return False

    # sort bam file
    print(f"\nrunning: sorting bam file")
    sort_sam_cmd = f"samtools sort -T {sample_name} {trimmed_bam_file} -o {sorted_trimmed_bam_file} " \
                   f"2>&1 | tee -a {log_file}"
    print("\n", sort_sam_cmd, "\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nrunning: sorting bam file\n{sort_sam_cmd}\n")
    run = try_except_continue_on_fail(sort_sam_cmd)
    if not run:
        return False

    # index trimmed bam file
    print(f"\nrunning: indexing bam file")
    index_bam_cmd = f"samtools index {sorted_trimmed_bam_file} 2>&1 | tee -a {log_file}"
    print("\n", index_bam_cmd, "\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nrunning: indexing bam file\n")
        handle.write(f"{index_bam_cmd}\n")
    run = try_except_continue_on_fail(index_bam_cmd)
    if not run:
        return False

    # make bcftools consensus
    print(f"\nrunning: making consensuses sequence from bcftools\n")
    min_base_qual = 30  # default=13
    p_val_of_variant = 0.2  # default=0.5
    bcf_vcf_cmd = f"bcftools mpileup --threads {threads} --max-depth 10000 --min-BQ {min_base_qual} -Oz " \
                  f"-f {chosen_ref_scheme} {sorted_trimmed_bam_file} | bcftools call -c -p {p_val_of_variant} " \
                  f"--ploidy 1 -Oz -o {bcftools_vcf_file} 2>&1 | tee -a {log_file}"
    bcf_index_cmd = f"bcftools index {bcftools_vcf_file} 2>&1 | tee -a {log_file}"
    bcf_cons_cmd = f"bcftools consensus -H A -f {chosen_ref_scheme} {bcftools_vcf_file} " \
                   f"-o {bcftools_cons_file} 2>&1 | tee -a {log_file}"
    with open(log_file, "a") as handle:
        handle.write(
            f"\nrunning: making consensuses sequence from bcftools:\n")
        handle.write(f"{bcf_vcf_cmd}\n\n{bcf_index_cmd}\n\n{bcf_cons_cmd}\n")
    run = try_except_continue_on_fail(bcf_vcf_cmd)
    if not run:
        return False
    run = try_except_continue_on_fail(bcf_index_cmd)
    if not run:
        return False
    run = try_except_continue_on_fail(bcf_cons_cmd)
    if not run:
        return False

    # rename the fasta header to the sample name
    rename_fasta(bcftools_cons_file, sample_name, "bcftools_cons")
    bcf_cons_d = fasta_to_dct(bcftools_cons_file)

    # write consensus to master consensus file
    with open(all_samples_consens_seqs, 'a') as fh:
        for name, seq in bcf_cons_d.items():
            fh.write(f">{name}\n{seq.replace('-', '')}\n")

    # # generate manual vcf consensus and seq depth + qual output
    depth_qual_outfile = vcf_processing(bcftools_vcf_file, min_depth,
                                        sample_folder)
    vcf_plots(depth_qual_outfile, plot_folder)

    # get json dump of reads and primer pairs
    json_file = list(
        pathlib.Path(sample_folder).glob("*read_primer_pair_lookup.json"))[0]
    if not json_file.is_file():
        print("the json file containing primer pair depth info was not found")
    with open(str(json_file), 'r') as jd:
        read_primer_pairs_dct = json.load(jd)

    primer_pair_depth_outfile = pathlib.Path(
        plot_folder, sample_name + "_per_primer_depth.png")

    primer_pairs = []
    primers_depth = []
    for primer_pair, names_list in read_primer_pairs_dct.items():
        primers_depth.append(len(names_list))
        primer_pairs.append(primer_pair)

    max_depth = max(primers_depth)
    percent_primers_depth = [
        round(val / max_depth * 100, 2) for val in primers_depth
    ]
    primers_and_depths = zip(primer_pairs, primers_depth)

    plot_primer_depth(primer_pairs, primers_depth, percent_primers_depth,
                      sample_name, primer_pair_depth_outfile)

    if msa_cons_also:
        # convert bam file to a mutli fasta alignment
        print(
            f"\nrunning: making consensuses sequence from bam to MSA with jvarkit\n"
        )

        sam4web = pathlib.Path(script_folder, "jvarkit", "dist",
                               "sam4weblogo.jar")
        msa_from_bam = f"java -jar {sam4web} -r '{reference_slice}' -o {msa_fasta} " \
                       f"{sorted_trimmed_bam_file} 2>&1 | tee -a {log_file}"
        print(msa_from_bam)

        with open(log_file, "a") as handle:
            handle.write(
                f"\nrunning: making consensuses sequence from bam to MSA with jvarkit\n"
            )
            handle.write(f"{msa_from_bam}\n")
        run = try_except_continue_on_fail(msa_from_bam)
        if not run:
            return False

        # convert multi fasta alignment to consensus sequence
        fasta_msa_d = fasta_to_dct(msa_fasta)

        if len(fasta_msa_d) == 0:
            print(
                f"{sam_name} alignment had no sequences\nskipping to next sample\n"
            )
            with open(log_file, "a") as handle:
                handle.write(
                    f"{sam_name} alignment had no sequences\nskipping to next sample\n"
                )
            return False

        # set minimum depth for calling a position in the consensus sequence per primer region
        positional_depth = collections.defaultdict(int)
        for (primerpair, depth) in primers_and_depths:
            start_pos = int(primerpair.split("_")[0])
            end_pos = int(primerpair.split("_")[1])
            for i in range(start_pos, end_pos + 1):
                positional_depth[str(i).zfill(4)] += depth

        # build the consensus sequence
        try:
            cons, depth_profile = consensus_maker(fasta_msa_d,
                                                  positional_depth, min_depth,
                                                  use_gaps)
        except IndexError as e:
            with open(log_file, "a") as handle:
                handle.write(
                    f"\nNo MSA made from Bam file\nno reads may have been mapped\n{e}\n"
                )
        else:
            with open(msa_cons, 'w') as handle:
                handle.write(f">{sample_name}\n{cons}\n")

            # write consensus to master consensus file
            with open(all_samples_consens_seqs, 'a') as fh:
                fh.write(f">{sample_name}\n{cons.replace('-', '')}\n")

            # plot depth for sample
            depth_list = depth_profile["non_gap"]
            depth_outfile = pathlib.Path(plot_folder,
                                         sample_name + "_sequencing_depth.png")
            plot_depth(depth_list, sample_name, depth_outfile)

    print(f"Completed processing sample: {sample_name}\n\n")
    with open(log_file, "a") as handle:
        handle.write(
            f"\n\n________________\nCompleted processing sample: {sample_name}\n\n________________\n"
        )

    print("done")
Exemple #5
0
def main(project_path, sample_names, reference, ref_start, ref_end, min_len, max_len, min_depth, run_step,
         rerun_step_only, basecall_mode, msa_cons_only, threads, gpu_cores, gpu_buffers, use_gaps, use_minmap2,
         guppy_path, real_time):

    # set the primer_scheme directory
    script_folder = pathlib.Path(__file__).absolute().parent
    primer_scheme_dir = pathlib.Path(script_folder, "primer-schemes")

    # get folder paths
    project_path = pathlib.Path(project_path).absolute()
    plot_folder = pathlib.Path(project_path, "seq_depth_plots")
    plot_folder.mkdir(mode=0o777, parents=True, exist_ok=True)
    run_name = project_path.parts[-1]
    fast5_dir = pathlib.Path(project_path, "fast5")
    fastq_dir = pathlib.Path(project_path, "fastq")
    sequencing_summary_file = pathlib.Path(fastq_dir, "sequencing_summary.txt")
    sample_names = pathlib.Path(sample_names).absolute()
    demultiplexed_folder = pathlib.Path(project_path, "demultiplexed")
    sample_folder = pathlib.Path(project_path, "samples")
    master_reads_file = pathlib.Path(project_path, run_name + "_all.fastq")
    time_stamp = str('{:%Y-%m-%d_%H_%M}'.format(datetime.datetime.now()))
    log_file = pathlib.Path(project_path, f"{time_stamp}_{run_name}_log_file.txt")

    with open(log_file, "w") as handle:
        handle.write(f"# start of pipeline run for project: {run_name}\n")

    # set dir to project dir so that output is written in correct place by external tools
    os.chdir(project_path)

    # set the reference genome
    reference_scheme = \
        {"ChikECSA_V1_800": pathlib.Path(primer_scheme_dir, "ChikECSA800", "V1", "ChikECSA800.reference.fasta"),
         "ChikAsian_V1_400": pathlib.Path(primer_scheme_dir, "ChikAsian400", "V1", "ChikAsian400.reference.fasta"),
         "ZikaAsian_V1_400": pathlib.Path(primer_scheme_dir, "ZikaAsian400", "V1", "ZikaAsian400.reference.fasta"),
         "SARS2_V1_800": pathlib.Path(primer_scheme_dir, "SARS2_800", "V1", "SARS2_800.reference.fasta"),
         "SARS2_V1_400": pathlib.Path(primer_scheme_dir, "SARS2_400", "V1", "SARS2_400.reference.fasta"),
         }

    chosen_ref_scheme = str(reference_scheme[reference])
    chosen_ref_scheme_bed_file = chosen_ref_scheme.replace(".reference.fasta", ".scheme.bed")
    ref_name, ref_seq = list(py3_fasta_iter(chosen_ref_scheme))[0]
    ref_name = ref_name.split()[0]
    if not ref_start or ref_start == 0:
        ref_start = 1
    if not ref_end or ref_end > len(ref_seq):
        ref_end = len(ref_seq)
    reference_slice = f'{ref_name}:{ref_start}-{ref_end}'
    print(f"\nReference is {chosen_ref_scheme}\n")
    print(f"\nPrimer bed file is {chosen_ref_scheme_bed_file}\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nReference is {chosen_ref_scheme}\nPrimer bed file is {chosen_ref_scheme_bed_file}\n")

    if run_step == 0:
        run = gupppy_basecall(fast5_dir, guppy_path, fastq_dir, gpu_cores, basecall_mode, real_time, reference, script_folder)
        faildir = pathlib.Path(fastq_dir, "fail")
        shutil.rmtree(faildir)
        if run and not rerun_step_only:
            run_step = 1
        elif run and rerun_step_only:
            sys.exit("Run step only completed, exiting")
        else:
            sys.exit("Basecalling failed")


    if run_step == 1:
        # demultiplex
        print(f"\nrunning: demultiplexing")
        with open(log_file, "a") as handle:
            handle.write(f"\nrunning: demultiplexing")
        if not list(fastq_dir.glob("*.fastq*")):
            fastq_dir = pathlib.Path(fastq_dir, "pass")
            if not list(fastq_dir.glob("*.fastq*")):
                print(f"No fastq files found in {str(fastq_dir)} or {str(fastq_dir.parent)}")
                sys.exit("fastq files not found")
        run = guppy_demultiplex(fastq_dir, guppy_path, demultiplexed_folder, threads, gpu_buffers, gpu_cores)
        if run and not rerun_step_only:
            run_step = 2
        elif run and rerun_step_only:
            sys.exit("demultiplexing completed, exiting")
        else:
            sys.exit("demultiplexing failed")

    if run_step == 2:

        pre_existing_files = list(demultiplexed_folder.glob("*.fastq"))
        if pre_existing_files:
            print("Found existing files in top level of demultiplex folder.\nThese files will be deleted")
            for file in pre_existing_files:
                os.unlink((str(file)))

        for folder in demultiplexed_folder.glob("barcode*"):
            search = list(pathlib.Path(folder).glob("*.fastq"))
            if not search:
                print(f"no files in folder\nskipping folder: {folder}\n")
                continue
            if len(search) > 1:
                barcode_number = pathlib.Path(search[0]).parent.parts[-1]
                concat_outfile = f"cat_barcode_{barcode_number}.fastq"
                cat_cmd = f"cat "
                for file in search:
                    cat_cmd += f"{str(file)} "
                cat_cmd += f" > {concat_outfile}"
                try_except_exit_on_fail(cat_cmd)
                new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq")
                filtered_file = filter_length(concat_outfile, new_name, max_len, min_len)

                os.unlink(str(concat_outfile))
                if not filtered_file:
                    print(f"no sequences in file after length filtering for {concat_outfile}\n")

                # sed_syntax = r"\t/\n"
                # bash_cmd = f"cat {concat_outfile} | paste - - - - | awk 'length($2)  >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}"
                # print(bash_cmd)
                # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \
                #                 f"--min-length {min_len} --max-length {max_len} " \
                #                 f"{concat_outfile} {new_name} "
                # vsearch_cmd = f"vsearch --fastq_filter {concat_outfile} -fastq_maxlen {max_len} " \
                #               f"--fastq_qmax 100 --fastq_minlen {min_len} --fastqout {new_name}"
                # try_except_exit_on_fail(bash_cmd)

            else:
                file = pathlib.Path(search[0])
                barcode_number = file.parent.parts[-1]
                new_name = pathlib.Path(demultiplexed_folder, f"{run_name}_{barcode_number}.fastq")

                filtered_file = filter_length(file, new_name, max_len, min_len)

                if not filtered_file:
                    print(f"no sequences in file after length filtering for {file}\n")

                # sed_syntax = r"\t/\n"
                # bash_cmd = f"cat {file} | paste - - - - | awk 'length($2)  >= {min_len} && length($2) <= {max_len}' | sed 's/{sed_syntax}/g' > {str(new_name)}"
                # print(bash_cmd)
                # seqmagick_cmd = f"seqmagick quality-filter --min-mean-quality 0 " \
                #                 f"--min-length {min_len} --max-length {max_len} " \
                #                 f"{file} {new_name} "
                # vsearch_cmd = f"vsearch --fastq_filter {file} -fastq_maxlen {max_len} --fastq_minlen {min_len} " \
                #               f"--fastq_qmax 100 --fastqout {new_name}"
                # try_except_exit_on_fail(bash_cmd)

        if not rerun_step_only and not msa_cons_only:
            run_step = 3
        elif not rerun_step_only and msa_cons_only:
            run_step = 4
        elif rerun_step_only:
            sys.exit("filer demultiplexed files and rename them completed, exiting")
        else:
            sys.exit("filtering and renaming demultiplexed files failed")

    if run_step == 3 and not msa_cons_only:
        # index concatenated fastq with nanopolish
        print(f"\nrunning: nanopolish index on fast5/fastq files")
        with open(log_file, "a") as handle:
            handle.write(f"\nrunning: nanopolish index on fast5/fastq files\n")
            if not sequencing_summary_file.is_file():
                handle.write(f"\nSequencing summary file not found")
                nanopolish_index_cmd = f"nanopolish index -d {fast5_dir} {master_reads_file} "
            else:
                nanopolish_index_cmd = f"nanopolish index -s {sequencing_summary_file} -d {fast5_dir} " \
                    f"{master_reads_file} "
        try_except_exit_on_fail(nanopolish_index_cmd)
        if not rerun_step_only:
            run_step = 4
        else:
            sys.exit("Run step only completed, exiting")

    if run_step == 4:
        # concatenated demultiplexed files for each sample and setup sample names and barcode combinations
        print("collecting demultiplexed files into sample.fastq files based on specified sample barcode combinations\n")
        with open(log_file, "a") as handle:
            handle.write(f"\ncollecting demultiplexed files into sample.fastq files based on specified sample "
                         f"barcode combinations\n")

        sample_names_df = pd.read_csv(sample_names, sep=None, keep_default_na=False, na_values=['NA'], engine="python")
        sample_names_df['barcode_1'] = sample_names_df['barcode_1'].apply(lambda x: cat_sample_names(x, run_name))
        sample_names_df['barcode_2'] = sample_names_df['barcode_2'].apply(lambda x: cat_sample_names(x, run_name))
        sample_names_dict = sample_names_df.set_index('sample_name').T.to_dict(orient='list')

        for sample_name, [barcode_1, barcode_2] in sample_names_dict.items():
            sample_dir = pathlib.Path(sample_folder, sample_name)
            if not sample_dir.exists():
                pathlib.Path(sample_dir).mkdir(mode=0o777, parents=True, exist_ok=True)

            # allow for case where only one barcode was specified per sample.
            barcode_1_file = pathlib.Path(demultiplexed_folder, barcode_1)
            if barcode_2 == " ":
                barcode_2_file = ""
            else:
                barcode_2_file = pathlib.Path(demultiplexed_folder, barcode_2)

            cat_outfile = pathlib.Path(sample_dir, f"{sample_name}.fastq")
            cat_cmd = f"cat {str(barcode_1_file)} {str(barcode_2_file)} > {cat_outfile}"
            print(cat_cmd)
            run = try_except_continue_on_fail(cat_cmd)
            if not run:
                print("missing one or more demultiplexed files for this sample")
                with open(log_file, "a") as handle:
                    handle.write("\nmissing one or more demultiplexed files for this sample\n")
                continue
        for fastq in demultiplexed_folder.glob('*.fastq'):
            os.remove(str(fastq))
        if not rerun_step_only:
            run_step = 5
        else:
            sys.exit("Run step only completed, exiting")

    if run_step == 5:
        print("Running variant calling on samples")
        with open(log_file, "a") as handle:
            handle.write(f"\nRunning variant calling on samples\n")
        if not use_minmap2:
            make_index_cmd = f"bwa index {chosen_ref_scheme}"
            with open(log_file, "a") as handle:
                handle.write(f"\n{make_index_cmd}\n")

            try_except_exit_on_fail(make_index_cmd)

        all_sample_files = pathlib.Path(sample_folder).glob("*/*.fastq")

        # make variable for project file containing all samples' consensus sequences
        project_name = project_path.parts[-1]
        all_samples_consens_seqs = pathlib.Path(project_path, project_name + "_all_samples.fasta")

        # initialize the file, and add reference to all consensus file
        with open(all_samples_consens_seqs, 'w') as fh:
            fh.write(f">{ref_name}\n{ref_seq}\n")
        p = pathlib.Path(project_path, project_name + '_mapping.csv')
        with open(p, 'w') as fh:
            fh.close()

        for sample_fastq in all_sample_files:
            if not sample_fastq.is_file():
                print(f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample")
                with open(log_file, "a") as handle:
                    handle.write(f"could not find the concatenated sample fastq file: {sample_fastq}\nskipping sample")
                continue
            run = sample_analysis(sample_fastq, plot_folder, log_file, use_minmap2, chosen_ref_scheme,
                                  chosen_ref_scheme_bed_file, threads, msa_cons_only, min_depth, use_gaps,
                                  all_samples_consens_seqs)
            if not run:
                continue

        # align the master consensus file
        sample_summary(project_path, all_samples_consens_seqs, chosen_ref_scheme, run_name)

    print("sample processing completed\n")
    with open(log_file, "a") as handle:
        handle.write(f"\nsample processing completed\n\n")

    targzpath = pathlib.Path(project_path.parent, run_name + ".tar.gz")
    tarcmd = f"tar cf - {fast5_dir} | pigz -7 -p 16  > {targzpath}"
    try_except_exit_on_fail(tarcmd)
    print(tarcmd)
    with open(log_file, "a") as handle:
        handle.write(f"\n{tarcmd}\n\n")