コード例 #1
0
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
コード例 #2
0
        if probing_batch_size % packet_size > 0:  # and this is ceiling
            packs_at_all += 1
        # end if

        # Pretty string to print "Request 3 out of 7" if not 'send_all'
        out_of_n = {
            "msg": " out of {}".format(packs_at_all),
            "npacks": packs_at_all
        }
    # enf if

    # Ordinal number of packet meant to be sent (will incrementing after every successful submission)
    pack_to_send = [1]  # it should be mutable

    # Choose appropriate record generator:
    packet_generator = fastq_packets if is_fastq(fq_fa_path) else fasta_packets

    # Iterate over packets in current file
    for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs,
                                   packet_mode, saved_packet_size,
                                   saved_packet_mode, max_seq_len,
                                   probing_batch_size):

        # Assumption that we need to submit current packet (that we cannot just request for results)
        send = True

        # If current packet has been already send, we can try just to request for results
        if not saved_RID is None:
            send = retrieve_ready_job(saved_RID, packet, packet_size,
                                      packet_mode, pack_to_send,
                                      seqs_processed, fq_fa_path, tmp_fpath,
コード例 #3
0
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm,
                  use_index, db_path, nfiles):
    # Function performs 'many_files'-parallel mode of barapost-local.py.

    # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;
    # :param nfiles: total number of files;
    # :type nfiles: int;

    queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp")

    # Iterate over source FASTQ and FASTA files
    for fq_fa_path in fq_fa_list:

        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$",
                                 infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None:  # If there is no data from previous run
            num_done_seqs = 0  # number of successfully processed sequences
            tsv_res_path = os.path.join(
                new_dpath, "classification.tsv")  # form result tsv file path
        else:  # if there is data from previous run
            num_done_seqs = previous_data[
                "n_done_reads"]  # get number of successfully processed sequences
            tsv_res_path = previous_data[
                "tsv_respath"]  # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(
                1
                for line in how_to_open(fq_fa_path)) // 4  # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(
                    tuple(
                        filter(
                            lambda l: True if l.startswith('>') else False,
                            map(fmt_func,
                                how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                with print_lock:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(
                        os.path.abspath(fq_fa_path)))
                    printlog_warning("This file will not be processed.")
                    continue
                # end with
            # end try
        # end if

        if num_seqs == num_done_seqs:
            with counter_lock:
                file_counter.value += 1
                i = file_counter.value  # save to local var and release lock
            # end with
            with print_lock:
                sys.stdout.write('\r')
                printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\
                    format(i, nfiles, fq_fa_path))
                printlog_info("Omitting it.")
                printn("Working...")
            # end with
            continue
        # end if

        for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs):

            # Blast the packet
            align_xml_text = launch_blastn(packet["fasta"], blast_algorithm,
                                           use_index, queries_tmp_dir, db_path)

            # Cnfigure result TSV lines
            result_tsv_lines = parse_align_results_xml(align_xml_text,
                                                       packet["qual"])

            # Write the result to tsv
            write_classification(result_tsv_lines, tsv_res_path)
        # end for

        with counter_lock:
            file_counter.value += 1
            i = file_counter.value  # save to local var and release lock
        # end with
        with print_lock:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) is processed.".\
                format(i, nfiles, os.path.basename(fq_fa_path)))
            printn("Working...")
        # end with
    # end for

    query_fpath = os.path.join(queries_tmp_dir,
                               "query{}_tmp.fasta".format(os.getpid()))
    remove_tmp_files(query_fpath)
コード例 #4
0
from glob import glob

try:
    opts, args = getopt.gnu_getopt(sys.argv[1:], "hvr:d:o:s:q:m:i:c:ut:n", [
        "help", "version", "taxannot-resdir=", "indir=", "outdir=",
        "binning-sensitivity=", "min-qual=", "min-seq-len=", "min-pident=",
        "min-coverage=", "untwist-fast5", "threads=", "no-trash"
    ])
except getopt.GetoptError as gerr:
    print(str(gerr))
    platf_depend_exit(2)
# end try

from src.filesystem import is_fasta, is_fastq, is_fast5

is_fastqa = lambda f: is_fasta(f) or is_fastq(f)

from datetime import datetime

now = datetime.now().strftime("%Y-%m-%d %H.%M.%S")

# |== Default parameters: ==|
fq_fa_list = list()  # list with input FASTQ and FASTA files paths
fast5_list = list()  # list with input FAST5 files paths
tax_annot_res_dir = "barapost_result"  # path to directory with classification results
indir_path = None  # path to input directory
outdir_path = "binning_result_{}".format(now.replace(
    ' ', '_'))  # path to output directory
sens = ("genus", 5)  # binning sensitivity
min_qual = 10  # minimum mean read quality to keep
min_qlen = None  # minimum seqeunce length to keep
コード例 #5
0
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
            blast_algorithm, use_index, db_path):
    # Function preforms "few_files"-parallel mode.
    #
    # :param fq_fa_list: list of paths to files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param n_thr: number of threads to launch;
    # :type n_thr: int;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;

    nfiles = len(fq_fa_list)

    for i, fq_fa_path in enumerate(fq_fa_list):
        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None: # If there is no data from previous run
            num_done_seqs = 0 # number of successfully processed sequences
            tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path
        else: # if there is data from previous run
            num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences
            tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ]
        fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False,
                    map(fmt_func, how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                print()
                printlog_warning("Warning: current file is broken: {}."\
                    .format(str(err)))
                printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path)))
                printlog_warning("This file will not be processed.")
                continue
            # end try
        # end if

        packet_size = min(packet_size, num_seqs // n_thr)

        if num_seqs == num_done_seqs:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\
                .format(i+1, nfiles, fq_fa_path))
            printlog_info("Omitting it.")
            printn("Working...")
            return
        # end if

        # Get number of seqeunces to pass to each thread
        file_part_size = num_seqs // n_thr
        if num_seqs % n_thr != 0:
            file_part_size += 1
        # end if

        pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral,
            initargs=(mp.Lock(), mp.Lock(),))

        pool.starmap(process_part_of_file, [(file_part,
            tsv_res_path,
            packet_size,
            tax_annot_res_dir,
            blast_algorithm,
            use_index,
            db_path) for file_part in packet_generator(fq_fa_path,
                file_part_size,
                num_done_seqs)])

        # Reaping zombies
        pool.close()
        pool.join()

        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} (`{}`) is processed.".\
            format(i+1, nfiles, os.path.basename(fq_fa_path)))
        printn("Working...")    # end for
コード例 #6
0
ファイル: single_thread_QA.py プロジェクト: Vikash84/barapost
def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                    min_pident, min_coverage, no_trash):
    # Function for single-thread binning FASTQ and FASTA files.
    #
    # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict(
    )  # dict containing file objects of existing output files

    new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Configure generator, write function and path to trash file
    if is_fastq(fq_fa_path):
        seq_records_generator = fastq_records
        write_fun = write_fastq_record
    else:
        seq_records_generator = fasta_records
        write_fun = write_fasta_record
    # end if

    # Make filter for quality and length
    QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            fq_fa_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    for fastq_rec in seq_records_generator(fq_fa_path):

        read_name = sys.intern(fmt_read_id(
            fastq_rec["seq_id"])[1:])  # get ID of the sequence

        try:
            hit_names, *vals_to_filter = resfile_lines[
                read_name]  # find hit corresponding to this sequence
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(read_name))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Make sure that this read has been already \
                processed by `barapost-prober.py` and `barapost-local.py`.")
            platf_depend_exit(1)
        # end try

        # Apply filters
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Place this sequence to QL trash file
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            write_fun(srt_file_dict[QL_trash_fpath],
                      fastq_rec)  # write current read to binned file

        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Place this sequence to align_trash file
            if align_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict,
                                                 align_trash_fpath)
            # end if
            write_fun(srt_file_dict[align_trash_fpath],
                      fastq_rec)  # write current read to binned file

        else:
            for hit_name in hit_names.split(
                    "&&"
            ):  # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(
                    outdir_path,
                    "{}.fast{}".format(hit_name,
                                       'q' if is_fastq(fq_fa_path) else 'a'))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     binned_file_path)
                # end if
                write_fun(srt_file_dict[binned_file_path],
                          fastq_rec)  # write current read to binned file
            # end for
            seqs_pass += 1
        # end if
    # end for

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(fq_fa_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)