def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                   min_pident, min_coverage, no_trash):
    # Function bins FAST5 file with untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            f5_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    readids_to_seek = list(from_f5.keys())  # list of not-binned-yet read IDs

    # Fill the list 'readids_to_seek'
    for read_name in fast5_readids(from_f5):
        # Get rid of "read_"
        readids_to_seek.append(sys.intern(read_name))
    # end for

    # Walk through the index
    index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r')

    if not f5_path in index_f5_2_tsv.keys():
        printlog_error_time(
            "Source FAST5 file `{}` not found in index".format(f5_path))
        printlog_error("Try to rebuild index")
        platf_depend_exit(1)
    # end if

    for tsv_path in index_f5_2_tsv[f5_path].keys():

        read_names = index_f5_2_tsv[f5_path][tsv_path]
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path)

        for read_name in read_names:
            try:
                hit_names, *vals_to_filter = resfile_lines[sys.intern(
                    fmt_read_id(read_name)[1:])]
            except KeyError:
                printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\
                    .format(fmt_read_id(read_name)[1:]))
                printlog_error(
                    "It is stored in `{}` FAST5 file".format(f5_path))
                printlog_error(
                    "Try to make new index file (press ENTER on corresponding prompt)."
                )
                printlog_error(
                    "Or, if does not work for you, make sure that taxonomic annotation info \
for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`."
                )
                index_f5_2_tsv.close()
                platf_depend_exit(1)
            # end try

            if not QL_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if QL_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     QL_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
                QL_seqs_fail += 1
            elif not align_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if align_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     align_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name,
                            srt_file_dict[align_trash_fpath])
                align_seqs_fail += 1
            else:
                for hit_name in hit_names.split(
                        "&&"
                ):  # there can be multiple hits for single query sequence
                    # Get name of result FASTQ file to write this read in
                    binned_file_path = os.path.join(
                        outdir_path, "{}.fast5".format(hit_name))
                    if binned_file_path not in srt_file_dict.keys():
                        srt_file_dict = update_file_dict(
                            srt_file_dict, binned_file_path)
                    # end if
                    f5_cpy_func(from_f5, read_name,
                                srt_file_dict[binned_file_path])
                # end for
                seqs_pass += 1
            # end if
        # end for

    from_f5.close()
    index_f5_2_tsv.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
def map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir):
    # Function perform mapping of all reads stored in input FAST5 files
    #     to existing TSV files containing taxonomic annotation info.
    #
    # It creates an DBM index file.
    #
    # :param f5_path: path to current FAST5 file;
    # :type f5_path: str;
    # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation;
    # :type tsv_taxann_lst: list<str>;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        f5_file = h5py.File(f5_path, 'r')

        for _ in f5_file:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        return
    # end try

    readids_to_seek = list(fast5_readids(f5_file))
    idx_dict = dict()  # dictionary for index

    # This saving is needed to compare with 'len(readids_to_seek)'
    #    after all TSV will be looked through in order to
    #    determine if some reads miss taxonomic annotation.
    len_before = len(readids_to_seek)

    # Iterate over TSV-taaxnn file
    for tsv_taxann_fpath in tsv_taxann_lst:

        with open(tsv_taxann_fpath, 'r') as taxann_file:

            # Get all read IDs in current TSV
            readids_in_tsv = list(
                map(lambda l: l.split('\t')[0], taxann_file.readlines()))

            # Iterate over all other reads in current FAST5
            #    ('reversed' is necessary because we remove items from list in this loop)
            for readid in reversed(readids_to_seek):
                fmt_id = fmt_read_id(readid)[1:]
                if fmt_id in readids_in_tsv:
                    # If not first -- write data to dict (and to index later)
                    try:
                        idx_dict[tsv_taxann_fpath].append(
                            "read_" + fmt_id)  # append to existing list
                    except KeyError:
                        idx_dict[tsv_taxann_fpath] = ["read_" + fmt_id
                                                      ]  # create a new list
                    finally:
                        readids_to_seek.remove(readid)
                    # end try
                # end if
            # end for
        # end with
        if len(readids_to_seek) == 0:
            break
        # end if
    # end for

    # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation
    #     for some reads! And we will write their IDs to 'missing_reads_lst.txt' file.
    if len(readids_to_seek) == len_before:
        printlog_error_time("reads from FAST5 file not found")
        printlog_error("FAST5 file: `{}`".format(f5_path))
        printlog_error("Some reads have not undergone taxonomic annotation.")
        missing_log = "missing_reads_lst.txt"
        printlog_error("List of missing reads are in following file:")
        printlog_error("{}".format(missing_log))
        with open(missing_log, 'w') as missing_logfile:
            missing_logfile.write(
                "Missing reads from file '{}':\n\n".format(f5_path))
            for readid in readids_to_seek:
                missing_logfile.write(fmt_read_id(readid) + '\n')
            # end for
        try:
            for path in glob(os.path.join(index_dirpath, '*')):
                os.unlink(path)
            # end for
            os.rmdir(index_dirpath)
        except OSError as oserr:
            printlog_error(
                "Error occured while removing index directory: {}".format(
                    oserr))
        finally:
            platf_depend_exit(3)
        # end try
    # end if

    try:
        # Open index files appending to existing data ('c' parameter)
        with open_shelve(os.path.join(index_dirpath, index_name),
                         'c') as index_f5_2_tsv:
            # Update index
            index_f5_2_tsv[f5_path] = idx_dict
        # end with
    except OSError as oserr:
        printlog_error_time("Error: cannot create index file `{}`"\
            .format(os.path.join(index_dirpath, index_name)))
        printlog_error(str(oserr))
        platf_depend_exit(1)
Example #3
0
def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
    min_pident, min_coverage, no_trash):
    # Function bins FAST5 file without untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0 # counter for sequences, which pass filters
    QL_seqs_fail = 0 # counter for too short or too low-quality sequences
    align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0]
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,)
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path)))
        printlog_error("Reason: {}".format( str(runterr) ))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    for _, read_name in enumerate(fast5_readids(from_f5)):

        try:
            hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(fmt_read_id(read_name)))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n")
            platf_depend_exit(1)
        # end try
        # If read is found in TSV file:
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath])
        else:
            for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict, binned_file_path)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path])
            # end for
            seqs_pass += 1
        # end if
    # end for

    from_f5.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for


    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)