Python printlog_info_time Examples, src.printlog.printlog_info_time Python Examples

Example #1

0

Show file

def gzip_outfiles(outdir):
    # Function gzips all fastq files in directory `outdir`.
    #
    # :param outdir: path to outdir;
    # :type outdir: str;

    # Get gzipping function
    gzip_func = _get_gzip_func()
    print()
    printlog_info_time('Gzipping output files...')

    # Get fastq files
    is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None
    fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*')))

    # Gzip them!
    for fpath in fq_fpaths:
        try:
            gzip_func(fpath)
        except OSError as err:
            printlog_info('Error: cannot gzip file `{}`: {}.'.format(
                fpath, err))
            platf_depend_exit(1)
        # end try
    # end for

    printlog_info_time('Output files are gzipped.')

Example #2

0

Show file

def main():
    args = handle_args()
    src.filesystem.make_outdir(args['o'])
    config_logging(args['o'], __version__, __last_update_date__)
    report_run_params(args)

    run_task_chain(args)

    if args['z']:
        src.compression.gzip_outfiles(args['o'])
    # end if

    printlog_info_time('Work is completed.')

Example #3

0

Show file

File: fastq.py Project: masikol/preprocess16S

def count_reads(fq_fpaths):
    # Function counts reads in a fastq file.
    # :param fq_fpaths: list of paths to fastq files;
    # :type fq_fpaths: list<str>;
    # Returns number of reads in "forward" file
    #   (assuming that there are as many reads in "reverse" file).

    printlog_info_time('Counting reads...')

    # Get open function for "forward" file
    open_func = src.compression.provide_open_funcs(fq_fpaths)[0]

    # Count reads in "forward" file
    with open_func(fq_fpaths[0]) as infile:
        nreads = sum(1 for _ in infile) // 4
    # end with

    printlog_info_time('{} reads.'.format(nreads))

    return nreads

Example #4

0

Show file

File: build_local_db.py Project: masikol/barapost

def add_lambda_phage(local_fasta, taxonomy_path):
    # Function adds control sequence of nanopore lambda phase DNA-CS
    #    to 'local_fasta'.
    #
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;
    # :param taxonomy_path: path to taxonomy file;
    # :type taxonomy_path: str;

    print()
    printlog_info_time("Adding lambda phage control sequence...")

    # sys.path[0] is directory containing the script that was used to invoke the Python interpreter.
    # We will use it to get path to file with lambda's sequence.
    lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control",
                                "nanopore_lambda_DNA-CS_control.fasta.gz")

    # Check file existance
    if not os.path.exists(lambda_fpath):
        printlog_error_time(
            "Error: cannot find lambda phage control sequence: '{}'".format(
                lambda_fpath))
        platf_depend_exit(1)
    # end if

    # Read lambda's sequence
    with open_as_gzip(lambda_fpath, 'rb') as lambda_file:
        lambda_fasta = lambda_file.read()
    # end with

    # Write it to db fasta file
    with open(local_fasta, 'wb') as db_fasta_file:
        db_fasta_file.write(lambda_fasta)
    # end with

    # Save lambda's taxonomy
    taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA",
                                    "Lambda-phage-nanopore-control")

    printlog_info_time(" ok")

Example #5

0

Show file

def launch_single_thread_binning(fpath_list, binning_func, tax_annot_res_dir,
                                 sens, min_qual, min_qlen, min_pident,
                                 min_coverage, no_trash):
    # Function launches single-thread binning, performed by finction 'srt_func'.
    #
    # :param fpath_list: list of path to files to process;
    # :type fpath_list: list<str>;
    # :param binning_func: function that performs binning;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    res_stats = list()
    num_files_total = len(fpath_list)

    # Sort files in single thread:
    for i, fq_fa_path in enumerate(fpath_list):
        res_stats.append(
            binning_func(fq_fa_path, tax_annot_res_dir, sens, min_qual,
                         min_qlen, min_pident, min_coverage, no_trash))
        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} `{}` is binned."\
            .format(i+1, num_files_total, os.path.basename(fq_fa_path)))
        printn(" Working...")
    # end for

    return res_stats

Example #6

0

Show file

File: build_local_db.py Project: masikol/barapost

def verify_cl_accessions(accs_to_download, acc_dict):
    # Function checks existance of GenBank records that correspond to accessions
    #   specified with '-s' option. After checking the function fulills 'acc_fict'.

    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>};
    # :type acc_dict: dict<str: tuple<str>>;

    check_connection("https://www.ncbi.nlm.nih.gov/")

    printlog_info_time("Verifying `-s` accessions...")
    sys.stdout.write("0/{}".format(len(accs_to_download)))

    for i, acc in enumerate(accs_to_download):

        server = "eutils.ncbi.nlm.nih.gov"
        url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc)
        text = lingering_https_get_request(server, url, "record's name", acc)

        name = re.search(
            r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text)

        if name is None:
            printlog_info(
                "Cannot find GenBank record with accession '{}'".format(acc))
            platf_depend_exit(1)
        else:
            name = name.group(1)
        # end if

        acc_dict[acc] = name
        sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download)))
    # end for
    print()
    printlog_info_time("OK.")

Example #7

0

Show file

def search_for_related_replicons(acc_dict):
    # Function searches for replicons related to those in 'hits_to_download.tsv'
    #   of specified with '-s' option.
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;

    print()
    printlog_info_time("Searching for related replicons...")

    start_accs = tuple(
        acc_dict.keys())  # accessions, which were "discovered" by prober

    for i, acc in enumerate(start_accs):

        printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc]))

        # Search for related replicons:
        try:
            related_repls = _get_related_replicons(acc, acc_dict)
        except AttributeError:
            printlog_errot_time(
                "Parsing error: cannot find replicons related to {}.".format(
                    acc))
            printlog_error("Please, contact the developer")
            platf_depend_exit(1)
        else:
            related_repls = _deduplicate_replicons(related_repls, acc)
        # end try
        for rel_acc, rel_def in related_repls:
            acc_dict[rel_acc] = rel_def
        # end for
    # end for

    print()
    if len(start_accs) != len(acc_dict):  # there are some new replicons
        printlog_info_time("{} related replicons have been found.".\
            format(len(acc_dict) - len(start_accs)))
    else:
        printlog_info_time("No related replicons found.")
    # end if
    print()


# end def search_for_related_replicons

Example #8

0

Show file

File: networking.py Project: Vikash84/barapost

def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)

Example #9

0

Show file

def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #10

0

Show file

File: networking.py Project: Vikash84/barapost

def send_request(request, pack_to_send, packet_size, packet_mode, filename,
                 tmp_fpath):
    # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
    #     and then waits for satisfaction of the request and retrieves response text.
    #
    # :param request: request_data (it is a dict that `configure_request()` function returns);
    # :param request: dict<dict>;
    # :param pack_to_send: current number (like id) of packet meant to be sent now.
    # :type pack_to_send: int;
    # :param pack_to_send: ordinal number of packet;
    # :type pack_to_send: int;
    # :param packet_size: numner of sequences in the packet;
    # :type packet_size: int;
    #
    # Returns XML text of type 'str' with BLAST response.

    payload = request["payload"]
    headers = request["headers"]

    server = "blast.ncbi.nlm.nih.gov"
    url = "/blast/Blast.cgi"
    error = True

    while error:
        try:
            conn = http.client.HTTPSConnection(server)  # create a connection
            conn.request("POST", url, payload, headers)  # send the request
            response = conn.getresponse()  # get the response
            response_text = str(response.read(), "utf-8")  # get response text
        except OSError as oserr:
            printlog_info_time(
                "`https://blast.ncbi.nlm.nih.gov` is not available.")
            printlog_info(str(oserr))
            printlog_info(
                "barapost will try to connect again in 30 seconds...\n")
            sleep(30)

        # if no exception occured
        else:
            error = False
        # end try
    # end while

    try:
        rid = re.search(r"RID = (.+)",
                        response_text).group(1)  # get Request ID
        rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group(
            1))  # get time to wait provided by the NCBI server
    except AttributeError:
        printlog_error_time("Seems, NCBI has denied your request.")
        printlog_error("Response is in file `request_denial_response.html`")
        with open("request_denial_response.html", 'w') as den_file:
            den_file.write(response_text)
        # end with
        platf_depend_exit(1)
    finally:
        conn.close()
    # end try

    # Save temporary data
    with open(tmp_fpath, 'w') as tmpfile:
        tmpfile.write("Request_ID: {}\n".format(rid))
        tmpfile.write("Packet_size: {}\n".format(packet_size))
        tmpfile.write("Packet_mode: {}".format(packet_mode))
    # end with

    # Wait for results of alignment
    return wait_for_align(rid, rtoe, pack_to_send, filename)

Example #11

0

Show file

File: parallel_FAST5_utwfunc.py Project: masikol/barapost

def map_f5reads_2_taxann(f5_fpaths, tsv_taxann_lst, tax_annot_res_dir):
    # Function perform mapping of all reads stored in input FAST5 files
    #     to existing TSV files containing taxonomic annotation info.
    #
    # It creates an DBM index file.
    #
    # :param f5_fpaths: list of paths to current FAST5 file;
    # :type f5_fpaths: list<str>;
    # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation;
    # :type tsv_taxann_lst: list<str>;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    for f5_path in f5_fpaths:
        # File validation:
        #   RuntimeError will be raised if FAST5 file is broken.
        try:
            # File existance checking is performed while parsing CL arguments.
            # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
            if not h5py.is_hdf5(f5_path):
                raise RuntimeError(
                    "file is not of HDF5 (i.e. not FAST5) format")
            # end if

            f5_file = h5py.File(f5_path, 'r')

            for _ in f5_file:
                break
            # end for
        except RuntimeError as runterr:
            with print_lock:
                printlog_error_time("Error: FAST5 file is broken")
                printlog_error("Reading the file `{}` failed.".format(
                    os.path.basename(f5_path)))
                printlog_error("Reason: {}".format(str(runterr)))
                printlog_error("Omitting this file...")
                print()
            # end with
            return
        # end try

        readids_to_seek = list(fast5_readids(f5_file))
        idx_dict = dict()  # dictionary for index

        # This saving is needed to compare with 'len(readids_to_seek)'
        #    after all TSV will be looked through in order to
        #    determine if some reads miss taxonomic annotation.
        len_before = len(readids_to_seek)

        # Iterate over TSV-taxann file
        for tsv_taxann_fpath in tsv_taxann_lst:

            with open(tsv_taxann_fpath, 'r') as taxann_file:

                # Get all read IDs in current TSV
                readids_in_tsv = list(
                    map(lambda l: l.split('\t')[0], taxann_file.readlines()))

                # Iterate over all other reads in current FAST5
                #    ('reversed' is necessary because we remove items from list in this loop)
                for readid in reversed(readids_to_seek):
                    fmt_id = fmt_read_id(readid)[1:]
                    if fmt_id in readids_in_tsv:
                        # If not first -- write data to dict (and to index later)
                        try:
                            idx_dict[tsv_taxann_fpath].append(
                                "read_" + fmt_id)  # append to existing list
                        except KeyError:
                            idx_dict[tsv_taxann_fpath] = [
                                "read_" + fmt_id
                            ]  # create a new list
                        finally:
                            readids_to_seek.remove(readid)
                        # end try
                    # end if
                # end for
            # end with
            if len(readids_to_seek) == 0:
                break
            # end if
        # end for

        # Save info about reads, for which classification if not found
        #   in any of classification files
        if len(readids_to_seek) != 0:
            not_fount_key = 'CLASSIF_NOT_FOUND'
            idx_dict[not_fount_key] = list()
            for readid in readids_to_seek:
                fmt_id = fmt_read_id(readid)[1:]
                idx_dict[not_fount_key].append("read_" + fmt_id)
            # end for
        # end if

        # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation
        #     for some reads! And we will write their IDs to 'missing_reads_lst.txt' file.
        if len(readids_to_seek) == len_before:
            with print_lock:
                printlog_error_time(
                    "Error: some reads from FAST5 file not found")
                printlog_error("This FAST5 file: `{}`".format(f5_path))
                printlog_error(
                    "Some reads have not undergone taxonomic annotation.")
                missing_log = "missing_reads_lst.txt"
                printlog_error(
                    "List of missing reads are in following file: `{}`".format(
                        missing_log))
                with open(missing_log, 'w') as missing_logfile:
                    missing_logfile.write(
                        "Missing reads from file `{}`:\n\n".format(f5_path))
                    for readid in readids_to_seek:
                        missing_logfile.write(fmt_read_id(readid) + '\n')
                    # end for
                try:
                    for path in glob(os.path.join(index_dirpath, '*')):
                        os.unlink(path)
                    # end for
                    os.rmdir(index_dirpath)
                except OSError as oserr:
                    printlog_error_time(
                        "Error occured while removing index directory: {}".
                        format(oserr))
                finally:
                    platf_depend_exit(3)
                # end try
            # end with
        # end if

        with write_lock:
            try:
                # Open index files appending to existing data ('c' parameter)
                with open_shelve(os.path.join(index_dirpath, index_name),
                                 'c') as index_f5_2_tsv:
                    # Update index
                    index_f5_2_tsv[f5_path] = idx_dict
                # end with
            except OSError as oserr:
                printlog_error_time(
                    "Error: cannot create index file `{}`".format(
                        os.path.join(index_dirpath, index_name)))
                printlog_error(str(oserr))
                platf_depend_exit(1)
            # end try
        # end with

        sys.stdout.write('\r')
        printlog_info_time("File `{}` is processed.".format(
            os.path.basename(f5_path)))
        printn(" Working...")

Example #12

0

Show file

File: parallel_mult_files.py Project: masikol/barapost

def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm,
                  use_index, db_path, nfiles):
    # Function performs 'many_files'-parallel mode of barapost-local.py.

    # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;
    # :param nfiles: total number of files;
    # :type nfiles: int;

    queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp")

    # Iterate over source FASTQ and FASTA files
    for fq_fa_path in fq_fa_list:

        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$",
                                 infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None:  # If there is no data from previous run
            num_done_seqs = 0  # number of successfully processed sequences
            tsv_res_path = os.path.join(
                new_dpath, "classification.tsv")  # form result tsv file path
        else:  # if there is data from previous run
            num_done_seqs = previous_data[
                "n_done_reads"]  # get number of successfully processed sequences
            tsv_res_path = previous_data[
                "tsv_respath"]  # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(
                1
                for line in how_to_open(fq_fa_path)) // 4  # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(
                    tuple(
                        filter(
                            lambda l: True if l.startswith('>') else False,
                            map(fmt_func,
                                how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                with print_lock:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(
                        os.path.abspath(fq_fa_path)))
                    printlog_warning("This file will not be processed.")
                    continue
                # end with
            # end try
        # end if

        if num_seqs == num_done_seqs:
            with counter_lock:
                file_counter.value += 1
                i = file_counter.value  # save to local var and release lock
            # end with
            with print_lock:
                sys.stdout.write('\r')
                printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\
                    format(i, nfiles, fq_fa_path))
                printlog_info("Omitting it.")
                printn("Working...")
            # end with
            continue
        # end if

        for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs):

            # Blast the packet
            align_xml_text = launch_blastn(packet["fasta"], blast_algorithm,
                                           use_index, queries_tmp_dir, db_path)

            # Cnfigure result TSV lines
            result_tsv_lines = parse_align_results_xml(align_xml_text,
                                                       packet["qual"])

            # Write the result to tsv
            write_classification(result_tsv_lines, tsv_res_path)
        # end for

        with counter_lock:
            file_counter.value += 1
            i = file_counter.value  # save to local var and release lock
        # end with
        with print_lock:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) is processed.".\
                format(i, nfiles, os.path.basename(fq_fa_path)))
            printn("Working...")
        # end with
    # end for

    query_fpath = os.path.join(queries_tmp_dir,
                               "query{}_tmp.fasta".format(os.getpid()))
    remove_tmp_files(query_fpath)

Example #13

0

Show file

def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
            blast_algorithm, use_index, db_path):
    # Function preforms "few_files"-parallel mode.
    #
    # :param fq_fa_list: list of paths to files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param n_thr: number of threads to launch;
    # :type n_thr: int;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;

    nfiles = len(fq_fa_list)

    for i, fq_fa_path in enumerate(fq_fa_list):
        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None: # If there is no data from previous run
            num_done_seqs = 0 # number of successfully processed sequences
            tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path
        else: # if there is data from previous run
            num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences
            tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ]
        fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False,
                    map(fmt_func, how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                print()
                printlog_warning("Warning: current file is broken: {}."\
                    .format(str(err)))
                printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path)))
                printlog_warning("This file will not be processed.")
                continue
            # end try
        # end if

        packet_size = min(packet_size, num_seqs // n_thr)

        if num_seqs == num_done_seqs:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\
                .format(i+1, nfiles, fq_fa_path))
            printlog_info("Omitting it.")
            printn("Working...")
            return
        # end if

        # Get number of seqeunces to pass to each thread
        file_part_size = num_seqs // n_thr
        if num_seqs % n_thr != 0:
            file_part_size += 1
        # end if

        pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral,
            initargs=(mp.Lock(), mp.Lock(),))

        pool.starmap(process_part_of_file, [(file_part,
            tsv_res_path,
            packet_size,
            tax_annot_res_dir,
            blast_algorithm,
            use_index,
            db_path) for file_part in packet_generator(fq_fa_path,
                file_part_size,
                num_done_seqs)])

        # Reaping zombies
        pool.close()
        pool.join()

        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} (`{}`) is processed.".\
            format(i+1, nfiles, os.path.basename(fq_fa_path)))
        printn("Working...")    # end for

Example #14

0

Show file

# Check if there is legacy taxonomy file and, if so, reformat it to new (TSV) format
legacy_taxonomy_handling.check_deprecated_taxonomy(tax_annot_res_dir)

if n_thr != 1:
    from src.spread_files_equally import spread_files_equally
# end if

# |=== Proceed "FAST5-untwisting" if it is enabled ===|

printlog_info('-' * 30)
print()

if untwist_fast5 and not use_old_index:

    printlog_info_time("Untwisting started.")
    printn(" Working...")

    from src.binning_modules.binning_spec import get_tsv_taxann_lst
    tsv_taxann_lst = get_tsv_taxann_lst(tax_annot_res_dir)

    if n_thr == 1:
        for f5_path in fast5_list:
            utw_module.map_f5reads_2_taxann(f5_path, tsv_taxann_lst,
                                            tax_annot_res_dir)
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is processed.".format(
                os.path.basename(f5_path)))
            printn(" Working...")
        # end for
    else:

Example #15

0

Show file

File: build_local_db.py Project: masikol/barapost

def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta

Example #16

0

Show file

File: single_thread_FAST5_binfunc_utw.py Project: Vikash84/barapost

def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                   min_pident, min_coverage, no_trash):
    # Function bins FAST5 file with untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            f5_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    readids_to_seek = list(from_f5.keys())  # list of not-binned-yet read IDs

    # Fill the list 'readids_to_seek'
    for read_name in fast5_readids(from_f5):
        # Get rid of "read_"
        readids_to_seek.append(sys.intern(read_name))
    # end for

    # Walk through the index
    index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r')

    if not f5_path in index_f5_2_tsv.keys():
        printlog_error_time(
            "Source FAST5 file `{}` not found in index".format(f5_path))
        printlog_error("Try to rebuild index")
        platf_depend_exit(1)
    # end if

    for tsv_path in index_f5_2_tsv[f5_path].keys():

        read_names = index_f5_2_tsv[f5_path][tsv_path]
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path)

        for read_name in read_names:
            try:
                hit_names, *vals_to_filter = resfile_lines[sys.intern(
                    fmt_read_id(read_name)[1:])]
            except KeyError:
                printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\
                    .format(fmt_read_id(read_name)[1:]))
                printlog_error(
                    "It is stored in `{}` FAST5 file".format(f5_path))
                printlog_error(
                    "Try to make new index file (press ENTER on corresponding prompt)."
                )
                printlog_error(
                    "Or, if does not work for you, make sure that taxonomic annotation info \
for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`."
                )
                index_f5_2_tsv.close()
                platf_depend_exit(1)
            # end try

            if not QL_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if QL_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     QL_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
                QL_seqs_fail += 1
            elif not align_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if align_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     align_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name,
                            srt_file_dict[align_trash_fpath])
                align_seqs_fail += 1
            else:
                for hit_name in hit_names.split(
                        "&&"
                ):  # there can be multiple hits for single query sequence
                    # Get name of result FASTQ file to write this read in
                    binned_file_path = os.path.join(
                        outdir_path, "{}.fast5".format(hit_name))
                    if binned_file_path not in srt_file_dict.keys():
                        srt_file_dict = update_file_dict(
                            srt_file_dict, binned_file_path)
                    # end if
                    f5_cpy_func(from_f5, read_name,
                                srt_file_dict[binned_file_path])
                # end for
                seqs_pass += 1
            # end if
        # end for

    from_f5.close()
    index_f5_2_tsv.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #17

0

Show file

File: build_local_db.py Project: masikol/barapost

def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta):
    # Function downloads set of records from Genbank according to accessions passed to it.
    # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta'

    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param db_dir: path to directory in which downloaded FASTA file will be placed;
    # :type db_dir: str;
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;

    # Path to file with current chunk (see below "100 accession numbers...")
    tmp_fasta = os.path.join(db_dir, "tmp.fasta")

    accessions = tuple(set(acc_dict.keys()))
    if len(accessions) == 0:  # just in case
        return
    # end if

    # 100 accession numbers in order not to make too long URL
    # Download genomes by chunks of 100 sequences.
    max_accnum = 100
    i = 0
    accnum = len(accessions)

    while i < accnum:

        curr_accessions = accessions[i:i + max_accnum]  # slice chunk

        accs_del_comma = ','.join(
            curr_accessions)  # accessions must be separated by comma in url
        # E-utilities provide a possibility to download records from Genbank by accessions.
        retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\
db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma)
        log_info("Retrieve URL: `{}`".format(retrieve_url))

        # GNU wget utility is safer, but there can be presence of absence of it :)
        wget_util = "wget"
        util_found = False
        for d in os.environ["PATH"].split(os.pathsep):
            if os.path.isdir(d) and wget_util in os.listdir(d):
                util_found = True
                break
            # end if
        # end for

        print()
        printlog_info("{} - Downloading {} reference sequences...".format(
            getwt(), len(curr_accessions)))

        if util_found:
            # If we have wget -- just use it

            wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format(
                retrieve_url, tmp_fasta)
            pipe = sp_Popen(wget_cmd, shell=True)
            pipe.communicate()
            if pipe.returncode != 0:
                printlog_error_time(
                    "Error occured while downloading reference sequences")
                platf_depend_exit(pipe.returncode)
            # end if

        else:
            # If there are no wget -- we will download sequences with Python disposal
            stop_wait = Event(
            )  # a flag variable that will signal waiter-function to stop executing

            def download_waiter(stop_wait):
                """
                Function waits untill 'local_fasta' file is downloaded.
                It prints size of downloaded data to console during downloading.
                This function just waits -- it won't bring you the menu :).
                """
                # Wait untill downloading starts
                while not os.path.exists(tmp_fasta):
                    if not stop_wait.is_set():
                        return
                    # end if
                    sleep(1)
                # end while

                MB_size = 1024**2  # we will divide by it to get megabytes

                while stop_wait.is_set():
                    # Get size of downloaded data
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size,
                                  1)  # get megabytes
                    printn("\r{} - {} MB downloaded ".format(getwt(), fsize))
                    sleep(1)  # instant updates are not necessary
                # end while

                # Print total size of downloaded file (it can be deleted by this time)
                try:
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1)
                except OSError:
                    # We can pass this ecxeption -- we do delete this file if downloading crushes
                    # And this function just waits :)
                    pass
                # end try
                printlog_info("\r{} - {} MB downloaded ".format(
                    getwt(), fsize))

            # end def download_waiter

            error = True
            while error:
                try:
                    waiter = Thread(target=download_waiter,
                                    args=(stop_wait, ))  # create thread
                    stop_wait.set()  # raise the flag
                    waiter.start()  # start waiting
                    urllib.request.urlretrieve(
                        retrieve_url, tmp_fasta)  # retrieve FASTA file
                except OSError as err:
                    printlog_error_time(
                        "Error occured while downloading fasta file.")
                    printlog_error(str(err))
                    printlog_error(
                        "`barapost-local.py` will try again in 30 seconds")
                    if os.path.exists(tmp_fasta):
                        os.unlink(tmp_fasta)
                    # end if
                    sleep(30)
                else:
                    error = False
                finally:
                    stop_wait.clear()  # lower the flag
                    waiter.join(
                    )  # main thread will wait until waiter function ends it's work
                # end try
            # end while
        # end if

        printlog_info_time("Downloading is completed")

        # Write chunk to result fasta file
        with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile:
            outfile.write(infile.read())
        # end with

        # Remove temp chunk file
        os.unlink(tmp_fasta)
        i += max_accnum  # go to next chunk

Example #18

0

Show file

def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
    min_pident, min_coverage, no_trash):
    # Function bins FAST5 file without untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0 # counter for sequences, which pass filters
    QL_seqs_fail = 0 # counter for too short or too low-quality sequences
    align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0]
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,)
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path)))
        printlog_error("Reason: {}".format( str(runterr) ))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    for _, read_name in enumerate(fast5_readids(from_f5)):

        try:
            hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(fmt_read_id(read_name)))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n")
            platf_depend_exit(1)
        # end try
        # If read is found in TSV file:
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath])
        else:
            for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict, binned_file_path)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path])
            # end for
            seqs_pass += 1
        # end if
    # end for

    from_f5.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for


    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #19

0

Show file

def crosstalks_runner(args):
    # Runner function for crosstalk detection task.
    #
    # :param args: arguments for crosstalk detection task;
    # :type args: CrosstalksArguments;
    #
    # Returns two collections:
    # 1. A collection of valid (non-crosstalk) paths.
    # 2. A collection of trash (crosstalk) paths.

    def write_positive(fastq_records, valid_files, statistics):
        # Function for writing non-crosstalk fastq records.
        src.fastq.write_fastq_records(fastq_records, valid_files)
        statistics.increment_positive()

    # end def crosstalk_write_positive

    def write_negative(fastq_records, trash_files, statistics):
        # Function for writing crosstalk fastq records.
        src.fastq.write_fastq_records(fastq_records, trash_files)
        statistics.increment_negative()

    # end def crosstalk_write_positive

    def crosstalk_iteration(fastq_records, primers, threshold, max_offset,
                            write_positive, write_negative):

        not_crosstalk, fastq_records = crosstalk_pipe(primers, fastq_records,
                                                      threshold, max_offset)

        if not_crosstalk:
            write_positive(fastq_records)
        else:
            write_negative(fastq_records)
        # end if

    # end def

    printlog_info_time('Start cross-talk detection.')

    # Get crosstalk pipe
    crosstalk_pipe = pcp.provide_crosstalk_pipe(args.cut_off_primers)

    # Cnofigure and open output files
    valid_fpaths, trash_fpaths = ofn.get_crosstalk_outfpaths(
        args.outdir, args.infpaths)
    valid_files = src.filesystem.open_files_for_appending(valid_fpaths)
    trash_files = src.filesystem.open_files_for_appending(trash_fpaths)

    # Create BinaryStatistics object
    crosstalk_statistics = src.binary_statistics.BinaryStatistics()

    # Connect non-crosstalk output files and statistics to `write_positive` function
    crosstalk_write_positive = functools.partial(
        write_positive,
        valid_files=valid_files,
        statistics=crosstalk_statistics)

    # Connect crosstalk output files and statistics to `write_negative` function
    crosstalk_write_negative = functools.partial(
        write_negative,
        trash_files=trash_files,
        statistics=crosstalk_statistics)

    # Connect arguments to `crosstalk_iteration` function
    #   for status_bar to be wrapped around it.
    loaded_crosstalk_iteration = functools.partial(
        crosstalk_iteration,
        primers=args.primers,
        threshold=args.threshold,
        max_offset=args.max_offset,
        write_positive=crosstalk_write_positive,
        write_negative=crosstalk_write_negative)

    # Proceed
    src.status_bar.run_status_bar(loaded_crosstalk_iteration, args.infpaths)

    # Close files
    for file_collection in (valid_files, trash_files):
        src.filesystem.close_files(file_collection)
    # end for

    printlog_info_time('{} ({}%) cross-talks detected.'\
        .format(crosstalk_statistics.negative_stat,
                crosstalk_statistics.get_negative_percents())
    )

    return valid_fpaths, trash_fpaths

Example #20

0

Show file

def ngmerge_runner(args):
    # Runner function for NGmerge task.
    #
    # :param args: arguments for NGmerge task;
    # :type args: NGmergeArguments;
    #
    # Returns two collections:
    # 1. A collection of valid ("merged") paths.
    # 2. A collection of trash ("unmerged") paths.

    print()
    printlog_info_time('Running NGmerge..')

    # NGmerge puts result files into working directory --
    #   we will temporarily go to output directory
    old_dir = os.getcwd()
    os.chdir(args.outdir)

    # Conigure output files' names
    merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes(
        args.infpaths[0])

    # Configure command
    ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\
        .format(args.ngmerge, args.infpaths[0], args.infpaths[1],
        merged_basename, unmerged_prefix, args.n_thr,
        args.min_overlap, args.mismatch_frac, args.phred_offset)
    printlog_info('Command: `{}`'.format(ngmerge_cmd))

    # Run NGmerge
    print('NGmerge is doing it\'s job silently...')
    pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE)
    stderr = pipe.communicate()[1].decode('utf-8')  # run NGmerge

    if pipe.returncode != 0:
        # error
        printlog_error('Error running NGmerge.: {}'.format(stderr))
        platf_depend_exit(pipe.returncode)
    # end if

    # Parse merging statistics from NGmerge's stderr
    stderr = stderr.splitlines()[1:]
    reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)'
    merged_pattern = r'Successfully stitched: ([0-9]+)'

    # Collect statistics
    try:
        reads_processed = int(re.search(reads_pattern, stderr[0]).group(1))
        merged_reads = int(re.search(merged_pattern, stderr[1]).group(1))
    except (ValueError, AttributeError) as err:
        printlog_error(
            'Error 78 ({}). Please, contact the developer.'.format(err))
        platf_depend_exit(78)
    # end try

    os.chdir(old_dir)  # return to old dir

    printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\
        .format(merged_reads, reads_processed,
            round(merged_reads / reads_processed * 100, 2)))

    # Configure absolute paths to output files.
    merged_fpath = os.path.join(args.outdir, merged_basename)
    unmerged_fpaths = sorted(
        glob.glob(
            os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix))))

    # Oh yeah, first returned value must be a collection.
    return [merged_fpath], unmerged_fpaths

Example #21

0

Show file

# Proceeding.
# The main goal of multiprocessing is to isolate processes from one another.
#
# Two situations are available:
#   1. Number of threads <= number of files meant to be processed ('many_files'-parallel mode):
#      Files will be distribured equally among processes.
#      Processes interact with one another only while printing things to the console
#      for user's entertainment.
#   2. Number of threads > number of files meant to be processed ('few_files'-parallel mode):
#      Files will be processed one by one. They will be divided into equal blocks,
#      and these blocks will be distributed among processes.
#      Processes interact with one another while writing to result file and
#      while printing things to the console.

print()
printlog_info_time("Starting classification.")
printn("  Working...")

if n_thr <= len(fq_fa_list):
    if n_thr != 1:

        # Proceed 'many_files'-parallel processing

        from src.barapost_local_modules.parallel_mult_files import process

        process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
                blast_algorithm, use_index, db_path)

    else:

        # Proceed single-thread processing

Example #22

0

Show file

def _get_record_title(record_id):
    # Function retrieves title (aka definition) and accession
    #   of a GenBank record by given accession or GI number.
    # :param record_id: accession or GI number of the record;
    # :type record_idi: str;
    # Returns tuple of two elements:
    #   (<RECORD_TITLE>, <RECORD_ACCESSION>)

    # We'll use E-utilities to communicate with GenBank
    eutils_server = "eutils.ncbi.nlm.nih.gov"
    esummary = "esummary.fcgi"  # utility name

    # Configure URL
    url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id)

    # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges.
    # So, if we just repeat this request, everything is going to be ok.
    error = True
    print_ok = False
    while error:
        # Send the request and get the response
        summary = lingering_https_get_request(
            eutils_server, url,
            "e-summary of nuccore record {}".format(record_id))

        # Parse XML that we've got
        root = ElementTree.fromstring(summary)

        # Elements of our insterest are all named "Item",
        #   but they have different tags.
        # They are children of element "DocSum", which is
        #   the first child of root
        try:
            docsum = next(iter(root.getchildren()))
        except StopIteration:
            print()
            printlog_info_time(
                "Failed to retrieve data for record {}. Trying again...".
                format(record_id))
            print_ok = True  # print this "ok" only after successful attepmt after fail
        else:
            if print_ok:
                printlog_info("ok")
            # end if
            error = False
        # end try
    # end while

    record_title = None
    record_acc = None

    # Search for title and accession
    for item in docsum.iter("Item"):
        if item.attrib["Name"] == "Title":
            record_title = item.text
        elif item.attrib["Name"] == "AccessionVersion":
            # Remove version just in case
            record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1)
        # end if
    # end for

    if record_title is None or record_acc is None:
        printlog_erro_time(
            "Error 8989: can't access e-summary for `{}`".format(record_acc))
        platf_depend_exit(1)
    # end if

    return record_title, record_acc

Example #23

0

Show file

File: single_thread_QA.py Project: Vikash84/barapost

def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                    min_pident, min_coverage, no_trash):
    # Function for single-thread binning FASTQ and FASTA files.
    #
    # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict(
    )  # dict containing file objects of existing output files

    new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Configure generator, write function and path to trash file
    if is_fastq(fq_fa_path):
        seq_records_generator = fastq_records
        write_fun = write_fastq_record
    else:
        seq_records_generator = fasta_records
        write_fun = write_fasta_record
    # end if

    # Make filter for quality and length
    QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            fq_fa_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    for fastq_rec in seq_records_generator(fq_fa_path):

        read_name = sys.intern(fmt_read_id(
            fastq_rec["seq_id"])[1:])  # get ID of the sequence

        try:
            hit_names, *vals_to_filter = resfile_lines[
                read_name]  # find hit corresponding to this sequence
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(read_name))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Make sure that this read has been already \
                processed by `barapost-prober.py` and `barapost-local.py`.")
            platf_depend_exit(1)
        # end try

        # Apply filters
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Place this sequence to QL trash file
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            write_fun(srt_file_dict[QL_trash_fpath],
                      fastq_rec)  # write current read to binned file

        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Place this sequence to align_trash file
            if align_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict,
                                                 align_trash_fpath)
            # end if
            write_fun(srt_file_dict[align_trash_fpath],
                      fastq_rec)  # write current read to binned file

        else:
            for hit_name in hit_names.split(
                    "&&"
            ):  # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(
                    outdir_path,
                    "{}.fast{}".format(hit_name,
                                       'q' if is_fastq(fq_fa_path) else 'a'))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     binned_file_path)
                # end if
                write_fun(srt_file_dict[binned_file_path],
                          fastq_rec)  # write current read to binned file
            # end for
            seqs_pass += 1
        # end if
    # end for

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(fq_fa_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)