Esempio n. 1
0
            def download_waiter(stop_wait):
                """
                Function waits untill 'local_fasta' file is downloaded.
                It prints size of downloaded data to console during downloading.
                This function just waits -- it won't bring you the menu :).
                """
                # Wait untill downloading starts
                while not os.path.exists(tmp_fasta):
                    if not stop_wait.is_set():
                        return
                    # end if
                    sleep(1)
                # end while

                MB_size = 1024**2  # we will divide by it to get megabytes

                while stop_wait.is_set():
                    # Get size of downloaded data
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size,
                                  1)  # get megabytes
                    printn("\r{} - {} MB downloaded ".format(getwt(), fsize))
                    sleep(1)  # instant updates are not necessary
                # end while

                # Print total size of downloaded file (it can be deleted by this time)
                try:
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1)
                except OSError:
                    # We can pass this ecxeption -- we do delete this file if downloading crushes
                    # And this function just waits :)
                    pass
                # end try
                printlog_info("\r{} - {} MB downloaded ".format(
                    getwt(), fsize))
Esempio n. 2
0
def ask_for_resumption():
    # Function asks a user if he/she wants to resume the previous run.
    # Returns True if the decision is to resume, else False

    resume = None

    while resume is None:
        resume = input("""
Would you like to resume the previous run?
   1 -- Resume!
   2 -- Start from the beginning.

Enter a number (1 or 2):>> """)
        # Check if entered value is integer number. If no, give another attempt.
        try:
            resume = int(resume)
            # Check if input number is 1 or 2
            if resume != 1 and resume != 2:
                print("\n   Not a VALID number entered!\a\n" + '~' * 20)
                resume = None
            else:
                action = "resume the previous run" if resume == 1 else "start from the beginning"
                printlog_info("You have chosen to {}.".format(action))
                print()
            # end if
        except ValueError:
            print("\nNot an integer number entered!\a\n" + '~' * 20)
            resume = None
        # end try

    return True if resume == 1 else False
Esempio n. 3
0
def gzip_outfiles(outdir):
    # Function gzips all fastq files in directory `outdir`.
    #
    # :param outdir: path to outdir;
    # :type outdir: str;

    # Get gzipping function
    gzip_func = _get_gzip_func()
    print()
    printlog_info_time('Gzipping output files...')

    # Get fastq files
    is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None
    fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*')))

    # Gzip them!
    for fpath in fq_fpaths:
        try:
            gzip_func(fpath)
        except OSError as err:
            printlog_info('Error: cannot gzip file `{}`: {}.'.format(
                fpath, err))
            platf_depend_exit(1)
        # end try
    # end for

    printlog_info_time('Output files are gzipped.')
Esempio n. 4
0
 def gzip_with_shutil(fpath):
     # Funtion for gzipping using Python funtionality.
     printlog_info('Gzipping `{}`'.format(fpath))
     with open(fpath, 'rb') as plain_file, gzip.open(fpath + '.gz',
                                                     'wb') as gz_file:
         shutil.copyfileobj(plain_file, gz_file)
     # end with
     os.unlink(fpath)
Esempio n. 5
0
def rename_file_verbosely(file):
    # Function verbosely renames file (as well as directory) given to it.
    # :param file: path to file (directory) meant to be renamed;
    # :type file: str;

    if not os.path.exists(file):
        return None
    # end if

    # Path to "file's" parent directory
    pardir = os.path.abspath(os.path.dirname(file))

    # Function can rename directories
    if os.path.isdir(file):
        is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\
            .format(os.path.basename(file)), f) is None
        word = "directory"
        name_itself = file
        ext = ""
    else:
        is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file)
                                        ).group(1) in f
        word = "file"
        name_itself = re.search(r"(.*)\..*$", file).group(1)
        ext = re.search(r".*(\..*)$", file).group(1)
    # end if

    # Count files in 'pardir' that have analogous names as 'file' has:
    num_analog_files = len(list(filter(is_analog, os.listdir(pardir))))

    if re.search(r"_old_[0-9]+", file) is None:
        # Append "_old_<number>"
        new_name = name_itself + "_old_" + str(num_analog_files) + ext
    else:
        # Merely substitute new number
        new_name = file.replace(
            re.search(r"_old_([0-9]+)", file).group(1),
            str(num_analog_files + 1))
    # end if

    try:
        print()
        printlog_info(" - Renaming old {}:".format(word))
        printlog_info("  `{}` --> `{}`".format(file, new_name))
        os.rename(file, new_name)
    except OSError as err:
        printlog_error_time("Error: {} `{}` cannot be renamed:".format(
            word, str(file)))
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return new_name
def check_deprecated_taxonomy(classif_dir):

    legacy_tax_path = os.path.join(classif_dir, "taxonomy", "taxonomy")

    if not os.path.exists(legacy_tax_path):
        pass # continue silently
    else:
        print()
        printlog_info("Legacy taxonomy file detected: `{}`.".format(legacy_tax_path))
        printlog_info("It will be reformatted to new format -- to plain TSV.")

        _reformat_legacy_file(legacy_tax_path)
Esempio n. 7
0
def search_for_related_replicons(acc_dict):
    # Function searches for replicons related to those in 'hits_to_download.tsv'
    #   of specified with '-s' option.
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;

    print()
    printlog_info_time("Searching for related replicons...")

    start_accs = tuple(
        acc_dict.keys())  # accessions, which were "discovered" by prober

    for i, acc in enumerate(start_accs):

        printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc]))

        # Search for related replicons:
        try:
            related_repls = _get_related_replicons(acc, acc_dict)
        except AttributeError:
            printlog_errot_time(
                "Parsing error: cannot find replicons related to {}.".format(
                    acc))
            printlog_error("Please, contact the developer")
            platf_depend_exit(1)
        else:
            related_repls = _deduplicate_replicons(related_repls, acc)
        # end try
        for rel_acc, rel_def in related_repls:
            acc_dict[rel_acc] = rel_def
        # end for
    # end for

    print()
    if len(start_accs) != len(acc_dict):  # there are some new replicons
        printlog_info_time("{} related replicons have been found.".\
            format(len(acc_dict) - len(start_accs)))
    else:
        printlog_info_time("No related replicons found.")
    # end if
    print()


# end def search_for_related_replicons
Esempio n. 8
0
def verify_taxids(taxid_list):
    # Funciton verifies TaxIDs passed to prober with `-g` option.
    # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response.
    # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions.
    #
    # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers
    #     during CL argument parsing;
    # :type taxid_list: list<str>;
    #
    # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>"

    organisms = list()
    if len(taxid_list) > 0:

        printlog_info("Verifying TaxIDs:")
        for taxid in taxid_list:
            printn("   {} - ".format(taxid))
            try:
                tax_resp = lingering_https_get_request(
                    "www.ncbi.nlm.nih.gov",
                    "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format(
                        taxid), "taxonomy")
                tax_name = re.search(r"Taxonomy browser \((.+?)\)",
                                     tax_resp).group(1)
            except AttributeError:
                printlog_error("\aError: TaxID not found")
                printlog_error(
                    "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi"
                )
                platf_depend_exit(1)
            except OSError as oserr:
                printlog_error("Something is wrong with connection:")
                printlog_error(str(oserr))
                platf_depend_exit(-2)
            else:
                print(tax_name)
                log_info("{} - {}".format(taxid, tax_name))
                organisms.append("{} (taxid:{})".format(tax_name, taxid))
            # end try
        # end for
        print('-' * 30 + '\n')

    # end if
    return organisms
Esempio n. 9
0
    def whether_to_build_index(index_dirpath):
        # Function checks if there are any files in index directory.
        # If there are any, it asks a user whether to create a new index or to use old one.

        # :param index_dirpath: path to index directory;
        # :type index_dirpath: str;

        use_old_index = False

        if len(os.listdir(index_dirpath)) != 0:
            printlog_info(
                "Index file created by `-u` option already exists (left from previous run)."
            )

            error = True

            while error:
                reply = input("""  Press ENTER to make new index file
  or enter 'u' to use old index file:>>""")
                if reply == "":
                    try:
                        for path in glob(os.path.join(index_dirpath, '*')):
                            os.unlink(path)
                        # end for
                    except OSError as oserr:
                        printlog_error_time(
                            "Error: cannot remove old index files!")
                        printlog_error(str(oserr))
                        platf_depend_exit(1)
                    # end try
                    error = False
                elif reply == 'u':
                    use_old_index = True
                    error = False
                else:
                    print("Invalid reply!\n")
                # end if
            # end while
            printlog_info("You have chosen to {} index file.".format(
                "use old" if use_old_index else "make new"))
            print()
        # end if
        return use_old_index
Esempio n. 10
0
def verify_cl_accessions(accs_to_download, acc_dict):
    # Function checks existance of GenBank records that correspond to accessions
    #   specified with '-s' option. After checking the function fulills 'acc_fict'.

    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>};
    # :type acc_dict: dict<str: tuple<str>>;

    check_connection("https://www.ncbi.nlm.nih.gov/")

    printlog_info_time("Verifying `-s` accessions...")
    sys.stdout.write("0/{}".format(len(accs_to_download)))

    for i, acc in enumerate(accs_to_download):

        server = "eutils.ncbi.nlm.nih.gov"
        url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc)
        text = lingering_https_get_request(server, url, "record's name", acc)

        name = re.search(
            r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text)

        if name is None:
            printlog_info(
                "Cannot find GenBank record with accession '{}'".format(acc))
            platf_depend_exit(1)
        else:
            name = name.group(1)
        # end if

        acc_dict[acc] = name
        sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download)))
    # end for
    print()
    printlog_info_time("OK.")
Esempio n. 11
0
 def gzip_with_gnu_gzip(fpath):
     # Funtion for gzipping with GNU gzip.
     printlog_info('Gzipping `{}`'.format(fpath))
     os.system('{} {}'.format(gzip_util, fpath))
Esempio n. 12
0
def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)
Esempio n. 13
0
def ngmerge_runner(args):
    # Runner function for NGmerge task.
    #
    # :param args: arguments for NGmerge task;
    # :type args: NGmergeArguments;
    #
    # Returns two collections:
    # 1. A collection of valid ("merged") paths.
    # 2. A collection of trash ("unmerged") paths.

    print()
    printlog_info_time('Running NGmerge..')

    # NGmerge puts result files into working directory --
    #   we will temporarily go to output directory
    old_dir = os.getcwd()
    os.chdir(args.outdir)

    # Conigure output files' names
    merged_basename, unmerged_prefix = ofn.get_ngmerge_outprefixes(
        args.infpaths[0])

    # Configure command
    ngmerge_cmd = '{} -1 {} -2 {} -o {} -f {} -n {} -v -m {} -p {} -q {}'\
        .format(args.ngmerge, args.infpaths[0], args.infpaths[1],
        merged_basename, unmerged_prefix, args.n_thr,
        args.min_overlap, args.mismatch_frac, args.phred_offset)
    printlog_info('Command: `{}`'.format(ngmerge_cmd))

    # Run NGmerge
    print('NGmerge is doing it\'s job silently...')
    pipe = sp.Popen(ngmerge_cmd, shell=True, stderr=sp.PIPE)
    stderr = pipe.communicate()[1].decode('utf-8')  # run NGmerge

    if pipe.returncode != 0:
        # error
        printlog_error('Error running NGmerge.: {}'.format(stderr))
        platf_depend_exit(pipe.returncode)
    # end if

    # Parse merging statistics from NGmerge's stderr
    stderr = stderr.splitlines()[1:]
    reads_pattern = r'Fragments \(pairs of reads\) analyzed: ([0-9]+)'
    merged_pattern = r'Successfully stitched: ([0-9]+)'

    # Collect statistics
    try:
        reads_processed = int(re.search(reads_pattern, stderr[0]).group(1))
        merged_reads = int(re.search(merged_pattern, stderr[1]).group(1))
    except (ValueError, AttributeError) as err:
        printlog_error(
            'Error 78 ({}). Please, contact the developer.'.format(err))
        platf_depend_exit(78)
    # end try

    os.chdir(old_dir)  # return to old dir

    printlog_info_time('NGmerge merged {}/{} ({}%) read pairs.'\
        .format(merged_reads, reads_processed,
            round(merged_reads / reads_processed * 100, 2)))

    # Configure absolute paths to output files.
    merged_fpath = os.path.join(args.outdir, merged_basename)
    unmerged_fpaths = sorted(
        glob.glob(
            os.path.join(args.outdir, '{}*.fastq'.format(unmerged_prefix))))

    # Oh yeah, first returned value must be a collection.
    return [merged_fpath], unmerged_fpaths
Esempio n. 14
0
def send_request(request, pack_to_send, packet_size, packet_mode, filename,
                 tmp_fpath):
    # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
    #     and then waits for satisfaction of the request and retrieves response text.
    #
    # :param request: request_data (it is a dict that `configure_request()` function returns);
    # :param request: dict<dict>;
    # :param pack_to_send: current number (like id) of packet meant to be sent now.
    # :type pack_to_send: int;
    # :param pack_to_send: ordinal number of packet;
    # :type pack_to_send: int;
    # :param packet_size: numner of sequences in the packet;
    # :type packet_size: int;
    #
    # Returns XML text of type 'str' with BLAST response.

    payload = request["payload"]
    headers = request["headers"]

    server = "blast.ncbi.nlm.nih.gov"
    url = "/blast/Blast.cgi"
    error = True

    while error:
        try:
            conn = http.client.HTTPSConnection(server)  # create a connection
            conn.request("POST", url, payload, headers)  # send the request
            response = conn.getresponse()  # get the response
            response_text = str(response.read(), "utf-8")  # get response text
        except OSError as oserr:
            printlog_info_time(
                "`https://blast.ncbi.nlm.nih.gov` is not available.")
            printlog_info(str(oserr))
            printlog_info(
                "barapost will try to connect again in 30 seconds...\n")
            sleep(30)

        # if no exception occured
        else:
            error = False
        # end try
    # end while

    try:
        rid = re.search(r"RID = (.+)",
                        response_text).group(1)  # get Request ID
        rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group(
            1))  # get time to wait provided by the NCBI server
    except AttributeError:
        printlog_error_time("Seems, NCBI has denied your request.")
        printlog_error("Response is in file `request_denial_response.html`")
        with open("request_denial_response.html", 'w') as den_file:
            den_file.write(response_text)
        # end with
        platf_depend_exit(1)
    finally:
        conn.close()
    # end try

    # Save temporary data
    with open(tmp_fpath, 'w') as tmpfile:
        tmpfile.write("Request_ID: {}\n".format(rid))
        tmpfile.write("Packet_size: {}\n".format(packet_size))
        tmpfile.write("Packet_mode: {}".format(packet_mode))
    # end with

    # Wait for results of alignment
    return wait_for_align(rid, rtoe, pack_to_send, filename)
Esempio n. 15
0
def lingering_https_get_request(server, url, request_for=None, acc=None):
    # Function performs a "lingering" HTTPS request.
    # It means that the function tries to get the response
    #     again and again if the request fails.
    #
    # :param server: server address;
    # :type server: str;
    # :param url: the rest of url;
    # :type url: str;
    # :param request_for: some comment for error message;
    # :type request_for: str;
    # :param acc: GenBank accession;
    # :type acc: str;
    #
    # Returns obtained response coded in UTF-8 ('str').

    error = True

    # We can get spurious 404 or sth due to instability of NCBI servers work.
    # Let's give it 3 attempts (with 15 sec spans in between),
    #   and if all them are unsuccessful -- teminate execution.
    attempt_i = 0
    max_attempts = 3

    while error:
        try:
            conn = http.client.HTTPSConnection(server,
                                               timeout=30)  # create connection
            conn.request("GET", url)  # ask for if there areresults
            response = conn.getresponse()  # get the resonse

            if response.code != 200:
                if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server:
                    printlog_error("Error {}: {}.".format(
                        response.code, response.reason))
                    printlog_error(
                        "It may be due to instable work of NCBI servers.")
                    printlog_error("{} attempts to connect left, waiting 15 sec..."\
                        .format(max_attempts - attempt_i))
                    attempt_i += 1
                else:
                    printlog_error("Cannot find {} for {}.".format(
                        request_for, acc))
                    printlog_error("Request failed with status code {}: {}"\
                        .format(response.code, response.reason))
                    platf_depend_exit(1)
                # end if
            # end if

            resp_content = str(response.read(), "utf-8")  # get response text
        except (OSError,\
                http.client.RemoteDisconnected,\
                socket.gaierror,\
                http.client.CannotSendRequest) as err:
            comment_str = ""
            if not request_for is None:
                comment_str += " requesting for {}".format(request_for)
                if not acc is None:
                    comment_str += " (accession: `{}`)".format(acc)
                # end if
                comment_str += '.'
            # end if
            print()
            printlog_info("Can't connect to `{}`{}".format(
                server + url, comment_str))
            printlog_info(str(err))
            printlog_info(
                """the program will sleep for 30 seconds and try to connect again."""
            )
            sleep(30)
        else:
            error = False  # if no exception ocured, get out of the loop
        finally:
            conn.close()
        # end try
    # end while
    return resp_content
Esempio n. 16
0
def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict,
                probing_batch_size):
    # Function looks around in order to check if there are results from previous run(s) of this script
    #   in order to resume the previous run.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "RID": saved_RID <str>,
    #     "packet_size_save": saved packet size <int>,
    #     "packet_size_mode": saved packet mode <int>,
    #     "tsv_respath": path_to_tsv_file_from_previous_run <str>,
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>,
    #     "tmp_fpath": path_to_pemporary_file <str>,
    #     "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int>
    # }
    #
    # :param outdir_path: path to output directory;
    # :type outdir_path: str;
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type infile_path: str;
    # :param blast_algorithm: BLASTn algorithm to use.
    #     This parameter is necessary because it is included in name of result files;
    # :param acc_dict: dictionary of accession info of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param probing_batch_size: amount of sequences meant to be processed in a single run;
    # :type probing_batch_size: str;
    # :type blast_algorithm: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(infile_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of `.fasta` extention

    # Form path to temporary file
    tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname),
                                        blast_algorithm)
    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")
    # Form path to file with hits to download
    acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv")

    num_done_seqs = 0  # variable to keep number of successfully processed sequences

    resume = None
    # Check if there are results from previous run.
    if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath):
        print()
        printlog_info(
            "A result file from previous run is found in the directory:")
        printlog_info("   `{}`".format(new_dpath))
        # Allow politely to continue from last successfully sent packet.
        resume = ask_for_resumption()
    # end if

    if not resume:
        rename_file_verbosely(tsv_res_fpath)
        rename_file_verbosely(tmp_fpath)
        rename_file_verbosely(acc_fpath)
    else:
        printlog_info("Let's try to resume...")

        # Collect information from result file
        if os.path.exists(tsv_res_fpath):
            # There can be invalid information in this file
            try:
                with open(tsv_res_fpath, 'r') as res_file:
                    lines = res_file.readlines()
                    num_done_seqs = len(lines) - 1  # the first line is a head
                    last_line = lines[-1]
                    last_seq_id = last_line.split('\t')[0]
                # end with
                # There must be 10 columns in each row:
                if any(map(lambda l: l.count('\t') != 9, lines)):
                    raise ValueError(
                        "There must be 10 colums separated by tabs in file `classification.tsv`"
                    )
                # end if

            except Exception as err:
                printlog_error_time(
                    "\nData in classification file `{}` not found or broken. Reason:"
                    .format(tsv_res_fpath))
                printlog_error(' ' + str(err))

                # If the reason is known -- print erroneous lines
                if isinstance(err, ValueError):
                    printlog_error("Here are numbers of improper lines:")
                    for i, line in enumerate(lines):
                        if line.count('\t') != 9:
                            printlog_error(str(i + 1) + ": `{}`".format(line))
                        # end if
                    # end for
                # end if

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                printlog_info("Last classified sequence: " + last_seq_id)
                printlog_info(
                    "{} sequences have been already processed".format(
                        num_done_seqs))
            # end try
        # end if

        # Collect information from accession file
        if os.path.exists(acc_fpath):

            # There can be invalid information in this file
            try:
                with open(acc_fpath, 'r') as acc_file:
                    lines = acc_file.readlines()[
                        9:]  # omit description and head of the table
                    local_files_filtered = list(
                        filter(lambda x: False if os.path.exists(x) else True,
                               lines))  # omit file paths
                    for line in local_files_filtered:
                        vals = line.split('\t')
                        acc = sys.intern(vals[0].strip())
                        if len(vals) == 1:
                            acc_dict[acc] = [
                                "No definition of the sequence provided", 1
                            ]
                        elif len(vals) == 2:
                            acc_dict[acc] = [vals[1].strip(), 1]
                        else:
                            acc_dict[acc] = [
                                vals[1].strip(),
                                int(vals[2].strip())
                            ]
                        # end if
                    # end for
                # end with

            except Exception as err:
                printlog_error_time(
                    "Data in accession file `{}` not found or broken. Reason:".
                    format(acc_fpath))
                printlog_error(' ' + str(err))
                printlog_error("Invalid line: `{}`".format(line))

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                print()
                printlog_info(
                    "Here are Genbank records encountered during previous run(s):"
                )
                for acc, other_info in sorted(acc_dict.items(),
                                              key=lambda x: -x[1][1]):
                    s_letter = "s" if other_info[1] > 1 else ""
                    printlog_info(" {} hit{} - {}, `{}`".format(
                        other_info[1], s_letter, acc, other_info[0]))
                # end for
                print('-' * 20)
            # end try
        # end if

        # Get packet size, number of the last sent packet and RID from temp file.
        # There can be invalid information in tmp file of tmp file may not exist
        try:

            with open(tmp_fpath, 'r') as tmp_file:
                temp_lines = tmp_file.readlines()
            # end with

            RID_save = re.search(r"Request_ID: (.+)",
                                 temp_lines[0]).group(1).strip()
            packet_size_save = int(
                re.search(r"Packet_size: ([0-9]*)",
                          temp_lines[1]).group(1).strip())
            packet_mode_save = int(
                re.search(r"Packet_mode: ([0-9]{1})",
                          temp_lines[2]).group(1).strip())

        except (AttributeError, OSError):

            # There is no need to disturb a user, merely proceed.
            return {
                "RID": None,
                "packet_size_save": None,
                "packet_mode_save": None,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": 0
            }
        else:
            # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs:
            #   subtract num_done_reads if probing_batch_size > num_done_reads.
            decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0
            # Return data from previous run
            return {
                "RID": RID_save,
                "packet_size_save": packet_size_save,
                "packet_mode_save": packet_mode_save,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": decr_pb
            }
        # end try
    # end if

    return None
Esempio n. 17
0
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
            blast_algorithm, use_index, db_path):
    # Function preforms "few_files"-parallel mode.
    #
    # :param fq_fa_list: list of paths to files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param n_thr: number of threads to launch;
    # :type n_thr: int;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;

    nfiles = len(fq_fa_list)

    for i, fq_fa_path in enumerate(fq_fa_list):
        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None: # If there is no data from previous run
            num_done_seqs = 0 # number of successfully processed sequences
            tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path
        else: # if there is data from previous run
            num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences
            tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ]
        fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False,
                    map(fmt_func, how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                print()
                printlog_warning("Warning: current file is broken: {}."\
                    .format(str(err)))
                printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path)))
                printlog_warning("This file will not be processed.")
                continue
            # end try
        # end if

        packet_size = min(packet_size, num_seqs // n_thr)

        if num_seqs == num_done_seqs:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\
                .format(i+1, nfiles, fq_fa_path))
            printlog_info("Omitting it.")
            printn("Working...")
            return
        # end if

        # Get number of seqeunces to pass to each thread
        file_part_size = num_seqs // n_thr
        if num_seqs % n_thr != 0:
            file_part_size += 1
        # end if

        pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral,
            initargs=(mp.Lock(), mp.Lock(),))

        pool.starmap(process_part_of_file, [(file_part,
            tsv_res_path,
            packet_size,
            tax_annot_res_dir,
            blast_algorithm,
            use_index,
            db_path) for file_part in packet_generator(fq_fa_path,
                file_part_size,
                num_done_seqs)])

        # Reaping zombies
        pool.close()
        pool.join()

        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} (`{}`) is processed.".\
            format(i+1, nfiles, os.path.basename(fq_fa_path)))
        printn("Working...")    # end for
Esempio n. 18
0
def _reformat_legacy_file(legacy_tax_path):

    import shelve

    # Check if this file is corrupted
    try:
        with shelve.open(legacy_tax_path, 'r') as tax_file:
            pass
        # end with
    except OSError as err:
        printlog_error("Legacy taxonomy file appears to be corrupted.")
        printlog_error("This error might be fatal.")
        str_err = str(err)
        if "dbm.gnu" in str_err and "module is not" in str_err:
            printlog_error("Installing `python3-gdbm` might solve this problem.")
        else:
            printlog_error("The program can't recover taxonomy from the broken file.")
            printlog_error("Seems, you have to annotate your sequences again.")
            printlog_error("Sorry for that :(")
        # end if
        platf_depend_exit(1)
    # end try

    new_tax_path = "{}.tsv".format(legacy_tax_path)

    taxonomy.init_tax_file(new_tax_path)

    printn("Reformatting: `{}` ->".format(legacy_tax_path))
    log_info("Reformatting: `{}` ->".format(legacy_tax_path))

    with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file:
        for acc, taxonomy_from_file in old_tax_file.items():
            if isinstance(taxonomy_from_file, tuple):
                tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file)
                new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) )))
            elif isinstance(taxonomy_from_file, str):
                new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) )))
            else:
                # Execution must not reach here
                printlog_error_time("Fatal error 8755.")
                printlog_error("Please, contact the developer.")
                platf_depend_exit(8755)
            # end if
        # end for
    # end with

    printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path)))

    try:
        renamed_legacy_file = "{}_deprecated".format(legacy_tax_path)
        os.rename(legacy_tax_path, renamed_legacy_file)
    except OSError as err:
        printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path))
        printlog_error(str(err))
        printlog_error("But it's not a problem -- we will proceed with our work.")
    else:
        printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path,
            os.path.basename(renamed_legacy_file)))
    # end try

    printlog_info("Legacy taxonomy file is reformatted to TSV format.")
Esempio n. 19
0
def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta):
    # Function downloads set of records from Genbank according to accessions passed to it.
    # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta'

    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param db_dir: path to directory in which downloaded FASTA file will be placed;
    # :type db_dir: str;
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;

    # Path to file with current chunk (see below "100 accession numbers...")
    tmp_fasta = os.path.join(db_dir, "tmp.fasta")

    accessions = tuple(set(acc_dict.keys()))
    if len(accessions) == 0:  # just in case
        return
    # end if

    # 100 accession numbers in order not to make too long URL
    # Download genomes by chunks of 100 sequences.
    max_accnum = 100
    i = 0
    accnum = len(accessions)

    while i < accnum:

        curr_accessions = accessions[i:i + max_accnum]  # slice chunk

        accs_del_comma = ','.join(
            curr_accessions)  # accessions must be separated by comma in url
        # E-utilities provide a possibility to download records from Genbank by accessions.
        retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\
db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma)
        log_info("Retrieve URL: `{}`".format(retrieve_url))

        # GNU wget utility is safer, but there can be presence of absence of it :)
        wget_util = "wget"
        util_found = False
        for d in os.environ["PATH"].split(os.pathsep):
            if os.path.isdir(d) and wget_util in os.listdir(d):
                util_found = True
                break
            # end if
        # end for

        print()
        printlog_info("{} - Downloading {} reference sequences...".format(
            getwt(), len(curr_accessions)))

        if util_found:
            # If we have wget -- just use it

            wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format(
                retrieve_url, tmp_fasta)
            pipe = sp_Popen(wget_cmd, shell=True)
            pipe.communicate()
            if pipe.returncode != 0:
                printlog_error_time(
                    "Error occured while downloading reference sequences")
                platf_depend_exit(pipe.returncode)
            # end if

        else:
            # If there are no wget -- we will download sequences with Python disposal
            stop_wait = Event(
            )  # a flag variable that will signal waiter-function to stop executing

            def download_waiter(stop_wait):
                """
                Function waits untill 'local_fasta' file is downloaded.
                It prints size of downloaded data to console during downloading.
                This function just waits -- it won't bring you the menu :).
                """
                # Wait untill downloading starts
                while not os.path.exists(tmp_fasta):
                    if not stop_wait.is_set():
                        return
                    # end if
                    sleep(1)
                # end while

                MB_size = 1024**2  # we will divide by it to get megabytes

                while stop_wait.is_set():
                    # Get size of downloaded data
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size,
                                  1)  # get megabytes
                    printn("\r{} - {} MB downloaded ".format(getwt(), fsize))
                    sleep(1)  # instant updates are not necessary
                # end while

                # Print total size of downloaded file (it can be deleted by this time)
                try:
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1)
                except OSError:
                    # We can pass this ecxeption -- we do delete this file if downloading crushes
                    # And this function just waits :)
                    pass
                # end try
                printlog_info("\r{} - {} MB downloaded ".format(
                    getwt(), fsize))

            # end def download_waiter

            error = True
            while error:
                try:
                    waiter = Thread(target=download_waiter,
                                    args=(stop_wait, ))  # create thread
                    stop_wait.set()  # raise the flag
                    waiter.start()  # start waiting
                    urllib.request.urlretrieve(
                        retrieve_url, tmp_fasta)  # retrieve FASTA file
                except OSError as err:
                    printlog_error_time(
                        "Error occured while downloading fasta file.")
                    printlog_error(str(err))
                    printlog_error(
                        "`barapost-local.py` will try again in 30 seconds")
                    if os.path.exists(tmp_fasta):
                        os.unlink(tmp_fasta)
                    # end if
                    sleep(30)
                else:
                    error = False
                finally:
                    stop_wait.clear()  # lower the flag
                    waiter.join(
                    )  # main thread will wait until waiter function ends it's work
                # end try
            # end while
        # end if

        printlog_info_time("Downloading is completed")

        # Write chunk to result fasta file
        with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile:
            outfile.write(infile.read())
        # end with

        # Remove temp chunk file
        os.unlink(tmp_fasta)
        i += max_accnum  # go to next chunk
Esempio n. 20
0
check_connection("https://blast.ncbi.nlm.nih.gov")

print("|=== barapost-prober.py (version {}) ===|\n".format(__version__))
log_info("barapost-prober.py (version {})".format(__version__))
print(get_full_time() + "- Start working\n")
log_info("Start working.")

from src.prober_modules.prober_spec import look_around
from src.prober_modules.networking import verify_taxids
from src.prober_modules.kernel import submit, retrieve_ready_job

# Make sure that TaxIDs specified by user actually exist
organisms = verify_taxids(taxid_list)

# Print information about the run
printlog_info(" - Output directory: `{}`;".format(outdir_path))
printlog_info(" - Logging to `{}`".format(
    logging.getLoggerClass().root.handlers[0].baseFilename))
if user_email != "":
    printlog_info(" - Your email: <{}>".format(user_email))
# end if
printlog_info(" - Probing batch size: {} sequences;".format(
    "all" if send_all else probing_batch_size))

mode_comment = "number of sequences" if packet_mode == 0 else "sum of sequences' lengths"
printlog_info(" - Packet forming mode: {} ({});".format(
    packet_mode, mode_comment))
del mode_comment

if packet_mode == 0:
    tmp_str = "sequences"
Esempio n. 21
0
def _get_record_title(record_id):
    # Function retrieves title (aka definition) and accession
    #   of a GenBank record by given accession or GI number.
    # :param record_id: accession or GI number of the record;
    # :type record_idi: str;
    # Returns tuple of two elements:
    #   (<RECORD_TITLE>, <RECORD_ACCESSION>)

    # We'll use E-utilities to communicate with GenBank
    eutils_server = "eutils.ncbi.nlm.nih.gov"
    esummary = "esummary.fcgi"  # utility name

    # Configure URL
    url = "/entrez/eutils/{}?db=nuccore&id={}".format(esummary, record_id)

    # Sometimes (I never figured out why) this XML arrives empty, and StopIteration emerges.
    # So, if we just repeat this request, everything is going to be ok.
    error = True
    print_ok = False
    while error:
        # Send the request and get the response
        summary = lingering_https_get_request(
            eutils_server, url,
            "e-summary of nuccore record {}".format(record_id))

        # Parse XML that we've got
        root = ElementTree.fromstring(summary)

        # Elements of our insterest are all named "Item",
        #   but they have different tags.
        # They are children of element "DocSum", which is
        #   the first child of root
        try:
            docsum = next(iter(root.getchildren()))
        except StopIteration:
            print()
            printlog_info_time(
                "Failed to retrieve data for record {}. Trying again...".
                format(record_id))
            print_ok = True  # print this "ok" only after successful attepmt after fail
        else:
            if print_ok:
                printlog_info("ok")
            # end if
            error = False
        # end try
    # end while

    record_title = None
    record_acc = None

    # Search for title and accession
    for item in docsum.iter("Item"):
        if item.attrib["Name"] == "Title":
            record_title = item.text
        elif item.attrib["Name"] == "AccessionVersion":
            # Remove version just in case
            record_acc = re.search(r"(.*)\.[0-9]+", item.text).group(1)
        # end if
    # end for

    if record_title is None or record_acc is None:
        printlog_erro_time(
            "Error 8989: can't access e-summary for `{}`".format(record_acc))
        platf_depend_exit(1)
    # end if

    return record_title, record_acc
Esempio n. 22
0
def _split_and_resubmit(packet, packet_size, packet_mode, pack_to_send,
                        seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path,
                        tsv_res_path, acc_fpath, blast_algorithm, user_email,
                        organisms, acc_dict, out_of_n):
    # :param packet: "packet" dictionary described in "barapost-prober.py" before the kernel loop:
    # :type packet: dict;
    # :param packet_size: size of the packet (see option `-c` for definition);
    # :type packet_size: int;
    # :param packet_mode: packet forming mode (see option `-c` for definition);
    # :type packet_mode: int;
    # :param pack_to_send: ordinal number of packet to send
    #   (it is list rather that in because it should be mutable);
    # :type pack_to_send: list<int>;
    # :param seqs_processed: nuber of sequnces processed
    #   (it is list rather that in because it should be mutable);
    # :type seqs_processed: list<int>;
    # :param fq_fa_path: path to current input file;
    # :type fq_fa_path: str;
    # :param tmp_fpath: path to current temporary file;
    # :type tmp_fpath: str;
    # :param taxonomy_path: path to taxonomt file;
    # :type taxonomy_path: str;
    # :param tsv_res_path: path to current classification file;
    # :type tsv_res_path: str;
    # :param acc_fpath: path to file `hits_to_download.tsv`;
    # :type acc_fpath: str;
    # :param blast_algorithm: BLAST algorithm to use (see option `-a`);
    # :type blast_algorithm: str;
    # :param user_email: user email ot send with request;
    # :type user_email: str;
    # :param organisms: list of strings performing `nt` database slices;
    # :type organisms: list<str>;
    # :param acc_dict: accession dictionary for writing to `hits_to_download.tsv`;
    # :type acc_dict: dict<str: (str, int)>;
    # :param out_of_n: dictionary for printing how many packets left;
    # :type out_of_n: dict<str: str, str: int>;

    # Number of sequnces in packet to be splitted:
    pack_len = len(packet["qual"])

    if pack_len > 1:
        # Split current packet into two (of equal number of sequences) and resubmit them one-by-one

        printlog_info(
            "Splitting current packet into two and submitting each of them one-by-one."
        )

        # Update this dictionary to print how many packets left
        if not out_of_n["npacks"] is None:
            out_of_n["npacks"] += 1
            out_of_n["msg"] = " out of {}".format(out_of_n["npacks"])
        # end if

        # Calculate size of subpacket
        new_pack_size_0 = pack_len // 2
        if pack_len % 2 != 0:
            new_pack_size_0 += 1
        # end if

        # Split the packet
        for _, splitted_packet in enumerate(
                fasta_packets_from_str(packet["fasta"], new_pack_size_0)):

            # Inherit quality information from "ancestor" qual_dict
            for query_name in splitted_packet["qual"].keys():
                splitted_packet["qual"][query_name] = packet["qual"][
                    query_name]
            # end for

            # Submit subpacket
            submit(splitted_packet, new_pack_size_0, 0, pack_to_send,
                   seqs_processed, fq_fa_path, tmp_fpath, taxonomy_path,
                   tsv_res_path, acc_fpath, blast_algorithm, user_email,
                   organisms, acc_dict, out_of_n)
        # end for
    else:
        # Prune the only sequence in packet and resend it

        printlog_info("Current packet contains only one sequence.")
        printlog_info(
            "prober will prune this sequence twofold and resubmit it.")

        # Calculate new length for this sequence
        # Generator of stripped sequence-sontaining lines:
        old_seq = map(str.strip, packet["fasta"].splitlines()[1:])
        old_len = len(''.join(old_seq))  # calculate length of old sequence
        new_len = old_len // 2
        if old_len % 2 != 0:
            new_len += 1
        # end if

        packet["fasta"] = prune_seqs(packet["fasta"], new_len)

        submit(packet, packet_size, packet_mode, pack_to_send, seqs_processed,
               fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath,
               blast_algorithm, user_email, organisms, acc_dict, out_of_n)
Esempio n. 23
0
for fpath in fq_fa_list:
    # Validate new_dpath existance for FASTA and FASTQ files:
    if not os.path.isdir(get_curr_res_dpath(fpath, tax_annot_res_dir)):
        printlog_error_time(
            "Error: Directory that should have contained results of taxonomic annotation \
for following file does not exist: `{}`.".format(os.path.basename(fpath)))
        printlog_error(
            "Please, make sure that this file have been already processed \
by `barapost-prober.py` and `barapost-local.py`.")
        platf_depend_exit(1)
    # end if
# end for

sys.stdout.write('\r')
printlog_info("Primary validation...ok")
print()

is_fastQA5 = lambda f: not re.search(r".*\.(m)?f(ast)?(a|q|5)(\.gz)?$", f
                                     ) is None

# Check if there are some results in output directory
if len(list(filter(is_fastQA5, os.listdir(outdir_path)))) != 0:
    printlog_info(
        "Attention! Output directory `{}` is not empty!".format(outdir_path))
    printlog_info("List of sequence-containing files in it:")
    for i, file in enumerate(filter(is_fastQA5, os.listdir(outdir_path))):
        printlog_info("  {}. `{}`".format(i + 1, file))
    # end for
    print()
Esempio n. 24
0
def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path):
    # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs,
    #     and values are corresponding hit names.
    #
    # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file;
    # :type tsv_res_fpath: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :parm taxonomy_path: path to taxonomy file;
    # :type taxonomy_file: str;

    resfile_lines = dict()

    tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

    with open(tsv_res_fpath, 'r') as brpst_resfile:

        brpst_resfile.readline()  # pass the head of the table
        line = brpst_resfile.readline().strip(
        )  # get the first informative line

        while line != "":
            splt = line.split('\t')
            read_name = sys.intern(splt[0])
            hit_name = splt[1]
            hit_acc = splt[2]

            try:
                quality = float(splt[8])  # we will filter by quality
            except ValueError as verr:
                if splt[8] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    quality = splt[8]
                else:
                    printlog_error_time("query quality parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                query_len = int(splt[3])  # we will filter by length
            except ValueError as verr:
                printlog_error_time("query length parsing error")
                printlog_error(str(verr))
                printlog_error("Please, contact the developer.")
                platf_depend_exit(1)
            # end try

            try:
                pident = float(splt[5])  # we will filter by identity
            except ValueError as verr:
                if splt[5] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    pident = splt[5]
                else:
                    printlog_error_time(
                        "Alignment percent of identity parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                coverage = float(splt[4])  # we will filter by coverage
            except ValueError as verr:
                if splt[4] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    coverage = splt[4]
                else:
                    printlog_error_time("alignment coverage parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            except NoTaxonomyError:
                printlog_warning(
                    "Can't find taxonomy for reference sequence `{}`".format(
                        hit_acc))
                printlog_warning("Trying to recover taxonomy.")

                # Recover
                src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path)
                printlog_info("Taxonomy for {} is recovered.".format(hit_acc))

                # Update tax_dict
                tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

                # Format again -- with new tax_dict
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            # end try

            line = brpst_resfile.readline().strip()  # get next line
        # end while
    # end with

    return resfile_lines
Esempio n. 25
0
                    format='%(levelname)s: %(asctime)s %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    filemode='w')
log_info(sys.platform)
log_info(sys.implementation)
log_info(sys.version)

print("|=== barapost-local.py (version {}) ===|\n".format(__version__))
log_info("barapost-local.py (version {})".format(__version__))
print(get_full_time() + "- Start working\n")
log_info("Start working.")

#                       |===== Proceed =====|

printlog_info(" - Logging to `{}`".format(
    logging.getLoggerClass().root.handlers[0].baseFilename))
printlog_info(" - Output directory: `{}`;".format(tax_annot_res_dir))
printlog_info(" - Packet size: {} sequences;".format(packet_size))
printlog_info(" - BLAST algorithm: {};".format(blast_algorithm))
printlog_info(" - Threads: {};".format(n_thr))
print()

s_letter = '' if len(fq_fa_list) == 1 else 's'
printlog_info(" {} file{} will be processed.".format(len(fq_fa_list),
                                                     s_letter))
if len(fq_fa_list) != 1:
    log_info("Here they are:")
else:
    log_info("Here it is:")
# end if
for i, path in enumerate(fq_fa_list):
Esempio n. 26
0
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta
Esempio n. 27
0
def report_run_params(args):
    # Funtion prints run parameters.
    #
    # :param args: argument dictionary returned by handle_args;
    # :type args: dict;

    print()
    printlog_info('  -- Run parameters --')
    printlog_info('  General:')
    printlog_info('- Tasks: {}.'.format(', '.join(args['tasks'])))
    printlog_info('- Forward reads: `{}`.'.format(args['1']))
    if not args['2'] is None:
        printlog_info('- Reverse reads: `{}`.'.format(args['2']))
    else:
        printlog_info('- Reverse reads: none.')
    # end if
    printlog_info('- Output directory: `{}`.'.format(args['o']))
    printlog_info('- Threads: {}.'.format(args['t']))
    printlog_info('- Gzip output files afterwards: {}.'.format(args['z']))
    printlog_info('  Crosstalks detection:')
    if args['r'] is None:
        printlog_info('- Primers: standard Illumina 16S rRNA V3-V4 primers.')
    else:
        printlog_info('- Primers file: `{}`.'.format(args['r']))
    # end if
    printlog_info('- Threshold: {}.'.format(args['x']))
    printlog_info('- Max offset: {}.'.format(args['s']))
    printlog_info('- Cut off primers: {}.'.format(args['c']))
    printlog_info('  Read merging:')
    printlog_info('- Minimum overlap: {}.'.format(args['m']))
    printlog_info('- Mismatch fraction: {}.'.format(args['p']))
    printlog_info('- Phred offset: {}.'.format(args['q']))
    printlog_info('- NGmerge path: {}.'.format(args['ngmerge-path']))
    print('-' * 10 + '\n')
Esempio n. 28
0
def submit(packet, packet_size, packet_mode, pack_to_send, seqs_processed,
           fq_fa_path, tmp_fpath, taxonomy_path, tsv_res_path, acc_fpath,
           blast_algorithm, user_email, organisms, acc_dict, out_of_n):
    # :param packet: "packet" dictionary described in "barapost-prober.py" before the kernel loop:
    # :type packet: dict;
    # :param packet_size: size of the packet (see option `-c` for definition);
    # :type packet_size: int;
    # :param packet_mode: packet forming mode (see option `-c` for definition);
    # :type packet_mode: int;
    # :param pack_to_send: ordinal number of packet to send
    #   (it is list rather that in because it should be mutable);
    # :type pack_to_send: list<int>;
    # :param seqs_processed: nuber of sequnces processed
    #   (it is list rather that in because it should be mutable);
    # :type seqs_processed: list<int>;
    # :param fq_fa_path: path to current input file;
    # :type fq_fa_path: str;
    # :param tmp_fpath: path to current temporary file;
    # :type tmp_fpath: str;
    # :param taxonomy_path: path to taxonomt file;
    # :type taxonomy_path: str;
    # :param tsv_res_path: path to current classification file;
    # :type tsv_res_path: str;
    # :param acc_fpath: path to file `hits_to_download.tsv`;
    # :type acc_fpath: str;
    # :param blast_algorithm: BLAST algorithm to use (see option `-a`);
    # :type blast_algorithm: str;
    # :param user_email: user email ot send with request;
    # :type user_email: str;
    # :param organisms: list of strings performing `nt` database slices;
    # :type organisms: list<str>;
    # :param acc_dict: accession dictionary for writing to `hits_to_download.tsv`;
    # :type acc_dict: dict<str: (str, int)>;
    # :param out_of_n: dictionary for printing how many packets left;
    # :type out_of_n: dict<str: str, str: int>;

    s_letter = 's' if len(packet["qual"]) != 1 else ''
    print()
    printlog_info("Going to BLAST (" + blast_algorithm + ")")

    # Count base pairs in packet
    lines = filter(lambda x: not x.startswith('>'),
                   packet["fasta"].splitlines())
    totalbp = len(''.join(map(lambda x: x.strip(), lines)))
    totalbp = "{:,}".format(totalbp)
    del lines

    printlog_info("Request number {}{}. Sending {} sequence{} ({} b.p. totally)."\
        .format(pack_to_send[0], out_of_n["msg"],
                len(packet["qual"]), s_letter, totalbp))

    error = BlastError(-1)

    while error.code != 0:  # until successfull attempt

        # Get the request
        request = configure_request(packet["fasta"], blast_algorithm,
                                    organisms, user_email)

        # Send the request and get BLAST XML response.
        # 'align_xml_text' will be None if an error occurs.
        align_xml_text, error = send_request(request, pack_to_send,
                                             packet_size, packet_mode,
                                             os.path.basename(fq_fa_path),
                                             tmp_fpath)

        if error.code == 0:
            # Write results and leave the loop
            _handle_result(align_xml_text, packet, taxonomy_path, tsv_res_path,
                           acc_dict, acc_fpath, seqs_processed, pack_to_send,
                           tmp_fpath)

        elif error.code == 2:
            # If NCBI BLAST server rejects the request due to too large amount of data in it --
            #    split packet into two or, if there is only one sequence in it -- prune this sequence.
            # Then resend the request.

            _split_and_resubmit(packet, packet_size, packet_mode, pack_to_send,
                                seqs_processed, fq_fa_path, tmp_fpath,
                                taxonomy_path, tsv_res_path, acc_fpath,
                                blast_algorithm, user_email, organisms,
                                acc_dict, out_of_n)

            error = BlastError(
                0)  # _split_and_resubmit will process packet successfully
Esempio n. 29
0
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm,
                  use_index, db_path, nfiles):
    # Function performs 'many_files'-parallel mode of barapost-local.py.

    # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;
    # :param nfiles: total number of files;
    # :type nfiles: int;

    queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp")

    # Iterate over source FASTQ and FASTA files
    for fq_fa_path in fq_fa_list:

        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$",
                                 infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None:  # If there is no data from previous run
            num_done_seqs = 0  # number of successfully processed sequences
            tsv_res_path = os.path.join(
                new_dpath, "classification.tsv")  # form result tsv file path
        else:  # if there is data from previous run
            num_done_seqs = previous_data[
                "n_done_reads"]  # get number of successfully processed sequences
            tsv_res_path = previous_data[
                "tsv_respath"]  # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(
                1
                for line in how_to_open(fq_fa_path)) // 4  # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(
                    tuple(
                        filter(
                            lambda l: True if l.startswith('>') else False,
                            map(fmt_func,
                                how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                with print_lock:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(
                        os.path.abspath(fq_fa_path)))
                    printlog_warning("This file will not be processed.")
                    continue
                # end with
            # end try
        # end if

        if num_seqs == num_done_seqs:
            with counter_lock:
                file_counter.value += 1
                i = file_counter.value  # save to local var and release lock
            # end with
            with print_lock:
                sys.stdout.write('\r')
                printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\
                    format(i, nfiles, fq_fa_path))
                printlog_info("Omitting it.")
                printn("Working...")
            # end with
            continue
        # end if

        for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs):

            # Blast the packet
            align_xml_text = launch_blastn(packet["fasta"], blast_algorithm,
                                           use_index, queries_tmp_dir, db_path)

            # Cnfigure result TSV lines
            result_tsv_lines = parse_align_results_xml(align_xml_text,
                                                       packet["qual"])

            # Write the result to tsv
            write_classification(result_tsv_lines, tsv_res_path)
        # end for

        with counter_lock:
            file_counter.value += 1
            i = file_counter.value  # save to local var and release lock
        # end with
        with print_lock:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) is processed.".\
                format(i, nfiles, os.path.basename(fq_fa_path)))
            printn("Working...")
        # end with
    # end for

    query_fpath = os.path.join(queries_tmp_dir,
                               "query{}_tmp.fasta".format(os.getpid()))
    remove_tmp_files(query_fpath)