Python printlog_error_time Examples, src.printlog.printlog_error_time Python Examples

Example #1

0

Show file

File: binning_spec.py Project: masikol/barapost

def get_res_tsv_fpath(new_dpath):
    # Function returns current TSV file. Binning will be performed according to this file.
    # :param new_dpath: current result directory;
    # :type new_dpath: str;

    is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False

    if not os.path.exists(new_dpath):
        printlog_error_time(
            "Error: directory `{}` does not exist!".format(new_dpath))
        printlog_error("Please make sure you have performed taxonomic \
annotation of the following file: `{}` \
with `barapost-prober.py` and/or `barapost-local.py`".format(
            os.path.basename(new_dpath)))
        printlog_error(
            "Also this error might occur if you forget to specify result directory \
generated by `barapost-prober.py` with `-r` option.")
        platf_depend_exit(0)
    # end if

    # Recent file will be the first in sorted list
    tsv_res_fpath = list(
        filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0]

    return os.path.join(new_dpath, tsv_res_fpath)

Example #2

0

Show file

def create_result_directory(fq_fa_path, outdir_path):
    # Function creates a result directory named according
    #     to how source FASTQ or FASTA file is named.
    #
    # :param fq_fa_path: path to source FASTQ or FASTA file;
    # :type fq_fa_path: str;
    # :param outdir_path: path to directory in which result_directory will be created;
    # :type outdir_path: str;
    #
    # Returns 'str' path to the recently created result directory.

    # dpath means "directory path"
    new_dpath = os.path.join(
        outdir_path, os.path.basename(fq_fa_path))  # get rid of absolute path
    new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$",
                          new_dpath).group(1)  # get rid of extention
    if not os.path.exists(new_dpath):
        try:
            os.makedirs(new_dpath)
        except OSError as oserr:
            printlog_error_time(
                "Error: can't create result directory: `{}`".format(new_dpath))
            printlog_error(str(oserr))
            platf_depend_exit(1)
        # end try
    # end if
    return new_dpath

Example #3

0

Show file

File: taxonomy.py Project: masikol/barapost

def recover_taxonomy(acc, hit_def, taxonomy_path):
    # Function recovers missing taxonomy by given accession.
    #
    # :param acc: accession of taxonomy entry to recover;
    # :type acc: str;
    # :param hit_def: name of this sequence;
    # :type hit_def: sre;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    if acc == "LAMBDA":
        # If we are missing lambda phage taxonomy -- just add it
        save_taxonomy_directly(taxonomy_path, acc,
                               "Lambda-phage-nanopore-control")
    elif acc.startswith("OWN_SEQ_"):
        # If sequence is an "own seq" -- check fasta file

        # Get necessary title line from `local_seq_set.fasta`
        # Firstly find fasta file (it may be compressed)
        classif_dir = os.path.dirname(os.path.dirname(taxonomy_path))
        db_dir = os.path.join(classif_dir, "local_database")
        db_files = glob.glob("{}{}*".format(db_dir, os.sep))
        try:
            local_fasta = next(iter(filter(is_fasta, db_files)))
        except StopIteration:
            printlog_error_time(
                "Error: cannot recover taxonomy for following sequence:")
            printlog_error(" `{} - {}`.".format(acc, hit_def))
            printlog_error(
                "You can solve this problem by yourself (it's pretty simple).")
            printlog_error("Just add taxonomy line for {} to file `{}`".format(
                acc, taxonomy_path))
            printlog_error("  and run the program again.")
            platf_depend_exit(1)
        # end try

        # Find our line startingg with `acc`
        how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)]
        if is_gzipped(local_fasta):
            search_for = b">" + bytes(acc, 'ascii') + b" "
        else:
            search_for = ">{} ".format(acc)
        # end if

        with how_to_open(local_fasta) as fasta_file:
            for line in fasta_file:
                if line.startswith(search_for):
                    seq_name = fmt_func(line).partition(' ')[
                        2]  # get name of the sequence
                    save_taxonomy_directly(taxonomy_path, acc, seq_name)
                    break
                # end if
            # end for
        # end with
    else:
        # Try to find taxonomy in NCBI
        download_taxonomy(acc, hit_def, taxonomy_path)

Example #4

0

Show file

def _is_redundant(nc_acc, accs):
    # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict).
    # :param nc_acc: accession number of NC-record;
    # :type nc_acc: str;
    # :param accs: tuple of accession numbers;
    # :type accs: tuple<str>;

    summary = lingering_https_get_request(
        "www.ncbi.nlm.nih.gov",
        "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary",
        nc_acc)

    try:
        # Find link to Identical GenBank Record

        # Firstly, get GI number of NC seqeunce:
        get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format(
            nc_acc)
        nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                 get_gi_url,
                                                 "GI of {}".format(nc_acc),
                                                 nc_acc)
        nc_gi_text = nc_gi_text.replace('\n', '')
        nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text)
        if nc_gi_re is None:
            raise _NoIdentLabelError(
                "Error 771. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        nc_gi = nc_gi_re.group(1)

        # Retrieve identical GenBank sequence accession number.
        # NCBI redirects these requests and provides necessary location in headers.
        # So, we'll follow thin link.
        identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format(
            nc_gi)
        redirect_text = _ling_https_getreq_handl_301(
            "www.ncbi.nlm.nih.gov", identical_gb_link,
            "link to identical genbank sequence", nc_acc)

        # Get accession number from the response text
        pattern = r"\<pre\>(.*).*\</pre\>"
        ident_acc_re = re.search(pattern, redirect_text.replace('\n', ''))

        if ident_acc_re is None:
            raise _NoIdentLabelError(
                "Error 773. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        ident_acc = ident_acc_re.group(1).partition('.')[0]

    except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err:
        printlog_error_time("Error: {}".format(err))
        platf_depend_exit(1)
    else:
        return ident_acc, ident_acc in accs

Example #5

0

Show file

def rename_file_verbosely(file):
    # Function verbosely renames file (as well as directory) given to it.
    # :param file: path to file (directory) meant to be renamed;
    # :type file: str;

    if not os.path.exists(file):
        return None
    # end if

    # Path to "file's" parent directory
    pardir = os.path.abspath(os.path.dirname(file))

    # Function can rename directories
    if os.path.isdir(file):
        is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\
            .format(os.path.basename(file)), f) is None
        word = "directory"
        name_itself = file
        ext = ""
    else:
        is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file)
                                        ).group(1) in f
        word = "file"
        name_itself = re.search(r"(.*)\..*$", file).group(1)
        ext = re.search(r".*(\..*)$", file).group(1)
    # end if

    # Count files in 'pardir' that have analogous names as 'file' has:
    num_analog_files = len(list(filter(is_analog, os.listdir(pardir))))

    if re.search(r"_old_[0-9]+", file) is None:
        # Append "_old_<number>"
        new_name = name_itself + "_old_" + str(num_analog_files) + ext
    else:
        # Merely substitute new number
        new_name = file.replace(
            re.search(r"_old_([0-9]+)", file).group(1),
            str(num_analog_files + 1))
    # end if

    try:
        print()
        printlog_info(" - Renaming old {}:".format(word))
        printlog_info("  `{}` --> `{}`".format(file, new_name))
        os.rename(file, new_name)
    except OSError as err:
        printlog_error_time("Error: {} `{}` cannot be renamed:".format(
            word, str(file)))
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return new_name

Example #6

0

Show file

File: single_thread_QA.py Project: Vikash84/barapost

def update_file_dict(srt_file_dict, new_fpath):
    try:
        if not new_fpath is None:
            srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a')
        else:
            srt_file_dict[new_fpath] = None  # handle no_trash
        # end if
    except OSError as oserr:
        printlog_error_time("Error occured while opening one of result files")
        printlog_error("Errorneous file: `{}`".format(new_fpath))
        printlog_error(str(oserr))
        platf_depend_exit(1)
    # end try
    return srt_file_dict

Example #7

0

Show file

def remove_tmp_files(*paths):
    # Function removes files passed to it.
    # :param paths: an array-like collection of apth of files;
    # :type paths: list<str>;

    for path in paths:
        if os.path.exists(path):
            try:
                os.unlink(path)
            except OSError as oserr:
                printlog_error_time("Error: cannot remove file `{}`").format(
                    path)
                printlog_error(str(oserr))
                platf_depend_exit(1)

Example #8

0

Show file

File: barapost_spec.py Project: masikol/barapost

def look_around(new_dpath, fq_fa_path):
    # Function looks around in order to check if there are results from previous runs of this script.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "tsv_respath": path_to_tsv_file_from_previous_run (str),
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file (int),
    # }
    #
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param fq_fa_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type fq_fa_path: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(fq_fa_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of '.fasta' extention

    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")

    num_done_reads = 0  # variable to keep number of succeffdully processed sequences

    if os.path.exists(tsv_res_fpath):

        with open(tsv_res_fpath, 'r') as res_file:
            # There can be invalid information in result file
            try:
                lines = res_file.readlines()
                num_done_reads = len(lines) - 1  # the first line is a head
            except OSError as err:
                printlog_error_time("Data in classification file `{}` is broken. Reason:"\
                    .format(tsv_res_fpath))
                printlog_error(str(err))
                printlog_error("Starting from the beginning.")
                rename_file_verbosely(tsv_res_fpath)
                return None
            # end try
        # end with
    else:
        return None
    # end if

    return {
        "tsv_respath": tsv_res_fpath,
        "n_done_reads": num_done_reads,
    }

Example #9

0

Show file

def copy_single_f5(from_f5, read_name, to_f5):
    # Function copies a read with ID 'read_name'
    #     from 'from_f5' singleFAST5 file to 'to_f5' multiFAST5 one.
    #
    # :param from_f5: FAST5 file object to copy a read from;
    # :type from_f5: h5py.File;
    # :param read_name: ID of a read to copy;
    # :type read_name: str;
    # :param to_f5: destination FAST5 file;
    # :type to_f5: h5py.File;

    # Handle no_trash
    if to_f5 is None:
        return
    # end if

    try:
        read_group = read_name
        to_f5.create_group(read_group) # create group in destination multi_FAST5 file

        # Copy "UniqueGlobalKey" to root of recently created group
        for ugk_subgr in from_f5["UniqueGlobalKey"]:
            from_f5.copy("UniqueGlobalKey/"+ugk_subgr, to_f5[read_group])
        # end for

        # Get data array in single-FAST5 file
        read_number_group = "Raw/Reads/"+next(iter(from_f5["Raw"]["Reads"]))
        # It's name in multi-FAST5 file
        read_number = re.search(r"(Read_[0-9]+)", read_number_group).group(1)

        # Copy group to multi-FAST5 file
        from_f5.copy(from_f5[read_number_group], to_f5[read_group])
        # Move data array to "Raw" group, as it is in multi-FAST5 files
        to_f5.move("{}/{}".format(read_group, read_number), "{}/Raw".format(read_group))

        # Copy everything else to recently created group
        for group in from_f5:
            if group != "Raw" and group != "UniqueGlobalKey":
                from_f5.copy(group, to_f5["/{}".format(read_group)])
            # end if
        # end for
    except ValueError as err:
        printlog_error_time("Error: `{}`".format( str(err) ))
        printlog_error("Reason is probably the following:")
        printlog_error("  read that is copying to the result file is already in this file.")
        printlog_error("ID of the read: `{}`".format(read_name))
        printlog_error("File: `{}`".format(to_f5.filename))
        return

Example #10

0

Show file

    def whether_to_build_index(index_dirpath):
        # Function checks if there are any files in index directory.
        # If there are any, it asks a user whether to create a new index or to use old one.

        # :param index_dirpath: path to index directory;
        # :type index_dirpath: str;

        use_old_index = False

        if len(os.listdir(index_dirpath)) != 0:
            printlog_info(
                "Index file created by `-u` option already exists (left from previous run)."
            )

            error = True

            while error:
                reply = input("""  Press ENTER to make new index file
  or enter 'u' to use old index file:>>""")
                if reply == "":
                    try:
                        for path in glob(os.path.join(index_dirpath, '*')):
                            os.unlink(path)
                        # end for
                    except OSError as oserr:
                        printlog_error_time(
                            "Error: cannot remove old index files!")
                        printlog_error(str(oserr))
                        platf_depend_exit(1)
                    # end try
                    error = False
                elif reply == 'u':
                    use_old_index = True
                    error = False
                else:
                    print("Invalid reply!\n")
                # end if
            # end while
            printlog_info("You have chosen to {} index file.".format(
                "use old" if use_old_index else "make new"))
            print()
        # end if
        return use_old_index

Example #11

0

Show file

File: barapost_spec.py Project: masikol/barapost

def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir,
                  db_path):
    """
    Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response.

    :param pacekt: FASTA data meant to be processend by 'blastn';
    :type packet: str;
    :param blast_algorithm: blastn algorithm to use;
    :type blast_algorithm: str;
    :param use_index: logic value inddicating whether to use index;
    :type use_index: bool:
    :param queries_tmp_dir: path to directory with query files;
    :type queries_tmp_dir: str:
    :param db_path: path to database;
    :type db_path: str:
    """

    # PID of current process won't change, so we can use it to mark query files.
    # 'paket's are too large to pass them to 'subprocess.Popen' as stdin,
    #    therefore we need to use these query files.
    query_path = os.path.join(queries_tmp_dir,
                              "query{}_tmp.fasta".format(os.getpid()))

    with open(query_path, 'w') as query_file:
        query_file.write(packet)
    # end with

    # Configure command line
    blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\
        .format(query_path, db_path, blast_algorithm, use_index)

    pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout_stderr = pipe.communicate()

    if pipe.returncode != 0:
        printlog_error_time(
            "Error occured while aligning a sequence against local database")
        printlog_error(stdout_stderr[1].decode("utf-8"))
        platf_depend_exit(pipe.returncode)
    # end if

    return stdout_stderr[0].decode("utf-8")

Example #12

0

Show file

def copy_read_f5_2_f5(from_f5, read_name, to_f5):
    # Function copies a read with ID 'read_name'
    #     from 'from_f5' multiFAST5 file to to_f5 multiFAST5 one.
    #
    # :param from_f5: FAST5 file object to copy a read from;
    # :type from_f5: h5py.File;
    # :param read_name: ID of a read to copy;
    # :type read_name: str;
    # :param to_f5: destination FAST5 file;
    # :type to_f5: h5py.File;

    if not to_f5 is None: # handle no_trash
        try:
            from_f5.copy(read_name, to_f5)
        except ValueError as err:
            printlog_error_time("Error: `{}`".format( str(err) ))
            printlog_error("Reason is probably the following:")
            printlog_error("  read that is copying to the result file is already in this file.")
            printlog_error("ID of the read: `{}`".format(read_name))
            printlog_error("File: `{}`".format(to_f5.filename))
            return

Example #13

0

Show file

File: build_local_db.py Project: masikol/barapost

def add_lambda_phage(local_fasta, taxonomy_path):
    # Function adds control sequence of nanopore lambda phase DNA-CS
    #    to 'local_fasta'.
    #
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;
    # :param taxonomy_path: path to taxonomy file;
    # :type taxonomy_path: str;

    print()
    printlog_info_time("Adding lambda phage control sequence...")

    # sys.path[0] is directory containing the script that was used to invoke the Python interpreter.
    # We will use it to get path to file with lambda's sequence.
    lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control",
                                "nanopore_lambda_DNA-CS_control.fasta.gz")

    # Check file existance
    if not os.path.exists(lambda_fpath):
        printlog_error_time(
            "Error: cannot find lambda phage control sequence: '{}'".format(
                lambda_fpath))
        platf_depend_exit(1)
    # end if

    # Read lambda's sequence
    with open_as_gzip(lambda_fpath, 'rb') as lambda_file:
        lambda_fasta = lambda_file.read()
    # end with

    # Write it to db fasta file
    with open(local_fasta, 'wb') as db_fasta_file:
        db_fasta_file.write(lambda_fasta)
    # end with

    # Save lambda's taxonomy
    taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA",
                                    "Lambda-phage-nanopore-control")

    printlog_info_time(" ok")

Example #14

0

Show file

#    generated by prober and barapost.

printn("Primary validation...")
if not untwist_fast5:
    for fpath in fast5_list:
        # Get number of directories in 'tax_annot_res_dir' where results of current FAST5
        #    baraposting are located.
        possible_fast5_resdirs_num = len(
            glob("{}{}*{}*".format(tax_annot_res_dir, os.sep,
                                   get_checkstr(fpath))))

        if possible_fast5_resdirs_num == 1:
            continue  # OK
        elif possible_fast5_resdirs_num == 0:  # there is no such a directory
            print()
            printlog_error_time(
                "Error: classification for following FAST5 file is missing:")
            printlog_error("  `{}`".format(fpath))
            printlog_error(
                "Try running barapost-binning with `-u` (`--untwist-fast5`) flag."
            )
            print()
            platf_depend_exit(5)
        else:  # there are multiple directories where prober-barapost results can be located
            printlog_error_time(
                "Error: multiple result directories match FAST5 file meant to be binned"
            )
            printlog_error("File: `{}`".format(os.path.basename(fpath)))
            printlog_error("Directories:")
            for d in glob("{}{}*{}*".format(tax_annot_res_dir, os.sep,
                                            get_checkstr(fpath))):
                printlog_error(d)

Example #15

0

Show file

File: binning_spec.py Project: masikol/barapost

def configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path):
    # Function returns dictionary, where keys are sequence (i.e. sequences meant to be binned) IDs,
    #     and values are corresponding hit names.
    #
    # :param tsv_res_fpath: path to current TSV file. Binning will be performed accorfing to this TSV file;
    # :type tsv_res_fpath: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :parm taxonomy_path: path to taxonomy file;
    # :type taxonomy_file: str;

    resfile_lines = dict()

    tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

    with open(tsv_res_fpath, 'r') as brpst_resfile:

        brpst_resfile.readline()  # pass the head of the table
        line = brpst_resfile.readline().strip(
        )  # get the first informative line

        while line != "":
            splt = line.split('\t')
            read_name = sys.intern(splt[0])
            hit_name = splt[1]
            hit_acc = splt[2]

            try:
                quality = float(splt[8])  # we will filter by quality
            except ValueError as verr:
                if splt[8] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    quality = splt[8]
                else:
                    printlog_error_time("query quality parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                query_len = int(splt[3])  # we will filter by length
            except ValueError as verr:
                printlog_error_time("query length parsing error")
                printlog_error(str(verr))
                printlog_error("Please, contact the developer.")
                platf_depend_exit(1)
            # end try

            try:
                pident = float(splt[5])  # we will filter by identity
            except ValueError as verr:
                if splt[5] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    pident = splt[5]
                else:
                    printlog_error_time(
                        "Alignment percent of identity parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                coverage = float(splt[4])  # we will filter by coverage
            except ValueError as verr:
                if splt[4] == '-':
                    # Keep minus as quality if there is no quality information.
                    # Error will not be raised.
                    coverage = splt[4]
                else:
                    printlog_error_time("alignment coverage parsing error")
                    printlog_error(str(verr))
                    printlog_error("Please, contact the developer.")
                    platf_depend_exit(1)
                # end if
            # end try

            try:
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            except NoTaxonomyError:
                printlog_warning(
                    "Can't find taxonomy for reference sequence `{}`".format(
                        hit_acc))
                printlog_warning("Trying to recover taxonomy.")

                # Recover
                src.taxonomy.recover_taxonomy(hit_acc, hit_name, taxonomy_path)
                printlog_info("Taxonomy for {} is recovered.".format(hit_acc))

                # Update tax_dict
                tax_dict = src.taxonomy.get_tax_dict(taxonomy_path)

                # Format again -- with new tax_dict
                resfile_lines[read_name] = [
                    format_taxonomy_name(hit_acc, hit_name, sens, tax_dict),
                    quality, query_len, pident, coverage
                ]
            # end try

            line = brpst_resfile.readline().strip()  # get next line
        # end while
    # end with

    return resfile_lines

Example #16

0

Show file

File: build_local_db.py Project: masikol/barapost

def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta

Example #17

0

Show file

if blast_algorithm == "megaBlast":
    blast_algorithm = "megablast"
elif blast_algorithm == "discoMegablast":
    blast_algorithm = "dc-megablast"
# end if

import src.legacy_taxonomy_handling as legacy_taxonomy_handling

# Form path to taxonomy file:
taxonomy_dir = os.path.join(tax_annot_res_dir, "taxonomy")
if not os.path.isdir(taxonomy_dir):
    try:
        os.makedirs(taxonomy_dir)
    except OSError as err:
        printlog_error_time(
            "Error: cannot create taxonomy directory `{}`".format(
                taxonomy_dir))
        printlog_error_time(str(err))
        platf_depend_exit(1)
    # end try
# end if
taxonomy_path = os.path.join(taxonomy_dir, "taxonomy.tsv")

# Check if there is legacy taxonomy file and, if so, reformat it to new (TSV) format
legacy_taxonomy_handling.check_deprecated_taxonomy(tax_annot_res_dir)

from src.barapost_local_modules.build_local_db import build_local_db

# Indexed discontiguous searches are not supported:
#    https://www.ncbi.nlm.nih.gov/books/NBK279668/#usermanual.Megablast_indexed_searches
if use_index == "true" and blast_algorithm == "dc-megablast":

Example #18

0

Show file

File: prober_spec.py Project: masikol/barapost

def look_around(outdir_path, new_dpath, infile_path, blast_algorithm, acc_dict,
                probing_batch_size):
    # Function looks around in order to check if there are results from previous run(s) of this script
    #   in order to resume the previous run.
    #
    # Returns None if there is no result from previous run.
    # If there are results from previous run, returns a dict of the following structure:
    # {
    #     "RID": saved_RID <str>,
    #     "packet_size_save": saved packet size <int>,
    #     "packet_size_mode": saved packet mode <int>,
    #     "tsv_respath": path_to_tsv_file_from_previous_run <str>,
    #     "n_done_reads": number_of_successfull_requests_from_currenrt_FASTA_file <int>,
    #     "tmp_fpath": path_to_pemporary_file <str>,
    #     "decr_pb": valuse decreasing size of probing batch (see below, where this variable is defined) <int>
    # }
    #
    # :param outdir_path: path to output directory;
    # :type outdir_path: str;
    # :param new_dpath: path to current (corresponding to fq_fa_path file) result directory;
    # :type new_dpath: str;
    # :param infile_path: path to current (corresponding to fq_fa_path file) FASTA file;
    # :type infile_path: str;
    # :param blast_algorithm: BLASTn algorithm to use.
    #     This parameter is necessary because it is included in name of result files;
    # :param acc_dict: dictionary of accession info of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param probing_batch_size: amount of sequences meant to be processed in a single run;
    # :type probing_batch_size: str;
    # :type blast_algorithm: str;

    # "hname" means human readable name (i.e. without file path and extention)
    fasta_hname = os.path.basename(infile_path)  # get rid of absolute path
    fasta_hname = re.search(r"(.*)\.(m)?f(ast)?a", fasta_hname).group(
        1)  # get rid of `.fasta` extention

    # Form path to temporary file
    tmp_fpath = "{}_{}_temp.txt".format(os.path.join(new_dpath, fasta_hname),
                                        blast_algorithm)
    # Form path to result file
    tsv_res_fpath = os.path.join(new_dpath, "classification.tsv")
    # Form path to file with hits to download
    acc_fpath = os.path.join(outdir_path, "hits_to_download.tsv")

    num_done_seqs = 0  # variable to keep number of successfully processed sequences

    resume = None
    # Check if there are results from previous run.
    if os.path.exists(tsv_res_fpath) or os.path.exists(tmp_fpath):
        print()
        printlog_info(
            "A result file from previous run is found in the directory:")
        printlog_info("   `{}`".format(new_dpath))
        # Allow politely to continue from last successfully sent packet.
        resume = ask_for_resumption()
    # end if

    if not resume:
        rename_file_verbosely(tsv_res_fpath)
        rename_file_verbosely(tmp_fpath)
        rename_file_verbosely(acc_fpath)
    else:
        printlog_info("Let's try to resume...")

        # Collect information from result file
        if os.path.exists(tsv_res_fpath):
            # There can be invalid information in this file
            try:
                with open(tsv_res_fpath, 'r') as res_file:
                    lines = res_file.readlines()
                    num_done_seqs = len(lines) - 1  # the first line is a head
                    last_line = lines[-1]
                    last_seq_id = last_line.split('\t')[0]
                # end with
                # There must be 10 columns in each row:
                if any(map(lambda l: l.count('\t') != 9, lines)):
                    raise ValueError(
                        "There must be 10 colums separated by tabs in file `classification.tsv`"
                    )
                # end if

            except Exception as err:
                printlog_error_time(
                    "\nData in classification file `{}` not found or broken. Reason:"
                    .format(tsv_res_fpath))
                printlog_error(' ' + str(err))

                # If the reason is known -- print erroneous lines
                if isinstance(err, ValueError):
                    printlog_error("Here are numbers of improper lines:")
                    for i, line in enumerate(lines):
                        if line.count('\t') != 9:
                            printlog_error(str(i + 1) + ": `{}`".format(line))
                        # end if
                    # end for
                # end if

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                printlog_info("Last classified sequence: " + last_seq_id)
                printlog_info(
                    "{} sequences have been already processed".format(
                        num_done_seqs))
            # end try
        # end if

        # Collect information from accession file
        if os.path.exists(acc_fpath):

            # There can be invalid information in this file
            try:
                with open(acc_fpath, 'r') as acc_file:
                    lines = acc_file.readlines()[
                        9:]  # omit description and head of the table
                    local_files_filtered = list(
                        filter(lambda x: False if os.path.exists(x) else True,
                               lines))  # omit file paths
                    for line in local_files_filtered:
                        vals = line.split('\t')
                        acc = sys.intern(vals[0].strip())
                        if len(vals) == 1:
                            acc_dict[acc] = [
                                "No definition of the sequence provided", 1
                            ]
                        elif len(vals) == 2:
                            acc_dict[acc] = [vals[1].strip(), 1]
                        else:
                            acc_dict[acc] = [
                                vals[1].strip(),
                                int(vals[2].strip())
                            ]
                        # end if
                    # end for
                # end with

            except Exception as err:
                printlog_error_time(
                    "Data in accession file `{}` not found or broken. Reason:".
                    format(acc_fpath))
                printlog_error(' ' + str(err))
                printlog_error("Invalid line: `{}`".format(line))

                # Ask a user if he/she wants to start from the beginning or to quit
                error = True
                while error:
                    reply = input("""Press ENTER to start from the beginning
  or enter `q` to quit:>> """)
                    if reply == "":
                        error = False
                        printlog_info(
                            "You have chosen to start from the beginning.\n")
                        rename_file_verbosely(tsv_res_fpath)
                        rename_file_verbosely(tmp_fpath)
                        rename_file_verbosely(acc_fpath)
                        return None
                    elif reply == 'q':
                        platf_depend_exit(0)
                    else:
                        print("! - Invalid reply: `{}`\n".format(reply))
                    # end if
                # end while
            else:
                print()
                printlog_info(
                    "Here are Genbank records encountered during previous run(s):"
                )
                for acc, other_info in sorted(acc_dict.items(),
                                              key=lambda x: -x[1][1]):
                    s_letter = "s" if other_info[1] > 1 else ""
                    printlog_info(" {} hit{} - {}, `{}`".format(
                        other_info[1], s_letter, acc, other_info[0]))
                # end for
                print('-' * 20)
            # end try
        # end if

        # Get packet size, number of the last sent packet and RID from temp file.
        # There can be invalid information in tmp file of tmp file may not exist
        try:

            with open(tmp_fpath, 'r') as tmp_file:
                temp_lines = tmp_file.readlines()
            # end with

            RID_save = re.search(r"Request_ID: (.+)",
                                 temp_lines[0]).group(1).strip()
            packet_size_save = int(
                re.search(r"Packet_size: ([0-9]*)",
                          temp_lines[1]).group(1).strip())
            packet_mode_save = int(
                re.search(r"Packet_mode: ([0-9]{1})",
                          temp_lines[2]).group(1).strip())

        except (AttributeError, OSError):

            # There is no need to disturb a user, merely proceed.
            return {
                "RID": None,
                "packet_size_save": None,
                "packet_mode_save": None,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": 0
            }
        else:
            # Let's assume that a user won't modify his/her brobing_batch size between erroneous runs:
            #   subtract num_done_reads if probing_batch_size > num_done_reads.
            decr_pb = num_done_seqs if num_done_seqs < probing_batch_size else 0
            # Return data from previous run
            return {
                "RID": RID_save,
                "packet_size_save": packet_size_save,
                "packet_mode_save": packet_mode_save,
                "tsv_respath": tsv_res_fpath,
                "n_done_reads": num_done_seqs,
                "tmp_fpath": tmp_fpath,
                "decr_pb": decr_pb
            }
        # end try
    # end if

    return None

Example #19

0

Show file

File: single_thread_FAST5_utwfunc.py Project: Vikash84/barapost

def map_f5reads_2_taxann(f5_path, tsv_taxann_lst, tax_annot_res_dir):
    # Function perform mapping of all reads stored in input FAST5 files
    #     to existing TSV files containing taxonomic annotation info.
    #
    # It creates an DBM index file.
    #
    # :param f5_path: path to current FAST5 file;
    # :type f5_path: str;
    # :param tsv_taxann_lst: list of path to TSV files that contain taxonomic annotation;
    # :type tsv_taxann_lst: list<str>;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        f5_file = h5py.File(f5_path, 'r')

        for _ in f5_file:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        return
    # end try

    readids_to_seek = list(fast5_readids(f5_file))
    idx_dict = dict()  # dictionary for index

    # This saving is needed to compare with 'len(readids_to_seek)'
    #    after all TSV will be looked through in order to
    #    determine if some reads miss taxonomic annotation.
    len_before = len(readids_to_seek)

    # Iterate over TSV-taaxnn file
    for tsv_taxann_fpath in tsv_taxann_lst:

        with open(tsv_taxann_fpath, 'r') as taxann_file:

            # Get all read IDs in current TSV
            readids_in_tsv = list(
                map(lambda l: l.split('\t')[0], taxann_file.readlines()))

            # Iterate over all other reads in current FAST5
            #    ('reversed' is necessary because we remove items from list in this loop)
            for readid in reversed(readids_to_seek):
                fmt_id = fmt_read_id(readid)[1:]
                if fmt_id in readids_in_tsv:
                    # If not first -- write data to dict (and to index later)
                    try:
                        idx_dict[tsv_taxann_fpath].append(
                            "read_" + fmt_id)  # append to existing list
                    except KeyError:
                        idx_dict[tsv_taxann_fpath] = ["read_" + fmt_id
                                                      ]  # create a new list
                    finally:
                        readids_to_seek.remove(readid)
                    # end try
                # end if
            # end for
        # end with
        if len(readids_to_seek) == 0:
            break
        # end if
    # end for

    # If after all TSV is checked but nothing have changed -- we miss taxonomic annotation
    #     for some reads! And we will write their IDs to 'missing_reads_lst.txt' file.
    if len(readids_to_seek) == len_before:
        printlog_error_time("reads from FAST5 file not found")
        printlog_error("FAST5 file: `{}`".format(f5_path))
        printlog_error("Some reads have not undergone taxonomic annotation.")
        missing_log = "missing_reads_lst.txt"
        printlog_error("List of missing reads are in following file:")
        printlog_error("{}".format(missing_log))
        with open(missing_log, 'w') as missing_logfile:
            missing_logfile.write(
                "Missing reads from file '{}':\n\n".format(f5_path))
            for readid in readids_to_seek:
                missing_logfile.write(fmt_read_id(readid) + '\n')
            # end for
        try:
            for path in glob(os.path.join(index_dirpath, '*')):
                os.unlink(path)
            # end for
            os.rmdir(index_dirpath)
        except OSError as oserr:
            printlog_error(
                "Error occured while removing index directory: {}".format(
                    oserr))
        finally:
            platf_depend_exit(3)
        # end try
    # end if

    try:
        # Open index files appending to existing data ('c' parameter)
        with open_shelve(os.path.join(index_dirpath, index_name),
                         'c') as index_f5_2_tsv:
            # Update index
            index_f5_2_tsv[f5_path] = idx_dict
        # end with
    except OSError as oserr:
        printlog_error_time("Error: cannot create index file `{}`"\
            .format(os.path.join(index_dirpath, index_name)))
        printlog_error(str(oserr))
        platf_depend_exit(1)

Example #20

0

Show file

File: networking.py Project: Vikash84/barapost

def send_request(request, pack_to_send, packet_size, packet_mode, filename,
                 tmp_fpath):
    # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
    #     and then waits for satisfaction of the request and retrieves response text.
    #
    # :param request: request_data (it is a dict that `configure_request()` function returns);
    # :param request: dict<dict>;
    # :param pack_to_send: current number (like id) of packet meant to be sent now.
    # :type pack_to_send: int;
    # :param pack_to_send: ordinal number of packet;
    # :type pack_to_send: int;
    # :param packet_size: numner of sequences in the packet;
    # :type packet_size: int;
    #
    # Returns XML text of type 'str' with BLAST response.

    payload = request["payload"]
    headers = request["headers"]

    server = "blast.ncbi.nlm.nih.gov"
    url = "/blast/Blast.cgi"
    error = True

    while error:
        try:
            conn = http.client.HTTPSConnection(server)  # create a connection
            conn.request("POST", url, payload, headers)  # send the request
            response = conn.getresponse()  # get the response
            response_text = str(response.read(), "utf-8")  # get response text
        except OSError as oserr:
            printlog_info_time(
                "`https://blast.ncbi.nlm.nih.gov` is not available.")
            printlog_info(str(oserr))
            printlog_info(
                "barapost will try to connect again in 30 seconds...\n")
            sleep(30)

        # if no exception occured
        else:
            error = False
        # end try
    # end while

    try:
        rid = re.search(r"RID = (.+)",
                        response_text).group(1)  # get Request ID
        rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group(
            1))  # get time to wait provided by the NCBI server
    except AttributeError:
        printlog_error_time("Seems, NCBI has denied your request.")
        printlog_error("Response is in file `request_denial_response.html`")
        with open("request_denial_response.html", 'w') as den_file:
            den_file.write(response_text)
        # end with
        platf_depend_exit(1)
    finally:
        conn.close()
    # end try

    # Save temporary data
    with open(tmp_fpath, 'w') as tmpfile:
        tmpfile.write("Request_ID: {}\n".format(rid))
        tmpfile.write("Packet_size: {}\n".format(packet_size))
        tmpfile.write("Packet_mode: {}".format(packet_mode))
    # end with

    # Wait for results of alignment
    return wait_for_align(rid, rtoe, pack_to_send, filename)

Example #21

0

Show file

File: barapost_spec.py Project: masikol/barapost

def configure_acc_dict(acc_fpath, your_own_fasta_lst, accs_to_download):
    # Fucntion configures accession dictionary according to accession file generated by 'barapost-prober.py':
    #    keys are accessions, values are tuples of the following format:
    #     (<sequence_name_aka_definition>).
    #
    # :param acc_fpath: path to accession file generated by 'barapost-prober.py';
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of paths to user's fasta files;
    # :type your_own_fasta_lst: list<str>;
    #
    # Returns accession dictionary described above.

    acc_dict = dict()

    # if database will be created only from 'your own' FASTA files -- return empty dict
    if not acc_fpath is None:

        with open(acc_fpath, 'r') as acc_file:
            lines = acc_file.readlines()

            for line_idx, line in enumerate(lines):
                line = line.strip()
                # Ignore ampty lines, commented lines and head of the table:
                if line != "" and not line.startswith(
                        '#') and not line.startswith("ACCESSION"):

                    line_splt = line.split('\t')
                    acc = sys.intern(line_splt[0].partition('.')[0])

                    if not re.match(GB_ACC_PATTERN, acc) is None:
                        # If we encounter GenBank accession number
                        try:
                            if len(line_splt) == 1:  # just accession
                                name = "No definition of the sequence provided"
                            else:
                                name = line_splt[1]
                            # end if
                            acc_dict[acc] = name
                        except IndexError as err:
                            printlog_error_time(
                                "Error: invalid data in file `{}`!".format(
                                    acc_fpath))
                            printlog_error(
                                "Here is that invalid line:\n  `{}`".format(
                                    line))
                            printlog_error(str(err))
                            platf_depend_exit(1)
                        # end try
                    else:
                        # It it's not a GenBank accession number,
                        #   probably it is a path to reference file.
                        if os.path.exists(line):
                            your_own_fasta_lst.append(line)
                        else:
                            printlog_error_time(
                                "Error in file `{}`.".format(acc_fpath))
                            printlog_error("Line #{} looks like path to reference file, but this file does not exist."\
                                .format(line_idx+1))
                            printlog_error(
                                "Here is this invalid line:\n  `{}`".format(
                                    line))
                            platf_depend_exit(1)
                        # end if
                    # end if
                # end if
            # end for
        # end with
    # end if

    if len(your_own_fasta_lst) == 0 and len(acc_dict) == 0 and len(
            accs_to_download) == 0:
        printlog_error_time(
            "Error: no accession information found in file `{}`".format(
                acc_fpath))
        platf_depend_exit(1)
    # end if

    return acc_dict

Example #22

0

Show file

File: build_local_db.py Project: masikol/barapost

def retrieve_fastas_by_acc(acc_dict, db_dir, local_fasta):
    # Function downloads set of records from Genbank according to accessions passed to it.
    # Downloaded FASTA file will be placed in 'db_dir' directory and named 'local_seq_set.fasta'

    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;
    # :param db_dir: path to directory in which downloaded FASTA file will be placed;
    # :type db_dir: str;
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;

    # Path to file with current chunk (see below "100 accession numbers...")
    tmp_fasta = os.path.join(db_dir, "tmp.fasta")

    accessions = tuple(set(acc_dict.keys()))
    if len(accessions) == 0:  # just in case
        return
    # end if

    # 100 accession numbers in order not to make too long URL
    # Download genomes by chunks of 100 sequences.
    max_accnum = 100
    i = 0
    accnum = len(accessions)

    while i < accnum:

        curr_accessions = accessions[i:i + max_accnum]  # slice chunk

        accs_del_comma = ','.join(
            curr_accessions)  # accessions must be separated by comma in url
        # E-utilities provide a possibility to download records from Genbank by accessions.
        retrieve_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?\
db=nuccore&id={}&rettype=fasta&retmode=text".format(accs_del_comma)
        log_info("Retrieve URL: `{}`".format(retrieve_url))

        # GNU wget utility is safer, but there can be presence of absence of it :)
        wget_util = "wget"
        util_found = False
        for d in os.environ["PATH"].split(os.pathsep):
            if os.path.isdir(d) and wget_util in os.listdir(d):
                util_found = True
                break
            # end if
        # end for

        print()
        printlog_info("{} - Downloading {} reference sequences...".format(
            getwt(), len(curr_accessions)))

        if util_found:
            # If we have wget -- just use it

            wget_cmd = 'wget --no-check-certificate "{}" -O {}'.format(
                retrieve_url, tmp_fasta)
            pipe = sp_Popen(wget_cmd, shell=True)
            pipe.communicate()
            if pipe.returncode != 0:
                printlog_error_time(
                    "Error occured while downloading reference sequences")
                platf_depend_exit(pipe.returncode)
            # end if

        else:
            # If there are no wget -- we will download sequences with Python disposal
            stop_wait = Event(
            )  # a flag variable that will signal waiter-function to stop executing

            def download_waiter(stop_wait):
                """
                Function waits untill 'local_fasta' file is downloaded.
                It prints size of downloaded data to console during downloading.
                This function just waits -- it won't bring you the menu :).
                """
                # Wait untill downloading starts
                while not os.path.exists(tmp_fasta):
                    if not stop_wait.is_set():
                        return
                    # end if
                    sleep(1)
                # end while

                MB_size = 1024**2  # we will divide by it to get megabytes

                while stop_wait.is_set():
                    # Get size of downloaded data
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size,
                                  1)  # get megabytes
                    printn("\r{} - {} MB downloaded ".format(getwt(), fsize))
                    sleep(1)  # instant updates are not necessary
                # end while

                # Print total size of downloaded file (it can be deleted by this time)
                try:
                    fsize = round(os.path.getsize(tmp_fasta) / MB_size, 1)
                except OSError:
                    # We can pass this ecxeption -- we do delete this file if downloading crushes
                    # And this function just waits :)
                    pass
                # end try
                printlog_info("\r{} - {} MB downloaded ".format(
                    getwt(), fsize))

            # end def download_waiter

            error = True
            while error:
                try:
                    waiter = Thread(target=download_waiter,
                                    args=(stop_wait, ))  # create thread
                    stop_wait.set()  # raise the flag
                    waiter.start()  # start waiting
                    urllib.request.urlretrieve(
                        retrieve_url, tmp_fasta)  # retrieve FASTA file
                except OSError as err:
                    printlog_error_time(
                        "Error occured while downloading fasta file.")
                    printlog_error(str(err))
                    printlog_error(
                        "`barapost-local.py` will try again in 30 seconds")
                    if os.path.exists(tmp_fasta):
                        os.unlink(tmp_fasta)
                    # end if
                    sleep(30)
                else:
                    error = False
                finally:
                    stop_wait.clear()  # lower the flag
                    waiter.join(
                    )  # main thread will wait until waiter function ends it's work
                # end try
            # end while
        # end if

        printlog_info_time("Downloading is completed")

        # Write chunk to result fasta file
        with open(tmp_fasta, 'r') as infile, open(local_fasta, 'a') as outfile:
            outfile.write(infile.read())
        # end with

        # Remove temp chunk file
        os.unlink(tmp_fasta)
        i += max_accnum  # go to next chunk

Example #23

0

Show file

File: taxonomy.py Project: masikol/barapost

def download_taxonomy(hit_acc, hit_def, taxonomy_path):
    # Function retrieves taxonomy of a hit from NCBI.
    # Moreover, it saves this taxonomy in file ``taxonomy_tsv:
    #     <accession>\t<taxonomy_str>
    #
    # :param hit_acc: hit accession;
    # :type hit_acc: str;
    # :param hit_def: definition of reference record;
    # :type hit_def: str;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    # Get TaxID of the organism from GenBank summary:
    gb_summary = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                             "/nuccore/{}".format(hit_acc),
                                             "GenBank summary", hit_acc)

    try:
        taxid = re.search(r"ORGANISM=([0-9]+)", gb_summary).group(1)
    except AttributeError:
        printlog_error_time(
            "Error: taxonomy parsing error 115-{}".format(hit_acc))
        printlog_error("Please, contact the developer.")
        platf_depend_exit(115)
    # end try

    # Get taxonomy page of the organism
    taxonomy_url = "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}&lvl=3&lin=f&keep=1&srchmode=1&unlock".format(
        taxid)
    taxonomy_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                taxonomy_url, "taxonomy",
                                                hit_acc)

    # This pattern will match taxonomic names along with their ranks
    tax_rank_pattern = r"TITLE=\"([a-z ]+)\"\>([A-Z].+?)\</a\>"

    # Get all taxonomic names of the organism
    taxonomy = re.findall(tax_rank_pattern, taxonomy_text)

    # We will convert ranks to lowercase just in case.
    # Firstly convert tuples to lists in order to change them:
    taxonomy = list(map(lambda x: list(x), taxonomy))

    # Remove odd information from beginnig of names:
    for i in range(len(taxonomy)):
        taxonomy[i][0] = taxonomy[i][0].lower()  # just in case
    # end for

    # We will leave only following taxonomic ranks: domain, phylum, class, order, family, genus.
    # Species name requires special handling, it will be added later.
    ranks_to_select = ranks[:-1]

    # Remove redundant ranks:
    taxonomy = filter(lambda x: x[0].lower() in ranks_to_select, taxonomy)

    # Convert back to tuples:
    taxonomy = list(map(lambda x: tuple(x), taxonomy))

    # E.g., this record has no appropriate ranks: CP034535
    # Merely return it's definition
    if len(taxonomy) == 0:
        # Save taxonomy
        _tax_accs.append(hit_acc)
        with open(taxonomy_path, 'a') as tax_file:
            tax_file.write("{}\n".format('\t'.join((hit_acc, hit_def))))
        # end with
    # end if

    # Check if species name is specified like other ranks:
    check_direct_species_patt = r"TITLE=\"(species)\"\>([A-Za-z0-9 \.]+)\</a\>"
    match_direct_species = re.search(check_direct_species_patt, taxonomy_text)

    if not match_direct_species is None:
        # If species name is specified like other ranks, merely add it to list:
        taxonomy.append((match_direct_species.group(1),
                         match_direct_species.group(2).partition(" ")[2]))
    else:
        # Otherwise we need to parse species name from title
        title = re.search(r"\<title\>Taxonomy browser \((.+)\)\</title\>",
                          taxonomy_text).group(1)

        # Get words
        title = title.split(' ')

        # We will take all this words as species name.
        # Viruses also often have unpredictable names.
        #   Example: MN908947
        try:
            if title[1] in second_words_not_species or taxonomy[0][1].lower(
            ) == "viruses":
                taxonomy.append(("species", '_'.join(title[1:])))
            else:
                taxonomy.append(("species", title[1]))
            # end if
        except IndexError:
            # Handle absence of species name, e.g., this: AC150248.3
            # Well, nothing to append in this case!
            pass
        # end try
    # end if

    # Fill in missing ranks with empty strings
    for i in range(len(ranks)):
        if len(taxonomy) < i + 1:  # for this (missing in the end): AC150248
            taxonomy.append((ranks[i], ""))
        elif taxonomy[i][0] != ranks[
                i]:  # for this (mising in the middle): MN908947
            taxonomy.insert(i, (ranks[i], ""))
        # end if
    # end for

    # It will be a bit faster
    taxonomy = tuple(taxonomy)

    # Save taxonomy
    _tax_accs.append(hit_acc)
    with open(taxonomy_path, 'a') as tax_file:
        tax_file.write("{}\n".format('\t'.join(
            (hit_acc, config_taxonomy_str(taxonomy)))))

Example #24

0

Show file

File: single_thread_FAST5_binfunc_utw.py Project: Vikash84/barapost

def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                   min_pident, min_coverage, no_trash):
    # Function bins FAST5 file with untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    index_dirpath = os.path.join(
        tax_annot_res_dir,
        index_name)  # name of directory that will contain indicies

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            f5_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(
            os.path.basename(f5_path)))
        printlog_error("Reason: {}".format(str(runterr)))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    readids_to_seek = list(from_f5.keys())  # list of not-binned-yet read IDs

    # Fill the list 'readids_to_seek'
    for read_name in fast5_readids(from_f5):
        # Get rid of "read_"
        readids_to_seek.append(sys.intern(read_name))
    # end for

    # Walk through the index
    index_f5_2_tsv = open_shelve(os.path.join(index_dirpath, index_name), 'r')

    if not f5_path in index_f5_2_tsv.keys():
        printlog_error_time(
            "Source FAST5 file `{}` not found in index".format(f5_path))
        printlog_error("Try to rebuild index")
        platf_depend_exit(1)
    # end if

    for tsv_path in index_f5_2_tsv[f5_path].keys():

        read_names = index_f5_2_tsv[f5_path][tsv_path]
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_path, sens, taxonomy_path)

        for read_name in read_names:
            try:
                hit_names, *vals_to_filter = resfile_lines[sys.intern(
                    fmt_read_id(read_name)[1:])]
            except KeyError:
                printlog_error_time("Error: missing taxonomic annotation info for read `{}`"\
                    .format(fmt_read_id(read_name)[1:]))
                printlog_error(
                    "It is stored in `{}` FAST5 file".format(f5_path))
                printlog_error(
                    "Try to make new index file (press ENTER on corresponding prompt)."
                )
                printlog_error(
                    "Or, if does not work for you, make sure that taxonomic annotation info \
for this read is present in one of TSV files generated by `barapost-prober.py` and `barapost-local.py`."
                )
                index_f5_2_tsv.close()
                platf_depend_exit(1)
            # end try

            if not QL_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if QL_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     QL_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
                QL_seqs_fail += 1
            elif not align_filter(vals_to_filter):
                # Get name of result FASTQ file to write this read in
                if align_trash_fpath not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     align_trash_fpath)
                # end if
                f5_cpy_func(from_f5, read_name,
                            srt_file_dict[align_trash_fpath])
                align_seqs_fail += 1
            else:
                for hit_name in hit_names.split(
                        "&&"
                ):  # there can be multiple hits for single query sequence
                    # Get name of result FASTQ file to write this read in
                    binned_file_path = os.path.join(
                        outdir_path, "{}.fast5".format(hit_name))
                    if binned_file_path not in srt_file_dict.keys():
                        srt_file_dict = update_file_dict(
                            srt_file_dict, binned_file_path)
                    # end if
                    f5_cpy_func(from_f5, read_name,
                                srt_file_dict[binned_file_path])
                # end for
                seqs_pass += 1
            # end if
        # end for

    from_f5.close()
    index_f5_2_tsv.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #25

0

Show file

File: legacy_taxonomy_handling.py Project: masikol/barapost

def _reformat_legacy_file(legacy_tax_path):

    import shelve

    # Check if this file is corrupted
    try:
        with shelve.open(legacy_tax_path, 'r') as tax_file:
            pass
        # end with
    except OSError as err:
        printlog_error("Legacy taxonomy file appears to be corrupted.")
        printlog_error("This error might be fatal.")
        str_err = str(err)
        if "dbm.gnu" in str_err and "module is not" in str_err:
            printlog_error("Installing `python3-gdbm` might solve this problem.")
        else:
            printlog_error("The program can't recover taxonomy from the broken file.")
            printlog_error("Seems, you have to annotate your sequences again.")
            printlog_error("Sorry for that :(")
        # end if
        platf_depend_exit(1)
    # end try

    new_tax_path = "{}.tsv".format(legacy_tax_path)

    taxonomy.init_tax_file(new_tax_path)

    printn("Reformatting: `{}` ->".format(legacy_tax_path))
    log_info("Reformatting: `{}` ->".format(legacy_tax_path))

    with shelve.open(legacy_tax_path, 'r') as old_tax_file, open(new_tax_path, 'w') as new_tax_file:
        for acc, taxonomy_from_file in old_tax_file.items():
            if isinstance(taxonomy_from_file, tuple):
                tax_str = taxonomy.config_taxonomy_str(taxonomy_from_file)
                new_tax_file.write("{}\n".format('\t'.join( (acc, tax_str) )))
            elif isinstance(taxonomy_from_file, str):
                new_tax_file.write("{}\n".format('\t'.join( (acc, taxonomy_from_file) )))
            else:
                # Execution must not reach here
                printlog_error_time("Fatal error 8755.")
                printlog_error("Please, contact the developer.")
                platf_depend_exit(8755)
            # end if
        # end for
    # end with

    printlog_info(" `<same_dir>/{}`".format(os.path.basename(new_tax_path)))

    try:
        renamed_legacy_file = "{}_deprecated".format(legacy_tax_path)
        os.rename(legacy_tax_path, renamed_legacy_file)
    except OSError as err:
        printlog_error_time("Cannot rename legacy taxonomy file `{}`:".format(legacy_tax_path))
        printlog_error(str(err))
        printlog_error("But it's not a problem -- we will proceed with our work.")
    else:
        printlog_info("Renamed: `{}` -> `<same_dir>/{}`".format(legacy_tax_path,
            os.path.basename(renamed_legacy_file)))
    # end try

    printlog_info("Legacy taxonomy file is reformatted to TSV format.")

Example #26

0

Show file

File: binning_spec.py Project: masikol/barapost

def format_taxonomy_name(hit_acc, hit_def, sens, tax_dict):
    # Function formats taxonomy name according to chosen sensibiliry of binning.
    #
    # :param hit_acc: accession(s) of best hit(s);
    # :type hit_acc: str;
    # :param hit_def: annotation of best hit;
    # :type hit_def: str;
    # :param sens: sensibility returned by 'get_classif_sensibility()' function.
    #     It's value can be one of the following strings: "genus", "species";
    # :type sens: str;
    # :param tax_dict: taxonomy dictionary returned by function 'src.taxonomy.get_tax_dict';
    # :type tax_dict: dict;
    #
    # Returns formatted hit name of 'str' type;

    # If there is no hit -- we are sure what to do!
    if hit_def == "No significant similarity found":
        return "unknown"
    # end if

    best_hit_annots = list(
    )  # list of strings that will be names of binned files

    for acc, annotation in zip(hit_acc.split('&&'), hit_def.split('&&')):

        # Get taxonomy
        try:
            taxonomy = tax_dict[acc]
        except KeyError:
            raise NoTaxonomyError()
        # end try

        # If it is beautiful tuple-formatted taxonomy -- find rank name for filename
        if isinstance(taxonomy, tuple):

            best_hit_annots.append(find_rank_for_filename(sens, taxonomy))
            if sens[0] == "species":
                genus_sens = ("genus", sens[1] - 1)
                genus_name = find_rank_for_filename(genus_sens, taxonomy)
                species_name = best_hit_annots[len(best_hit_annots) - 1]
                best_hit_annots[len(best_hit_annots) - 1] = "{}_{}".format(
                    genus_name, species_name)
            # end if

        # Otherwise consider sequence ID
        elif isinstance(taxonomy, str):

            # Check if hit is a sequence from SPAdes or a5 assembly:
            spades_match_obj = re.search(SPADES_PATT, annotation)
            a5_match_obj = re.search(A5_PATT, annotation)

            if not spades_match_obj is None:
                if sens[0] != "species":
                    contig_info = spades_match_obj.group(1)
                    taxonomy = taxonomy.replace('--' + contig_info, '')
                # end if
            elif not a5_match_obj is None:
                if sens[0] != "species":
                    contig_info = a5_match_obj.group(1)
                    taxonomy = taxonomy.replace('--' + contig_info, '')
                # end if
            # end if
            # If it is not assembly -- merely return taxonomy
            best_hit_annots.append(taxonomy)
        else:
            # Execution must not reach here
            printlog_error_time("Fatal error 8754.")
            printlog_error("Please, contact the developer.")
            platf_depend_exit(8754)
        # end if
    # end for

    # Replace symbols not allowed in filenames
    best_hit_annots = map(remove_bad_chars, best_hit_annots)

    # Return deduplicated names
    return "&&".join(set(best_hit_annots))

Example #27

0

Show file

def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #28

0

Show file

def bin_fast5_file(f5_path, tax_annot_res_dir, sens, min_qual, min_qlen,
    min_pident, min_coverage, no_trash):
    # Function bins FAST5 file without untwisting.
    #
    # :param f5_path: path to FAST5 file meant to be processed;
    # :type f5_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0 # counter for sequences, which pass filters
    QL_seqs_fail = 0 # counter for too short or too low-quality sequences
    align_seqs_fail = 0 # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict()

    new_dpath = glob("{}{}*{}*".format(tax_annot_res_dir, os.sep, get_checkstr(f5_path)))[0]
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Make filter for quality and length
    QL_filter = get_QL_filter(f5_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(f5_path, outdir_path, min_qual, min_qlen,)
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(f5_path, outdir_path, min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    # File validation:
    #   RuntimeError will be raised if FAST5 file is broken.
    try:
        # File existance checking is performed while parsing CL arguments.
        # Therefore, this if-statement will trigger only if f5_path's file is not a valid HDF5 file.
        if not h5py.is_hdf5(f5_path):
            raise RuntimeError("file is not of HDF5 (i.e. not FAST5) format")
        # end if

        from_f5 = h5py.File(f5_path, 'r')

        for _ in from_f5:
            break
        # end for
    except RuntimeError as runterr:
        printlog_error_time("FAST5 file is broken")
        printlog_error("Reading the file `{}` crashed.".format(os.path.basename(f5_path)))
        printlog_error("Reason: {}".format( str(runterr) ))
        printlog_error("Omitting this file...")
        print()
        # Return zeroes -- inc_val won't be incremented and this file will be omitted
        return (0, 0, 0)
    # end try

    # singleFAST5 and multiFAST5 files should be processed in different ways
    # "Raw" group always in singleFAST5 root and never in multiFAST5 root
    if "Raw" in from_f5.keys():
        f5_cpy_func = copy_single_f5
    else:
        f5_cpy_func = copy_read_f5_2_f5
    # end if

    for _, read_name in enumerate(fast5_readids(from_f5)):

        try:
            hit_names, *vals_to_filter = resfile_lines[sys.intern(fmt_read_id(read_name))[1:]] # omit 'read_' in the beginning of FAST5 group's name
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(fmt_read_id(read_name)))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Try running barapost-binning with `-u` (`--untwist-fast5`) flag.\n")
            platf_depend_exit(1)
        # end try
        # If read is found in TSV file:
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[QL_trash_fpath])
        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Get name of result FASTQ file to write this read in
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, align_trash_fpath)
            # end if
            f5_cpy_func(from_f5, read_name, srt_file_dict[align_trash_fpath])
        else:
            for hit_name in hit_names.split("&&"): # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(outdir_path, "{}.fast5".format(hit_name))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict, binned_file_path)
                # end if
                f5_cpy_func(from_f5, read_name, srt_file_dict[binned_file_path])
            # end for
            seqs_pass += 1
        # end if
    # end for

    from_f5.close()

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for


    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(os.path.basename(f5_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)

Example #29

0

Show file

File: networking.py Project: Vikash84/barapost

def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)

Example #30

0

Show file

File: single_thread_QA.py Project: Vikash84/barapost

def bin_fastqa_file(fq_fa_path, tax_annot_res_dir, sens, min_qual, min_qlen,
                    min_pident, min_coverage, no_trash):
    # Function for single-thread binning FASTQ and FASTA files.
    #
    # :param fq_fa_path: path to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_path: str;
    # :param tax_annot_res_dir: path to directory containing taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param sens: binning sensitivity;
    # :type sens: str;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    srt_file_dict = dict(
    )  # dict containing file objects of existing output files

    new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
    tsv_res_fpath = get_res_tsv_fpath(new_dpath)
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")
    resfile_lines = configure_resfile_lines(tsv_res_fpath, sens, taxonomy_path)

    # Configure generator, write function and path to trash file
    if is_fastq(fq_fa_path):
        seq_records_generator = fastq_records
        write_fun = write_fastq_record
    else:
        seq_records_generator = fasta_records
        write_fun = write_fasta_record
    # end if

    # Make filter for quality and length
    QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
    # Configure path to trash file
    if not no_trash:
        QL_trash_fpath = get_QL_trash_fpath(
            fq_fa_path,
            outdir_path,
            min_qual,
            min_qlen,
        )
    else:
        QL_trash_fpath = None
    # end if

    # Make filter for identity and coverage
    align_filter = get_align_filter(min_pident, min_coverage)
    # Configure path to this trash file
    if not no_trash:
        align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                  min_pident, min_coverage)
    else:
        align_trash_fpath = None
    # end if

    for fastq_rec in seq_records_generator(fq_fa_path):

        read_name = sys.intern(fmt_read_id(
            fastq_rec["seq_id"])[1:])  # get ID of the sequence

        try:
            hit_names, *vals_to_filter = resfile_lines[
                read_name]  # find hit corresponding to this sequence
        except KeyError:
            printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                .format(read_name))
            printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
            printlog_error("Make sure that this read has been already \
                processed by `barapost-prober.py` and `barapost-local.py`.")
            platf_depend_exit(1)
        # end try

        # Apply filters
        if not QL_filter(vals_to_filter):
            QL_seqs_fail += 1
            # Place this sequence to QL trash file
            if QL_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict, QL_trash_fpath)
            # end if
            write_fun(srt_file_dict[QL_trash_fpath],
                      fastq_rec)  # write current read to binned file

        elif not align_filter(vals_to_filter):
            align_seqs_fail += 1
            # Place this sequence to align_trash file
            if align_trash_fpath not in srt_file_dict.keys():
                srt_file_dict = update_file_dict(srt_file_dict,
                                                 align_trash_fpath)
            # end if
            write_fun(srt_file_dict[align_trash_fpath],
                      fastq_rec)  # write current read to binned file

        else:
            for hit_name in hit_names.split(
                    "&&"
            ):  # there can be multiple hits for single query sequence
                # Get name of result FASTQ file to write this read in
                binned_file_path = os.path.join(
                    outdir_path,
                    "{}.fast{}".format(hit_name,
                                       'q' if is_fastq(fq_fa_path) else 'a'))
                if binned_file_path not in srt_file_dict.keys():
                    srt_file_dict = update_file_dict(srt_file_dict,
                                                     binned_file_path)
                # end if
                write_fun(srt_file_dict[binned_file_path],
                          fastq_rec)  # write current read to binned file
            # end for
            seqs_pass += 1
        # end if
    # end for

    # Close all binned files
    for file_obj in filter(lambda x: not x is None, srt_file_dict.values()):
        file_obj.close()
    # end for

    sys.stdout.write('\r')
    printlog_info_time("File `{}` is binned.".format(
        os.path.basename(fq_fa_path)))
    printn(" Working...")

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)