Beispiel #1
0
def create_result_directory(fq_fa_path, outdir_path):
    # Function creates a result directory named according
    #     to how source FASTQ or FASTA file is named.
    #
    # :param fq_fa_path: path to source FASTQ or FASTA file;
    # :type fq_fa_path: str;
    # :param outdir_path: path to directory in which result_directory will be created;
    # :type outdir_path: str;
    #
    # Returns 'str' path to the recently created result directory.

    # dpath means "directory path"
    new_dpath = os.path.join(
        outdir_path, os.path.basename(fq_fa_path))  # get rid of absolute path
    new_dpath = re.search(r"(.*)\.(m)?f(ast)?(a|q)(\.gz)?$",
                          new_dpath).group(1)  # get rid of extention
    if not os.path.exists(new_dpath):
        try:
            os.makedirs(new_dpath)
        except OSError as oserr:
            printlog_error_time(
                "Error: can't create result directory: `{}`".format(new_dpath))
            printlog_error(str(oserr))
            platf_depend_exit(1)
        # end try
    # end if
    return new_dpath
Beispiel #2
0
def provide_open_funcs(fpaths):
    # Function, which returns opening function(s) for input file(s).
    #
    # :param fpaths: collection of paths to input files;
    # :type fpaths: list<str>;

    open_funcs = list()

    try:
        for fpath in fpaths:
            # Check if input file is gzipped
            if _is_gzipped(fpath):
                open_funcs.append(
                    functools.partial(gzip.open, mode='rt', encoding='utf-8'))
            # Check if input file is bzipped2
            elif _is_bzipped(fpath):
                open_funcs.append(
                    functools.partial(bz2.open, mode='rt', encoding='utf-8'))
            # Check if input file is plain text file
            elif _is_plain_text(fpath):
                open_funcs.append(
                    functools.partial(open, mode='r', encoding='utf-8'))
            else:
                # Raise a super terrifying exception
                raise _InvalidFileError('Error: cannot read file `{}`: \
it is neither plain text file, nor gzipped, nor bzipped2.'.format(fpath))
            # end if
        # end for
    except _InvalidFileError as err:
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return open_funcs
Beispiel #3
0
def get_res_tsv_fpath(new_dpath):
    # Function returns current TSV file. Binning will be performed according to this file.
    # :param new_dpath: current result directory;
    # :type new_dpath: str;

    is_similar_to_tsv_res = lambda f: True if f == "classification.tsv" else False

    if not os.path.exists(new_dpath):
        printlog_error_time(
            "Error: directory `{}` does not exist!".format(new_dpath))
        printlog_error("Please make sure you have performed taxonomic \
annotation of the following file: `{}` \
with `barapost-prober.py` and/or `barapost-local.py`".format(
            os.path.basename(new_dpath)))
        printlog_error(
            "Also this error might occur if you forget to specify result directory \
generated by `barapost-prober.py` with `-r` option.")
        platf_depend_exit(0)
    # end if

    # Recent file will be the first in sorted list
    tsv_res_fpath = list(
        filter(is_similar_to_tsv_res, sorted(os.listdir(new_dpath))))[0]

    return os.path.join(new_dpath, tsv_res_fpath)
Beispiel #4
0
def gzip_outfiles(outdir):
    # Function gzips all fastq files in directory `outdir`.
    #
    # :param outdir: path to outdir;
    # :type outdir: str;

    # Get gzipping function
    gzip_func = _get_gzip_func()
    print()
    printlog_info_time('Gzipping output files...')

    # Get fastq files
    is_fastq = lambda x: not re.match(r'.+\.f(ast)?q$', x) is None
    fq_fpaths = filter(is_fastq, glob.iglob(os.path.join(outdir, '*')))

    # Gzip them!
    for fpath in fq_fpaths:
        try:
            gzip_func(fpath)
        except OSError as err:
            printlog_info('Error: cannot gzip file `{}`: {}.'.format(
                fpath, err))
            platf_depend_exit(1)
        # end try
    # end for

    printlog_info_time('Output files are gzipped.')
Beispiel #5
0
def recover_taxonomy(acc, hit_def, taxonomy_path):
    # Function recovers missing taxonomy by given accession.
    #
    # :param acc: accession of taxonomy entry to recover;
    # :type acc: str;
    # :param hit_def: name of this sequence;
    # :type hit_def: sre;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    if acc == "LAMBDA":
        # If we are missing lambda phage taxonomy -- just add it
        save_taxonomy_directly(taxonomy_path, acc,
                               "Lambda-phage-nanopore-control")
    elif acc.startswith("OWN_SEQ_"):
        # If sequence is an "own seq" -- check fasta file

        # Get necessary title line from `local_seq_set.fasta`
        # Firstly find fasta file (it may be compressed)
        classif_dir = os.path.dirname(os.path.dirname(taxonomy_path))
        db_dir = os.path.join(classif_dir, "local_database")
        db_files = glob.glob("{}{}*".format(db_dir, os.sep))
        try:
            local_fasta = next(iter(filter(is_fasta, db_files)))
        except StopIteration:
            printlog_error_time(
                "Error: cannot recover taxonomy for following sequence:")
            printlog_error(" `{} - {}`.".format(acc, hit_def))
            printlog_error(
                "You can solve this problem by yourself (it's pretty simple).")
            printlog_error("Just add taxonomy line for {} to file `{}`".format(
                acc, taxonomy_path))
            printlog_error("  and run the program again.")
            platf_depend_exit(1)
        # end try

        # Find our line startingg with `acc`
        how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)]
        if is_gzipped(local_fasta):
            search_for = b">" + bytes(acc, 'ascii') + b" "
        else:
            search_for = ">{} ".format(acc)
        # end if

        with how_to_open(local_fasta) as fasta_file:
            for line in fasta_file:
                if line.startswith(search_for):
                    seq_name = fmt_func(line).partition(' ')[
                        2]  # get name of the sequence
                    save_taxonomy_directly(taxonomy_path, acc, seq_name)
                    break
                # end if
            # end for
        # end with
    else:
        # Try to find taxonomy in NCBI
        download_taxonomy(acc, hit_def, taxonomy_path)
Beispiel #6
0
def _is_redundant(nc_acc, accs):
    # Function checks if "NC-or-NW"-record is redundant (if it's non-RefSeq copy already exists in acc_dict).
    # :param nc_acc: accession number of NC-record;
    # :type nc_acc: str;
    # :param accs: tuple of accession numbers;
    # :type accs: tuple<str>;

    summary = lingering_https_get_request(
        "www.ncbi.nlm.nih.gov",
        "/nuccore/{}?report=genbank&log$=seqview".format(nc_acc), "summary",
        nc_acc)

    try:
        # Find link to Identical GenBank Record

        # Firstly, get GI number of NC seqeunce:
        get_gi_url = "/nuccore/{}?report=gilist&log$=seqview&format=text".format(
            nc_acc)
        nc_gi_text = lingering_https_get_request("www.ncbi.nlm.nih.gov",
                                                 get_gi_url,
                                                 "GI of {}".format(nc_acc),
                                                 nc_acc)
        nc_gi_text = nc_gi_text.replace('\n', '')
        nc_gi_re = re.search(r"\<pre\>([0-9]+).*\</pre\>", nc_gi_text)
        if nc_gi_re is None:
            raise _NoIdentLabelError(
                "Error 771. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        nc_gi = nc_gi_re.group(1)

        # Retrieve identical GenBank sequence accession number.
        # NCBI redirects these requests and provides necessary location in headers.
        # So, we'll follow thin link.
        identical_gb_link = "/nuccore?LinkName=nuccore_nuccore_rsgb&from_uid={}".format(
            nc_gi)
        redirect_text = _ling_https_getreq_handl_301(
            "www.ncbi.nlm.nih.gov", identical_gb_link,
            "link to identical genbank sequence", nc_acc)

        # Get accession number from the response text
        pattern = r"\<pre\>(.*).*\</pre\>"
        ident_acc_re = re.search(pattern, redirect_text.replace('\n', ''))

        if ident_acc_re is None:
            raise _NoIdentLabelError(
                "Error 773. Accession: {}. Please, contact the developer.".
                format(nc_acc))
        # end if

        ident_acc = ident_acc_re.group(1).partition('.')[0]

    except (_NoIdentLabelError, _NoLinkError, _NoAccError) as err:
        printlog_error_time("Error: {}".format(err))
        platf_depend_exit(1)
    else:
        return ident_acc, ident_acc in accs
Beispiel #7
0
def create_or_emply_file(file_path):
    try:
        with open(file_path, 'wt') as _:
            pass
        # end with
    except OSError as err:
        print(f'\nError: cannot create file {file_path}')
        print(str(err))
        platf_depend_exit(1)
Beispiel #8
0
def make_outdir(outdpath: str) -> None:
    # Function creates output directory.
    if not os.path.exists(outdpath):
        try:
            os.makedirs(outdpath)
        except OSError as err:
            print(
                'Error: cannot create output directory `{}`.'.format(outdpath))
            print(str(err))
            platf_depend_exit(1)
Beispiel #9
0
def rename_file_verbosely(file):
    # Function verbosely renames file (as well as directory) given to it.
    # :param file: path to file (directory) meant to be renamed;
    # :type file: str;

    if not os.path.exists(file):
        return None
    # end if

    # Path to "file's" parent directory
    pardir = os.path.abspath(os.path.dirname(file))

    # Function can rename directories
    if os.path.isdir(file):
        is_analog = lambda f: not re.search(r"{}.*(_old_[0-9]+)?$"\
            .format(os.path.basename(file)), f) is None
        word = "directory"
        name_itself = file
        ext = ""
    else:
        is_analog = lambda f: re.search(r"(.*)\..*$", os.path.basename(file)
                                        ).group(1) in f
        word = "file"
        name_itself = re.search(r"(.*)\..*$", file).group(1)
        ext = re.search(r".*(\..*)$", file).group(1)
    # end if

    # Count files in 'pardir' that have analogous names as 'file' has:
    num_analog_files = len(list(filter(is_analog, os.listdir(pardir))))

    if re.search(r"_old_[0-9]+", file) is None:
        # Append "_old_<number>"
        new_name = name_itself + "_old_" + str(num_analog_files) + ext
    else:
        # Merely substitute new number
        new_name = file.replace(
            re.search(r"_old_([0-9]+)", file).group(1),
            str(num_analog_files + 1))
    # end if

    try:
        print()
        printlog_info(" - Renaming old {}:".format(word))
        printlog_info("  `{}` --> `{}`".format(file, new_name))
        os.rename(file, new_name)
    except OSError as err:
        printlog_error_time("Error: {} `{}` cannot be renamed:".format(
            word, str(file)))
        printlog_error(str(err))
        platf_depend_exit(1)
    # end try

    return new_name
Beispiel #10
0
def remove_tmp_files(*paths):
    # Function removes files passed to it.
    # :param paths: an array-like collection of apth of files;
    # :type paths: list<str>;

    for path in paths:
        if os.path.exists(path):
            try:
                os.unlink(path)
            except OSError as oserr:
                printlog_error_time("Error: cannot remove file `{}`").format(
                    path)
                printlog_error(str(oserr))
                platf_depend_exit(1)
Beispiel #11
0
def update_file_dict(srt_file_dict, new_fpath):
    try:
        if not new_fpath is None:
            srt_file_dict[sys.intern(new_fpath)] = open(new_fpath, 'a')
        else:
            srt_file_dict[new_fpath] = None  # handle no_trash
        # end if
    except OSError as oserr:
        printlog_error_time("Error occured while opening one of result files")
        printlog_error("Errorneous file: `{}`".format(new_fpath))
        printlog_error(str(oserr))
        platf_depend_exit(1)
    # end try
    return srt_file_dict
Beispiel #12
0
def _fasta_generator(infpath: str) -> Generator[Tuple[str, str], None, None]:
    # Generator yields "fasta-tuples: 0-th element of such a tuple is sequence name,
    #   and 1-st element if sequence itself.

    curr_seq_name: str = ''  # current sequence name
    curr_seq: str = ''  # current sequence

    open_func: Callable[[str],
                        ContextManager]  # function for opening input file

    # Choose `open_func`
    if infpath.endswith('.gz'):
        open_func = partial(gzip.open, mode='rt', encoding='utf-8')
    else:
        open_func = partial(open, mode='rt', encoding='utf-8')
    # end if

    infile: TextIO
    with open_func(infpath) as infile:

        eof: bool = False  # indicates of End Of File is reached

        # Get the first sequence name
        curr_seq_name = infile.readline().strip()

        while not eof:
            # Get next line whatever it is
            line: str = infile.readline().strip()

            if line.startswith('>') or line == '':
                # We reached end of current sequence

                # Validate parsed sequence
                try:
                    _validate_fasta(curr_seq_name, curr_seq)
                except ValueError:
                    platf_depend_exit(1)
                # end try

                yield curr_seq_name[1:], curr_seq  # yield current sequence

                curr_seq_name = line  # read next header
                curr_seq = ''  # empty sequence

                if line == '':  # no more sequences -- end of file
                    eof = True
                # end if
            else:
                curr_seq += line.upper(
                )  # new line is a sequence -- append it to `curr_seq`
Beispiel #13
0
def _create_outdir_from_outfile(outfpath: str) -> None:
    # Function creates output directory
    # :param outfpath: path to output file;

    outdpath = os.path.dirname(outfpath)

    # Create directory if it does not exist
    if not os.path.isdir(outdpath):
        try:
            os.makedirs(outdpath)
        except OSError as err:
            print(f'Error! Cannot create output directory `{outdpath}`.')
            print(str(err))
            platf_depend_exit(1)
Beispiel #14
0
def fastq_generator(fq_fpaths):
    # Function yields fastq records.
    # It does not create new FastqRecord object each time.
    # Instead it just updates extant object.
    # :param fq_fpaths: list ot paths to input fastq files;
    # :type fq_fpaths: list<str>, tuple<str>;
    # Yields list of FastqRecord-s, list<FastqRecord>.

    # Get open funtions for both files
    open_funcs = src.compression.provide_open_funcs(fq_fpaths)

    # Open input files and create FastqRecord objects for forward and reverse reads.
    fq_files = list()
    fq_records = list()
    for fpath, open_func in zip(fq_fpaths, open_funcs):
        fq_files.append(open_func(fpath))
        fq_records.append(FastqRecord(None, None, None, None))
    # end for

    eof = False

    while not eof:

        for fq_record, fq_file in zip(fq_records, fq_files):
            # Update FastqRecord
            fq_record.update_record(fq_file.readline().strip(),
                                    fq_file.readline().strip(),
                                    fq_file.readline().strip(),
                                    fq_file.readline().strip())
        # end for

        if fq_records[0].read_name == '':
            eof = True  # end of file
        else:
            # Validate fastq record(s)
            for fq_record in fq_records:
                error_response = fq_record.validate_fastq()
                if not error_response is None:
                    printlog_error('Fastq error: {}'.format(error_response))
                    platf_depend_exit(1)
                # end if
            # end for
            yield fq_records
        # end if
    # end while

    # Close input files.
    for fq_file in fq_files:
        fq_file.close()
Beispiel #15
0
def _bname_no_fasta_ext(fpath: str) -> str:
    # Function removes fasta extention (with `.gz` one, if it it present)

    # Find the extention
    ext_match_obj: re.Match = re.search(r'.+(\.f(ast)?a(\.gz)?)$',
                                        os.path.basename(fpath))

    # Remove it
    bname_no_ext: str
    if ext_match_obj is None:
        print('Error 12: please, contact the developer.')
        platf_depend_exit(12)
    else:
        bname_no_ext = os.path.basename(fpath).replace(ext_match_obj.group(1),
                                                       '')
    # end if

    return bname_no_ext
Beispiel #16
0
def verify_taxids(taxid_list):
    # Funciton verifies TaxIDs passed to prober with `-g` option.
    # Function requests NCBI Taxonomy Browser and parses organism's name from HTML response.
    # What is more, this function configures `oraganisms` list - it will be included into BLAST submissions.
    #
    # :param taxid_list: list of TaxIDs. TaxIDs are strings, but they are verified to be integers
    #     during CL argument parsing;
    # :type taxid_list: list<str>;
    #
    # Returns list of strings of the following format: "<tax_name> (taxid:<TaxID>)>"

    organisms = list()
    if len(taxid_list) > 0:

        printlog_info("Verifying TaxIDs:")
        for taxid in taxid_list:
            printn("   {} - ".format(taxid))
            try:
                tax_resp = lingering_https_get_request(
                    "www.ncbi.nlm.nih.gov",
                    "/Taxonomy/Browser/wwwtax.cgi?mode=Info&id={}".format(
                        taxid), "taxonomy")
                tax_name = re.search(r"Taxonomy browser \((.+?)\)",
                                     tax_resp).group(1)
            except AttributeError:
                printlog_error("\aError: TaxID not found")
                printlog_error(
                    "Please, check your TaxID: https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi"
                )
                platf_depend_exit(1)
            except OSError as oserr:
                printlog_error("Something is wrong with connection:")
                printlog_error(str(oserr))
                platf_depend_exit(-2)
            else:
                print(tax_name)
                log_info("{} - {}".format(taxid, tax_name))
                organisms.append("{} (taxid:{})".format(tax_name, taxid))
            # end try
        # end for
        print('-' * 30 + '\n')

    # end if
    return organisms
Beispiel #17
0
def search_for_related_replicons(acc_dict):
    # Function searches for replicons related to those in 'hits_to_download.tsv'
    #   of specified with '-s' option.
    # :param acc_dict: dictionary comntaining accession data of hits;
    # :type acc_dict: dict<str: tuple<str, str, int>>;

    print()
    printlog_info_time("Searching for related replicons...")

    start_accs = tuple(
        acc_dict.keys())  # accessions, which were "discovered" by prober

    for i, acc in enumerate(start_accs):

        printlog_info("{}. {} ({}):".format(i + 1, acc, acc_dict[acc]))

        # Search for related replicons:
        try:
            related_repls = _get_related_replicons(acc, acc_dict)
        except AttributeError:
            printlog_errot_time(
                "Parsing error: cannot find replicons related to {}.".format(
                    acc))
            printlog_error("Please, contact the developer")
            platf_depend_exit(1)
        else:
            related_repls = _deduplicate_replicons(related_repls, acc)
        # end try
        for rel_acc, rel_def in related_repls:
            acc_dict[rel_acc] = rel_def
        # end for
    # end for

    print()
    if len(start_accs) != len(acc_dict):  # there are some new replicons
        printlog_info_time("{} related replicons have been found.".\
            format(len(acc_dict) - len(start_accs)))
    else:
        printlog_info_time("No related replicons found.")
    # end if
    print()


# end def search_for_related_replicons
Beispiel #18
0
    def whether_to_build_index(index_dirpath):
        # Function checks if there are any files in index directory.
        # If there are any, it asks a user whether to create a new index or to use old one.

        # :param index_dirpath: path to index directory;
        # :type index_dirpath: str;

        use_old_index = False

        if len(os.listdir(index_dirpath)) != 0:
            printlog_info(
                "Index file created by `-u` option already exists (left from previous run)."
            )

            error = True

            while error:
                reply = input("""  Press ENTER to make new index file
  or enter 'u' to use old index file:>>""")
                if reply == "":
                    try:
                        for path in glob(os.path.join(index_dirpath, '*')):
                            os.unlink(path)
                        # end for
                    except OSError as oserr:
                        printlog_error_time(
                            "Error: cannot remove old index files!")
                        printlog_error(str(oserr))
                        platf_depend_exit(1)
                    # end try
                    error = False
                elif reply == 'u':
                    use_old_index = True
                    error = False
                else:
                    print("Invalid reply!\n")
                # end if
            # end while
            printlog_info("You have chosen to {} index file.".format(
                "use old" if use_old_index else "make new"))
            print()
        # end if
        return use_old_index
Beispiel #19
0
def launch_blastn(packet, blast_algorithm, use_index, queries_tmp_dir,
                  db_path):
    """
    Function launches 'blastn' utility from "BLAST+" toolkit and returns it's response.

    :param pacekt: FASTA data meant to be processend by 'blastn';
    :type packet: str;
    :param blast_algorithm: blastn algorithm to use;
    :type blast_algorithm: str;
    :param use_index: logic value inddicating whether to use index;
    :type use_index: bool:
    :param queries_tmp_dir: path to directory with query files;
    :type queries_tmp_dir: str:
    :param db_path: path to database;
    :type db_path: str:
    """

    # PID of current process won't change, so we can use it to mark query files.
    # 'paket's are too large to pass them to 'subprocess.Popen' as stdin,
    #    therefore we need to use these query files.
    query_path = os.path.join(queries_tmp_dir,
                              "query{}_tmp.fasta".format(os.getpid()))

    with open(query_path, 'w') as query_file:
        query_file.write(packet)
    # end with

    # Configure command line
    blast_cmd = "blastn -query {} -db {} -outfmt 5 -task {} -max_target_seqs 10 -max_hsps 1 -use_index {}"\
        .format(query_path, db_path, blast_algorithm, use_index)

    pipe = sp.Popen(blast_cmd, shell=True, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout_stderr = pipe.communicate()

    if pipe.returncode != 0:
        printlog_error_time(
            "Error occured while aligning a sequence against local database")
        printlog_error(stdout_stderr[1].decode("utf-8"))
        platf_depend_exit(pipe.returncode)
    # end if

    return stdout_stderr[0].decode("utf-8")
Beispiel #20
0
def make_outdir(outdir):
    # Funciton makes output directory.
    # Function warns a user if outdir is not empty.
    #
    # :aram outdir: path to outdir;
    # :type outdir: str;

    if not os.path.exists(outdir):
        # Create outdir if it doesn't exist.
        try:
            os.makedirs(outdir)
        except OSError as err:
            print('Cannot create output directory: {}'.format(err))
            platf_depend_exit(1)
        # end try

    elif len(os.listdir(outdir)) != 0:
        # It outdir is not empty -- warn user and ask if he/she wants to empty it now.

        print('\nOutput directory `{}` is not empty.'.format(outdir))

        error = True
        while error:
            reply = input("""Press ENTER to remove all files in it and proceed
 or enter `q` to exit\n >> """)

            if reply == '':
                error = False
                for fpath in glob.iglob(os.path.join(outdir, '*')):
                    print('Removing `{}`'.format(fpath))
                    try:
                        os.unlink(fpath)
                    except OSError as err:
                        print('Error. Cannot remove file `{}`: {}'\
                            .format(fpath, err))
                    # end try
                # end for
            elif reply.lower() == 'q':
                # Just exit
                sys.exit(0)
            else:
                print('Invalid reply: `{}`'.format(reply))
Beispiel #21
0
def _select_get_matches(
        term: str
) -> Callable[[MutableSequence[Overlap]], Collection[Overlap]]:
    # Function returns function depending on `term` (terminus) parameter.
    # If `term` is 's', it returns `_get_start_matches` function.
    # If `term` is 'e', it returns `_get_end_matches` function.
    #
    # :param term: string 's' (start) or 'e' (end);

    if term == 's':
        get_matches = _get_start_matches
    elif term == 'e':
        get_matches = _get_end_matches
    else:
        print('Fatal error: invalid value passed to function \
`_get_overlaps_str_for_table` with argument `term`: `{}`'.format(term))
        print('Please, contact the developer.')
        platf_depend_exit(1)
    # end if

    return get_matches
Beispiel #22
0
def parse_args(version: str, last_update_date: str) -> Tuple[Sequence[str], Mapping[str, Any]]:
    # Function parses command line arguments.
    # Returns two values:
    #  1. Collection of paths to input files.
    #  2. Dictionary of parameters (see function _parse_options).

    # Print help message and exit if required
    if '-h' in sys.argv[1:] or '--help' in sys.argv[1:]:
        print_help(version, last_update_date)
        platf_depend_exit()
    # end if

    # Print version and exit if required
    if '-v' in sys.argv[1:] or '--version' in sys.argv[1:]:
        print(version)
        platf_depend_exit()
    # end if

    # Parse arguments woth getopt
    opts: List[List[str]]
    args: List[str]
    try:
        opts, args = getopt.gnu_getopt(sys.argv[1:], 'hvk:i:a:o:',
            ['help', 'version', 'k-mer=', 'mink=', 'maxk=', 'outdir='])
    except getopt.GetoptError as err:
        print(str(err))
        platf_depend_exit(2)
    # end try

    # Extract paths to input files from parsed arguments
    contigs_fpaths: Sequence[str] = _get_input_fpaths(args)
    # Extract optional parameters from parsed arguments
    params: Dict[str, Any] = _parse_options(opts)

    # Verify mink and maxk:
    if params['i'] > params['a']:
        if '-i' not in sys.argv[1:] and '--mink' not in sys.argv[1:]:
            params['i'] = params['a']
        elif '-a' not in sys.argv[1:] and '--maxk' not in sys.argv[1:]:
            params['a'] = params['i']
        else:
            print('Error: minimum length of a k-mer is greater than maximum length of a k-mer.')
            print('Values specified by you:')
            print('Minimum length of a k-mer: {}.'.format(params['i']))
            print('Maximum length of a k-mer: {}.'.format(params['a']))
            platf_depend_exit(1)
        # end if
    # end if

    return contigs_fpaths, params
Beispiel #23
0
def add_lambda_phage(local_fasta, taxonomy_path):
    # Function adds control sequence of nanopore lambda phase DNA-CS
    #    to 'local_fasta'.
    #
    # :param local_fasta: path to file with reference sequences to be included in database;
    # :type local_fasta: str;
    # :param taxonomy_path: path to taxonomy file;
    # :type taxonomy_path: str;

    print()
    printlog_info_time("Adding lambda phage control sequence...")

    # sys.path[0] is directory containing the script that was used to invoke the Python interpreter.
    # We will use it to get path to file with lambda's sequence.
    lambda_fpath = os.path.join(os.path.dirname(sys.path[0]), "lambda_control",
                                "nanopore_lambda_DNA-CS_control.fasta.gz")

    # Check file existance
    if not os.path.exists(lambda_fpath):
        printlog_error_time(
            "Error: cannot find lambda phage control sequence: '{}'".format(
                lambda_fpath))
        platf_depend_exit(1)
    # end if

    # Read lambda's sequence
    with open_as_gzip(lambda_fpath, 'rb') as lambda_file:
        lambda_fasta = lambda_file.read()
    # end with

    # Write it to db fasta file
    with open(local_fasta, 'wb') as db_fasta_file:
        db_fasta_file.write(lambda_fasta)
    # end with

    # Save lambda's taxonomy
    taxonomy.save_taxonomy_directly(taxonomy_path, "LAMBDA",
                                    "Lambda-phage-nanopore-control")

    printlog_info_time(" ok")
def check_depencencies() -> None:
    # Function checks all necessary dependencies for the program

    version: str = None
    err_msg: str = None
    err_msg_list: List = list()

    # Init dependencies names and functions to check them
    dependencies: Sequence[str] = ('Biopython', 'samtools')
    check_funcitons: Sequence[Callable[[], Tuple[str,
                                                 str]]] = (_check_biopython,
                                                           _check_samtools)

    print('\nDependencies:')

    for dep_name, chech_func in zip(dependencies, check_funcitons):

        print(f'{dep_name}:', end='')

        version, err_msg = chech_func()  # check the dependence
        # Append error message, if it exists
        if not err_msg is None:
            err_msg_list.append(err_msg)
        # end if

        print(f' version {version}')
    # end for

    # Print errors, if they occured
    if len(err_msg_list) != 0:
        print('Dependencies errors:')
        for err_msg in err_msg_list:
            print(f'  - {err_msg}')
        # end for
        platf_depend_exit(1)
    # end if

    print('All dependencies are satisfied.\n')
Beispiel #25
0
def verify_cl_accessions(accs_to_download, acc_dict):
    # Function checks existance of GenBank records that correspond to accessions
    #   specified with '-s' option. After checking the function fulills 'acc_fict'.

    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param acc_dict: dictionary {<ACCESSION>: <HIT_DEFINITION>};
    # :type acc_dict: dict<str: tuple<str>>;

    check_connection("https://www.ncbi.nlm.nih.gov/")

    printlog_info_time("Verifying `-s` accessions...")
    sys.stdout.write("0/{}".format(len(accs_to_download)))

    for i, acc in enumerate(accs_to_download):

        server = "eutils.ncbi.nlm.nih.gov"
        url = "/entrez/eutils/esummary.fcgi?db=nuccore&id={}".format(acc)
        text = lingering_https_get_request(server, url, "record's name", acc)

        name = re.search(
            r"\<Item Name=\"Title\" Type=\"String\"\>(.+)\</Item\>", text)

        if name is None:
            printlog_info(
                "Cannot find GenBank record with accession '{}'".format(acc))
            platf_depend_exit(1)
        else:
            name = name.group(1)
        # end if

        acc_dict[acc] = name
        sys.stdout.write("\r{}/{}".format(i + 1, len(accs_to_download)))
    # end for
    print()
    printlog_info_time("OK.")
def get_primers_seqs(primers_fpath):
    # Function for for obtaining primer sequence(s).
    # If primer_fpath is None, it returns refault primers:
    #   Illumina 16S V3-V4 primers.
    # Otherwise it parses primers from provided fasta file.

    # Use Illumina V3-V4 primers by dafault
    if primers_fpath is None:
        primers = ('CCTACGGGNGGCWGCAG', 'GACTACHVGGGTATCTAATCC')
    else:
        primers = list()

        # Get lines
        try:
            with open(primers_fpath, 'r') as primers_file:
                lines = primers_file.readlines()
            # end with
        except OSError as oserror:
            printlog_error('Error while reading file of primers: {}'\
                .format(oserror))
            platf_depend_exit(1)
        # end try

        # Remove blank lines
        lines = list(filter(lambda x: x != '', lines))

        # There must be 1 or 2 primers in primers file.
        if len(lines) not in (2, 4):
            printlog_error('Error: invalid format of primers file.\
It should be single (2 lines at all) or "double" (4 lines at all) fasta file.\
Bu there are {} lines in your file.'.format(len(lines)))
            platf_depend_exit(1)
        # end if

        bases = 'AGCTUTYSWKMBDHVN'

        # Validate sequence(s).
        for i in range(1, len(lines), 2):
            seq = lines[i].strip().upper()
            if re.match(r'[{}]+'.format(bases), seq) is None:
                printlog_error('Error: invalid character in primer sequence.\
Here is invalid primer sequence: `{}`. Permitted characters: `{}`'\
                    .format(seq, bases))
                platf_depend_exit(1)
            # end if
            primers.append(seq)
        # end for
    # end if

    return primers
Beispiel #27
0
def wait_for_align(rid, rtoe, pack_to_send, filename):
    # Function waits untill BLAST server accomplishes the request.
    #
    # :param rid: Request ID to wait for;
    # :type rid: str;
    # :param rtoe: time in seconds estimated by BLAST server needed to accomplish the request;
    # :type rtoe: int;
    # :param pack_to_send: current packet (id) number to send;
    # :type pack_to_send: int;
    # :param filename: basename of current FASTA file;
    # :type filename: str
    #
    # Returns XML response ('str').

    print()
    print("Requesting for current query status. Request ID: {}".format(rid))
    print(" `{}`; Submission #{}".format(filename, pack_to_send[0]))
    log_info("Requesting for current query status.")
    log_info("Request ID: {}; `{}`; Submission #{}".format(
        rid,
        filename,
        pack_to_send[0],
    ))
    # RTOE can be zero at the very beginning of resumption
    if rtoe > 0:

        printlog_info_time(
            "BLAST server estimates that alignment will be accomplished in {} seconds"
            .format(rtoe))
        printlog_info_time(
            "Waiting for {}+3 (+3 extra) seconds...".format(rtoe))
        # Server migth be wrong -- we will give it 3 extra seconds
        sleep(rtoe + 3)
        printlog_info_time(
            "{} seconds have passed. Checking if alignment is accomplished...".
            format(rtoe + 3))
    # end if

    server = "blast.ncbi.nlm.nih.gov"
    wait_url = "/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID=" + rid

    whtspc_len = 6 + len("(requesting)")

    while True:
        resp_content = lingering_https_get_request(server, wait_url,
                                                   "BLAST response")

        # if server asks to wait
        if "Status=WAITING" in resp_content:
            printn("\r{} - The request is being processed. Waiting{}{}".format(
                getwt(), ' ' * whtspc_len, "\033[%dD" % whtspc_len))
            # indicate each 20 seconds with a dot
            for i in range(1, 7):
                sleep(10)
                printn(
                    "\r{} - The request is being processed. Waiting{}".format(
                        getwt(), '.' * i))
            # end for
            printn("(requesting)")
            continue
        elif "Status=FAILED" in resp_content:
            # if job failed
            print()
            printlog_info_time("Job failed\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(2)
        elif "Status=UNKNOWN" in resp_content:
            # if job expired
            print()
            printlog_info_time("Job expired\a\n")
            printlog_info("Resending this packet.")
            return None, BlastError(1)
        # if results are ready
        elif "Status=READY" in resp_content:
            print()
            printlog_info("Result for query `{}` #{} is ready!".format(
                filename, pack_to_send[0]))
            # if there are hits
            if "ThereAreHits=yes" in resp_content:
                for i in range(15, 0, -5):
                    print('-' * i)
                # end for
                print("-\nRetrieving results...")

                # Retrieve human-readable text and put it into result directory
                retrieve_text_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&DESCRIPTIONS=1&ALIGNMENTS=1&RID=" + rid
                txt_align_res = lingering_https_get_request(
                    server, retrieve_text_url,
                    "text version of BLAST response")

                # Count already existing plain text files in outdir:
                is_txt_response = lambda f: not re.search(
                    r"prober_blast_response_[0-9]+\.txt", f) is None
                outdir_path = os.path.dirname(logging.getLoggerClass(
                ).root.handlers[0].baseFilename)  # tricky trick
                response_num = len(
                    tuple(filter(is_txt_response, os.listdir(outdir_path))))

                # Curent txt response file will have number `response_num+1`
                txt_hpath = os.path.join(
                    outdir_path,
                    "prober_blast_response_{}.txt".format(response_num + 1))
                # Write text result for a human to read
                with open(txt_hpath, 'w') as txt_file:
                    txt_file.write(txt_align_res)
                # end with
            elif "ThereAreHits=no" in resp_content:
                # if there are no hits
                printlog_info_time("There are no hits. It happens.\n")
            else:
                # probably, job is failed if execution reaches here
                print()
                printlog_info_time("Job failed\a\n")
                printlog_info("Resending this packet.")
                return None, BlastError(2)
            # end if
            break
        # end if
        # Execution should not reach here
        printlog_error_time(
            "Fatal error (-122). Please contact the developer.\a\n")
        platf_depend_exit(-122)
    # end while

    # Retrieve XML result
    retrieve_xml_url = "/Blast.cgi?CMD=Get&FORMAT_TYPE=XML&ALIGNMENTS=1&RID=" + rid
    xml_text = lingering_https_get_request(server, retrieve_xml_url,
                                           "XML BLAST response")

    if "Bad Gateway" in xml_text:
        print()
        printlog_info_time("Bad Gateway. Data from last packet has been lost.")
        printlog_info("Resending this packet.")
        return None, BlastError(1)

    elif "Status=FAILED" in xml_text:
        print()
        printlog_info_time("BLAST error: request failed")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "to start it again" in xml_text:
        print()
        printlog_info_time("BLAST error")
        printlog_info("Resending this packet.")
        return None, BlastError(2)

    elif "[blastsrv4.REAL]" in xml_text:
        blastsrv4_match = re.search(r"(\[blastsrv4\.REAL\].*\))", xml_text)
        blastsrv4_str = "" if blastsrv4_match is None else ": {}".format(
            blastsrv4_match.group(1))
        printlog_info_time("BLAST server error{}".format(blastsrv4_str))
        # Error code 2 indicated that we need to split packet and resubmit
        return None, BlastError(2)
    # end if

    return xml_text, BlastError(0)
Beispiel #28
0
def send_request(request, pack_to_send, packet_size, packet_mode, filename,
                 tmp_fpath):
    # Function sends a request to "blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
    #     and then waits for satisfaction of the request and retrieves response text.
    #
    # :param request: request_data (it is a dict that `configure_request()` function returns);
    # :param request: dict<dict>;
    # :param pack_to_send: current number (like id) of packet meant to be sent now.
    # :type pack_to_send: int;
    # :param pack_to_send: ordinal number of packet;
    # :type pack_to_send: int;
    # :param packet_size: numner of sequences in the packet;
    # :type packet_size: int;
    #
    # Returns XML text of type 'str' with BLAST response.

    payload = request["payload"]
    headers = request["headers"]

    server = "blast.ncbi.nlm.nih.gov"
    url = "/blast/Blast.cgi"
    error = True

    while error:
        try:
            conn = http.client.HTTPSConnection(server)  # create a connection
            conn.request("POST", url, payload, headers)  # send the request
            response = conn.getresponse()  # get the response
            response_text = str(response.read(), "utf-8")  # get response text
        except OSError as oserr:
            printlog_info_time(
                "`https://blast.ncbi.nlm.nih.gov` is not available.")
            printlog_info(str(oserr))
            printlog_info(
                "barapost will try to connect again in 30 seconds...\n")
            sleep(30)

        # if no exception occured
        else:
            error = False
        # end try
    # end while

    try:
        rid = re.search(r"RID = (.+)",
                        response_text).group(1)  # get Request ID
        rtoe = int(re.search(r"RTOE = ([0-9]+)", response_text).group(
            1))  # get time to wait provided by the NCBI server
    except AttributeError:
        printlog_error_time("Seems, NCBI has denied your request.")
        printlog_error("Response is in file `request_denial_response.html`")
        with open("request_denial_response.html", 'w') as den_file:
            den_file.write(response_text)
        # end with
        platf_depend_exit(1)
    finally:
        conn.close()
    # end try

    # Save temporary data
    with open(tmp_fpath, 'w') as tmpfile:
        tmpfile.write("Request_ID: {}\n".format(rid))
        tmpfile.write("Packet_size: {}\n".format(packet_size))
        tmpfile.write("Packet_mode: {}".format(packet_mode))
    # end with

    # Wait for results of alignment
    return wait_for_align(rid, rtoe, pack_to_send, filename)
Beispiel #29
0
def bin_fastqa_file(fq_fa_lst, tax_annot_res_dir, sens, n_thr, min_qual,
                    min_qlen, min_pident, min_coverage, no_trash):
    # Function for parallel binning FASTQ and FASTA files.
    # Actually bins multiple files.
    #
    # :param fq_fa_lst: lsit of paths to FASTQ (of FASTA) file meant to be processed;
    # :type fq_fa_lst: list<str>;
    # :param min_qual: threshold for quality filter;
    # :type min_qual: float;
    # :param min_qlen: threshold for length filter;
    # :type min_qlen: int (or None, if this filter is disabled);
    # :param min_pident: threshold for alignment identity filter;
    # :type min_pident: float (or None, if this filter is disabled);
    # :param min_coverage: threshold for alignment coverage filter;
    # :type min_coverage: float (or None, if this filter is disabled);
    # :param no_trash: loical value. True if user does NOT want to output trash files;
    # :type no_trash: bool;

    outdir_path = os.path.dirname(
        logging.getLoggerClass().root.handlers[0].baseFilename)

    seqs_pass = 0  # counter for sequences, which pass filters
    QL_seqs_fail = 0  # counter for too short or too low-quality sequences
    align_seqs_fail = 0  # counter for sequences, which align to their best hit with too low identity or coverage

    for fq_fa_path in fq_fa_lst:

        new_dpath = get_curr_res_dpath(fq_fa_path, tax_annot_res_dir)
        tsv_res_fpath = get_res_tsv_fpath(new_dpath)
        taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy",
                                     "taxonomy.tsv")
        resfile_lines = configure_resfile_lines(tsv_res_fpath, sens,
                                                taxonomy_path)

        # Configure path to trash file
        if is_fastq(fq_fa_path):
            seq_records_generator = fastq_records
            write_fun = write_fastq_record
        else:
            seq_records_generator = fasta_records
            write_fun = write_fasta_record
        # end if

        # Make filter for quality and length
        QL_filter = get_QL_filter(fq_fa_path, min_qual, min_qlen)
        # Configure path to trash file
        if not no_trash:
            QL_trash_fpath = get_QL_trash_fpath(
                fq_fa_path,
                outdir_path,
                min_qual,
                min_qlen,
            )
        else:
            QL_trash_fpath = None
        # end if

        # Make filter for identity and coverage
        align_filter = get_align_filter(min_pident, min_coverage)
        # Configure path to this trash file
        if not no_trash:
            align_trash_fpath = get_align_trash_fpath(fq_fa_path, outdir_path,
                                                      min_pident, min_coverage)
        else:
            align_trash_fpath = None
        # end if

        # Create an iterator that will yield records
        seq_records_iterator = iter(seq_records_generator(fq_fa_path))
        # Dict for storing batches of sequences meant to be written to output files:
        to_write = dict()
        stop = False  # for outer while-loop

        while not stop:

            # Extract batch of records of 'n_thr' size and find their destination paths:
            for _ in range(n_thr):

                try:
                    fastqa_rec = next(seq_records_iterator)
                except StopIteration:
                    stop = True  # for outer while-loop
                    break
                # end try

                read_name = sys.intern(fmt_read_id(
                    fastqa_rec["seq_id"])[1:])  # get ID of the sequence

                try:
                    hit_names, *vals_to_filter = resfile_lines[
                        read_name]  # find hit corresponding to this sequence
                except KeyError:
                    printlog_error_time("Error: read `{}` not found in TSV file containing taxonomic annotation."\
                        .format(read_name))
                    printlog_error("This TSV file: `{}`".format(tsv_res_fpath))
                    printlog_error(
                        "Make sure that this read has been already processed by \
`barapost-prober.py` and `barapost-local.py`.")
                    platf_depend_exit(1)
                # end try

                # If read is found in TSV file:
                if not QL_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, QL_trash_fpath)
                    QL_seqs_fail += 1
                elif not align_filter(vals_to_filter):
                    # Place this sequence to QL trash file
                    to_write[read_name] = (fastqa_rec, align_trash_fpath)
                    align_seqs_fail += 1
                else:
                    for hit_name in hit_names.split("&&"):
                        # Get name of result FASTQ file to write this read in
                        binned_file_path = os.path.join(
                            outdir_path, "{}.fast{}".format(
                                hit_name,
                                'q' if is_fastq(fq_fa_path) else 'a'))
                        to_write[read_name] = (fastqa_rec, binned_file_path)
                    # end for
                    seqs_pass += 1
                # end if
            # end for

            # Write batch of records to output files:
            with write_lock:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end with
            to_write.clear()
        # end while

        with write_lock:
            # Write the rest of 'uneven' data to output files:
            if len(to_write) != 0:
                for record, fpath in to_write.values():
                    write_fun(fpath, record)
                # end for
            # end if
            sys.stdout.write('\r')
            printlog_info_time("File `{}` is binned.".format(
                os.path.basename(fq_fa_path)))
            printn(" Working...")
        # end with
    # end for

    return (seqs_pass, QL_seqs_fail, align_seqs_fail)
Beispiel #30
0
def lingering_https_get_request(server, url, request_for=None, acc=None):
    # Function performs a "lingering" HTTPS request.
    # It means that the function tries to get the response
    #     again and again if the request fails.
    #
    # :param server: server address;
    # :type server: str;
    # :param url: the rest of url;
    # :type url: str;
    # :param request_for: some comment for error message;
    # :type request_for: str;
    # :param acc: GenBank accession;
    # :type acc: str;
    #
    # Returns obtained response coded in UTF-8 ('str').

    error = True

    # We can get spurious 404 or sth due to instability of NCBI servers work.
    # Let's give it 3 attempts (with 15 sec spans in between),
    #   and if all them are unsuccessful -- teminate execution.
    attempt_i = 0
    max_attempts = 3

    while error:
        try:
            conn = http.client.HTTPSConnection(server,
                                               timeout=30)  # create connection
            conn.request("GET", url)  # ask for if there areresults
            response = conn.getresponse()  # get the resonse

            if response.code != 200:
                if attempt_i < max_attempts and "ncbi.nlm.nih.gov" in server:
                    printlog_error("Error {}: {}.".format(
                        response.code, response.reason))
                    printlog_error(
                        "It may be due to instable work of NCBI servers.")
                    printlog_error("{} attempts to connect left, waiting 15 sec..."\
                        .format(max_attempts - attempt_i))
                    attempt_i += 1
                else:
                    printlog_error("Cannot find {} for {}.".format(
                        request_for, acc))
                    printlog_error("Request failed with status code {}: {}"\
                        .format(response.code, response.reason))
                    platf_depend_exit(1)
                # end if
            # end if

            resp_content = str(response.read(), "utf-8")  # get response text
        except (OSError,\
                http.client.RemoteDisconnected,\
                socket.gaierror,\
                http.client.CannotSendRequest) as err:
            comment_str = ""
            if not request_for is None:
                comment_str += " requesting for {}".format(request_for)
                if not acc is None:
                    comment_str += " (accession: `{}`)".format(acc)
                # end if
                comment_str += '.'
            # end if
            print()
            printlog_info("Can't connect to `{}`{}".format(
                server + url, comment_str))
            printlog_info(str(err))
            printlog_info(
                """the program will sleep for 30 seconds and try to connect again."""
            )
            sleep(30)
        else:
            error = False  # if no exception ocured, get out of the loop
        finally:
            conn.close()
        # end try
    # end while
    return resp_content