Example #1
0
def fasta_records(fa_path):
    # Generator yields records retrieved from fasta files.
    #
    # :param fasta_file: file instance of FASTA file to retrieve sequences from;
    # :type fasta_file: _io.TextIOWrapper or gzip.GzipFile;
    # Returns dictionary of the following structure:
    # {
    #     "seq_id": ID_of_sequence,
    #     "seq": sequence_itself
    # }

    how_to_open = OPEN_FUNCS[is_gzipped(fa_path)]
    fmt_func = FORMATTING_FUNCS[is_gzipped(fa_path)]

    with how_to_open(fa_path) as fa_file:

        line = fmt_func(fa_file.readline())
        seq_id = line
        seq = ""
        line = fmt_func(fa_file.readline())

        while line != "":

            seq += line
            line = fmt_func(fa_file.readline())

            if line.startswith('>') or line == "":

                yield {"seq_id": seq_id, "seq": seq}
                seq_id = line
                seq = ""
                line = fmt_func(fa_file.readline())
Example #2
0
def fastq_records(fq_path):
    # Generator yields records retrieved from fasta files.
    #
    # :param read_file: file instance of FASTQ file to retrieve sequences from;
    # :type fasta_file: _io.TextIOWrapper or gzip.GzipFile;
    #
    # Returns dictionary of the following structure:
    # {
    #     "seq_id": ID_of_sequence,
    #     "seq": sequence_itself,
    #     "opt_id": the_third_line,
    #     "qual_line": quality_line
    # }

    how_to_open = OPEN_FUNCS[is_gzipped(fq_path)]
    fmt_func = FORMATTING_FUNCS[is_gzipped(fq_path)]

    with how_to_open(fq_path) as fq_file:

        eof = False
        while not eof:

            seq_id = fmt_func(fq_file.readline())
            if seq_id != "":
                yield {
                    "seq_id": seq_id,
                    "seq": fmt_func(fq_file.readline()),
                    "opt_id": fmt_func(fq_file.readline()),
                    "qual_line": fmt_func(fq_file.readline())
                }
            else:
                eof = True
Example #3
0
def recover_taxonomy(acc, hit_def, taxonomy_path):
    # Function recovers missing taxonomy by given accession.
    #
    # :param acc: accession of taxonomy entry to recover;
    # :type acc: str;
    # :param hit_def: name of this sequence;
    # :type hit_def: sre;
    # :param taxonomy_path: path to TSV file with taxonomy;
    # :type taxonomy_path: str;

    if acc == "LAMBDA":
        # If we are missing lambda phage taxonomy -- just add it
        save_taxonomy_directly(taxonomy_path, acc,
                               "Lambda-phage-nanopore-control")
    elif acc.startswith("OWN_SEQ_"):
        # If sequence is an "own seq" -- check fasta file

        # Get necessary title line from `local_seq_set.fasta`
        # Firstly find fasta file (it may be compressed)
        classif_dir = os.path.dirname(os.path.dirname(taxonomy_path))
        db_dir = os.path.join(classif_dir, "local_database")
        db_files = glob.glob("{}{}*".format(db_dir, os.sep))
        try:
            local_fasta = next(iter(filter(is_fasta, db_files)))
        except StopIteration:
            printlog_error_time(
                "Error: cannot recover taxonomy for following sequence:")
            printlog_error(" `{} - {}`.".format(acc, hit_def))
            printlog_error(
                "You can solve this problem by yourself (it's pretty simple).")
            printlog_error("Just add taxonomy line for {} to file `{}`".format(
                acc, taxonomy_path))
            printlog_error("  and run the program again.")
            platf_depend_exit(1)
        # end try

        # Find our line startingg with `acc`
        how_to_open = OPEN_FUNCS[is_gzipped(local_fasta)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(local_fasta)]
        if is_gzipped(local_fasta):
            search_for = b">" + bytes(acc, 'ascii') + b" "
        else:
            search_for = ">{} ".format(acc)
        # end if

        with how_to_open(local_fasta) as fasta_file:
            for line in fasta_file:
                if line.startswith(search_for):
                    seq_name = fmt_func(line).partition(' ')[
                        2]  # get name of the sequence
                    save_taxonomy_directly(taxonomy_path, acc, seq_name)
                    break
                # end if
            # end for
        # end with
    else:
        # Try to find taxonomy in NCBI
        download_taxonomy(acc, hit_def, taxonomy_path)
Example #4
0
def process_paral(fq_fa_list, packet_size, tax_annot_res_dir, blast_algorithm,
                  use_index, db_path, nfiles):
    # Function performs 'many_files'-parallel mode of barapost-local.py.

    # :param fq_fa_list: list of paths to FASTA and FASTQ files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;
    # :param nfiles: total number of files;
    # :type nfiles: int;

    queries_tmp_dir = os.path.join(tax_annot_res_dir, "queries-tmp")

    # Iterate over source FASTQ and FASTA files
    for fq_fa_path in fq_fa_list:

        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$",
                                 infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None:  # If there is no data from previous run
            num_done_seqs = 0  # number of successfully processed sequences
            tsv_res_path = os.path.join(
                new_dpath, "classification.tsv")  # form result tsv file path
        else:  # if there is data from previous run
            num_done_seqs = previous_data[
                "n_done_reads"]  # get number of successfully processed sequences
            tsv_res_path = previous_data[
                "tsv_respath"]  # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[is_gzipped(fq_fa_path)]
        fmt_func = FORMATTING_FUNCS[is_gzipped(fq_fa_path)]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(
                1
                for line in how_to_open(fq_fa_path)) // 4  # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(
                    tuple(
                        filter(
                            lambda l: True if l.startswith('>') else False,
                            map(fmt_func,
                                how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                with print_lock:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(
                        os.path.abspath(fq_fa_path)))
                    printlog_warning("This file will not be processed.")
                    continue
                # end with
            # end try
        # end if

        if num_seqs == num_done_seqs:
            with counter_lock:
                file_counter.value += 1
                i = file_counter.value  # save to local var and release lock
            # end with
            with print_lock:
                sys.stdout.write('\r')
                printlog_info_time("File #{}/{} (`{}`) has been already completely processed.".\
                    format(i, nfiles, fq_fa_path))
                printlog_info("Omitting it.")
                printn("Working...")
            # end with
            continue
        # end if

        for packet in packet_generator(fq_fa_path, packet_size, num_done_seqs):

            # Blast the packet
            align_xml_text = launch_blastn(packet["fasta"], blast_algorithm,
                                           use_index, queries_tmp_dir, db_path)

            # Cnfigure result TSV lines
            result_tsv_lines = parse_align_results_xml(align_xml_text,
                                                       packet["qual"])

            # Write the result to tsv
            write_classification(result_tsv_lines, tsv_res_path)
        # end for

        with counter_lock:
            file_counter.value += 1
            i = file_counter.value  # save to local var and release lock
        # end with
        with print_lock:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) is processed.".\
                format(i, nfiles, os.path.basename(fq_fa_path)))
            printn("Working...")
        # end with
    # end for

    query_fpath = os.path.join(queries_tmp_dir,
                               "query{}_tmp.fasta".format(os.getpid()))
    remove_tmp_files(query_fpath)
Example #5
0
def fasta_packets(fasta,
                  packet_size,
                  num_done_seqs,
                  packet_mode=0,
                  saved_packet_size=None,
                  saved_packet_mode=None,
                  max_seq_len=float("inf"),
                  probing_batch_size=float("inf")):
    # Generator yields fasta-formattedpackets of records from fasta file.
    # This function passes 'num_done_seqs' sequences (i.e. they will not be processed)
    #     to 'pass_processed_files'.
    #
    # :param fasta: path to fasta file;
    # :type fasta: str;
    # :param packet_size: number of sequences to align in one request ('blastn' launching);
    # :type packet_size: int;
    # :param num_done_seqs: number of sequnces in current file that have been already processed;
    # :type num_done_seqs: int;
    # :param packet_mode: packet mode (see -c option);
    # :type packet_mode: int;
    # :param saved_packet_size: size of last sent packet from tmp file. Necessary for resumption.
    #   It will be None, if no tmp file was in classification directory;
    # :type saved_packet_size: int;
    # :param saved_packet_mode: mode used whilst formig the last sent packet from tmp file.
    #   Necessary for resumption. It will be None, if no tmp file was in classification directory;
    # :type saved_packet_mode: int;
    # :param max_seq_len: maximum length of a sequence proessed;
    # :type max_seq_len: int (float("inf") if pruning is disabled);

    how_to_open = OPEN_FUNCS[is_gzipped(fasta)]
    fmt_func = FORMATTING_FUNCS[is_gzipped(fasta)]

    with how_to_open(fasta) as fasta_file:
        # Next line retrieving is implemented as simple line-from-file reading.
        get_next_line = lambda: fmt_func(fasta_file.readline())

        # Variable that contains ID of next sequence in current FASTA file.
        # If no or all sequences in current FASTA file have been already processed, this variable is None.
        # There is no way to count sequences in multi-FASTA file, accept of counting sequence IDs.
        # Therefore 'next_id_line' should be saved in memory just after moment when packet is formed.
        next_id_line = pass_processed_seqs(fasta_file, num_done_seqs, fmt_func)

        if next_id_line == "":
            yield {"fasta": "", "qual": dict()}
        # end if

        packet = ""

        # We are resuming, nucleotide sequence will be saved in 'line' variable here:
        try:
            line = get_next_line()
        except UnicodeDecodeError as err:
            print()
            printlog_warning("Warning: current file is broken: {}."\
                .format(str(err)))
            printlog_warning("File: `{}`".format(fasta))
            printlog_warning("Ceasing reading sequences from this file.")
            return
        # end try

        if line.startswith('>'):
            line = fmt_read_id(line)  # format sequence ID
        # end if

        # If some sequences have been passed, this if-statement will be executed.
        # New packet should start with sequence ID line.
        if not next_id_line is None:
            packet += next_id_line + '\n'
        # end if
        packet += line + '\n'  # add recently read line

        # Here goes check for saved packet size and mode:
        if not saved_packet_size is None:
            wrk_pack_size = saved_packet_size
        else:
            wrk_pack_size = packet_size
        # end if

        if not saved_packet_mode is None:
            wrk_pack_mode = saved_packet_mode
        else:
            wrk_pack_mode = packet_mode
        # end if

        eof = False
        while not eof:  # till the end of file

            counter = 0  # variable for counting sequences within packet
            seqlen = 0

            while counter < wrk_pack_size:

                try:
                    line = get_next_line()
                except UnicodeDecodeError as err:
                    print()
                    printlog_warning("Warning: current file is broken: {}."\
                        .format(str(err)))
                    printlog_warning("File: `{}`".format(fasta))
                    printlog_warning(
                        "Ceasing reading sequences from this file.")
                    line = ""
                    break
                # end try

                if line.startswith('>'):
                    line = fmt_read_id(line)
                    if packet_mode == 0:
                        counter += 1
                    else:
                        counter += min(seqlen, max_seq_len)
                        seqlen = 0
                    # end if
                # end if

                if line == "":  # if end of file (data) is reached
                    break
                # end if

                if not line.startswith('>'):
                    seqlen += len(line.strip())
                # end if

                packet += line + '\n'  # add line to packet
            # end while

            if line != "":
                next_id_line = packet.splitlines()[
                    -1]  # save sequence ID next packet will start with
                packet = '\n'.join(packet.splitlines()
                                   [:-1])  # exclude 'next_id_line' from packet
            else:
                eof = True
                next_id_line = None
            # end if

            # Get list of sequence IDs:
            names = filter(lambda l: l.startswith('>'), packet.splitlines())
            names = map(lambda l: l.replace('>', ''), names)

            # {<seq_id>: '-'}, as soon as it is a fasta file
            qual_dict = {name: '-' for name in names}

            if max_seq_len < float("inf"):  # prune sequences
                packet = prune_seqs(packet, max_seq_len)
            # end if

            if packet != "":
                yield {"fasta": packet, "qual": qual_dict}

                if packet_mode == 0:
                    probing_batch_size -= wrk_pack_size
                    wrk_pack_size = min(packet_size, probing_batch_size)
                else:
                    probing_batch_size -= len(qual_dict)
                # end if

                # Switch back to standart packet size
                # As Vorotos said, repeated assignment is the best check:
                if wrk_pack_mode != packet_mode:
                    wrk_pack_mode = packet_mode
                # end if

                if not next_id_line is None:
                    packet = next_id_line + '\n'
                # end if
            else:
                return
Example #6
0
def process(fq_fa_list, n_thr, packet_size, tax_annot_res_dir,
            blast_algorithm, use_index, db_path):
    # Function preforms "few_files"-parallel mode.
    #
    # :param fq_fa_list: list of paths to files meant to be processed;
    # :type fq_fa_list: list<str>;
    # :param n_thr: number of threads to launch;
    # :type n_thr: int;
    # :param packet_size: number of sequences processed by blast in a single launching;
    # :type packet_size: int;
    # :param tax_annot_res_dir: path to ouput directory that contains taxonomic annotation;
    # :type tax_annot_res_dir: str;
    # :param blast_algorithm: blast algorithm to use;
    # :type blast_algorithm: str;
    # :param use_index: logic value indicationg whether to use indes;
    # :type use_index: bool;
    # :param db_path: path to database;
    # :type db_path: str;

    nfiles = len(fq_fa_list)

    for i, fq_fa_path in enumerate(fq_fa_list):
        # Create the result directory with the name of FASTQ of FASTA file being processed:
        new_dpath = create_result_directory(fq_fa_path, tax_annot_res_dir)

        # "hname" means human readable name (i.e. without file path and extention)
        infile_hname = os.path.basename(fq_fa_path)
        infile_hname = re.search(r"(.+)\.(m)?f(ast)?(a|q)(\.gz)?$", infile_hname).group(1)

        # Look around and ckeck if there are results of previous runs of this script
        # If 'look_around' is None -- there is no data from previous run
        previous_data = look_around(new_dpath, fq_fa_path)

        if previous_data is None: # If there is no data from previous run
            num_done_seqs = 0 # number of successfully processed sequences
            tsv_res_path = os.path.join(new_dpath, "classification.tsv") # form result tsv file path
        else: # if there is data from previous run
            num_done_seqs = previous_data["n_done_reads"] # get number of successfully processed sequences
            tsv_res_path = previous_data["tsv_respath"] # result tsv file sholud be the same as during previous run
        # end if

        how_to_open = OPEN_FUNCS[ is_gzipped(fq_fa_path) ]
        fmt_func = FORMATTING_FUNCS[ is_gzipped(fq_fa_path) ]

        if is_fastq(fq_fa_path):
            packet_generator = fastq_packets
            num_seqs = sum(1 for line in how_to_open(fq_fa_path)) // 4 # 4 lines per record
        else:
            packet_generator = fasta_packets
            try:
                num_seqs = len(tuple(filter(lambda l: True if l.startswith('>') else False,
                    map(fmt_func, how_to_open(fq_fa_path).readlines()))))
            except UnicodeDecodeError as err:
                print()
                printlog_warning("Warning: current file is broken: {}."\
                    .format(str(err)))
                printlog_warning("File: `{}`".format(os.path.abspath(fq_fa_path)))
                printlog_warning("This file will not be processed.")
                continue
            # end try
        # end if

        packet_size = min(packet_size, num_seqs // n_thr)

        if num_seqs == num_done_seqs:
            sys.stdout.write('\r')
            printlog_info_time("File #{}/{} (`{}`) has been already completely processed."\
                .format(i+1, nfiles, fq_fa_path))
            printlog_info("Omitting it.")
            printn("Working...")
            return
        # end if

        # Get number of seqeunces to pass to each thread
        file_part_size = num_seqs // n_thr
        if num_seqs % n_thr != 0:
            file_part_size += 1
        # end if

        pool = mp.Pool(n_thr, initializer=init_proc_single_file_in_paral,
            initargs=(mp.Lock(), mp.Lock(),))

        pool.starmap(process_part_of_file, [(file_part,
            tsv_res_path,
            packet_size,
            tax_annot_res_dir,
            blast_algorithm,
            use_index,
            db_path) for file_part in packet_generator(fq_fa_path,
                file_part_size,
                num_done_seqs)])

        # Reaping zombies
        pool.close()
        pool.join()

        sys.stdout.write('\r')
        printlog_info_time("File #{}/{} (`{}`) is processed.".\
            format(i+1, nfiles, os.path.basename(fq_fa_path)))
        printn("Working...")    # end for
Example #7
0
def fastq_packets(fastq, packet_size, num_done_seqs, packet_mode=0,
    saved_packet_size=None, saved_packet_mode=None,
    max_seq_len=float("inf"), probing_batch_size=float("inf")):
    # Generator yields fasta-formattedpackets of records from fastq file.
    # This function passes 'num_done_seqs' sequences (i.e. they will not be processed)
    #   to 'pass_processed_files'.

    # :param fastq: path to fastq file;
    # :type fastq: str;
    # :param packet_size: number of sequences to align in one request ('blastn' launching);
    # :type packet_size: int;
    # :param num_done_seqs: number of sequnces in current file that have been already processed;
    # :type num_done_seqs: int;
    # :param packet_mode: packet mode (see -c option);
    # :type packet_mode: int;
    # :param saved_packet_size: size of last sent packet from tmp file. Necessary for resumption.
    #   It will be None, if no tmp file was in classification directory;
    # :type saved_packet_size: int;
    # :param saved_packet_mode: mode used whilst formig the last sent packet from tmp file.
    #   Necessary for resumption. It will be None, if no tmp file was in classification directory;
    # :type saved_packet_mode: int;
    # :param max_seq_len: maximum length of a sequence proessed;
    # :type max_seq_len: int (float("inf") if pruning is disabled);

    how_to_open = OPEN_FUNCS[ is_gzipped(fastq) ]
    fmt_func = FORMATTING_FUNCS[ is_gzipped(fastq) ]

    with how_to_open(fastq) as fastq_file:

        # Pass reads, which have been already processed:
        for _ in range(int(num_done_seqs * FASTQ_LINES_PER_READ)):
            fastq_file.readline()
        # end for

        # End of file
        eof = False

        # Here goes check for saved packet size and mode:
        if not saved_packet_size is None:
            wrk_pack_size = saved_packet_size
        else:
            wrk_pack_size = packet_size
        # end if

        if not saved_packet_mode is None:
            wrk_pack_mode = saved_packet_mode
        else:
            wrk_pack_mode = packet_mode
        # end if

        if wrk_pack_mode == 0:
            form_packet = form_packet_numseqs
        else:
            form_packet = form_packet_totalbp
        # end if

        # Process all remaining sequences with standart packet size:
        while not eof:

            packet, eof = form_packet(fastq_file, wrk_pack_size, fmt_func, max_seq_len)

            if eof and packet["fasta"] == "":
                return
            # end if

            yield packet

            if packet_mode == 0:
                probing_batch_size -= wrk_pack_size
                wrk_pack_size = min(packet_size, probing_batch_size)
            else:
                probing_batch_size -= len(packet['qual'])
            # end if

            # Switch back to standart packet size
            # As Vorotos said, repeated assignment is the best check:
            if wrk_pack_mode != packet_mode:
                wrk_pack_mode = packet_mode
Example #8
0
def build_local_db(tax_annot_res_dir, acc_fpath, your_own_fasta_lst,
                   accs_to_download, use_index):
    # Function creates a database with utilities from 'blast+' toolkit
    #     according to acc_dict and your_own_fasta_lst.
    #
    # :param tax_annot_res_dir: path to current result directory
    #   (each processed file has it's own result directory);
    # :type tax_annot_res_dir: str;
    # :param acc_fpath: path to file "hits_to_download.tsv";
    # :type acc_fpath: str;
    # :param your_own_fasta_lst: list of user's fasta files to be included in database;
    # :type your_own_fasta_lst: list<str>;
    # :param accs_to_download: list of accessions from command line ('-s');
    # :type accs_to_download: list<str>;
    # :param use_index: whether to use index;
    # :type use_index: str;

    # Returns path to created database.

    # Path to directory in which database will be placed
    db_dir = os.path.join(tax_annot_res_dir, "local_database")
    # Path to DBM taxonomy file
    taxonomy_path = os.path.join(tax_annot_res_dir, "taxonomy", "taxonomy.tsv")

    try:
        os.makedirs(db_dir)
    except OSError:
        #If this directory exists

        while True:
            if len(os.listdir(db_dir)) == 0:
                # If db directory is empty -- break and build a database
                break
            else:
                print()
                printlog_info("Database directory is not empty:")
                printlog_info("  `{}`".format(os.path.abspath(db_dir)))
                printlog_info("Here is it's content:")
                for i, fname in enumerate(os.listdir(os.path.abspath(db_dir))):
                    printlog_info(" {}. `{}`".format(i + 1, fname))
                # end for
                reply = input(
                    """\nPress ENTER to start classification using existing database.
Enter 'r' to remove all files in this directory and create the database from the beginning:>>"""
                )

                if reply == "":
                    # Do not build a database, just return path to it.
                    printlog_info("You have chosen to use extant database.")

                    # Return path to DB located in this directory
                    dbpath = next(iter(os.listdir(db_dir)))
                    dbpath = dbpath.partition(".fasta")[0] + dbpath.partition(
                        ".fasta")[1]  # remove all after '.fasta'

                    return os.path.join(db_dir, dbpath)

                elif reply == 'r':

                    printlog_info("You have chosen to rebuild the database.")
                    # Rename old classification files and write actual data to new one:
                    old_classif_dirs = filter(
                        lambda d: os.path.exists(
                            os.path.join(d, "classification.tsv")),
                        glob(os.path.join(tax_annot_res_dir, "*")))
                    old_classif_files = tuple(
                        map(lambda f: os.path.join(f, "classification.tsv"),
                            old_classif_dirs))

                    if len(old_classif_files) > 0:
                        print()
                        printlog_info("Renaming old classification files:")
                        for classif_file in old_classif_files:
                            rename_file_verbosely(classif_file)
                        # end for
                    # end if

                    # Empty database directory
                    for file in glob("{}{}*".format(db_dir, os.sep)):
                        os.unlink(file)
                    # end for

                    # Break from the loop in order to build a database
                    break
                else:
                    print("Invalid reply: `{}`\n".format(reply))
                    continue
                # end if
            # end if
        # end while
    # end try

    # It is a dictionary of accessions and record names.
    # Accessions are keys, record names are values.
    acc_dict = configure_acc_dict(acc_fpath, your_own_fasta_lst,
                                  accs_to_download)

    if len(accs_to_download) != 0:
        verify_cl_accessions(accs_to_download, acc_dict)
    # end if

    # Retrieve already existing taxonomy data from taxonomy file
    tax_exist_accs = taxonomy.get_tax_keys(taxonomy_path)

    # If accession file does not exist and execution has reached here -- everything is OK --
    #    we are building a database from user's files only.
    if len(acc_dict) != 0:
        print()

        print("""Following sequences (and all replicons related to them)
  will be downloaded from Genbank for further taxonomic classification
  on your local machine:\n""")
        printlog_info(
            "Following sequences (and all replicons related to them) \
will be downloaded from Genbank for further taxonomic classification \
on your local machine:")
        for i, acc in enumerate(acc_dict.keys()):
            printlog_info(" {}. {} - `{}`".format(i + 1, acc, acc_dict[acc]))
        # end for

        search_for_related_replicons(acc_dict)

        printlog_info_time("Completing taxonomy file...")
        for i, acc in enumerate(acc_dict.keys()):
            if not acc in tax_exist_accs:
                taxonomy.find_taxonomy(acc, acc_dict[acc][1], taxonomy_path)
            # end if
            # Accessions can be of different length
            printn("\r{} - {}: {}/{}".format(getwt(), acc, i +
                                             1, len(acc_dict)) + " " * 10 +
                   "\b" * 10)
        # end for
        print()
        printlog_info_time("Taxonomy file is consistent.")
    # end if

    local_fasta = os.path.join(
        db_dir, "local_seq_set.fasta")  # path to downloaded FASTA file

    add_lambda_phage(local_fasta,
                     taxonomy_path)  # add lambda phage control sequence

    retrieve_fastas_by_acc(
        acc_dict, db_dir, local_fasta)  # download main fasta data from GenBank

    # Add 'your own' fasta files to database
    if not len(your_own_fasta_lst) == 0:

        # This variable counts sequences from local files.
        # It is necessary for not allowing duplicated accessions.
        own_seq_counter = 0

        # Check if these files are assembly made by SPAdes or a5
        spades_patt = r">NODE_[0-9]+"  # this pattern will match sequence IDs generated y SPAdes
        a5_patt = r">scaffold_[0-9]+"  # this pattern will match sequence IDs generated y a5
        assemblies = list(
        )  # this list will contain paths to assembly files (SPAdes or a5)

        for own_fasta_path in reversed(your_own_fasta_lst):

            how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
            fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]

            with how_to_open(own_fasta_path) as fasta_file:
                first_seq_id = fmt_func(fasta_file.readline(
                ))  # get the first line in file (the first seq ID)
            # end with

            # if we've got SPAdes assembly
            if not re.search(spades_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                # Remove these file from list -- they will be processed in a specific way
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if

            # if we've got a5 assembly
            if not re.search(a5_patt, first_seq_id) is None:
                assemblies.append(own_fasta_path)
                your_own_fasta_lst.remove(own_fasta_path)
                continue
            # end if
        # end for

        # Include assemblies files in multi-fasta file

        # Find common prefix of all assembly paths and remove it from assembly names
        if len(assemblies) > 1:
            assemblies_formatted = tuple(
                map(lambda f: os.path.abspath(f).replace(os.sep, '-'),
                    assemblies))
            common_prefix = find_common_prefix(assemblies_formatted)
            assemblies_formatted = tuple(
                map(lambda f: f.replace(common_prefix, ''),
                    assemblies_formatted))
        elif len(assemblies) > 0:
            common_prefix = ''
            assemblies_formatted = tuple(map(os.path.basename, assemblies))
        # end if

        # Add assembled sequences to database
        with open(local_fasta, 'a') as fasta_db:
            for i, assm_path in enumerate(assemblies):
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(assm_path)))
                assm_name_fmt = assemblies_formatted[i]

                how_to_open = OPEN_FUNCS[is_gzipped(assm_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(assm_path)]
                with how_to_open(assm_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # You can find comments to "OWN_SEQ..." below.
                        # Paths will be written to seq IDs in following way:
                        #   some-happy-path.fastq--
                        # in order to retrieve them securely with regex later.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            own_def = "{}--".format(
                                assm_name_fmt.replace(common_prefix,
                                                      '')) + line[1:]
                            own_def = remove_bad_chars(own_def)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, own_def)
                            line = ">" + "{} {}".format(own_acc, own_def)
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with

        with open(local_fasta, 'a') as fasta_db:
            for own_fasta_path in your_own_fasta_lst:
                printlog_info("Adding `{}` to database...".format(
                    os.path.basename(own_fasta_path)))

                how_to_open = OPEN_FUNCS[is_gzipped(own_fasta_path)]
                fmt_func = FORMATTING_FUNCS[is_gzipped(own_fasta_path)]
                with how_to_open(own_fasta_path) as fasta_file:
                    for line in fasta_file:
                        line = fmt_func(line)
                        # 'makeblastdb' considers first word (sep. is space) as sequence ID
                        #   and throws an error if there are duplicated IDs.
                        # In order not to allow this duplication we'll create our own sequence IDs:
                        #   'OWN_SEQ_<NUMBER>' and write it in the beginning of FASTA record name.
                        if line.startswith('>'):
                            own_seq_counter += 1
                            own_acc = "OWN_SEQ_{}".format(own_seq_counter)
                            taxonomy.save_taxonomy_directly(
                                taxonomy_path, own_acc, line[1:])
                            line = ">" + own_acc + ' ' + remove_bad_chars(
                                line[1:])
                        # end if
                        fasta_db.write(line + '\n')
                    # end for
                # end with
            # end for
        # end with
    # end if

    # 'lcl|ACCESSION...' entries can be given with '.1'
    #   (or '.2', whatever) terminus by blastn.
    # There is no '.1' terminus in taxonomy file.
    # Therefore we will prune accessions in advance.
    print()
    printn("{} - Formatting accessions...".format(getwt()))
    log_info("Formatting accessions...")
    corrected_path = os.path.join(db_dir, "corrected_seqs.fasta")
    with open(local_fasta, 'r') as source_file, open(corrected_path,
                                                     'w') as dest_file:
        for line in source_file:
            if line.startswith('>'):
                line = line.strip()
                acc, seq_name = (line.partition(' ')[0],
                                 line.partition(' ')[2])
                acc = acc.partition('.')[0]
                seq_name = remove_bad_chars(seq_name)
                seq_name = re.sub(r'[^\x00-\x7F]+', '_',
                                  seq_name)  # remove non-ascii chars
                line = ' '.join((acc, seq_name)) + '\n'
            # end if
            dest_file.write(line)
        # end for
    # end with
    os.unlink(local_fasta)
    os.rename(corrected_path, local_fasta)
    sys.stdout.write("\r{} - Formatting accessions... ok".format(getwt()))
    log_info("Formatting accessions done.")

    # Configure command line
    make_db_cmd = "makeblastdb -in {} -parse_seqids -dbtype nucl".format(
        local_fasta)
    exit_code = os.system(make_db_cmd)  # make a blast-format database
    if exit_code != 0:
        printlog_error_time("Error occured while making the database")
        platf_depend_exit(exit_code)
    # end if

    print("\033[1A{} - Database is successfully created: `{}`\n".format(
        getwt(), local_fasta))
    log_info("Database is successfully created: `{}`".format(local_fasta))

    if use_index == "true":
        printlog_info_time("Database index creating started")
        # Configure command line
        make_index_cmd = "makembindex -input {} -iformat blastdb -verbosity verbose".format(
            local_fasta)
        exit_code = os.system(
            make_index_cmd)  # create an index for the database
        if exit_code != 0:
            printlog_info_time("Error occured while creating database index")
            platf_depend_exit(exit_code)
        # end if

        printlog_info_time("Database index has been successfully created")
    # end if

    # Gzip downloaded FASTA file
    printlog_info_time("Gzipping FASTA file: `{}`".format(local_fasta))

    if gzip_util_found:
        os.system("{} -v {}".format(gzip_util, local_fasta))
    else:
        # form .fasta.gz file 'by hand'
        with open(local_fasta,
                  'rb') as fasta_file, open_as_gzip(local_fasta + ".gz",
                                                    "wb") as fagz_file:
            shutil_copyfileobj(fasta_file, fagz_file)
        # end with
        os.unlink(local_fasta)  # remove source FASTA file, not the database
    # end if

    return local_fasta