Ejemplo n.º 1
0
def get_btax_blastdb(btax_dict,
                     db_dir,
                     btr_profiles=None,
                     num_threads=None,
                     config_path=None):
    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not num_threads:
        num_threads = conf_constants.num_threads
    else:
        conf_constants.num_threads = num_threads

    # Can be parallel
    for btax_name in btax_dict:
        btax_info = BtaxInfo.load_from_dict(btax_dict[btax_name])
        btax_info.btax_fna, btax_info.fna_id, downloaded_fna = get_btax_fna(
            btax_genomes=btax_info.genomes, btax_name=btax_name, db_dir=db_dir)
        for i, btax_genome in enumerate(btax_info.genomes):
            genome_info = GenomeInfo.load_from_dict(btax_genome)
            if genome_info.genome_id in downloaded_fna:
                genome_info.fna_path = downloaded_fna[genome_info.genome_id]
                btax_info.genomes[i] = genome_info.get_json()
        btax_info.blastdb = create_btax_blastdb(
            btax_fna_path=btax_info.btax_fna,
            btax_name=btax_name,
            db_dir=db_dir,
            blast_inst_dir=conf_constants.blast_inst_dir,
            logger=eagle_logger)
        if btr_profiles is not None:
            # create repr profile
            pass
        btax_dict[btax_name] = btax_info.get_json()
    return btax_dict
Ejemplo n.º 2
0
def _parse_cmd_args(*args):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-dbt",
        "--dbtype",
        help="The type of database to create (bacteria or archea or eukaryota)",
        required=True)
    parser.add_argument(
        "-irefseq",
        "--input-table-refseq",
        help="Path to a table with organisms to download from NCBI refseq",
        required=False,
        default=None)
    parser.add_argument(
        "-igenbank",
        "--input-table-genbank",
        help="Path to a table with organisms to download from NCBI genbank",
        required=False,
        default=None)
    parser.add_argument(
        "-icustom",
        "--input-table-custom",
        help=
        "Path to a table with custom genomes and their taxonomy (not implemented yet). "
        "The table consists of two necessary columns (1 - genome path; 2 - genome taxonomy) "
        "and one optional column (3 - organism name)",
        required=False,
        default=None)
    parser.add_argument(
        "-btl",
        "--btax-level",
        help="The taxonomic level to split input genomes into base taxons "
        "(1 - species, 2 - genus, 3 - family, etc)",
        required=False,
        default=int())
    parser.add_argument(
        "-btcp",
        "--btax-class-profile",
        help=
        "The path to HMM profile of sequences that should be used for base taxons classification "
        "while the db construction (not implemented yet)",
        required=False,
        default=None)
    parser.add_argument(
        "-btrp",
        "--btax-rep-profile",
        help=
        "The path to HMM profile of sequences that should be used for a base taxon response "
        "while essential and advantageous genes exploration (not implemented yet)",
        required=False,
        default=None)
    parser.add_argument("-d",
                        "--db-dir",
                        help="Path to a directory to collect database files",
                        required=False,
                        default=None)
    parser.add_argument("-nt",
                        "--num-threads",
                        help="Threads number (can be set in config file)",
                        required=False,
                        default=conf_constants.num_threads)
    parser.add_argument(
        "-po",
        "--prepared-organisms",
        help='Path to a json with organisms not to prepare listed. '
        'Format as follows: {"org_name": true}',
        required=False,
        default=None)
    parser.add_argument(
        "-poinf",
        "--prepared-organisms-info",
        help='Path to a json with info for organisms just prepared',
        required=False,
        default=None)
    parser.add_argument("-c",
                        "--config-path",
                        help="Path to a config file",
                        required=False,
                        default=None)

    cmd_args = parser.parse_args(args)
    if cmd_args.config_path:
        conf_constants.update_by_config(config_path=cmd_args.config_path)
        conf_constants_db.update_by_config(config_path=cmd_args.config_path)
        cmd_args.num_threads = conf_constants.num_threads
    return cmd_args.__dict__
Ejemplo n.º 3
0
def create_bactdb(input_table_refseq=None,
                  input_table_genbank=None,
                  input_table_custom=None,
                  btax_level=int(),
                  btax_class_profile=None,
                  btax_rep_profile=None,
                  db_dir=DEFAULT_BACTDB_DIR,
                  num_threads=None,
                  prepared_genomes=PREPARED_BACTERIA_F_NAME,
                  prepared_genomes_info=None,
                  config_path=None,
                  **kwargs):

    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not btax_level:
        btax_level = conf_constants_db.btax_level
    else:
        conf_constants_db.btax_level = btax_level
    if not db_dir:
        db_dir = DEFAULT_BACTDB_DIR
    if num_threads:
        int_num_threads = int(num_threads)
        num_threads = None
        num_threads = int_num_threads
        conf_constants.num_threads = num_threads
    else:
        num_threads = conf_constants.num_threads
    if not prepared_genomes:
        prepared_genomes = PREPARED_BACTERIA_F_NAME

    if btax_class_profile is not None:
        # TODO: implement loading btc_profiles from custom profiles
        eagle_logger.warning(
            "custom btax classification profiles are not implemented currently - default will be used"
        )
    # else:
    btc_profiles = [
        SeqProfileInfo(name="16S_rRNA", seq_type="nucl").get_json()
    ]  # TODO: include it to 'else' bock

    if btax_rep_profile is not None:
        # TODO: implement loading btr_profiles from custom profiles
        eagle_logger.warning(
            "custom btax representative profiles are not implemented currently - default will be used"
        )
    # else:
    btr_profiles = None  # TODO: include it to 'else' bock

    # TODO: this code should not get the btax classification sequence (16S rRNA)
    if input_table_custom is None and input_table_refseq is None and input_table_genbank is None:
        input_table_refseq = DEFAULT_REFSEQ_BACTERIA_TABLE
        input_table_genbank = DEFAULT_GENBANK_BACTERIA_TABLE
    bacteria_list = list()
    if input_table_refseq is not None or input_table_genbank is not None:
        bacteria_list = get_bacteria_from_ncbi(
            refseq_bacteria_table=input_table_refseq,
            genbank_bacteria_table=input_table_genbank,
            bactdb_dir=db_dir,
            num_threads=num_threads,
            prepared_bacteria_f_path=prepared_genomes)
    if input_table_custom is not None:
        eagle_logger.warning("custom genomes input is not implemented yet")
        # TODO: implement custom genomes input
        # bacteria_list.extend()
    if prepared_genomes_info:
        with open(prepared_genomes_info) as prep_genomes_info_f:
            bacteria_list = join_genomes_lists(
                genomes_list_1=bacteria_list,
                genomes_list_2=json.load(prep_genomes_info_f))

    # TODO: implement code to obtain btax classification sequence from fna with hmm profile
    # profiles input should be a list of SeqProfilesInfo objects
    # result - btc_seqs_path field of GenomeInfo objects in bacteria_list filled
    # currently it is filled during get_bacteria_from_ncbi run - not good

    btax_dict = get_btax_dict(genomes_list=bacteria_list,
                              btax_level=btax_level,
                              btc_profiles=btc_profiles,
                              db_dir=db_dir,
                              num_threads=num_threads,
                              build_tree=not bool(btr_profiles))

    btax_dict = get_btax_blastdb(btax_dict,
                                 db_dir=db_dir,
                                 btr_profiles=btr_profiles,
                                 num_threads=num_threads)

    repr_profiles_path = create_profiles_db(
        btax_dict,
        db_dir=db_dir,
        profiles_db_name=PROFILES_DB_NAME,
        method="hmmer",
        hmmer_inst_dir=conf_constants.hmmer_inst_dir,
        config_path=config_path,
        logger=eagle_logger)
    with open(os.path.join(db_dir, BTAX_JSON_NAME), "w") as btax_json_f:
        json.dump(btax_dict, btax_json_f,
                  indent=2)  # maybe btax_dict will be dumped in get_btax_dict
    db_info = DBInfo(all_genomes=os.path.join(db_dir, BACTERIA_LIST_F_NAME),
                     btax_json=os.path.join(db_dir, BTAX_JSON_NAME),
                     repr_profiles=repr_profiles_path,
                     global_dist_matrix=os.path.join(
                         db_dir, BACTERIA_GLOBAL_DIST_MATRIX),
                     all_org_full_names=os.path.join(
                         db_dir, BACTERIA_SHORT_TO_FULL_ORG_NAMES)).get_json()
    with open(os.path.join(db_dir, DB_INFO_NAME), "w") as db_info_f:
        json.dump(db_info, db_info_f, indent=2)
    return db_info
Ejemplo n.º 4
0
def get_families_dict(bacteria_list,
                      db_dir,
                      num_threads=None,
                      only_repr=False,
                      config_path=None):
    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not num_threads:
        num_threads = conf_constants.num_threads
    else:
        conf_constants.num_threads = num_threads
    if not only_repr:
        only_repr = conf_constants_db.only_repr
    else:
        conf_constants_db.only_repr = only_repr

    families_dict = dict()
    for bacterium in bacteria_list:
        if not bacterium:
            continue
        if not os.path.exists(bacterium["16S_rRNA_file"]):
            continue
        bacterium_data = {
            "download_prefix": bacterium["download_prefix"],
            "16S_rRNA_file": bacterium["16S_rRNA_file"],
            "fna_file": None,
            "source_db": bacterium["source_db"],
            "repr": bacterium['repr']
        }
        if only_repr and not bacterium['repr']: continue
        if families_dict.get(bacterium['family'], None):
            if families_dict[bacterium['family']].get(bacterium['genus'],
                                                      None):
                if families_dict[bacterium['family']][bacterium['genus']].get(
                        bacterium['species'], None):
                    families_dict[bacterium['family']][bacterium['genus']][bacterium['species']][bacterium['strain']] =\
                        bacterium_data
                else:
                    families_dict[bacterium['family']][bacterium['genus']][bacterium['species']] = \
                        {bacterium['strain']: bacterium_data}
            else:
                families_dict[bacterium['family']][bacterium['genus']] = \
                    {bacterium['species']:
                        {bacterium['strain']: bacterium_data}
                     }
        else:
            families_dict[bacterium['family']] = \
                {bacterium['genus']:
                    {bacterium['species']:
                         {bacterium['strain']: bacterium_data}
                     }
                # "16S_rRNA_tree": None,
                # "WGS_tree": None,
                # "16S_rRNA_gtf": os.path.join(db_dir, bacterium['family']+"_16S_rRNA.gtf"),
                # "WGS_gtf": os.path.join(db_dir, bacterium['family']+"_WGS.gtf"),
                # "16S_rRNA_profile": None,
                # "WGS_profile": None,
                 }

    bact_fam_f_path = os.path.join(db_dir, BACT_FAM_F_NAME)
    prepare_families(families_dict,
                     db_dir,
                     bact_fam_f_path,
                     num_threads=num_threads)

    return json.load(open(bact_fam_f_path))
Ejemplo n.º 5
0
def get_btax_dict(genomes_list,
                  btax_level,
                  btc_profiles,
                  db_dir,
                  num_threads=None,
                  build_tree=False,
                  config_path=None,
                  **kwargs):

    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not num_threads:
        num_threads = conf_constants.num_threads
    else:
        conf_constants.num_threads = num_threads

    btax_dict = defaultdict(BtaxInfo)
    btc_fasta_dict = defaultdict(dict)
    seq_ids_to_orgs = dict()
    for genome_dict in genomes_list:
        if not genome_dict:
            continue
        genome_info = GenomeInfo.load_from_dict(genome_dict)
        if not genome_info.btc_seqs_id:
            continue
        btax_name = None
        try:
            btax_name = genome_info.taxonomy[-btax_level]
        except IndexError:
            btax_name = genome_info.taxonomy[0]
        btax_dict[btax_name].genomes.append(genome_info.get_json())
        if btax_dict[btax_name].name is None:
            btax_dict[btax_name].name = btax_name
        btc_seqs_fasta_dict = load_fasta_to_dict(genome_info.btc_seqs_fasta)
        for btc_seq_id in genome_info.btc_seqs_id:
            seq_ids_to_orgs[btc_seq_id] = genome_info.org_name
            btc_fasta_dict[genome_info.btc_seqs_id[btc_seq_id]][
                btc_seq_id] = btc_seqs_fasta_dict[btc_seq_id]

    btc_profile_types = dict()
    for btc_profile_dict in btc_profiles:
        btc_profile_info = SeqProfileInfo.load_from_dict(btc_profile_dict)
        btc_profile_types[btc_profile_info.name] = btc_profile_info.seq_type
    btc_dist_dict = dict()
    btc_aln_dict = dict()
    short_to_full_seq_names = dict()
    for btc_profile_name in btc_fasta_dict:
        btc_mult_aln = construct_mult_aln(
            seq_dict=btc_fasta_dict[btc_profile_name],
            aln_type=btc_profile_types[btc_profile_name],
            aln_name=btc_profile_name + "_aln",
            tmp_dir=kwargs.get("aln_tmp_dir", "mult_aln_tmp"),
            method=conf_constants_db.btc_profile_aln_method,
            num_threads=num_threads,
            logger=eagle_logger,
            op=5.0,
            ep=0.5,
            **kwargs)  # low_memory can be set through kwargs

        # TODO: only the code from else block should be remained after moving 16S rRNA obtaining out from get_bacteria_from_ncbi
        if btc_profile_name == "16S_rRNA":
            btc_mult_aln.short_to_full_seq_names = \
                reduce_seq_names({re.sub("lcl\|(N(C|Z)_)?", "", seq_name): seq_name for seq_name in btc_mult_aln},
                                 num_letters=10, num_words=1)[0]
        else:
            btc_mult_aln.short_to_full_seq_names = short_to_full_seq_names.copy(
            )

        btc_mult_aln.remove_paralogs(seq_ids_to_orgs=seq_ids_to_orgs,
                                     inplace=True)
        btc_mult_aln.improve_aln(inplace=True)
        btc_dist_dict[btc_profile_name] = btc_mult_aln.get_distance_matrix(
        )  # TODO: implement specific positions method
        short_to_full_seq_names.update(btc_mult_aln.short_to_full_seq_names)
        if kwargs.get("save_alignments", False):
            btc_mult_aln.dump_alignment(
                aln_fasta_path=os.path.join(db_dir, btc_mult_aln.aln_name +
                                            ".fasta"))
        btc_mult_aln.rename_seqs(seq_ids_to_orgs)
        btc_aln_dict[btc_profile_name] = deepcopy(btc_mult_aln)

    global_dist_matr = get_global_dist(btc_dist_dict, btc_profiles,
                                       seq_ids_to_orgs)
    global_dist_matr_path = os.path.join(db_dir, BACTERIA_GLOBAL_DIST_MATRIX)
    short_to_full_seq_names_path = os.path.join(
        db_dir, BACTERIA_SHORT_TO_FULL_ORG_NAMES)
    short_to_full_seq_names = global_dist_matr.dump(
        matrix_path=global_dist_matr_path, matr_format="phylip")
    with open(short_to_full_seq_names_path, "w") as short_to_full_org_names_f:
        json.dump(short_to_full_seq_names, short_to_full_org_names_f, indent=2)

    eagle_logger.info("base taxons standardisation started")
    btax_dict = standardize_btax(btax_dict=btax_dict,
                                 global_dist_matr=global_dist_matr)
    eagle_logger.info("base taxons standardisation finished")

    full_to_short_seq_names = {
        v: k
        for k, v in short_to_full_seq_names.items()
    }
    for btax_name in btax_dict:
        btax_orgs = set(
            GenomeInfo.load_from_dict(genome).org_name
            for genome in btax_dict[btax_name].genomes)
        if build_tree:
            btax_dict[btax_name].mean_d = global_dist_matr[btax_orgs].mean_dist
            btax_dict[btax_name].median_d = global_dist_matr[
                btax_orgs].median_dist
            if len(btax_orgs) > 2:
                btax_dict[btax_name].ref_tree_newick = build_tree_by_dist(
                    global_dist_matr[btax_orgs],
                    tree_name=btax_name + "_tree").newick
                btax_btc_aln_dict = dict()
                for btc_profile_name, btc_aln in btc_aln_dict.items():
                    btax_btc_aln = btc_aln[btax_orgs].improve_aln(
                        inplace=False)
                    btax_btc_aln.aln_name = btax_name + "_" + btc_profile_name
                    btax_btc_aln_dict[btc_profile_name] = deepcopy(
                        btax_btc_aln)
                btax_dict[btax_name].repr_profiles = generate_btax_profiles(
                    btax_btc_aln_dict,
                    db_dir=db_dir,
                    btax_name=btax_name,
                    method="hmmer")
        btax_dict[btax_name].ref_tree_full_names = \
            {full_to_short_seq_names[btax_org]: btax_org for btax_org in btax_orgs}
        btax_dict[btax_name] = btax_dict[btax_name].get_json()
    return btax_dict
Ejemplo n.º 6
0
def get_bacteria_from_ncbi(refseq_bacteria_table=None,
                           genbank_bacteria_table=None,
                           bactdb_dir=DEFAULT_BACTDB_DIR,
                           num_threads=None,
                           first_bact=None,
                           last_bact=None,
                           prepared_bacteria_f_path=PREPARED_BACTERIA_F_NAME,
                           remove_bact_list_f=False,
                           config_path=None):

    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not num_threads:
        num_threads = conf_constants.num_threads
    else:
        conf_constants.num_threads = num_threads
    if refseq_bacteria_table is None and genbank_bacteria_table is None:
        refseq_bacteria_table = DEFAULT_REFSEQ_BACTERIA_TABLE
        genbank_bacteria_table = DEFAULT_GENBANK_BACTERIA_TABLE
    try:
        os.makedirs(bactdb_dir)
    except OSError:
        eagle_logger.warning("bactdb directory exists")
    prepared_bacteria = mp.Manager().dict()
    if os.path.exists(prepared_bacteria_f_path):
        eagle_logger.info("loading prepared bacteria from '%s'" %
                          prepared_bacteria_f_path)
        prepared_bacteria_f = open(prepared_bacteria_f_path)
        prepared_bacteria.update(json.load(prepared_bacteria_f))
        prepared_bacteria_f.close()
        eagle_logger.info("prepared bacteria loaded")
    bacteria_list_f_path = os.path.join(bactdb_dir, BACTERIA_LIST_F_NAME)
    bacteria_list_f = io.open(bacteria_list_f_path, 'w', newline="\n")
    bacteria_list_f.write(u"[\n")
    bacteria_list_f.close()
    refseq_df = pandas.read_csv(refseq_bacteria_table, sep="\t",
                                dtype=str).sort_values(by="ncbi_link")
    genbank_df = pandas.read_csv(genbank_bacteria_table, sep="\t",
                                 dtype=str).sort_values(by="ncbi_link")
    n = 1
    i = 0
    j = 0
    params_list = list()
    while i < refseq_df.shape[0] or j < genbank_df.shape[0]:
        if first_bact and n < first_bact:
            n += 1
            continue
        if last_bact and n > last_bact: break
        if i >= refseq_df.shape[0] or j >= genbank_df.shape[0]:
            if i >= refseq_df.shape[0]:
                params_list.append({
                    'function':
                    get_bacterium,
                    'prepared_bacteria':
                    prepared_bacteria,
                    'logger_name':
                    eagle_logger.name,
                    'ncbi_db_link':
                    genbank_df.iloc[j]["ncbi_link"],
                    'bacterium_name':
                    genbank_df.iloc[j]["org_name"],
                    'is_repr':
                    bool_from_str(genbank_df.iloc[j]["repr"]),
                    'db_dir':
                    bactdb_dir,
                    'source_db':
                    "genbank",
                    'try_err_message':
                    "%s is not prepared: " % genbank_df.iloc[j]["org_name"],
                })
                j += 1
            else:
                params_list.append({
                    'function':
                    get_bacterium,
                    'prepared_bacteria':
                    prepared_bacteria,
                    'logger_name':
                    eagle_logger.name,
                    'ncbi_db_link':
                    refseq_df.iloc[i]["ncbi_link"],
                    'bacterium_name':
                    refseq_df.iloc[i]["org_name"],
                    'is_repr':
                    bool_from_str(refseq_df.iloc[i]["repr"]),
                    'db_dir':
                    bactdb_dir,
                    'source_db':
                    "refseq",
                    'try_err_message':
                    "%s is not prepared: " % refseq_df.iloc[i]["org_name"],
                })
                i += 1
        else:
            if genbank_df.iloc[j]["ncbi_link"].replace(
                    "GCA", "GCF") < refseq_df.iloc[i]["ncbi_link"]:
                params_list.append({
                    'function':
                    get_bacterium,
                    'prepared_bacteria':
                    prepared_bacteria,
                    'logger_name':
                    eagle_logger.name,
                    'ncbi_db_link':
                    genbank_df.iloc[j]["ncbi_link"],
                    'bacterium_name':
                    genbank_df.iloc[j]["org_name"],
                    'is_repr':
                    bool_from_str(genbank_df.iloc[j]["repr"]),
                    'db_dir':
                    bactdb_dir,
                    'source_db':
                    "genbank",
                    'try_err_message':
                    "%s is not prepared: " % genbank_df.iloc[j]["org_name"],
                })
                j += 1
            else:
                params_list.append({
                    'function':
                    get_bacterium,
                    'prepared_bacteria':
                    prepared_bacteria,
                    'logger_name':
                    eagle_logger.name,
                    'ncbi_db_link':
                    refseq_df.iloc[i]["ncbi_link"],
                    'bacterium_name':
                    refseq_df.iloc[i]["org_name"],
                    'is_repr':
                    bool_from_str(refseq_df.iloc[i]["repr"]),
                    'db_dir':
                    bactdb_dir,
                    'source_db':
                    "refseq",
                    'try_err_message':
                    "%s is not prepared: " % refseq_df.iloc[i]["org_name"],
                })
                i += 1
            if genbank_df.iloc[j]["ncbi_link"].replace(
                    "GCA", "GCF") == refseq_df.iloc[i - 1]["ncbi_link"]:
                j += 1
        n += 1
    eagle_logger.info("got download links for %s bacteria" % len(params_list))
    pool = mp.Pool(num_threads)
    pool.map(worker, params_list)
    pool.close()
    pool.join()
    prepared_bacteria_f = open(
        os.path.join(bactdb_dir, PREPARED_BACTERIA_F_NAME), "w")
    json.dump(dict(prepared_bacteria), prepared_bacteria_f)
    prepared_bacteria_f.close()
    bacteria_list_f = io.open(bacteria_list_f_path, 'a', newline="\n")
    bacteria_list_f.write(u"  {}\n]")
    bacteria_list_f.close()
    with open(bacteria_list_f_path) as bacteria_list_f:
        return json.load(bacteria_list_f)
Ejemplo n.º 7
0
def explore_orfs(in_fasta,
                 db_json,
                 out_dir="",
                 min_orf_l=None,
                 btax_name=None,
                 num_threads=None,
                 btax_det_method="hmmer",
                 config_path=None,
                 **kwargs):

    if config_path:
        conf_constants.update_by_config(config_path)
    if num_threads:
        conf_constants.num_threads = int(num_threads)
        num_threads = None
    num_threads = conf_constants.num_threads
    if min_orf_l:
        conf_constants.min_orf_l = min_orf_l
        min_orf_l = None
    min_orf_l = conf_constants.min_orf_l

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    if kwargs.get("save_alignments", False) and not os.path.exists(
            os.path.join(out_dir, ORF_ALNS_DIR)):
        os.makedirs(os.path.join(out_dir, ORF_ALNS_DIR))
    if kwargs.get("save_trees", False) and not os.path.exists(
            os.path.join(out_dir, ORF_TREES_DIR)):
        os.makedirs(os.path.join(out_dir, ORF_TREES_DIR))

    if type(db_json) is str:
        with open(db_json) as db_json_f:
            db_info = DBInfo.load_from_dict(json.load(db_json_f))
    elif isinstance(db_json, dict):
        db_info = DBInfo.load_from_dict(db_json)
    else:
        eagle_logger.error("Unsupported type of value for 'db_json' argument")
        return
    with open(db_info.btax_json) as btax_dict_f:
        btax_dict = json.load(btax_dict_f)
    if btax_name is None:
        btax_name = get_btax_name(in_fasta,
                                  db_info.repr_profiles,
                                  btax_names=btax_dict.keys(),
                                  work_dir=out_dir,
                                  num_threads=conf_constants.num_threads,
                                  method=btax_det_method,
                                  hmmer_inst_dir=conf_constants.hmmer_inst_dir,
                                  config_path=config_path)

    orfs_fasta_path = os.path.join(out_dir,
                                   os.path.basename(in_fasta) + ".orfs")
    res_gtf_json = get_orfs(in_fasta_path=in_fasta,
                            out_fasta_path=orfs_fasta_path,
                            minsize=min_orf_l)
    blast_handler = BlastHandler(inst_dir=conf_constants.blast_inst_dir,
                                 config_path=config_path,
                                 logger=eagle_logger)

    if btax_name == "Unclassified":
        eagle_logger.warning(
            "The family was not detected - cannot run further analysis")
    else:
        btax_info = BtaxInfo.load_from_dict(btax_dict[btax_name])
        eagle_logger.info("Family '%s' will be used for the sequence from %s" %
                          (btax_name, in_fasta))
        tblastn_out_path = kwargs.get("tblastn_result_path",
                                      None)  # for debug and testing
        if tblastn_out_path is None:
            tblastn_out_path = os.path.join(out_dir,
                                            os.path.basename(in_fasta) + ".bl")
            blast_handler.run_blast_search(blast_type="tblastn",
                                           query=orfs_fasta_path,
                                           db=btax_info.blastdb,
                                           out=tblastn_out_path,
                                           num_threads=num_threads)
        res_gtf_json = analyze_tblastn_out(
            tblastn_out_path=tblastn_out_path,
            orfs_fasta_path=orfs_fasta_path,
            in_fasta=in_fasta,
            btax_data=btax_dict[btax_name],
            res_gtf_json=res_gtf_json,
            num_threads=conf_constants.num_threads,
            work_dir=out_dir,
            save_alignments=kwargs.get("save_alignments", False),
            save_trees=kwargs.get("save_trees", False))
    res_gtf_df = pd.DataFrame(res_gtf_json.values())
    res_gtf_df.sort_values("start", inplace=True)
    res_gtf_df = res_gtf_df[[
        "seqid", "source", "type", "start", "end", "score", "strand", "frame",
        "attribute"
    ]]
    res_gtf_df.to_csv(os.path.join(out_dir,
                                   os.path.basename(in_fasta) + ".gtf"),
                      sep="\t",
                      index=False,
                      quotechar="'")
Ejemplo n.º 8
0
def _parse_cmd_args(*args):
    parser = argparse.ArgumentParser()

    parser.add_argument("-i",
                        "--in-fasta",
                        help="Path to input fasta file",
                        required=True)
    parser.add_argument("-db",
                        "--db-json",
                        help="Path to json with eagledb to use description",
                        required=True)
    parser.add_argument("-o",
                        "--out-dir",
                        help="Path to the directory for output",
                        required=False,
                        default="")
    parser.add_argument("-l",
                        "--min-orf-l",
                        help="Minimal length for ORF to analyze",
                        required=False)
    parser.add_argument(
        "-btn",
        "--btax-name",
        help=
        "The name of base taxon. If specified eagle will not scan the eagledb and "
        "will work straight with this base taxon. Applicable only with 'genome' mode",
        required=False,
        default=None)
    parser.add_argument("-nt",
                        "--num-threads",
                        help="Number of threads",
                        required=False,
                        default=conf_constants.num_threads)
    parser.add_argument(
        "-btd",
        "--btax-det-method",
        help=
        "Method name to detect base taxon for input sequence (default: 'hmmer')",
        required=False,
        default="hmmer")
    parser.add_argument("-c",
                        "--config-path",
                        help="Path to a config file",
                        required=False,
                        default=None)
    parser.add_argument("-tbnr",
                        "--tblastn-result-path",
                        help="Path to tblastn result (outfmt 7) if it exists",
                        required=False,
                        default=None)
    parser.add_argument(
        "-sa",
        "--save-alignments",
        help="Set it '1' if ORFs multiple alignments are needed to be saved",
        required=False,
        default=False)
    parser.add_argument(
        "-st",
        "--save-trees",
        help="Set it '1' if ORFs phylogenetic trees are needed to be saved",
        required=False,
        default=False)

    cmd_args = parser.parse_args(args)
    if cmd_args.config_path:
        conf_constants.update_by_config(config_path=cmd_args.config_path)
        cmd_args.num_threads = conf_constants.num_threads
    return cmd_args.__dict__