Exemple #1
0
def analyze_tblastn_out(tblastn_out_path,
                        orfs_fasta_path,
                        in_fasta,
                        btax_data,
                        res_gtf_json,
                        num_threads=conf_constants.num_threads,
                        work_dir="",
                        save_alignments=False,
                        save_trees=False):

    in_fasta_dict = load_fasta_to_dict(fasta_path=in_fasta)
    btax_info = BtaxInfo.load_from_dict(btax_data)
    orfs_stats = mp.Manager().dict()
    seq_ids_to_orgs = btax_info.fna_id
    tblatn_out_dict = read_blast_out(blast_out_path=tblastn_out_path)
    orfs_fasta_dict = load_fasta_to_dict(fasta_path=orfs_fasta_path)
    params_list = list()
    for seq_id in orfs_fasta_dict:
        chr_id, c1, c2, ori = parse_orf_id(seq_id)
        if ori == "+":
            orf_nucl_seq = in_fasta_dict[chr_id][c1 - 1:c2]
        else:
            orf_nucl_seq = str(
                Seq(in_fasta_dict[chr_id][c1 - 1:c2]).reverse_complement())
        params_list.append({
            "function": get_orf_stats,
            "orf_id": seq_id,
            "orf_homologs_prot": {
                seq_id: orfs_fasta_dict[seq_id]
            },
            "orf_homologs_nucl": {
                seq_id: orf_nucl_seq
            },
            "homologs_list": tblatn_out_dict[seq_id],
            "btax_data": btax_data,
            "seq_ids_to_orgs": seq_ids_to_orgs,
            "orfs_stats": orfs_stats,
            "work_dir": work_dir,
            "save_alignment": save_alignments,
            "save_tree": save_trees
        })

    pool = mp.Pool(num_threads)
    pool.map(worker, params_list)
    # list(map(worker, params_list))  # for debug
    pool.close()
    pool.join()

    tblatn_out_dict = None
    orfs_fasta_dict = None
    eagle_logger.info("ORFs stats calculated")
    for orf_id in orfs_stats.keys():
        try:
            res_gtf_json[orf_id]["attribute"] = json.dumps(orfs_stats[orf_id])
        except KeyError:
            pass
    orfs_stats = None
    return res_gtf_json
Exemple #2
0
    def nucl_by_prot_aln(self, nucl_fasta_dict=None, nucl_fasta_path=None):
        if self.aln_type.lower() not in ("protein", "prot", "p"):
            if self.logger:
                self.logger.error("reference alignment type is not protein")
            else:
                print("ERROR: reference alignment type is not protein")
            return
        if not nucl_fasta_dict:
            if nucl_fasta_path:
                nucl_fasta_dict = load_fasta_to_dict(fasta_path=nucl_fasta_path)
            else:
                if self.logger:
                    self.logger.error("no nucleotide sequences are input")
                else:
                    print("ERROR: no nucleotide sequences are input")
                return

        nucl_aln_dict = dict()
        for seq_name in self.seq_names:
            match = re.search(re.sub("[-\.]", "", self[seq_name]).replace("*", "."),
                              str(Seq(nucl_fasta_dict[seq_name]).translate()))
            nucl_aln_dict[seq_name] = nucl_accord_prot(self[seq_name],
                                                       nucl_fasta_dict[seq_name][match.start()*3: match.end()*3])
        nucl_aln = self._sub_mult_aln(SeqsDict.load_from_dict(nucl_aln_dict),
                                      aln_type="nucl",
                                      aln_name="nucl_"+self.aln_name)
        return nucl_aln
Exemple #3
0
    def run_blast_search(self, blast_type, query, db, out, num_threads=1, outfmt=7, max_hsps=100, **kwargs):
        if num_threads > 1 and kwargs.get("split_input", True):
            if self.logger is not None:
                self.logger.info("splitting '%s' into %s parts" % (query, num_threads))
            else:
                print("INFO: splitting '%s' into %s parts" % (query, num_threads))
            if not os.path.exists(self.tmp_dir):
                os.makedirs(self.tmp_dir)
            query_dict = load_fasta_to_dict(fasta_path=query, dat_path=os.path.join(self.tmp_dir, ".%s.dat" % query))
            query_chunk_size = len(query_dict) // num_threads + 1
            p_list = list()
            query_seqs = list(query_dict.keys())
            i = 0
            query_chunks_list = list()
            for i in range(num_threads):
                query_chunk_path = None
                query_chunk_path = os.path.join(self.tmp_dir,
                                                ("_%s" % i).join(os.path.splitext(os.path.basename(query))))
                query_chunks_list.append([query_chunk_path, query_chunk_path + ".bl"])
                if len(query_dict) == i*query_chunk_size:
                    del query_chunks_list[-1]
                    continue
                elif (i+1) * query_chunk_size > len(query_dict):
                    query_dict.get_sample(query_seqs[i * query_chunk_size:]).dump(seqs_path=query_chunks_list[-1][0])
                else:
                    query_dict.get_sample(query_seqs[i*query_chunk_size:
                                                     (i+1)*query_chunk_size]).dump(seqs_path=query_chunks_list[-1][0])
                p = mp.Process(target=self.run_blast_search,
                               args=(blast_type, query_chunks_list[-1][0], db,
                                     query_chunks_list[-1][1], 1, outfmt, max_hsps),
                               kwargs=kwargs)
                p.start()
                p_list.append(p)
            for p in p_list:
                p.join()
            join_files(in_files_list=list(map(lambda p: p[1], query_chunks_list)), out_file_path=out)

            if kwargs.get("remove_tmp", True):
                shutil.rmtree(self.tmp_dir)
        else:
            blast_search_cmd = os.path.join(self.inst_dir, blast_type) + \
                               " -query " + query + \
                               " -db " + db + \
                               " -out " + out + \
                               " -word_size " + kwargs.get("word_size", str(3)) + \
                               " -num_threads " + str(num_threads) + \
                               " -outfmt " + str(outfmt) + \
                               " -max_hsps " + str(max_hsps)
            if self.logger is not None:
                self.logger.info("run '%s' command" % blast_search_cmd)
            else:
                print("INFO: run '%s' command" % blast_search_cmd)
            subprocess.call(blast_search_cmd, shell=True)
Exemple #4
0
def detect_seqs_type(fasta_path=None, fasta_dict=None, nuc_freq_thr=0.75):
    seqs_list = list()
    summ_l = 0
    if not fasta_dict and fasta_path:
        fasta_dict = load_fasta_to_dict(fasta_path)
        for seq_key in fasta_dict.keys():
            seqs_list.append(fasta_dict[seq_key].lower().replace("-", ""))
            summ_l += len(seqs_list[-1])
        let_counts = Counter("".join(seqs_list))
        if float(let_counts.get("a", 0)+let_counts.get("c", 0)+let_counts.get("g", 0)+
                         let_counts.get("t", 0))/float(summ_l) >= nuc_freq_thr:
            return "nucl"
        else:
            return "prot"
    else:
        return None
Exemple #5
0
 def run_hmmscan(self, profiles_db, in_fasta, num_threads=4, out_path=None):
     if not os.path.exists(self.tmp_dir):
         os.makedirs(self.tmp_dir)
     shredded_fasta_path = os.path.join(self.tmp_dir, os.path.basename(in_fasta))
     if not out_path:
         if "." in in_fasta:
             out_path = ".".join(in_fasta.split(".")[:-1]) + ".hsr"
         else:
             out_path = in_fasta + ".hsr"
     in_fasta_dict = load_fasta_to_dict(fasta_path=in_fasta)
     shredded_in_fasta = shred_seqs(fasta_dict=in_fasta_dict, part_l=50000, parts_ov=5000)
     fasta_to_scan_dict = OrderedDict()
     for seq_id in shredded_in_fasta:
         i = 1
         for seq in shredded_in_fasta[seq_id]:
             fasta_to_scan_dict[seq_id+"_"+str(i)] = seq
             i += 1
     dump_fasta_dict(fasta_dict=fasta_to_scan_dict, fasta_path=shredded_fasta_path)
     hmmscan_cmd = os.path.join(self.inst_dir, "hmmscan") + " --cpu " + str(num_threads) + " " + profiles_db + " " +\
                   shredded_fasta_path + " > " + out_path
     subprocess.call(hmmscan_cmd, shell=True)
     shutil.rmtree(self.tmp_dir, ignore_errors=True)
Exemple #6
0
def prepare_family(family_name, family_data, bact_fam_f_path, db_dir,
                   **kwargs):
    # TODO: refactor it
    special_keys = ("16S_rRNA_tree", "16S_rRNA_tsv", "16S_rRNA_fasta",
                    "blastdb", "repr_profile")
    rRNA_seqs_dict = dict()  # {seq_id: seq}
    ids_to_org_dict = dict()  # {seq_id: bacterium_name}
    for genus in family_data.keys():
        for species in family_data[genus].keys():
            for strain in family_data[genus][species].keys():
                bacterium_rRNA_dict = load_fasta_to_dict(
                    family_data[genus][species][strain]["16S_rRNA_file"])
                for rRNA_id in bacterium_rRNA_dict.keys():
                    new_rRNA_id = None
                    new_rRNA_id = rRNA_id.split(" ")[0].split("|")[1]
                    ids_to_org_dict[new_rRNA_id] = strain
                    rRNA_seqs_dict[new_rRNA_id] = bacterium_rRNA_dict[rRNA_id]
    red, reduced_orgs = reduce_seq_names(fasta_dict=dict(
        map(lambda x: (x, True), set(ids_to_org_dict.values()))),
                                         num_letters=7,
                                         num_words=2)
    rev_reduced_orgs = dict(map(lambda x: (x[1], x[0]), reduced_orgs.items()))
    comp_seq_id_dict = defaultdict(int)
    short_ids_dict = dict()
    for seq_id in rRNA_seqs_dict.keys():
        short_seq_id = None
        short_seq_id = rev_reduced_orgs[ids_to_org_dict[seq_id]]+"x"+\
                       str(get_un_fix(un_num=comp_seq_id_dict[rev_reduced_orgs[ids_to_org_dict[seq_id]]], fix_len=2))
        comp_seq_id_dict[rev_reduced_orgs[ids_to_org_dict[seq_id]]] += 1
        short_ids_dict[short_seq_id] = rev_reduced_orgs[
            ids_to_org_dict[seq_id]] + "x" + seq_id
        ids_to_org_dict[short_ids_dict[short_seq_id]] = {
            "organism_name": ids_to_org_dict.pop(seq_id)
        }
        rRNA_seqs_dict[short_seq_id] = rRNA_seqs_dict.pop(seq_id)
    eagle_logger.info("%s rRNA loaded" % family_name)
    # TODO: follows
    ### This section will be upgraded with my own alignment method but now MUSCLE and hmmer 3 are used
    tmp_fam_dir = os.path.join(db_dir, family_name + "_tmp")
    rRNA_aln = construct_mult_aln(
        seq_dict=rRNA_seqs_dict,
        method="MUSCLE",
        aln_type="nucl",
        aln_name=family_name + "_rRNA_aln",
        tmp_dir=tmp_fam_dir,
        muscle_exec_path=conf_constants.muscle_exec_path,
        emboss_inst_dir=conf_constants.emboss_inst_dir,
        hmmer_inst_dir=conf_constants.hmmer_inst_dir,
        logger=eagle_logger)
    eagle_logger.info("%s rRNA alignment constructed" % family_name)
    rRNA_aln.short_to_full_seq_names = short_ids_dict
    # May be errors: not tested
    rRNA_aln.remove_paralogs(
        ids_to_org_dict, method="min_dist",
        inplace=True)  # If I use my own alignment method: method="spec_pos"
    rRNA_tree = build_tree_by_dist(
        rRNA_aln.get_distance_matrix(),
        full_seq_names=rRNA_aln.full_to_short_seq_names,
        tmp_dir=tmp_fam_dir,
        logger=eagle_logger)
    # TODO: write it for not only repr bacteria usage
    # fam_tax = {family_name: get_tree_from_dict(family_data, stop_level=3, special_keys=special_keys)}
    # rRNA_tree, removed_seqs = rRNA_tree.according_to_taxonomy(taxonomy=fam_tax)
    # rRNA_aln.remove_seqs(seqs_list=removed_seqs)
    ###
    family_data["16S_rRNA_tree"] = {
        "newick": rRNA_tree.newick,
        "full_seq_names": rRNA_tree.full_seq_names
    }
    family_data["16S_rRNA_tsv"] = os.path.join(db_dir,
                                               family_name + "_16S_rRNA.tsv")
    family_data["16S_rRNA_fasta"] = os.path.join(
        db_dir, family_name + "_16S_rRNA.fasta")
    rRNA_aln.get_blocks_tsv(tsv_path=family_data["16S_rRNA_tsv"],
                            fasta_path=family_data["16S_rRNA_fasta"],
                            meta_dict=ids_to_org_dict)
    remained_orgs = list(
        map(lambda seq_id: ids_to_org_dict[seq_id]["organism_name"],
            rRNA_aln.seq_names()))
    family_data = clean_btax_data(family_data,
                                  remained_orgs,
                                  stop_level=3,
                                  special_keys=special_keys)
    family_data = download_btax_files(
        key_prefix_pairs={"fna_file": "_genomic.fna.gz"},
        btax_data=family_data,
        download_dir=db_dir,
        logger=eagle_logger)
    family_data["fam_fna"], family_data["chr_id"] = get_btax_fna(
        fna_key="fna_file",
        btax_genomes=family_data,
        btax_name=family_name,
        db_dir=db_dir)
    family_data["blastdb"] = create_btax_blastdb(
        btax_fna_path=family_data["fam_fna"],
        btax_name=family_name,
        db_dir=db_dir,
        blast_inst_dir=conf_constants.blast_inst_dir,
        logger=eagle_logger)
    # repr_alns = <function that builds alignments for set of representative genes (returns dict = {aln_name: MultAln object})>
    family_data["repr_profile"] = generate_btax_profiles(
        source={"16S_rRNA": rRNA_aln},
        db_dir=db_dir,
        btax_name=family_name,
        method="hmmer")  # TODO: the source should be repr_alns
    # family_data["codon_usage"] = get_btax_cu(family_data)
    bact_fam_json_f = open(bact_fam_f_path, 'a')
    bact_fam_json_f.write('  "' + family_name + '": ' +
                          json.dumps(family_data) + ",\n")
    bact_fam_json_f.close()
    eagle_logger.info("%s prepared" % family_name)
Exemple #7
0
def get_btax_dict(genomes_list,
                  btax_level,
                  btc_profiles,
                  db_dir,
                  num_threads=None,
                  build_tree=False,
                  config_path=None,
                  **kwargs):

    if config_path:
        conf_constants.update_by_config(config_path=config_path)
        conf_constants_db.update_by_config(config_path=config_path)
    if not num_threads:
        num_threads = conf_constants.num_threads
    else:
        conf_constants.num_threads = num_threads

    btax_dict = defaultdict(BtaxInfo)
    btc_fasta_dict = defaultdict(dict)
    seq_ids_to_orgs = dict()
    for genome_dict in genomes_list:
        if not genome_dict:
            continue
        genome_info = GenomeInfo.load_from_dict(genome_dict)
        if not genome_info.btc_seqs_id:
            continue
        btax_name = None
        try:
            btax_name = genome_info.taxonomy[-btax_level]
        except IndexError:
            btax_name = genome_info.taxonomy[0]
        btax_dict[btax_name].genomes.append(genome_info.get_json())
        if btax_dict[btax_name].name is None:
            btax_dict[btax_name].name = btax_name
        btc_seqs_fasta_dict = load_fasta_to_dict(genome_info.btc_seqs_fasta)
        for btc_seq_id in genome_info.btc_seqs_id:
            seq_ids_to_orgs[btc_seq_id] = genome_info.org_name
            btc_fasta_dict[genome_info.btc_seqs_id[btc_seq_id]][
                btc_seq_id] = btc_seqs_fasta_dict[btc_seq_id]

    btc_profile_types = dict()
    for btc_profile_dict in btc_profiles:
        btc_profile_info = SeqProfileInfo.load_from_dict(btc_profile_dict)
        btc_profile_types[btc_profile_info.name] = btc_profile_info.seq_type
    btc_dist_dict = dict()
    btc_aln_dict = dict()
    short_to_full_seq_names = dict()
    for btc_profile_name in btc_fasta_dict:
        btc_mult_aln = construct_mult_aln(
            seq_dict=btc_fasta_dict[btc_profile_name],
            aln_type=btc_profile_types[btc_profile_name],
            aln_name=btc_profile_name + "_aln",
            tmp_dir=kwargs.get("aln_tmp_dir", "mult_aln_tmp"),
            method=conf_constants_db.btc_profile_aln_method,
            num_threads=num_threads,
            logger=eagle_logger,
            op=5.0,
            ep=0.5,
            **kwargs)  # low_memory can be set through kwargs

        # TODO: only the code from else block should be remained after moving 16S rRNA obtaining out from get_bacteria_from_ncbi
        if btc_profile_name == "16S_rRNA":
            btc_mult_aln.short_to_full_seq_names = \
                reduce_seq_names({re.sub("lcl\|(N(C|Z)_)?", "", seq_name): seq_name for seq_name in btc_mult_aln},
                                 num_letters=10, num_words=1)[0]
        else:
            btc_mult_aln.short_to_full_seq_names = short_to_full_seq_names.copy(
            )

        btc_mult_aln.remove_paralogs(seq_ids_to_orgs=seq_ids_to_orgs,
                                     inplace=True)
        btc_mult_aln.improve_aln(inplace=True)
        btc_dist_dict[btc_profile_name] = btc_mult_aln.get_distance_matrix(
        )  # TODO: implement specific positions method
        short_to_full_seq_names.update(btc_mult_aln.short_to_full_seq_names)
        if kwargs.get("save_alignments", False):
            btc_mult_aln.dump_alignment(
                aln_fasta_path=os.path.join(db_dir, btc_mult_aln.aln_name +
                                            ".fasta"))
        btc_mult_aln.rename_seqs(seq_ids_to_orgs)
        btc_aln_dict[btc_profile_name] = deepcopy(btc_mult_aln)

    global_dist_matr = get_global_dist(btc_dist_dict, btc_profiles,
                                       seq_ids_to_orgs)
    global_dist_matr_path = os.path.join(db_dir, BACTERIA_GLOBAL_DIST_MATRIX)
    short_to_full_seq_names_path = os.path.join(
        db_dir, BACTERIA_SHORT_TO_FULL_ORG_NAMES)
    short_to_full_seq_names = global_dist_matr.dump(
        matrix_path=global_dist_matr_path, matr_format="phylip")
    with open(short_to_full_seq_names_path, "w") as short_to_full_org_names_f:
        json.dump(short_to_full_seq_names, short_to_full_org_names_f, indent=2)

    eagle_logger.info("base taxons standardisation started")
    btax_dict = standardize_btax(btax_dict=btax_dict,
                                 global_dist_matr=global_dist_matr)
    eagle_logger.info("base taxons standardisation finished")

    full_to_short_seq_names = {
        v: k
        for k, v in short_to_full_seq_names.items()
    }
    for btax_name in btax_dict:
        btax_orgs = set(
            GenomeInfo.load_from_dict(genome).org_name
            for genome in btax_dict[btax_name].genomes)
        if build_tree:
            btax_dict[btax_name].mean_d = global_dist_matr[btax_orgs].mean_dist
            btax_dict[btax_name].median_d = global_dist_matr[
                btax_orgs].median_dist
            if len(btax_orgs) > 2:
                btax_dict[btax_name].ref_tree_newick = build_tree_by_dist(
                    global_dist_matr[btax_orgs],
                    tree_name=btax_name + "_tree").newick
                btax_btc_aln_dict = dict()
                for btc_profile_name, btc_aln in btc_aln_dict.items():
                    btax_btc_aln = btc_aln[btax_orgs].improve_aln(
                        inplace=False)
                    btax_btc_aln.aln_name = btax_name + "_" + btc_profile_name
                    btax_btc_aln_dict[btc_profile_name] = deepcopy(
                        btax_btc_aln)
                btax_dict[btax_name].repr_profiles = generate_btax_profiles(
                    btax_btc_aln_dict,
                    db_dir=db_dir,
                    btax_name=btax_name,
                    method="hmmer")
        btax_dict[btax_name].ref_tree_full_names = \
            {full_to_short_seq_names[btax_org]: btax_org for btax_org in btax_orgs}
        btax_dict[btax_name] = btax_dict[btax_name].get_json()
    return btax_dict
Exemple #8
0
def get_orf_stats(orf_id, orf_homologs_prot, orf_homologs_nucl, homologs_list,
                  btax_data, seq_ids_to_orgs, orfs_stats, work_dir, **kwargs):

    orf_stats = {
        "uniformity_std": -1.0,
        "phylo_diff": -1.0,
        "Ka/Ks": -1.0,
        "representation": 0.0,
        "relative_mean_btax_dist": -1.0,
        "relative_median_btax_dist": -1.0,
        "relative_mean_ORF_dist": -1.0,
        "relative_median_ORF_dist": -1.0,
        "stops_per_seq_median": -1.0,
        "seqs_with_stops_fract": -1.0,
    }

    if len(homologs_list) < 3:
        orfs_stats[orf_id] = orf_stats
        eagle_logger.warning("A few homologs number for ORF '%s'" % orf_id)
        return
    btax_info = BtaxInfo.load_from_dict(btax_data)
    seq_ids_to_orgs[orf_id] = "Input_Organism_X"
    btax_fna = load_fasta_to_dict(fasta_path=btax_info.btax_fna)
    for hom in homologs_list:
        if hom["subj_start"] <= hom["subj_end"]:
            nucl_seq = Seq(btax_fna[hom["subj_id"]][hom["subj_start"] -
                                                    1:hom["subj_end"]])
        else:
            nucl_seq = Seq(btax_fna[hom["subj_id"]]
                           [hom["subj_end"] -
                            1:hom["subj_start"]]).reverse_complement()
        try:
            orf_homologs_prot[hom["subj_id"]] = str(nucl_seq.translate())
        except TranslationError:
            continue
        orf_homologs_nucl[hom["subj_id"]] = str(nucl_seq)
    btax_fna = None
    eagle_logger.info("got homologs sequences for ORF '%s'" % orf_id)
    orf_mult_aln = construct_mult_aln(
        seq_dict=orf_homologs_prot,
        aln_name=orf_id.replace("|:", "_") + "_aln",
        aln_type="prot",
        method="MUSCLE",
        tmp_dir=os.path.join(work_dir,
                             orf_id.replace("|:", "_") + "_aln_tmp"),
        logger=eagle_logger)
    eagle_logger.info("got multiple alignment for ORF '%s' homologs" % orf_id)
    orf_mult_aln.remove_paralogs(seq_ids_to_orgs=seq_ids_to_orgs,
                                 method="min_dist",
                                 inplace=True)
    orf_stats["representation"] = float(len(orf_mult_aln) - 1) / float(
        len(btax_info.genomes))
    if len(orf_mult_aln.seq_names) < 4:
        orfs_stats[orf_id] = orf_stats
        eagle_logger.warning("A few homologs number for ORF '%s'" % orf_id)
        return

    orf_mult_aln.improve_aln(inplace=True)
    dist_matrix = orf_mult_aln.get_distance_matrix().replace_negative(
        inplace=False)
    btax_dist_matrix = dist_matrix[list(
        filter(lambda seq_name: seq_name != orf_id, dist_matrix.seq_names))]
    if btax_info.median_d > 0.0:
        orf_stats[
            "relative_mean_btax_dist"] = btax_dist_matrix.mean_dist / btax_info.mean_d
        orf_stats["relative_mean_ORF_dist"] = dist_matrix[orf_id].mean(
        ) / btax_info.mean_d
    if btax_info.median_d > 0.0:
        orf_stats[
            "relative_median_btax_dist"] = btax_dist_matrix.median_dist / btax_info.median_d
        orf_stats["relative_median_ORF_dist"] = dist_matrix[orf_id].median(
        ) / btax_info.median_d
    stops_stats = orf_mult_aln.stop_codons_stats()
    orf_stats["stops_per_seq_median"] = stops_stats["stops_per_seq_median"]
    orf_stats["seqs_with_stops_fract"] = stops_stats["seqs_with_stops_fract"]

    # Uniformity
    orf_stats["uniformity_std"] = orf_mult_aln.estimate_uniformity(
        cons_thr=conf_constants.cons_thr,
        window_l=conf_constants.unif_window_l,
        windows_step=conf_constants.unif_windows_step)
    if np.isnan(orf_stats["uniformity_std"]):
        orf_stats["uniformity_std"] = -1.0
    eagle_logger.info("got uniformity_std for ORF '%s'" % orf_id)

    # Ka/Ks
    orf_kaks = orf_mult_aln.calculate_KaKs_windows(
        nucl_seqs_dict=orf_homologs_nucl)
    if not pd.isna(orf_kaks):
        orf_stats["Ka/Ks"] = orf_kaks
        eagle_logger.info("got Ka/Ks for ORF '%s'" % orf_id)

    # Phylo
    phylo_tmp_dir = os.path.join(work_dir,
                                 orf_id.replace("|:", "_") + "_phylo_tmp")
    try:
        del orf_mult_aln[orf_id]
    except KeyError:
        pass
    orf_homs_tree = build_tree_by_dist(
        dist_matrix=orf_mult_aln.get_distance_matrix(),
        method="FastME",
        full_seq_names=dict((short_id, seq_ids_to_orgs[full_id])
                            for short_id, full_id in
                            orf_mult_aln.short_to_full_seq_names.items()),
        tree_name=orf_id.replace("|:", "_") + "_tree",
        tmp_dir=phylo_tmp_dir,
        logger=eagle_logger)
    orf_homs_tree.set_full_names(inplace=True)
    btax_tree = PhyloTree.load_tree_from_str(
        tree_str=btax_info.ref_tree_newick,
        full_seq_names=btax_info.ref_tree_full_names,
        tree_name="btax_tree",
        tmp_dir=phylo_tmp_dir,
        logger=eagle_logger)
    btax_tree.set_full_names(inplace=True)
    phylo_diff = compare_trees(phylo_tree1=orf_homs_tree,
                               phylo_tree2=btax_tree,
                               method="Robinson-Foulds")
    if not pd.isna(phylo_diff):
        orf_stats["phylo_diff"] = phylo_diff
    eagle_logger.info("got phylo_diff for ORF '%s'" % orf_id)

    orfs_stats[orf_id] = orf_stats
    eagle_logger.info("got ORF '%s' stats" % orf_id)
    if kwargs.get("save_alignment", False):
        orf_mult_aln.dump_alignment(
            os.path.join(work_dir, ORF_ALNS_DIR,
                         orf_mult_aln.aln_name + ".fasta"))
    if kwargs.get("save_tree", False):
        orf_homs_tree.dump_tree(tree_path=os.path.join(
            work_dir, ORF_TREES_DIR, orf_homs_tree.tree_name + ".nwk"))
Exemple #9
0
 def load_alignment(cls, aln_fasta_path, aln_type=None, aln_name=None, config_path=None, logger=None, **kwargs):
     return cls(mult_aln_dict=load_fasta_to_dict(fasta_path=aln_fasta_path, **kwargs),
                aln_type=aln_type,
                aln_name=aln_name,
                config_path=config_path,
                logger=logger)