Example #1
0
def load_data(organism, conf_parser: GenedescConfigParser):
    logger = logging.getLogger("WB Gene Description Pipeline - Data loader")
    sister_df = None
    df_agr = None
    organisms_info = conf_parser.get_wb_organisms_info()
    df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"],
                       config=conf_parser)
    if organism == "c_elegans":
        df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None)
        df_agr.load_ontology_from_file(ontology_type=DataType.GO,
                                       ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(),
                                       ontology_cache_path=os.path.join(conf_parser.get_cache_dir(),
                                                                        "wormbase_agr_human", "go_ontology.obo"),
                                       config=conf_parser)
        df_agr.load_associations_from_file(associations_type=DataType.GO,
                                           associations_url=conf_parser.get_wb_human_orthologs_go_associations(),
                                           associations_cache_path=os.path.join(
                                               conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"),
                                           config=conf_parser)
    if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]:
        sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"],
                                  do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser)
        logger.info("Loading GO data for sister species")
        sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url,
                                          ontology_cache_path=sister_df.go_ontology_cache_path,
                                          config=conf_parser)
        sister_df.load_associations_from_file(associations_type=DataType.GO,
                                              associations_url=sister_df.go_associations_url,
                                              associations_cache_path=sister_df.go_associations_cache_path,
                                              config=conf_parser)
    logger.info("Loading all data for main species")
    df.load_all_data_from_file()
    return df, sister_df, df_agr
 def test_load_protein_domain_data(self):
     df = WBDataManager(do_relations=None,
                        go_relations=["subClassOf", "BFO:0000050"],
                        config=self.conf_parser,
                        species="c_elegans")
     df.load_protein_domain_information()
     self.assertTrue(True)
 def test_load_orthology_data(self):
     df = WBDataManager(do_relations=None,
                        go_relations=["subClassOf", "BFO:0000050"],
                        config=self.conf_parser,
                        species="c_remanei")
     df.load_orthology_from_file()
     self.assertTrue(len(df.orthologs) > 0)
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="ERROR",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, "config_test_wb.yml"))
     self.df = WBDataManager(do_relations=None,
                             go_relations=["subClassOf", "BFO:0000050"],
                             config=self.conf_parser,
                             species="c_elegans")
def get_best_orthologs_and_sentence(dm: WBDataManager,
                                    orth_fullnames: List[str],
                                    gene_desc: GeneDescription,
                                    human_genes_props, api_manager,
                                    config: GenedescConfigParser):
    best_orthologs, selected_orth_name = dm.get_best_orthologs_for_gene(
        gene_desc.gene_id, orth_species_full_name=orth_fullnames)
    selected_orthologs = []
    orth_sent = ''
    if best_orthologs:
        gene_desc.stats.set_best_orthologs = [
            orth[0] for orth in best_orthologs
        ]
        if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
            sel_orthologs, orth_sent = generate_ortholog_sentence_wormbase_human(
                best_orthologs, human_genes_props, config=config)
            selected_orthologs = [
                orth for orth in best_orthologs
                if orth[1].upper() in sel_orthologs
            ]
        else:
            orth_sent = generate_ortholog_sentence_wormbase_non_c_elegans(
                best_orthologs,
                selected_orth_name,
                api_manager=api_manager,
                config=config)
    return selected_orthologs, orth_sent
def set_sister_species_sentence(dm: WBDataManager,
                                conf_parser: GenedescConfigParser,
                                sister_sp_fullname, sister_df: WBDataManager,
                                species, organism, gene_desc: GeneDescription,
                                gene: Gene):
    best_ortholog = dm.get_best_orthologs_for_gene(
        gene_desc.gene_id,
        orth_species_full_name=[sister_sp_fullname],
        sister_species_data_fetcher=sister_df,
        ecode_priority_list=[
            "EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP",
            "HGI", "HEP"
        ])[0][0]
    if not best_ortholog[0].startswith("WB:"):
        best_ortholog[0] = "WB:" + best_ortholog[0]
    sister_sentences_generator = OntologySentenceGenerator(
        gene_id=best_ortholog[0],
        module=Module.GO,
        data_manager=sister_df,
        config=conf_parser,
        humans=sister_sp_fullname == "H**o sapiens",
        limit_to_group="EXPERIMENTAL")
    sister_sp_module_sentences = sister_sentences_generator.get_module_sentences(
        aspect='P',
        qualifier="involved_in",
        merge_groups_with_same_prefix=True,
        keep_only_best_group=True)
    if sister_sp_module_sentences.contains_sentences():
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.SISTER_SP,
            description="in " +
            species[species[organism]["main_sister_species"]]["name"] + ", " +
            best_ortholog[1] + " " +
            sister_sp_module_sentences.get_description())
Example #7
0
class TestGOModule(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(
            filename=None,
            level="ERROR",
            format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(
            os.path.join(self.this_dir, "config_test_wb.yml"))
        self.df = WBDataManager(do_relations=None,
                                go_relations=["subClassOf", "BFO:0000050"],
                                config=self.conf_parser,
                                species="c_elegans")

    def test_load_expression_data(self):
        self.df.load_ontology_from_file(
            ontology_type=DataType.EXPR,
            ontology_url="file://" +
            os.path.join(self.this_dir, "data", "anatomy_gd_test.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "anatomy_gd_test.obo"),
            config=self.conf_parser)
        self.df.load_associations_from_file(
            associations_type=DataType.EXPR,
            associations_url="file://" +
            os.path.join(self.this_dir, "data", "anatomy_gd_test.wb"),
            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                 "anatomy_gd_test.wb"),
            config=self.conf_parser)
        self.assertTrue(self.df.expression_ontology is not None)
        self.assertTrue('WB:WBGene00000001' in
                        self.df.expression_associations.associations_by_subj)
        for annotations in self.df.expression_associations.associations_by_subj.values(
        ):
            for annotation in annotations:
                self.assertTrue(annotation["evidence"]["type"] == "IDA")
Example #8
0
def set_orthology_sentence(dm: WBDataManager, orth_fullnames: List[str], gene_desc: GeneDescription,
                           human_genes_props, api_manager):
    best_orthologs, selected_orth_name = dm.get_best_orthologs_for_gene(gene_desc.gene_id,
                                                                        orth_species_full_name=orth_fullnames)
    selected_orthologs = []
    if best_orthologs:
        gene_desc.stats.set_best_orthologs = [orth[0] for orth in best_orthologs]
        if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
            sel_orthologs, orth_sent = generate_ortholog_sentence_wormbase_human(best_orthologs, human_genes_props)
            selected_orthologs = [orth for orth in best_orthologs if orth[1] in sel_orthologs]
        else:
            orth_sent = generate_ortholog_sentence_wormbase_non_c_elegans(best_orthologs, selected_orth_name,
                                                                          api_manager=api_manager)
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=orth_sent)
    return selected_orthologs
def set_expression_cluster_sentence(dm: WBDataManager,
                                    conf_parser: GenedescConfigParser,
                                    gene_desc: GeneDescription, gene: Gene,
                                    api_manager: APIManager):

    expr_sentence_generator = OntologySentenceGenerator(
        gene_id=gene.id,
        module=Module.EXPRESSION,
        data_manager=dm,
        config=conf_parser)
    ec_gene_id = gene_desc.gene_id[3:]
    ec_anatomy_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.ANATOMY,
        feature=ExpressionClusterFeature.STUDIES)
    ec_anatomy_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.TERMS,
        expression_cluster_type=ExpressionClusterType.ANATOMY)
    if dm.expression_ontology is not None:
        expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences(
            aspect='A',
            qualifier="Enriched",
            merge_groups_with_same_prefix=True,
            keep_only_best_group=False)
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description=expression_enriched_module_sentences.get_description(),
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    elif ec_anatomy_terms:
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description="is enriched in " +
            concatenate_words_with_oxford_comma(
                ec_anatomy_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    ec_molreg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.MOLREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_molreg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.MOLREG)
    ec_genereg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.GENEREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_genereg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.GENEREG)
    if ec_genereg_terms:
        several_word = ""
        if len(ec_genereg_terms) > 3:
            t_p = [
                t_p for t_p in sorted(
                    [[term, api_manager.get_textpresso_popularity(term)]
                     for term in ec_genereg_terms],
                    key=lambda x: (x[1], x[0][1]),
                    reverse=True)
            ]
            ec_genereg_terms = [term for term, popularity in t_p[0:3]]
            several_word = "several genes including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_GENE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_genereg_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_genereg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    if ec_molreg_terms:
        several_word = ""
        if len(ec_molreg_terms) > 3:
            several_word = num2words(
                len(ec_molreg_terms)) + " chemicals including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_MOLECULE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_molreg_terms[0:3],
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_molreg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
class TestGOModule(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(
            filename=None,
            level="ERROR",
            format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(
            os.path.join(self.this_dir, "config_test_wb.yml"))
        self.df = WBDataManager(do_relations=None,
                                go_relations=["subClassOf", "BFO:0000050"],
                                config=self.conf_parser,
                                species="c_elegans")

    def test_load_expression_data(self):
        self.df.load_ontology_from_file(
            ontology_type=DataType.EXPR,
            ontology_url="file://" +
            os.path.join(self.this_dir, "data", "anatomy_gd_test.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "anatomy_gd_test.obo"),
            config=self.conf_parser)
        self.df.load_associations_from_file(
            associations_type=DataType.EXPR,
            associations_url="file://" +
            os.path.join(self.this_dir, "data", "anatomy_gd_test.wb"),
            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                 "anatomy_gd_test.wb"),
            config=self.conf_parser)
        self.assertTrue(self.df.expression_ontology is not None)
        self.assertTrue('WB:WBGene00000001' in
                        self.df.expression_associations.associations_by_subj)
        for annotations in self.df.expression_associations.associations_by_subj.values(
        ):
            for annotation in annotations:
                self.assertTrue(annotation["evidence"]["type"] == "IDA")

    def test_load_disease_data(self):
        self.df.load_ontology_from_file(
            ontology_type=DataType.DO,
            ontology_url="file://" +
            os.path.join(self.this_dir, os.pardir, "data", "doid.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "doid.obo"),
            config=self.conf_parser)
        self.df.load_associations_from_file(
            associations_type=DataType.DO,
            associations_url=self.df.do_associations_url,
            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                 "do_ann.gaf"),
            association_additional_cache_path=os.path.join(
                self.this_dir, "cache", "do_ann.daf"),
            association_additional_url=self.df.do_associations_new_url,
            config=self.conf_parser)
        self.assertTrue(
            any([
                annotation["evidence"]["type"] == "IMP" for annotations in
                self.df.expression_associations.associations_by_subj.values()
                for annotation in annotations
            ]))

    def test_load_orthology_data(self):
        df = WBDataManager(do_relations=None,
                           go_relations=["subClassOf", "BFO:0000050"],
                           config=self.conf_parser,
                           species="c_remanei")
        df.load_orthology_from_file()
        self.assertTrue(len(df.orthologs) > 0)

    def test_load_protein_domain_data(self):
        df = WBDataManager(do_relations=None,
                           go_relations=["subClassOf", "BFO:0000050"],
                           config=self.conf_parser,
                           species="c_elegans")
        df.load_protein_domain_information()
        self.assertTrue(True)

    def test_expression_the_cell_renaming_to_widely(self):
        self.df.load_ontology_from_file(
            ontology_type=DataType.EXPR,
            ontology_url=self.df.expression_ontology_url,
            ontology_cache_path=self.df.expression_ontology_cache_path,
            config=self.conf_parser)
        self.df.load_associations_from_file(
            associations_type=DataType.EXPR,
            associations_url=self.df.expression_associations_url,
            associations_cache_path=self.df.expression_associations_cache_path,
            config=self.conf_parser)
        gene_desc = GeneDescription(gene_id="WB:WBGene00007352",
                                    gene_name="cdc-48.1",
                                    add_gene_name=False)
        expr_sentence_generator = OntologySentenceGenerator(
            gene_id=gene_desc.gene_id,
            module=Module.EXPRESSION,
            data_manager=self.df,
            config=self.conf_parser)
        expression_module_sentences = expr_sentence_generator.get_module_sentences(
            config=self.conf_parser,
            aspect='A',
            qualifier="Verified",
            merge_groups_with_same_prefix=True,
            keep_only_best_group=False)
        gene_desc.set_or_extend_module_description_and_final_stats(
            module_sentences=expression_module_sentences,
            module=Module.EXPRESSION)
        self.assertTrue("is expressed widely" in gene_desc.description)