def load_data(organism, conf_parser: GenedescConfigParser): logger = logging.getLogger("WB Gene Description Pipeline - Data loader") sister_df = None df_agr = None organisms_info = conf_parser.get_wb_organisms_info() df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) if organism == "c_elegans": df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None) df_agr.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(), ontology_cache_path=os.path.join(conf_parser.get_cache_dir(), "wormbase_agr_human", "go_ontology.obo"), config=conf_parser) df_agr.load_associations_from_file(associations_type=DataType.GO, associations_url=conf_parser.get_wb_human_orthologs_go_associations(), associations_cache_path=os.path.join( conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"), config=conf_parser) if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]: sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"], do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) logger.info("Loading GO data for sister species") sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url, ontology_cache_path=sister_df.go_ontology_cache_path, config=conf_parser) sister_df.load_associations_from_file(associations_type=DataType.GO, associations_url=sister_df.go_associations_url, associations_cache_path=sister_df.go_associations_cache_path, config=conf_parser) logger.info("Loading all data for main species") df.load_all_data_from_file() return df, sister_df, df_agr
def test_load_protein_domain_data(self): df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans") df.load_protein_domain_information() self.assertTrue(True)
def test_load_orthology_data(self): df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_remanei") df.load_orthology_from_file() self.assertTrue(len(df.orthologs) > 0)
def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, "config_test_wb.yml")) self.df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans")
def get_best_orthologs_and_sentence(dm: WBDataManager, orth_fullnames: List[str], gene_desc: GeneDescription, human_genes_props, api_manager, config: GenedescConfigParser): best_orthologs, selected_orth_name = dm.get_best_orthologs_for_gene( gene_desc.gene_id, orth_species_full_name=orth_fullnames) selected_orthologs = [] orth_sent = '' if best_orthologs: gene_desc.stats.set_best_orthologs = [ orth[0] for orth in best_orthologs ] if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": sel_orthologs, orth_sent = generate_ortholog_sentence_wormbase_human( best_orthologs, human_genes_props, config=config) selected_orthologs = [ orth for orth in best_orthologs if orth[1].upper() in sel_orthologs ] else: orth_sent = generate_ortholog_sentence_wormbase_non_c_elegans( best_orthologs, selected_orth_name, api_manager=api_manager, config=config) return selected_orthologs, orth_sent
def set_sister_species_sentence(dm: WBDataManager, conf_parser: GenedescConfigParser, sister_sp_fullname, sister_df: WBDataManager, species, organism, gene_desc: GeneDescription, gene: Gene): best_ortholog = dm.get_best_orthologs_for_gene( gene_desc.gene_id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=[ "EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP" ])[0][0] if not best_ortholog[0].startswith("WB:"): best_ortholog[0] = "WB:" + best_ortholog[0] sister_sentences_generator = OntologySentenceGenerator( gene_id=best_ortholog[0], module=Module.GO, data_manager=sister_df, config=conf_parser, humans=sister_sp_fullname == "H**o sapiens", limit_to_group="EXPERIMENTAL") sister_sp_module_sentences = sister_sentences_generator.get_module_sentences( aspect='P', qualifier="involved_in", merge_groups_with_same_prefix=True, keep_only_best_group=True) if sister_sp_module_sentences.contains_sentences(): gene_desc.set_or_extend_module_description_and_final_stats( module=Module.SISTER_SP, description="in " + species[species[organism]["main_sister_species"]]["name"] + ", " + best_ortholog[1] + " " + sister_sp_module_sentences.get_description())
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, "config_test_wb.yml")) self.df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans") def test_load_expression_data(self): self.df.load_ontology_from_file( ontology_type=DataType.EXPR, ontology_url="file://" + os.path.join(self.this_dir, "data", "anatomy_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "anatomy_gd_test.obo"), config=self.conf_parser) self.df.load_associations_from_file( associations_type=DataType.EXPR, associations_url="file://" + os.path.join(self.this_dir, "data", "anatomy_gd_test.wb"), associations_cache_path=os.path.join(self.this_dir, "cache", "anatomy_gd_test.wb"), config=self.conf_parser) self.assertTrue(self.df.expression_ontology is not None) self.assertTrue('WB:WBGene00000001' in self.df.expression_associations.associations_by_subj) for annotations in self.df.expression_associations.associations_by_subj.values( ): for annotation in annotations: self.assertTrue(annotation["evidence"]["type"] == "IDA")
def set_orthology_sentence(dm: WBDataManager, orth_fullnames: List[str], gene_desc: GeneDescription, human_genes_props, api_manager): best_orthologs, selected_orth_name = dm.get_best_orthologs_for_gene(gene_desc.gene_id, orth_species_full_name=orth_fullnames) selected_orthologs = [] if best_orthologs: gene_desc.stats.set_best_orthologs = [orth[0] for orth in best_orthologs] if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": sel_orthologs, orth_sent = generate_ortholog_sentence_wormbase_human(best_orthologs, human_genes_props) selected_orthologs = [orth for orth in best_orthologs if orth[1] in sel_orthologs] else: orth_sent = generate_ortholog_sentence_wormbase_non_c_elegans(best_orthologs, selected_orth_name, api_manager=api_manager) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=orth_sent) return selected_orthologs
def set_expression_cluster_sentence(dm: WBDataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene, api_manager: APIManager): expr_sentence_generator = OntologySentenceGenerator( gene_id=gene.id, module=Module.EXPRESSION, data_manager=dm, config=conf_parser) ec_gene_id = gene_desc.gene_id[3:] ec_anatomy_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.ANATOMY, feature=ExpressionClusterFeature.STUDIES) ec_anatomy_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.TERMS, expression_cluster_type=ExpressionClusterType.ANATOMY) if dm.expression_ontology is not None: expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences( aspect='A', qualifier="Enriched", merge_groups_with_same_prefix=True, keep_only_best_group=False) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description=expression_enriched_module_sentences.get_description(), additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) elif ec_anatomy_terms: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description="is enriched in " + concatenate_words_with_oxford_comma( ec_anatomy_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) ec_molreg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.MOLREG, feature=ExpressionClusterFeature.TERMS) ec_molreg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.MOLREG) ec_genereg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.GENEREG, feature=ExpressionClusterFeature.TERMS) ec_genereg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.GENEREG) if ec_genereg_terms: several_word = "" if len(ec_genereg_terms) > 3: t_p = [ t_p for t_p in sorted( [[term, api_manager.get_textpresso_popularity(term)] for term in ec_genereg_terms], key=lambda x: (x[1], x[0][1]), reverse=True) ] ec_genereg_terms = [term for term, popularity in t_p[0:3]] several_word = "several genes including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_GENE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_genereg_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_genereg_studies, additional_postfix_final_word="studies", use_single_form=True) if ec_molreg_terms: several_word = "" if len(ec_molreg_terms) > 3: several_word = num2words( len(ec_molreg_terms)) + " chemicals including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_MOLECULE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_molreg_terms[0:3], separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_molreg_studies, additional_postfix_final_word="studies", use_single_form=True)
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, "config_test_wb.yml")) self.df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans") def test_load_expression_data(self): self.df.load_ontology_from_file( ontology_type=DataType.EXPR, ontology_url="file://" + os.path.join(self.this_dir, "data", "anatomy_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "anatomy_gd_test.obo"), config=self.conf_parser) self.df.load_associations_from_file( associations_type=DataType.EXPR, associations_url="file://" + os.path.join(self.this_dir, "data", "anatomy_gd_test.wb"), associations_cache_path=os.path.join(self.this_dir, "cache", "anatomy_gd_test.wb"), config=self.conf_parser) self.assertTrue(self.df.expression_ontology is not None) self.assertTrue('WB:WBGene00000001' in self.df.expression_associations.associations_by_subj) for annotations in self.df.expression_associations.associations_by_subj.values( ): for annotation in annotations: self.assertTrue(annotation["evidence"]["type"] == "IDA") def test_load_disease_data(self): self.df.load_ontology_from_file( ontology_type=DataType.DO, ontology_url="file://" + os.path.join(self.this_dir, os.pardir, "data", "doid.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"), config=self.conf_parser) self.df.load_associations_from_file( associations_type=DataType.DO, associations_url=self.df.do_associations_url, associations_cache_path=os.path.join(self.this_dir, "cache", "do_ann.gaf"), association_additional_cache_path=os.path.join( self.this_dir, "cache", "do_ann.daf"), association_additional_url=self.df.do_associations_new_url, config=self.conf_parser) self.assertTrue( any([ annotation["evidence"]["type"] == "IMP" for annotations in self.df.expression_associations.associations_by_subj.values() for annotation in annotations ])) def test_load_orthology_data(self): df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_remanei") df.load_orthology_from_file() self.assertTrue(len(df.orthologs) > 0) def test_load_protein_domain_data(self): df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans") df.load_protein_domain_information() self.assertTrue(True) def test_expression_the_cell_renaming_to_widely(self): self.df.load_ontology_from_file( ontology_type=DataType.EXPR, ontology_url=self.df.expression_ontology_url, ontology_cache_path=self.df.expression_ontology_cache_path, config=self.conf_parser) self.df.load_associations_from_file( associations_type=DataType.EXPR, associations_url=self.df.expression_associations_url, associations_cache_path=self.df.expression_associations_cache_path, config=self.conf_parser) gene_desc = GeneDescription(gene_id="WB:WBGene00007352", gene_name="cdc-48.1", add_gene_name=False) expr_sentence_generator = OntologySentenceGenerator( gene_id=gene_desc.gene_id, module=Module.EXPRESSION, data_manager=self.df, config=self.conf_parser) expression_module_sentences = expr_sentence_generator.get_module_sentences( config=self.conf_parser, aspect='A', qualifier="Verified", merge_groups_with_same_prefix=True, keep_only_best_group=False) gene_desc.set_or_extend_module_description_and_final_stats( module_sentences=expression_module_sentences, module=Module.EXPRESSION) self.assertTrue("is expressed widely" in gene_desc.description)