def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, ensembl_hgnc_ids_map, conf_parser: GenedescConfigParser, human_df_agr: DataManager, gene_desc: GeneDescription, dm: WBDataManager, gene: Gene): if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": best_orth = get_best_human_ortholog_for_info_poor(selected_orthologs, ensembl_hgnc_ids_map, conf_parser.get_annotations_priority(module=Module.GO), human_df_agr, config=conf_parser) if best_orth: if not best_orth.startswith("RGD:"): best_orth = "RGD:" + best_orth human_go_sent_generator = OntologySentenceGenerator(gene_id=best_orth, module=Module.GO, data_manager=human_df_agr, config=conf_parser, humans=False, limit_to_group="EXPERIMENTAL") human_func_module_sentences = human_go_sent_generator.get_module_sentences( config=conf_parser, aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[ best_orth] + " " + human_func_sent) protein_domains = dm.protein_domains[gene_desc.gene_id[3:]] if protein_domains: dom_word = "domain" if len(protein_domains) > 1: dom_word = "domains" gene_desc.set_or_extend_module_description_and_final_stats( module=Module.PROTEIN_DOMAIN, description="is predicted to encode a protein with the following " + dom_word + ": " + concatenate_words_with_oxford_comma([ptdom[1] if ptdom[1] != "" else ptdom[0] for ptdom in protein_domains]))
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue( any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue( len( self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority( module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue( all( len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property( module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): pass
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ self.ontology = data_manager.get_ontology( get_data_type_from_module(module)) self.config = config self.module = module self.terms_already_covered = set() self.terms_groups = defaultdict(lambda: defaultdict(set)) self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.gene_annots = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=get_data_type_from_module(module), priority_list=config.get_annotations_priority(module=module)) self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)]( ontology=self.ontology, annotations=data_manager.get_associations( get_data_type_from_module(module)), nodeids_blacklist=config.get_module_property( module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS), slim_terms_ic_bonus_perc=config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC), slim_set=data_manager.get_slim(module=module)) self.set_terms_groups(module, config, limit_to_group, humans)
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue(any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue(len(self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO) self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot])) def test_download_gz_file(self): test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"), file_source_url="file://" + os.path.join( self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz")) self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt")) def test_gene_data_functions(self): self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True), Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)]) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=False)]) == 1) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=False)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=True)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=True)]) == 4) def test_get_human_gene_props(self): human_gene_props = self.df.get_human_gene_props() self.assertTrue(len(human_gene_props) > 0) def test_get_ensembl_hgnc_ids_map(self): ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map() self.assertTrue(len(ensembl_hgnc_ids_map) > 0) def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes())) def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations) def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ annot_type = None if module == Module.DO_ORTHOLOGY or module == Module.DO_EXPERIMENTAL or module == module.DO_BIOMARKER: self.ontology = data_manager.do_ontology annot_type = DataType.DO elif module == Module.GO: self.ontology = data_manager.go_ontology annot_type = DataType.GO elif module == Module.EXPRESSION: self.ontology = data_manager.expression_ontology annot_type = DataType.EXPR self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.terms_groups = defaultdict(lambda: defaultdict(set)) ev_codes_groups_maps = config.get_evidence_codes_groups_map( module=module) annotations = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=annot_type, priority_list=config.get_annotations_priority(module=module)) self.annotations = annotations self.module = module self.data_manager = data_manager self.annot_type = annot_type evidence_codes_groups_map = { evcode: group for evcode, group in ev_codes_groups_maps.items() if limit_to_group is None or limit_to_group in ev_codes_groups_maps[evcode] } prepostfix_special_cases_sent_map = config.get_prepostfix_sentence_map( module=module, special_cases_only=True, humans=humans) if len(annotations) > 0: for annotation in annotations: if annotation["evidence"]["type"] in evidence_codes_groups_map: aspect = annotation["aspect"] ev_group = evidence_codes_groups_map[annotation["evidence"] ["type"]] qualifier = "_".join( sorted(annotation["qualifiers"] )) if "qualifiers" in annotation else "" if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \ prepostfix_special_cases_sent_map: for special_case in prepostfix_special_cases_sent_map[( aspect, ev_group, qualifier)]: if re.match( re.escape(special_case[1]), self.ontology.label( annotation["object"]["id"], id_if_null=True)): ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \ str(special_case[0]) if ev_group not in self.evidence_groups_priority_list: self.evidence_groups_priority_list.insert( self.evidence_groups_priority_list. index(evidence_codes_groups_map[ annotation["evidence"]["type"]]) + 1, ev_group) break self.terms_groups[(aspect, qualifier)][ev_group].add( annotation["object"]["id"])