コード例 #1
0
def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, ensembl_hgnc_ids_map,
                                  conf_parser: GenedescConfigParser, human_df_agr: DataManager,
                                  gene_desc: GeneDescription, dm: WBDataManager, gene: Gene):
    if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
        best_orth = get_best_human_ortholog_for_info_poor(selected_orthologs, ensembl_hgnc_ids_map,
                                                          conf_parser.get_annotations_priority(module=Module.GO),
                                                          human_df_agr, config=conf_parser)
        if best_orth:
            if not best_orth.startswith("RGD:"):
                best_orth = "RGD:" + best_orth
            human_go_sent_generator = OntologySentenceGenerator(gene_id=best_orth, module=Module.GO,
                                                                data_manager=human_df_agr, config=conf_parser,
                                                                humans=False, limit_to_group="EXPERIMENTAL")
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                config=conf_parser, aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " +
                                                                        human_df_agr.go_associations.subject_label_map[
                                                                            best_orth] + " " + human_func_sent)

    protein_domains = dm.protein_domains[gene_desc.gene_id[3:]]
    if protein_domains:
        dom_word = "domain"
        if len(protein_domains) > 1:
            dom_word = "domains"
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.PROTEIN_DOMAIN,
            description="is predicted to encode a protein with the following " + dom_word + ": " +
                        concatenate_words_with_oxford_comma([ptdom[1] if ptdom[1] != "" else ptdom[0] for
                                                             ptdom in protein_domains]))
コード例 #2
0
class TestGOModule(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(
            filename=None,
            level="ERROR",
            format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(
            os.path.join(self.this_dir, os.path.pardir, "tests",
                         "config_test.yml"))
        self.df = DataManager(do_relations=None,
                              go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(
            ontology_type=DataType.GO,
            ontology_url="file://" +
            os.path.join(self.this_dir, "data", "go_gd_test.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "go_gd_test.obo"),
            config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(
            associations_type=DataType.GO,
            associations_url="file://" + os.path.join(
                self.this_dir, "data", "gene_association_1.7.wb.partial"),
            associations_cache_path=os.path.join(
                self.this_dir, "cache", "gene_association_1.7.wb.partial"),
            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(
            any(parent == "GO:0009987"
                for parent in self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(
            len(
                self.df.get_annotations_for_gene(
                    gene_id="WB:WBGene00000001",
                    annot_type=DataType.GO,
                    include_obsolete=False,
                    include_negative_results=False,
                    priority_list=self.conf_parser.get_annotations_priority(
                        module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(
            all(
                len(self.df.go_ontology.search(term)) == 0 for term in list(
                    self.conf_parser.get_module_property(
                        module=Module.GO,
                        prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        pass
コード例 #3
0
    def __init__(self,
                 gene_id: str,
                 module: Module,
                 data_manager: DataManager,
                 config: GenedescConfigParser,
                 limit_to_group: str = None,
                 humans: bool = False):
        """initialize sentence generator object

        Args:
            config (GenedescConfigParser): an optional config object from which to read the options
            limit_to_group (str): limit the evidence codes to the specified group
        """
        self.ontology = data_manager.get_ontology(
            get_data_type_from_module(module))
        self.config = config
        self.module = module
        self.terms_already_covered = set()
        self.terms_groups = defaultdict(lambda: defaultdict(set))
        self.evidence_groups_priority_list = config.get_evidence_groups_priority_list(
            module=module)
        self.prepostfix_sentences_map = config.get_prepostfix_sentence_map(
            module=module, humans=humans)
        self.gene_annots = data_manager.get_annotations_for_gene(
            gene_id=gene_id,
            annot_type=get_data_type_from_module(module),
            priority_list=config.get_annotations_priority(module=module))
        self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property(
            module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)](
                ontology=self.ontology,
                annotations=data_manager.get_associations(
                    get_data_type_from_module(module)),
                nodeids_blacklist=config.get_module_property(
                    module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS),
                slim_terms_ic_bonus_perc=config.get_module_property(
                    module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC),
                slim_set=data_manager.get_slim(module=module))
        self.set_terms_groups(module, config, limit_to_group, humans)
コード例 #4
0
class TestGOModule(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache",
                                                                                       "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.wb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.wb.partial"),
                                            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(any(parent == "GO:0009987" for parent in
                            self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(len(self.df.get_annotations_for_gene(
            gene_id="WB:WBGene00000001", annot_type=DataType.GO,
            include_obsolete=False, include_negative_results=False,
            priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list(
            self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO)
        self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot]))

    def test_download_gz_file(self):
        test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache",
                                                                     "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"),
                                             file_source_url="file://" + os.path.join(
                                                 self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"))
        self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt"))

    def test_gene_data_functions(self):
        self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True),
                                         Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)])
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=False)]) == 1)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=False)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=True)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=True)]) == 4)

    def test_get_human_gene_props(self):
        human_gene_props = self.df.get_human_gene_props()
        self.assertTrue(len(human_gene_props) > 0)

    def test_get_ensembl_hgnc_ids_map(self):
        ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map()
        self.assertTrue(len(ensembl_hgnc_ids_map) > 0)

    def test_set_ontology(self):
        ontology = OntologyFactory().create()
        for i in range(4):
            ontology.add_node(i, 'node' + str(i))
        ontology.add_parent(1, 0)
        ontology.add_parent(2, 0)
        ontology.add_parent(3, 0)
        self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser)
        self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes()))

    def test_set_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901",
                                                                 "", "F", "EXP", None, "WB", ""))
        associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertTrue(self.df.go_associations)

    def test_remap_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
コード例 #5
0
    def __init__(self,
                 gene_id: str,
                 module: Module,
                 data_manager: DataManager,
                 config: GenedescConfigParser,
                 limit_to_group: str = None,
                 humans: bool = False):
        """initialize sentence generator object

        Args:
            config (GenedescConfigParser): an optional config object from which to read the options
            limit_to_group (str): limit the evidence codes to the specified group
        """
        annot_type = None
        if module == Module.DO_ORTHOLOGY or module == Module.DO_EXPERIMENTAL or module == module.DO_BIOMARKER:
            self.ontology = data_manager.do_ontology
            annot_type = DataType.DO
        elif module == Module.GO:
            self.ontology = data_manager.go_ontology
            annot_type = DataType.GO
        elif module == Module.EXPRESSION:
            self.ontology = data_manager.expression_ontology
            annot_type = DataType.EXPR
        self.evidence_groups_priority_list = config.get_evidence_groups_priority_list(
            module=module)
        self.prepostfix_sentences_map = config.get_prepostfix_sentence_map(
            module=module, humans=humans)
        self.terms_groups = defaultdict(lambda: defaultdict(set))
        ev_codes_groups_maps = config.get_evidence_codes_groups_map(
            module=module)
        annotations = data_manager.get_annotations_for_gene(
            gene_id=gene_id,
            annot_type=annot_type,
            priority_list=config.get_annotations_priority(module=module))
        self.annotations = annotations
        self.module = module
        self.data_manager = data_manager
        self.annot_type = annot_type
        evidence_codes_groups_map = {
            evcode: group
            for evcode, group in ev_codes_groups_maps.items()
            if limit_to_group is None
            or limit_to_group in ev_codes_groups_maps[evcode]
        }
        prepostfix_special_cases_sent_map = config.get_prepostfix_sentence_map(
            module=module, special_cases_only=True, humans=humans)
        if len(annotations) > 0:
            for annotation in annotations:
                if annotation["evidence"]["type"] in evidence_codes_groups_map:
                    aspect = annotation["aspect"]
                    ev_group = evidence_codes_groups_map[annotation["evidence"]
                                                         ["type"]]
                    qualifier = "_".join(
                        sorted(annotation["qualifiers"]
                               )) if "qualifiers" in annotation else ""
                    if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \
                            prepostfix_special_cases_sent_map:
                        for special_case in prepostfix_special_cases_sent_map[(
                                aspect, ev_group, qualifier)]:
                            if re.match(
                                    re.escape(special_case[1]),
                                    self.ontology.label(
                                        annotation["object"]["id"],
                                        id_if_null=True)):
                                ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \
                                           str(special_case[0])
                                if ev_group not in self.evidence_groups_priority_list:
                                    self.evidence_groups_priority_list.insert(
                                        self.evidence_groups_priority_list.
                                        index(evidence_codes_groups_map[
                                            annotation["evidence"]["type"]]) +
                                        1, ev_group)
                                break
                    self.terms_groups[(aspect, qualifier)][ev_group].add(
                        annotation["object"]["id"])