Esempio n. 1
0
def load_data(organism, conf_parser: GenedescConfigParser):
    logger = logging.getLogger("WB Gene Description Pipeline - Data loader")
    sister_df = None
    df_agr = None
    organisms_info = conf_parser.get_wb_organisms_info()
    df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"],
                       config=conf_parser)
    if organism == "c_elegans":
        df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None)
        df_agr.load_ontology_from_file(ontology_type=DataType.GO,
                                       ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(),
                                       ontology_cache_path=os.path.join(conf_parser.get_cache_dir(),
                                                                        "wormbase_agr_human", "go_ontology.obo"),
                                       config=conf_parser)
        df_agr.load_associations_from_file(associations_type=DataType.GO,
                                           associations_url=conf_parser.get_wb_human_orthologs_go_associations(),
                                           associations_cache_path=os.path.join(
                                               conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"),
                                           config=conf_parser)
    if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]:
        sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"],
                                  do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser)
        logger.info("Loading GO data for sister species")
        sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url,
                                          ontology_cache_path=sister_df.go_ontology_cache_path,
                                          config=conf_parser)
        sister_df.load_associations_from_file(associations_type=DataType.GO,
                                              associations_url=sister_df.go_associations_url,
                                              associations_cache_path=sister_df.go_associations_cache_path,
                                              config=conf_parser)
    logger.info("Loading all data for main species")
    df.load_all_data_from_file()
    return df, sister_df, df_agr
class TestGOModule(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(
            filename=None,
            level="ERROR",
            format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(
            os.path.join(self.this_dir, os.path.pardir, "tests",
                         "config_test.yml"))
        self.df = DataManager(do_relations=None,
                              go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(
            ontology_type=DataType.GO,
            ontology_url="file://" +
            os.path.join(self.this_dir, "data", "go_gd_test.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "go_gd_test.obo"),
            config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(
            associations_type=DataType.GO,
            associations_url="file://" + os.path.join(
                self.this_dir, "data", "gene_association_1.7.wb.partial"),
            associations_cache_path=os.path.join(
                self.this_dir, "cache", "gene_association_1.7.wb.partial"),
            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(
            any(parent == "GO:0009987"
                for parent in self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(
            len(
                self.df.get_annotations_for_gene(
                    gene_id="WB:WBGene00000001",
                    annot_type=DataType.GO,
                    include_obsolete=False,
                    include_negative_results=False,
                    priority_list=self.conf_parser.get_annotations_priority(
                        module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(
            all(
                len(self.df.go_ontology.search(term)) == 0 for term in list(
                    self.conf_parser.get_module_property(
                        module=Module.GO,
                        prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        pass
Esempio n. 3
0
class TestDescriptionsGenerator(unittest.TestCase):

    def setUp(self):
        logger.info("Starting Ontology Tools tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"),
                                        ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.fb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.fb.partial"),
                                            config=self.conf_parser)
        logging.basicConfig(filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')

    def test_set_or_extend_module_description_and_final_stats(self):
        gene_desc = GeneDescription(gene_id="FB:FBgn0027655", gene_name="Test gene", add_gene_name=False,
                                    config=self.conf_parser)
        go_sent_generator = OntologySentenceGenerator(gene_id="FB:FBgn0027655", module=Module.GO,
                                                      data_manager=self.df, config=self.conf_parser)
        sentences = go_sent_generator.get_module_sentences(aspect='P', qualifier='', merge_groups_with_same_prefix=True,
                                                           keep_only_best_group=True)
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.GO_PROCESS, module_sentences=sentences)
        self.assertTrue(gene_desc.description, "Is involved in several processes, including axo-dendritic transport, "
                                               "establishment of mitotic spindle orientation, and positive regulation "
                                               "of extent of heterochromatin assembly")
        gene_desc = GeneDescription(gene_id="FB:FBgn0027655", gene_name="Test gene", add_gene_name=True,
                                    config=self.conf_parser)
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.GO_PROCESS, module_sentences=sentences)
        self.assertTrue(gene_desc.description, "Test gene is involved in several processes, including axo-dendritic "
                                               "transport, establishment of mitotic spindle orientation, and positive "
                                               "regulation of extent of heterochromatin assembly")
Esempio n. 4
0
class TestOntologyTools(unittest.TestCase):

    def setUp(self):
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"),
                                        ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.wb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.wb.partial"),
                                            config=self.conf_parser)

    @staticmethod
    def get_associations(gene_id, term_ids, qualifiers, aspect, ecode):
        return [DataManager.create_annotation_record(source_line="", gene_id=gene_id, gene_symbol="", gene_type="gene",
                                                     taxon_id="", object_id=term_id, qualifiers=qualifiers,
                                                     aspect=aspect, ecode=ecode, references="", prvdr="WB",
                                                     date="") for term_id in term_ids]

    def test_trimming_lca(self):
        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic"
        gene = Gene(id="WB:WBGene00000018", name="abl-1", dead=False, pseudo=False)
        self.df.load_ontology_from_file(ontology_type=DataType.EXPR, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "anatomy_ontology.WS274.obo"),
                                        ontology_cache_path=os.path.join(self.this_dir, "cache",
                                                                         "anatomy_ontology.WS274.obo"),
                                        config=self.conf_parser)
        logger.info("Loading expression associations from file")
        self.conf_parser.config["expression_sentences_options"]["max_num_terms"] = 5
        self.conf_parser.config["expression_sentences_options"]["trim_min_distance_from_root"]["A"] = 4
        self.conf_parser.config["expression_sentences_options"]["remove_children_if_parent_is_present"] = False
        associations = self.get_associations(gene.id, ["WBbt:0006796", "WBbt:0006759", "WBbt:0005300", "WBbt:0008598",
                                                       "WBbt:0003681", "WBbt:0005829", "WBbt:0003927", "WBbt:0006751"],
                                             ["Verified"], "A", "IDA")
        self.df.expression_associations = AssociationSetFactory().create_from_assocs(assocs=associations,
                                                                                     ontology=self.df.expression_ontology)
        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca"
        gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1",
                                        add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene)
        gene_desc_lca.stats.calculate_stats(data_manager=self.df)
        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic"
        set_ic_ontology_struct(ontology=self.df.go_ontology, relations=self.df.go_relations)
        set_ic_ontology_struct(ontology=self.df.expression_ontology, relations=self.df.expr_relations)
        gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1",
                                       add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene)
        gene_desc_ic.stats.calculate_stats(data_manager=self.df)
        self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "1")

        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca"
        gene = Gene(id="WB:WBGene00000022", name="aat-1", dead=False, pseudo=False)
        associations = self.get_associations(gene.id, ["WBbt:0005828", "WBbt:0006751", "WBbt:0005439", "WBbt:0005788",
                                                       "WBbt:0006749", "WBbt:0005300", "WBbt:0005735", "WBbt:0005747",
                                                       "WBbt:0005772", "WBbt:0005776", "WBbt:0005812", "WBbt:0005741",
                                                       "WBbt:0005799", "WBbt:0003681"],
                                             ["Verified"], "A", "IDA")
        self.df.expression_associations = AssociationSetFactory().create_from_assocs(
            assocs=associations, ontology=self.df.expression_ontology)
        gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1",
                                        add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene)
        gene_desc_lca.stats.calculate_stats(data_manager=self.df)
        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic"
        gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1",
                                       add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene)
        gene_desc_ic.stats.calculate_stats(data_manager=self.df)
        self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "2")

        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca"
        gene = Gene(id="WB:WBGene00000044", name="acr-5", dead=False, pseudo=False)
        associations = self.get_associations(gene.id, ['WBbt:0003679', 'WBbt:0006759', 'WBbt:0005336', 'WBbt:0006751',
                                                       'WBbt:0005300', 'WBbt:0005274', 'WBbt:0005741', 'WBbt:0006749',
                                                       'WBbt:0005735'],
                                             ["Verified"], "A", "IDA")
        self.df.expression_associations = AssociationSetFactory().create_from_assocs(
            assocs=associations, ontology=self.df.expression_ontology)
        gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5",
                                        add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene)
        gene_desc_lca.stats.calculate_stats(data_manager=self.df)
        self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic"
        self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic"
        gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5",
                                       add_gene_name=False)
        set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene)
        set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene)
        gene_desc_ic.stats.calculate_stats(data_manager=self.df)
        self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "3")
class TestGOModule(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache",
                                                                                       "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.wb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.wb.partial"),
                                            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(any(parent == "GO:0009987" for parent in
                            self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(len(self.df.get_annotations_for_gene(
            gene_id="WB:WBGene00000001", annot_type=DataType.GO,
            include_obsolete=False, include_negative_results=False,
            priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list(
            self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO)
        self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot]))

    def test_download_gz_file(self):
        test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache",
                                                                     "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"),
                                             file_source_url="file://" + os.path.join(
                                                 self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"))
        self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt"))

    def test_gene_data_functions(self):
        self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True),
                                         Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)])
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=False)]) == 1)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=False)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=True)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=True)]) == 4)

    def test_get_human_gene_props(self):
        human_gene_props = self.df.get_human_gene_props()
        self.assertTrue(len(human_gene_props) > 0)

    def test_get_ensembl_hgnc_ids_map(self):
        ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map()
        self.assertTrue(len(ensembl_hgnc_ids_map) > 0)

    def test_set_ontology(self):
        ontology = OntologyFactory().create()
        for i in range(4):
            ontology.add_node(i, 'node' + str(i))
        ontology.add_parent(1, 0)
        ontology.add_parent(2, 0)
        ontology.add_parent(3, 0)
        self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser)
        self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes()))

    def test_set_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901",
                                                                 "", "F", "EXP", None, "WB", ""))
        associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertTrue(self.df.go_associations)

    def test_remap_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
class TestOntologyTools(unittest.TestCase):

    def load_go_ontology(self):
        logger.info("Starting Ontology Tools tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"),
                                        ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.wb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.wb.partial"),
                                            config=self.conf_parser)

    def load_do_ontology(self):
        logger.info("Starting Ontology Tools tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None)
        logger.info("Loading do ontology from file")
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "doid.obo"),
                                        ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"),
                                        config=self.conf_parser)

    def test_get_common_ancestors(self):
        self.load_go_ontology()
        generator = OntologySentenceGenerator(gene_id="WB:WBGene00000912", module=Module.GO,
                                              data_manager=self.df, config=self.conf_parser)
        node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"]
        common_ancestors = get_all_common_ancestors(node_ids, generator.ontology)
        self.assertTrue(len(common_ancestors) > 0, "Common ancestors not found")
        associations = [association for subj_associations in self.df.go_associations.associations_by_subj.values() for
                        association in subj_associations]
        associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931",
                                                                 gene_symbol="", gene_type="gene", taxon_id="",
                                                                 object_id="GO:0043055", qualifiers="", aspect="P",
                                                                 ecode="EXP", references="", prvdr="WB", date=""))
        associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931",
                                                                 gene_symbol="", gene_type="gene", taxon_id="",
                                                                 object_id="GO:0061065", qualifiers="", aspect="P",
                                                                 ecode="EXP", references="", prvdr="WB", date=""))
        associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931",
                                                                 gene_symbol="", gene_type="gene", taxon_id="",
                                                                 object_id="GO:0043054", qualifiers="", aspect="P",
                                                                 ecode="EXP", references="", prvdr="WB", date=""))
        associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931",
                                                                 gene_symbol="", gene_type="gene", taxon_id="",
                                                                 object_id="GO:0043053", qualifiers="", aspect="P",
                                                                 ecode="EXP", references="", prvdr="WB", date=""))
        self.df.go_associations = AssociationSetFactory().create_from_assocs(assocs=associations,
                                                                             ontology=self.df.go_ontology)
        self.conf_parser.config["go_sentences_options"]["exclude_terms"].append("GO:0040024")
        generator = OntologySentenceGenerator(gene_id="WB:WBGene00003931", module=Module.GO,
                                                      data_manager=self.df, config=self.conf_parser)
        node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"]
        common_ancestors = get_all_common_ancestors(node_ids, generator.ontology)
        self.assertTrue("GO:0040024" not in common_ancestors, "Common ancestors contain blacklisted term")

    def test_information_content(self):
        self.load_go_ontology()
        set_all_information_content_values(ontology=self.df.go_ontology)
        roots = self.df.go_ontology.get_roots()
        for root_id in roots:
            self.assertTrue(self.df.go_ontology.node(root_id)["IC"] == 0, "Root IC not equal to 0")

    def test_find_set_covering(self):
        subsets = [("1", "1", {"A", "B", "C"}), ("2", "2", {"A", "B"}), ("3", "3", {"C"}), ("4", "4", {"A"}),
                   ("5", "5", {"B"}), ("6", "6", {"C"})]
        values = [2, 12, 5, 20, 20, 20]
        # test with weights
        set_covering = [best_set[0] for best_set in find_set_covering(subsets=subsets, value=values, max_num_subsets=3)]
        self.assertTrue("2" in set_covering)
        self.assertTrue("6" in set_covering)
        self.assertTrue("1" not in set_covering)
        self.assertTrue("3" not in set_covering)
        self.assertTrue("4" not in set_covering)
        self.assertTrue("5" not in set_covering)
        # test without weights
        set_covering_noweights = [best_set[0] for best_set in
                                  find_set_covering(subsets=subsets, value=None, max_num_subsets=3)]
        self.assertTrue("1" in set_covering_noweights and len(set_covering_noweights) == 1)
        # test wrong input
        costs_wrong = [1, 3]
        set_covering_wrong = find_set_covering(subsets=subsets, value=costs_wrong, max_num_subsets=3)
        self.assertTrue(set_covering_wrong is None, "Cost vector with length different than subsets should return None")

        subsets = [("1", "1", {"7"}), ("2", "2", {"7", "12", "13"}),
                   ("3", "3", {"16", "17"}), ("4", "4", {"11"}), ("6", "6", {"12", "13"}), ("7", "7", {"7"}),
                   ("9", "9", {"16", "17"}), ("11", "11", {"11"}), ("12", "12", {"12"}), ("13", "13", {"13"}),
                   ("16", "16", {"16"}), ("17", "17", {"17"})]
        values = [1, 1, 0.875061263, 1.301029996, 1.301029996, 1.602059991, 1.301029996, 1.698970004, 1.698970004,
                  1.698970004, 1.698970004, 1.698970004]
        set_covering = [best_set[0] for best_set in find_set_covering(subsets=subsets, value=values, max_num_subsets=3)]
        self.assertTrue(all([num in set_covering for num in ["2", "9", "11"]]))

    def test_set_covering_with_ontology(self):
        self.load_do_ontology()
        self.conf_parser.config["do_via_orth_sentences_options"]["trimming_algorithm"] = "ic"
        self.conf_parser.config["do_via_orth_sentences_options"]["max_num_terms"] = 5
        associations = [DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:0080028", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:0080056", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:14789", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:0080026", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:14415", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:0080045", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:3371", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:8886", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:674", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:5614", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:11830", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:8398", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:2256", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:5327", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date=""),
                        DataManager.create_annotation_record(source_line="", gene_id="MGI:88452",
                                                             gene_symbol="", gene_type="gene", taxon_id="",
                                                             object_id="DOID:1123", qualifiers="", aspect="D",
                                                             ecode="ISS", references="", prvdr="WB", date="")]
        self.df.do_associations = AssociationSetFactory().create_from_assocs(assocs=associations,
                                                                             ontology=self.df.do_ontology)
        generator = OntologySentenceGenerator(gene_id="MGI:88452", module=Module.DO_ORTHOLOGY,
                                              data_manager=self.df, config=self.conf_parser)
        sentences = generator.get_module_sentences(
            config=self.conf_parser, aspect='D', qualifier='', merge_groups_with_same_prefix=True,
            keep_only_best_group=True, high_priority_term_ids=["DOID:0080028", "DOID:0080056", "DOID:14789",
                                                               "DOID:0080026", "DOID:14415", "DOID:0080045"])
        print(sentences.get_description())
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)