class TestConfigParser(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))

    def test_exclude_terms_list(self):
        self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO,
                                                                 prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0,
                        "GO exclusion term list not loading")
        self.assertTrue(len(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL,
                                                                 prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0,
                        "DO terms exclusion not loading")

    def test_rename_terms(self):
        self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO,
                                                                 prop=ConfigModuleProperty.RENAME_TERMS)) == 7,
                        "GO term renaming list not loading")
        self.assertTrue(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL,
                                                             prop=ConfigModuleProperty.RENAME_TERMS) is None,
                        "DO term renaming list should be None")

    def test_evidence_codes(self):
        self.assertTrue("EXP" in list(self.conf_parser.get_evidence_codes_groups_map(module=Module.GO).keys()))
Example #2
0
    def set_associations(self, associations_type: DataType,
                         associations: AssociationSet,
                         config: GenedescConfigParser):
        """set the go annotations and remove blacklisted annotations

        Args:
            associations_type (DataType): the type of associations to set
            associations (AssociationSet): an association object to set as go annotations
            config (GenedescConfigParser): configuration object where to read properties
        """
        if associations_type == DataType.GO:
            logger.info("Setting GO associations")
            self.go_associations = self.remove_blacklisted_annotations(
                association_set=associations,
                ontology=self.go_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS))
        elif associations_type == DataType.DO:
            logger.info("Setting DO associations")
            self.do_associations = self.remove_blacklisted_annotations(
                association_set=associations,
                ontology=self.do_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.DO_EXPERIMENTAL,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))
        elif associations_type == DataType.EXPR:
            logger.info("Setting Expression associations")
            self.expression_associations = self.remove_blacklisted_annotations(
                association_set=associations,
                ontology=self.do_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.EXPRESSION,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="ERROR",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, os.path.pardir, "tests",
                      "config_test.yml"))
     self.df = DataManager(do_relations=None,
                           go_relations=["subClassOf", "BFO:0000050"])
     logger.info("Loading go ontology from file")
     self.df.load_ontology_from_file(
         ontology_type=DataType.GO,
         ontology_url="file://" +
         os.path.join(self.this_dir, "data", "go_gd_test.obo"),
         ontology_cache_path=os.path.join(self.this_dir, "cache",
                                          "go_gd_test.obo"),
         config=self.conf_parser)
     logger.info("Loading go associations from file")
     self.df.load_associations_from_file(
         associations_type=DataType.GO,
         associations_url="file://" + os.path.join(
             self.this_dir, "data", "gene_association_1.7.wb.partial"),
         associations_cache_path=os.path.join(
             self.this_dir, "cache", "gene_association_1.7.wb.partial"),
         config=self.conf_parser)
Example #4
0
def load_data(organism, conf_parser: GenedescConfigParser):
    logger = logging.getLogger("WB Gene Description Pipeline - Data loader")
    sister_df = None
    df_agr = None
    organisms_info = conf_parser.get_wb_organisms_info()
    df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"],
                       config=conf_parser)
    if organism == "c_elegans":
        df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None)
        df_agr.load_ontology_from_file(ontology_type=DataType.GO,
                                       ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(),
                                       ontology_cache_path=os.path.join(conf_parser.get_cache_dir(),
                                                                        "wormbase_agr_human", "go_ontology.obo"),
                                       config=conf_parser)
        df_agr.load_associations_from_file(associations_type=DataType.GO,
                                           associations_url=conf_parser.get_wb_human_orthologs_go_associations(),
                                           associations_cache_path=os.path.join(
                                               conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"),
                                           config=conf_parser)
    if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]:
        sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"],
                                  do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser)
        logger.info("Loading GO data for sister species")
        sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url,
                                          ontology_cache_path=sister_df.go_ontology_cache_path,
                                          config=conf_parser)
        sister_df.load_associations_from_file(associations_type=DataType.GO,
                                              associations_url=sister_df.go_associations_url,
                                              associations_cache_path=sister_df.go_associations_cache_path,
                                              config=conf_parser)
    logger.info("Loading all data for main species")
    df.load_all_data_from_file()
    return df, sister_df, df_agr
class TestGOModule(unittest.TestCase):
    def setUp(self):
        logging.basicConfig(
            filename=None,
            level="ERROR",
            format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(
            os.path.join(self.this_dir, os.path.pardir, "tests",
                         "config_test.yml"))
        self.df = DataManager(do_relations=None,
                              go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(
            ontology_type=DataType.GO,
            ontology_url="file://" +
            os.path.join(self.this_dir, "data", "go_gd_test.obo"),
            ontology_cache_path=os.path.join(self.this_dir, "cache",
                                             "go_gd_test.obo"),
            config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(
            associations_type=DataType.GO,
            associations_url="file://" + os.path.join(
                self.this_dir, "data", "gene_association_1.7.wb.partial"),
            associations_cache_path=os.path.join(
                self.this_dir, "cache", "gene_association_1.7.wb.partial"),
            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(
            any(parent == "GO:0009987"
                for parent in self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(
            len(
                self.df.get_annotations_for_gene(
                    gene_id="WB:WBGene00000001",
                    annot_type=DataType.GO,
                    include_obsolete=False,
                    include_negative_results=False,
                    priority_list=self.conf_parser.get_annotations_priority(
                        module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(
            all(
                len(self.df.go_ontology.search(term)) == 0 for term in list(
                    self.conf_parser.get_module_property(
                        module=Module.GO,
                        prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        pass
def set_information_poor_sentence(orth_fullnames: List[str],
                                  selected_orthologs,
                                  conf_parser: GenedescConfigParser,
                                  human_df_agr: DataManager,
                                  gene_desc: GeneDescription,
                                  dm: WBDataManager, gene: Gene):
    if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
        best_orth = get_best_human_ortholog_for_info_poor(
            selected_orthologs,
            conf_parser.get_annotations_priority(module=Module.GO),
            human_df_agr,
            config=conf_parser)
        if best_orth:
            if not best_orth.startswith("RGD:"):
                best_orth = "RGD:" + best_orth
            human_go_sent_generator = OntologySentenceGenerator(
                gene_id=best_orth,
                module=Module.GO,
                data_manager=human_df_agr,
                config=conf_parser,
                humans=False,
                limit_to_group="EXPERIMENTAL")
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                aspect='F',
                qualifier="contributes_to",
                merge_groups_with_same_prefix=True,
                keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION,
                    description="human " +
                    human_df_agr.go_associations.subject_label_map[best_orth] +
                    " " + human_func_sent)
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                aspect='F',
                qualifier="enables",
                merge_groups_with_same_prefix=True,
                keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION,
                    description="human " +
                    human_df_agr.go_associations.subject_label_map[best_orth] +
                    " " + human_func_sent)

    protein_domains = dm.protein_domains[gene_desc.gene_id[3:]]
    if protein_domains:
        dom_word = "domain"
        if len([ptdom[1] for ptdom in protein_domains if ptdom[1] != ""]) > 1:
            dom_word = "domains"
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.PROTEIN_DOMAIN,
            description="is predicted to encode a protein with the following "
            + dom_word + ": " + concatenate_words_with_oxford_comma(
                [ptdom[1] for ptdom in protein_domains if ptdom[1] != ""],
                separator=conf_parser.get_terms_delimiter()))
Example #7
0
    def load_associations_from_file(self, associations_type: DataType,
                                    associations_url: str,
                                    associations_cache_path: str,
                                    config: GenedescConfigParser) -> None:
        """load go associations from file

        Args:
            associations_type (DataType): the type of associations to set
            associations_url (str): url to the association file
            associations_cache_path (str): path to cache file for the associations
            config (GenedescConfigParser): configuration object where to read properties
        """
        assoc_config = AssocParserConfig(remove_double_prefixes=True,
                                         paint=True)
        if associations_type == DataType.GO:
            logger.info("Loading GO associations from file")
            self.go_associations = AssociationSetFactory().create_from_assocs(
                assocs=GafParser(config=assoc_config).parse(
                    file=self._get_cached_file(
                        cache_path=associations_cache_path,
                        file_source_url=associations_url),
                    skipheader=True),
                ontology=self.go_ontology)
            self.go_associations = self.remove_blacklisted_annotations(
                association_set=self.go_associations,
                ontology=self.go_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS))
        elif associations_type == DataType.DO:
            logger.info("Loading DO associations from file")
            self.do_associations = AssociationSetFactory().create_from_assocs(
                assocs=GafParser(config=assoc_config).parse(
                    file=self._get_cached_file(
                        cache_path=associations_cache_path,
                        file_source_url=associations_url),
                    skipheader=True),
                ontology=self.do_ontology)
            self.do_associations = self.remove_blacklisted_annotations(
                association_set=self.do_associations,
                ontology=self.do_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.DO_EXP_AND_BIO,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))
        elif associations_type == DataType.EXPR:
            logger.info("Loading Expression associations from file")
            self.expression_associations = AssociationSetFactory(
            ).create_from_assocs(assocs=GafParser(config=assoc_config).parse(
                file=self._get_cached_file(cache_path=associations_cache_path,
                                           file_source_url=associations_url),
                skipheader=True),
                                 ontology=self.expression_ontology)
            self.expression_associations = self.remove_blacklisted_annotations(
                association_set=self.expression_associations,
                ontology=self.expression_ontology,
                terms_blacklist=config.get_module_property(
                    module=Module.EXPRESSION,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="INFO",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, os.path.pardir, "tests",
                      "config_test.yml"))
Example #9
0
    def set_ontology(self,
                     ontology_type: DataType,
                     ontology: Ontology,
                     config: GenedescConfigParser,
                     slim_cache_path: str = None) -> None:
        """set the go ontology and apply terms renaming

        Args:
            ontology_type (DataType): the type of ontology to set
            ontology (Ontology): an ontology object to set as go ontology
            config (GenedescConfigParser): configuration object where to read properties
            slim_cache_path (str): path to slim file to use
        """
        if ontology_type == DataType.GO:
            logger.info("Setting GO ontology")
            if self.go_relations:
                self.go_ontology = ontology.subontology(
                    relations=self.go_relations)
            else:
                self.go_ontology = ontology
        elif ontology_type == DataType.DO:
            logger.info("Setting DO ontology")
            if self.do_relations:
                self.do_ontology = ontology.subontology(
                    relations=self.do_relations)
            else:
                self.do_ontology = ontology
        elif ontology_type == DataType.EXPR:
            logger.info("Setting Expression ontology")
            if self.expr_relations:
                self.expression_ontology = ontology.subontology(
                    relations=self.expr_relations)
            else:
                self.expression_ontology = ontology
        module = get_module_from_data_type(ontology_type)
        ontology = self.get_ontology(data_type=ontology_type)
        terms_replacement_regex = config.get_module_property(
            module=module, prop=ConfigModuleProperty.RENAME_TERMS)
        if terms_replacement_regex:
            self.rename_ontology_terms(
                ontology=ontology,
                terms_replacement_regex=terms_replacement_regex)
        set_all_depths(ontology=ontology,
                       relations=self.get_relations(ontology_type))
        if config.get_module_property(
                module=module,
                prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "ic":
            set_ic_ontology_struct(ontology=ontology,
                                   relations=self.get_relations(ontology_type))
        if slim_cache_path:
            slim_url = config.get_module_property(
                module=module, prop=ConfigModuleProperty.SLIM_URL)
            self.load_slim(module=module,
                           slim_url=slim_url,
                           slim_cache_path=slim_cache_path)
 def test_compose_sentence(self):
     this_dir = os.path.split(__file__)[0]
     conf_parser = GenedescConfigParser(
         os.path.join(this_dir, os.path.pardir, "tests", "config_test.yml"))
     sentence = compose_sentence(
         prefix="Is expressed in",
         additional_prefix="several processes, including",
         term_names=["cell", "tail", "head", "male"],
         postfix="based on experimental observation",
         config=conf_parser,
         ancestors_with_multiple_children={"head"},
         rename_cell=True,
         put_anatomy_male_at_end=True)
     self.assertTrue("cell" not in sentence)
     self.assertTrue("and in male" in sentence)
     sentence = compose_sentence(
         prefix="Is expressed in",
         additional_prefix="several processes, including",
         term_names=["cell"],
         postfix="based on experimental observation",
         config=conf_parser,
         rename_cell=True)
     self.assertTrue(
         sentence ==
         "Is expressed widely based on experimental observation")
 def load_do_ontology(self):
     logger.info("Starting Ontology Tools tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
     self.df = DataManager(do_relations=None)
     logger.info("Loading do ontology from file")
     logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join(
         self.this_dir, "data", "doid.obo"),
                                     ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"),
                                     config=self.conf_parser)
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="ERROR",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, "config_test_wb.yml"))
     self.df = WBDataManager(do_relations=None,
                             go_relations=["subClassOf", "BFO:0000050"],
                             config=self.conf_parser,
                             species="c_elegans")
Example #13
0
    def set_associations(self, associations_type: DataType,
                         associations: AssociationSet,
                         config: GenedescConfigParser):
        """set the go annotations and remove blacklisted annotations

        Args:
            associations_type (DataType): the type of associations to set
            associations (AssociationSet): an association object to set as go annotations
            config (GenedescConfigParser): configuration object where to read properties
        """
        assocs = self.remap_associations(
            associations=associations,
            ontology=self.get_ontology(associations_type),
            associations_map=config.get_module_property(
                module=get_module_from_data_type(associations_type),
                prop=ConfigModuleProperty.REMAP_TERMS))
        assocs = self.remove_blacklisted_annotations(
            association_set=assocs,
            ontology=self.get_ontology(associations_type),
            terms_blacklist=config.get_module_property(
                module=get_module_from_data_type(associations_type),
                prop=ConfigModuleProperty.EXCLUDE_TERMS))

        if associations_type == DataType.GO:
            logger.info("Setting GO associations")
            self.go_associations = assocs
        elif associations_type == DataType.DO:
            logger.info("Setting DO associations")
            self.do_associations = assocs
        elif associations_type == DataType.EXPR:
            logger.info("Setting Expression associations")
            self.expression_associations = assocs
        if config.get_module_property(
                module=get_module_from_data_type(associations_type),
                prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "icGO":
            set_ic_annot_freq(self.get_ontology(associations_type),
                              self.get_associations(associations_type))
Example #14
0
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription,
                                        config: GenedescConfigParser, excluded_orthologs: bool = False):
    """set orthology module for Alliance human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name
        gene_desc (GeneDescription): the gene description object to update
        config (GenedescConfigParser): a gene descriptions configuration object
        excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the
            final sentence will include a prefix to specify that some orthologs have been omitted
    """
    if len(orthologs) > 0:
        prefix = "human"
        orthologs_display = sorted(orthologs, key=lambda x: x[2])
        if excluded_orthologs or len(orthologs) > 3:
            orthologs_display = orthologs_display[0:3]
            prefix = "several human genes including"
        sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma(
            [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display],
            separator=config.get_terms_delimiter())
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
Example #15
0
def generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human_genes_props: Dict[str, List[str]],
                                              config: GenedescConfigParser):
    """build orthology sentence for WormBase human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol
        human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties
        config (GenedescConfigParser): a gene description configuration object
    Returns:
        Tuple[list, str]: the orthologs and the sentence
    """
    prefix = "human "
    if len(orthologs) > 3:
        orthologs = orthologs[0:3]
        prefix = "several human genes including "
    symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] +
                              ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else
                              best_orth[1] for best_orth in orthologs])
    orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma(
        symbol_name_arr, separator=config.get_terms_delimiter())
    return [human_genes_props[best_orth[0]][0] for best_orth in orthologs if best_orth[0] in human_genes_props and
            human_genes_props[best_orth[0]]], orth_sentence
Example #16
0
    def __init__(self,
                 gene_id: str,
                 module: Module,
                 data_manager: DataManager,
                 config: GenedescConfigParser,
                 limit_to_group: str = None,
                 humans: bool = False):
        """initialize sentence generator object

        Args:
            config (GenedescConfigParser): an optional config object from which to read the options
            limit_to_group (str): limit the evidence codes to the specified group
        """
        self.ontology = data_manager.get_ontology(
            get_data_type_from_module(module))
        self.config = config
        self.module = module
        self.terms_already_covered = set()
        self.terms_groups = defaultdict(lambda: defaultdict(set))
        self.evidence_groups_priority_list = config.get_evidence_groups_priority_list(
            module=module)
        self.prepostfix_sentences_map = config.get_prepostfix_sentence_map(
            module=module, humans=humans)
        self.gene_annots = data_manager.get_annotations_for_gene(
            gene_id=gene_id,
            annot_type=get_data_type_from_module(module),
            priority_list=config.get_annotations_priority(module=module))
        self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property(
            module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)](
                ontology=self.ontology,
                annotations=data_manager.get_associations(
                    get_data_type_from_module(module)),
                nodeids_blacklist=config.get_module_property(
                    module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS),
                slim_terms_ic_bonus_perc=config.get_module_property(
                    module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC),
                slim_set=data_manager.get_slim(module=module))
        self.set_terms_groups(module, config, limit_to_group, humans)
class TestGOModule(unittest.TestCase):

    def setUp(self):
        logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
        logger.info("Starting DataManager tests")
        self.this_dir = os.path.split(__file__)[0]
        self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
        self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        logger.info("Loading go ontology from file")
        self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join(
            self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache",
                                                                                       "go_gd_test.obo"),
                                        config=self.conf_parser)
        logger.info("Loading go associations from file")
        self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join(
            self.this_dir, "data", "gene_association_1.7.wb.partial"),
                                            associations_cache_path=os.path.join(self.this_dir, "cache",
                                                                                 "gene_association_1.7.wb.partial"),
                                            config=self.conf_parser)

    def test_ontology_exists(self):
        self.assertTrue(self.df.go_ontology is not None)
        self.assertTrue(any(parent == "GO:0009987" for parent in
                            self.df.go_ontology.parents("GO:0000075")))

    def test_annotations_exist(self):
        self.assertTrue(self.df.go_associations is not None)
        self.assertTrue(len(self.df.get_annotations_for_gene(
            gene_id="WB:WBGene00000001", annot_type=DataType.GO,
            include_obsolete=False, include_negative_results=False,
            priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0)

    def test_rename_terms(self):
        self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list(
            self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys())))

    def test_exclude_terms(self):
        test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO)
        self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot]))

    def test_download_gz_file(self):
        test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache",
                                                                     "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"),
                                             file_source_url="file://" + os.path.join(
                                                 self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"))
        self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt"))

    def test_gene_data_functions(self):
        self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True),
                                         Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)])
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=False)]) == 1)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=False)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False,
                                                              include_pseudo_genes=True)]) == 2)
        self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True,
                                                              include_pseudo_genes=True)]) == 4)

    def test_get_human_gene_props(self):
        human_gene_props = self.df.get_human_gene_props()
        self.assertTrue(len(human_gene_props) > 0)

    def test_get_ensembl_hgnc_ids_map(self):
        ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map()
        self.assertTrue(len(ensembl_hgnc_ids_map) > 0)

    def test_set_ontology(self):
        ontology = OntologyFactory().create()
        for i in range(4):
            ontology.add_node(i, 'node' + str(i))
        ontology.add_parent(1, 0)
        ontology.add_parent(2, 0)
        ontology.add_parent(3, 0)
        self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser)
        self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes()))

    def test_set_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901",
                                                                 "", "F", "EXP", None, "WB", ""))
        associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertTrue(self.df.go_associations)

    def test_remap_associations(self):
        associations = []
        associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996",
                                                                 "", "F", "EXP", None, "WB", ""))
        assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology)
        self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser)
        self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
Example #18
0
    def get_module_sentences(self,
                             config: GenedescConfigParser,
                             aspect: str,
                             qualifier: str = '',
                             keep_only_best_group: bool = False,
                             merge_groups_with_same_prefix: bool = False,
                             high_priority_term_ids: List[str] = None):
        """generate description for a specific combination of aspect and qualifier

        Args:
            config (GenedescConfigParser): a configuration object from which to read properties
            aspect (str): a data type aspect
            qualifier (str): qualifier
            keep_only_best_group (bool): whether to get only the evidence group with highest priority and discard
                the other evidence groups
            merge_groups_with_same_prefix (bool): whether to merge the phrases for evidence groups with the same prefix
            high_priority_term_ids (List[str]): list of ids for terms that must always appear in the sentence with
                higher priority than the other terms. Trimming is not applied to these terms
        Returns:
            ModuleSentences: the module sentences
        """
        cat_several_words = config.get_module_property(
            module=self.module,
            prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD)
        del_overlap = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.REMOVE_OVERLAP)
        remove_parents = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.DEL_PARENTS_IF_CHILD)
        remove_child_terms = config.get_module_property(
            module=self.module,
            prop=ConfigModuleProperty.DEL_CHILDREN_IF_PARENT)
        max_terms = config.get_module_property(
            module=self.module,
            prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE)
        exclude_terms = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS)
        cutoff_final_word = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD)
        rename_cell = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.RENAME_CELL)
        if not cat_several_words:
            cat_several_words = {
                'F': 'functions',
                'P': 'processes',
                'C': 'components',
                'D': 'diseases',
                'A': 'tissues'
            }
        sentences = []
        terms_already_covered = set()
        evidence_group_priority = {
            eg: p
            for p, eg in enumerate(self.evidence_groups_priority_list)
        }
        for terms, evidence_group, priority in sorted(
            [(t, eg, evidence_group_priority[eg])
             for eg, t in self.terms_groups[(aspect, qualifier)].items()],
                key=lambda x: x[2]):
            terms, trimmed, add_others, ancestors_covering_multiple_children = self.reduce_terms(
                terms, max_terms, aspect, config, del_overlap,
                terms_already_covered, exclude_terms, remove_parents,
                remove_child_terms, high_priority_term_ids)
            if (aspect, evidence_group, qualifier
                ) in self.prepostfix_sentences_map and len(terms) > 0:
                sentences.append(
                    _get_single_sentence(
                        node_ids=terms,
                        ontology=self.ontology,
                        aspect=aspect,
                        evidence_group=evidence_group,
                        qualifier=qualifier,
                        prepostfix_sentences_map=self.prepostfix_sentences_map,
                        terms_merged=False,
                        trimmed=trimmed,
                        add_others=add_others,
                        truncate_others_generic_word=cutoff_final_word,
                        truncate_others_aspect_words=cat_several_words,
                        ancestors_with_multiple_children=
                        ancestors_covering_multiple_children,
                        rename_cell=rename_cell))
                if keep_only_best_group:
                    return ModuleSentences(sentences)
        if merge_groups_with_same_prefix:
            sentences = self.merge_sentences_with_same_prefix(
                sentences=sentences,
                remove_parent_terms=remove_parents,
                rename_cell=rename_cell,
                high_priority_term_ids=high_priority_term_ids)
        return ModuleSentences(sentences)
Example #19
0
    def __init__(self,
                 config: GenedescConfigParser,
                 species: str,
                 go_relations: List[str] = None,
                 do_relations: List[str] = None,
                 use_cache: bool = False):
        """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file
        locations are automatically generated and stored in class variables ending in _url for remote filed and
        _cache_path for caching

        Args:
            species (str): WormBase species to fetch
        """
        self.config = config
        raw_files_source = config.get_wb_raw_file_sources()
        cache_location = config.get_cache_dir()
        release_version = config.get_wb_release()
        organisms_info = config.get_wb_organisms_info()
        project_id = organisms_info[species]["project_id"]
        self.sister_sp_fullname = ""
        if "main_sister_species" in organisms_info[species] and "full_name" in \
                organisms_info[organisms_info[species]["main_sister_species"]]:
            self.sister_sp_fullname = organisms_info[
                organisms_info[species]["main_sister_species"]]["full_name"]
        self.orth_fullnames = ""
        if "ortholog" in organisms_info[species] and all([
                "full_name" in organisms_info[ortholog_sp]
                for ortholog_sp in organisms_info[species]["ortholog"]
        ]):
            self.orth_fullnames = [
                organisms_info[ortholog_sp]["full_name"]
                for ortholog_sp in organisms_info[species]["ortholog"]
            ]
        expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \
            "ec_anatomy_prefix" in organisms_info[species] else None
        expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \
            "ec_molreg_prefix" in organisms_info[species] else None
        expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \
            "ec_genereg_prefix" in organisms_info[species] else None
        super().__init__(go_relations=go_relations,
                         do_relations=do_relations,
                         use_cache=use_cache)
        self.gene_data_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".geneIDs.txt.gz")
        self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz'
        self.go_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "gene_ontology." + release_version + ".obo")
        self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \
                               release_version + '.obo'
        self.go_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".go_annotations.gaf.gz")
        self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                   project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                   '.go_annotations.gaf.gz'
        self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \
                               release_version + '.obo'
        self.do_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "disease_ontology." + release_version + ".obo")
        self.do_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.wb")
        self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                   release_version + '.wb'
        self.do_associations_new_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.daf.txt")
        self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                       release_version + '.daf.txt'
        self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz'
        self.orthology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".orthologs.txt.gz")
        self.orthologs = defaultdict(lambda: defaultdict(list))
        self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                  project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                  '.protein_domains.csv.gz'
        self.protein_domain_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".protein_domains.csv.gz")
        self.protein_domains = defaultdict(list)
        self.expression_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_ontology." + release_version + ".obo")
        self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \
                                       release_version + '.obo'
        self.expression_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_association." + release_version + ".wb")
        self.expression_associations_url = raw_files_source + '/' + release_version + \
                                           '/ONTOLOGY/anatomy_association.' + release_version + '.wb'
        self.expression_cluster_anatomy_url = self._get_expression_cluster_url(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version)
        self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_anatomy_data = defaultdict(
            list) if self.expression_cluster_anatomy_url else None
        self.expression_cluster_molreg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version)
        self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_molreg_data = defaultdict(
            list) if self.expression_cluster_molreg_url else None
        self.expression_cluster_genereg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version)
        self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_genereg_data = defaultdict(
            list) if self.expression_cluster_genereg_url else None
Example #20
0
 def load_associations_from_file(
         self,
         associations_type: DataType,
         associations_url: str,
         associations_cache_path: str,
         config: GenedescConfigParser,
         association_additional_url: str = None,
         association_additional_cache_path: str = None) -> None:
     logger.info("Loading associations from file")
     if associations_type == DataType.GO:
         super().load_associations_from_file(
             associations_type=associations_type,
             associations_url=associations_url,
             associations_cache_path=associations_cache_path,
             config=config)
     elif associations_type == DataType.EXPR:
         associations = []
         file_path = self._get_cached_file(
             cache_path=associations_cache_path,
             file_source_url=associations_url)
         for line in open(file_path):
             if not line.strip().startswith("!"):
                 linearr = line.strip().split("\t")
                 if self.expression_ontology.node(linearr[4]):
                     gene_id = linearr[0] + ":" + linearr[1]
                     qualifiers = linearr[3].split("|")
                     if len(
                             qualifiers
                     ) == 0 or "Partial" in qualifiers or "Certain" in qualifiers:
                         qualifiers = ["Verified"]
                     associations.append(
                         DataManager.create_annotation_record(
                             line, gene_id, linearr[2], linearr[11],
                             linearr[12], linearr[4], qualifiers,
                             linearr[8], linearr[6], linearr[5].split("|"),
                             linearr[14], linearr[13]))
         self.expression_associations = AssociationSetFactory(
         ).create_from_assocs(assocs=associations,
                              ontology=self.expression_ontology)
         self.expression_associations = self.remove_blacklisted_annotations(
             association_set=self.expression_associations,
             ontology=self.expression_ontology,
             terms_blacklist=config.get_module_property(
                 module=Module.EXPRESSION,
                 prop=ConfigModuleProperty.EXCLUDE_TERMS))
     elif associations_type == DataType.DO:
         self.do_associations = AssociationSetFactory().create_from_assocs(
             assocs=GafParser().parse(file=self._get_cached_file(
                 cache_path=associations_cache_path,
                 file_source_url=associations_url),
                                      skipheader=True),
             ontology=self.do_ontology)
         if association_additional_cache_path and association_additional_url:
             associations = []
             for subj_associations in self.do_associations.associations_by_subj.values(
             ):
                 for association in subj_associations:
                     if association["evidence"]["type"] == "IEA":
                         associations.append(association)
             file_path = self._get_cached_file(
                 cache_path=association_additional_cache_path,
                 file_source_url=association_additional_url)
             header = True
             for line in open(file_path):
                 if not line.strip().startswith("!"):
                     if not header:
                         linearr = line.strip().split("\t")
                         if self.do_ontology.node(
                                 linearr[10]) and linearr[16] != "IEA":
                             gene_ids = [linearr[2]]
                             if linearr[1] == "allele":
                                 gene_ids = linearr[4].split(",")
                             for gene_id in gene_ids:
                                 associations.append(
                                     DataManager.create_annotation_record(
                                         line, gene_id, linearr[3],
                                         linearr[1], linearr[0],
                                         linearr[10], linearr[9].split("|"),
                                         "D", linearr[16],
                                         linearr[18].split("|"),
                                         linearr[20], linearr[19]))
                     else:
                         header = False
             self.do_associations = AssociationSetFactory(
             ).create_from_assocs(assocs=associations,
                                  ontology=self.do_ontology)
         self.do_associations = self.remove_blacklisted_annotations(
             association_set=self.do_associations,
             ontology=self.do_ontology,
             terms_blacklist=config.get_module_property(
                 module=Module.DO_EXPERIMENTAL,
                 prop=ConfigModuleProperty.EXCLUDE_TERMS))
Example #21
0
    def __init__(self,
                 gene_id: str,
                 module: Module,
                 data_manager: DataManager,
                 config: GenedescConfigParser,
                 limit_to_group: str = None,
                 humans: bool = False):
        """initialize sentence generator object

        Args:
            config (GenedescConfigParser): an optional config object from which to read the options
            limit_to_group (str): limit the evidence codes to the specified group
        """
        annot_type = None
        if module == Module.DO_ORTHOLOGY or module == Module.DO_EXPERIMENTAL or module == module.DO_BIOMARKER:
            self.ontology = data_manager.do_ontology
            annot_type = DataType.DO
        elif module == Module.GO:
            self.ontology = data_manager.go_ontology
            annot_type = DataType.GO
        elif module == Module.EXPRESSION:
            self.ontology = data_manager.expression_ontology
            annot_type = DataType.EXPR
        self.evidence_groups_priority_list = config.get_evidence_groups_priority_list(
            module=module)
        self.prepostfix_sentences_map = config.get_prepostfix_sentence_map(
            module=module, humans=humans)
        self.terms_groups = defaultdict(lambda: defaultdict(set))
        ev_codes_groups_maps = config.get_evidence_codes_groups_map(
            module=module)
        annotations = data_manager.get_annotations_for_gene(
            gene_id=gene_id,
            annot_type=annot_type,
            priority_list=config.get_annotations_priority(module=module))
        self.annotations = annotations
        self.module = module
        self.data_manager = data_manager
        self.annot_type = annot_type
        evidence_codes_groups_map = {
            evcode: group
            for evcode, group in ev_codes_groups_maps.items()
            if limit_to_group is None
            or limit_to_group in ev_codes_groups_maps[evcode]
        }
        prepostfix_special_cases_sent_map = config.get_prepostfix_sentence_map(
            module=module, special_cases_only=True, humans=humans)
        if len(annotations) > 0:
            for annotation in annotations:
                if annotation["evidence"]["type"] in evidence_codes_groups_map:
                    aspect = annotation["aspect"]
                    ev_group = evidence_codes_groups_map[annotation["evidence"]
                                                         ["type"]]
                    qualifier = "_".join(
                        sorted(annotation["qualifiers"]
                               )) if "qualifiers" in annotation else ""
                    if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \
                            prepostfix_special_cases_sent_map:
                        for special_case in prepostfix_special_cases_sent_map[(
                                aspect, ev_group, qualifier)]:
                            if re.match(
                                    re.escape(special_case[1]),
                                    self.ontology.label(
                                        annotation["object"]["id"],
                                        id_if_null=True)):
                                ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \
                                           str(special_case[0])
                                if ev_group not in self.evidence_groups_priority_list:
                                    self.evidence_groups_priority_list.insert(
                                        self.evidence_groups_priority_list.
                                        index(evidence_codes_groups_map[
                                            annotation["evidence"]["type"]]) +
                                        1, ev_group)
                                break
                    self.terms_groups[(aspect, qualifier)][ev_group].add(
                        annotation["object"]["id"])
def main():
    parser = argparse.ArgumentParser(
        description="Generate gene descriptions for wormbase")
    parser.add_argument("-c",
                        "--config-file",
                        metavar="config_file",
                        dest="config_file",
                        type=str,
                        default="config.yml",
                        help="configuration file. Default ./config.yaml")
    parser.add_argument(
        "-C",
        "--use-cache",
        dest="use_cache",
        action="store_true",
        default=False,
        help=
        "Use cached source files from cache_location specified in config file. Download them from "
        "raw_file_source (configured in config file) if not yet cached")
    parser.add_argument(
        "-l",
        "--log-file",
        metavar="log_file",
        dest="log_file",
        type=str,
        default=None,
        help="path to the log file to generate. Default ./genedescriptions.log"
    )
    parser.add_argument(
        "-L",
        "--log-level",
        dest="log_level",
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        help="set the logging level")
    parser.add_argument("-t",
                        "--textpressoapi-token",
                        metavar="textpresso_token",
                        dest="textpresso_token",
                        type=str,
                        help="Texpresso api token")
    parser.add_argument("-o",
                        "--output-formats",
                        metavar="output_formats",
                        dest="output_formats",
                        type=str,
                        nargs="+",
                        default=["ace", "txt", "json", "tsv"],
                        help="file formats to generate. Accepted values "
                        "are: ace, txt, json, tsv")
    args = parser.parse_args()
    conf_parser = GenedescConfigParser(args.config_file)
    logging.basicConfig(filename=args.log_file,
                        level=args.log_level,
                        format='%(asctime)s - %(name)s - %(levelname)s:'
                        '%(message)s',
                        force=True)
    logger = logging.getLogger("WB Gene Description Pipeline")
    organisms_list = conf_parser.get_wb_organisms_to_process()
    human_genes_props = DataManager.get_human_gene_props()
    api_manager = APIManager(textpresso_api_token=args.textpresso_token)
    for organism in organisms_list:
        logger.info("Processing organism " + organism)
        species = conf_parser.get_wb_organisms_info()
        dm, sister_df, df_agr = load_data(organism=organism,
                                          conf_parser=conf_parser)
        desc_writer = DescriptionsWriter()
        desc_writer.overall_properties.species = organism
        desc_writer.overall_properties.release_version = conf_parser.get_wb_release(
        )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1)
        desc_writer.overall_properties.date = datetime.date.today().strftime(
            "%B %d, %Y")
        for gene in dm.get_gene_data():
            logger.debug("Generating description for gene " + gene.name)
            gene_desc = GeneDescription(gene_id=gene.id,
                                        config=conf_parser,
                                        gene_name=gene.name,
                                        add_gene_name=False)
            selected_orthologs, orth_sent = get_best_orthologs_and_sentence(
                dm=dm,
                orth_fullnames=dm.orth_fullnames,
                human_genes_props=human_genes_props,
                gene_desc=gene_desc,
                api_manager=api_manager,
                config=conf_parser)
            set_gene_ontology_module(dm=dm,
                                     conf_parser=conf_parser,
                                     gene_desc=gene_desc,
                                     gene=gene)
            set_tissue_expression_sentence(dm=dm,
                                           gene=gene,
                                           conf_parser=conf_parser,
                                           gene_desc=gene_desc)
            if not gene_desc.description:
                set_expression_cluster_sentence(dm=dm,
                                                conf_parser=conf_parser,
                                                gene_desc=gene_desc,
                                                gene=gene,
                                                api_manager=api_manager)
            set_disease_module(df=dm,
                               conf_parser=conf_parser,
                               gene=gene,
                               gene_desc=gene_desc)
            if not gene_desc.go_description:
                set_information_poor_sentence(
                    orth_fullnames=dm.orth_fullnames,
                    selected_orthologs=selected_orthologs,
                    conf_parser=conf_parser,
                    human_df_agr=df_agr,
                    gene_desc=gene_desc,
                    dm=dm,
                    gene=gene)
            gene_desc.set_or_extend_module_description_and_final_stats(
                module=Module.ORTHOLOGY, description=orth_sent)
            if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \
                    dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname],
                                                   sister_species_data_fetcher=sister_df,
                                                   ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP",
                                                                        "HDA", "HMP", "HGI", "HEP"])[0]:
                set_sister_species_sentence(
                    dm=dm,
                    sister_sp_fullname=dm.sister_sp_fullname,
                    sister_df=sister_df,
                    species=species,
                    organism=organism,
                    gene_desc=gene_desc,
                    conf_parser=conf_parser,
                    gene=gene)
            desc_writer.add_gene_desc(gene_desc)
        logger.info("All genes processed for " + organism)
        date_prefix = datetime.date.today().strftime("%Y%m%d")
        if "json" in args.output_formats:
            logger.info("Writing descriptions to json")
            desc_writer.write_json(os.path.join(
                conf_parser.get_out_dir(),
                date_prefix + "_" + organism + ".json"),
                                   include_single_gene_stats=True,
                                   data_manager=dm)
        if "txt" in args.output_formats:
            logger.info("Writing descriptions to txt")
            desc_writer.write_plain_text(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".txt"))
        if "tsv" in args.output_formats:
            logger.info("Writing descriptions to tsv")
            desc_writer.write_tsv(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".tsv"))
        if "ace" in args.output_formats:
            logger.info("Writing descriptions to ace")
            curators = ["WBPerson324", "WBPerson37462"]
            release_version = conf_parser.get_wb_release()
            desc_writer.write_ace(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".ace"), curators,
                release_version)
def set_expression_cluster_sentence(dm: WBDataManager,
                                    conf_parser: GenedescConfigParser,
                                    gene_desc: GeneDescription, gene: Gene,
                                    api_manager: APIManager):

    expr_sentence_generator = OntologySentenceGenerator(
        gene_id=gene.id,
        module=Module.EXPRESSION,
        data_manager=dm,
        config=conf_parser)
    ec_gene_id = gene_desc.gene_id[3:]
    ec_anatomy_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.ANATOMY,
        feature=ExpressionClusterFeature.STUDIES)
    ec_anatomy_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.TERMS,
        expression_cluster_type=ExpressionClusterType.ANATOMY)
    if dm.expression_ontology is not None:
        expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences(
            aspect='A',
            qualifier="Enriched",
            merge_groups_with_same_prefix=True,
            keep_only_best_group=False)
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description=expression_enriched_module_sentences.get_description(),
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    elif ec_anatomy_terms:
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description="is enriched in " +
            concatenate_words_with_oxford_comma(
                ec_anatomy_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    ec_molreg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.MOLREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_molreg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.MOLREG)
    ec_genereg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.GENEREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_genereg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.GENEREG)
    if ec_genereg_terms:
        several_word = ""
        if len(ec_genereg_terms) > 3:
            t_p = [
                t_p for t_p in sorted(
                    [[term, api_manager.get_textpresso_popularity(term)]
                     for term in ec_genereg_terms],
                    key=lambda x: (x[1], x[0][1]),
                    reverse=True)
            ]
            ec_genereg_terms = [term for term, popularity in t_p[0:3]]
            several_word = "several genes including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_GENE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_genereg_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_genereg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    if ec_molreg_terms:
        several_word = ""
        if len(ec_molreg_terms) > 3:
            several_word = num2words(
                len(ec_molreg_terms)) + " chemicals including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_MOLECULE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_molreg_terms[0:3],
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_molreg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
Example #24
0
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str,
                                                      api_manager: APIManager, config: GenedescConfigParser):
    """build orthology sentence for WormBase non-human hortologs

        Args:
            orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol
            orthologs_sp_fullname (str): full name of species from which to extract orthologs
            api_manager (APIManager): api manager to send requests to wormbase and textpresso
            config (GenedescConfigParser): a gene description configuration object
        Returns:
            str: the orthology sentence
        """
    orth_sentence = None
    if len(orthologs) > 0:
        fullname_arr = orthologs_sp_fullname.split(" ")
        if len(fullname_arr[0]) > 2:
            fullname_arr[0] = fullname_arr[0][0] + "."
            orthologs_sp_fullname = " ".join(fullname_arr)
        if len(orthologs) > 3:
            # sort orthologs by tpc popularity and alphabetically (if tied)
            orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for
                                                    ortholog in orthologs], key=lambda x: (x[1], x[0][1]),
                                                   reverse=True)]
            classes_orth_pop = defaultdict(list)
            orthologs_pop_wo_class = []
            for o_p in orthologs_pop:
                gene_class = api_manager.get_gene_class(o_p[0][0])
                if gene_class:
                    classes_orth_pop[gene_class].append(o_p)
                else:
                    orthologs_pop_wo_class.append(o_p)
            if len(list(classes_orth_pop.keys())) == 1:
                orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]])
                classes_orth_pop = {}
            else:
                for gene_class, orths_with_pop in classes_orth_pop.items():
                    if len(orths_with_pop) == 1:
                        orthologs_pop_wo_class.extend(orths_with_pop)
            classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1}
            sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class]
            sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()])
            sorted_items.sort(key=lambda x: x[0][1], reverse=True)
            if len(sorted_items) > 3:
                sorted_items = sorted_items[0:3]
            gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0]
            classes_symbols = [item[2] for item in sorted_items if item[1] == 1]
            genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1]
            sentences_arr = []
            if len(gene_symbols_wo_class) > 0:
                sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma(
                    gene_symbols_wo_class, separator=config.get_terms_delimiter()))
            if len(classes_symbols) > 0:
                genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma(
                    genes_symbols_in_classes, separator=config.get_terms_delimiter())
                classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols,
                                                                           separator=config.get_terms_delimiter())
                classes_word = "classes" if len(classes_symbols) > 1 else "class"
                sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent +
                                     " gene " + classes_word + " including " + genes_symbols_in_classes_sent)
            orth_sentence = "is an ortholog of " + " and ".join(sentences_arr)
        else:
            # sort orthologs alphabetically
            orthologs_symbols = sorted([orth[1] for orth in orthologs])
            orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \
                            concatenate_words_with_oxford_comma(orthologs_symbols,
                                                                separator=config.get_terms_delimiter())
    return orth_sentence
Example #25
0
    def load_ontology_from_file(self, ontology_type: DataType,
                                ontology_url: str, ontology_cache_path: str,
                                config: GenedescConfigParser) -> None:
        """load go ontology from file

        Args:
            ontology_type (DataType): the type of ontology to set
            ontology_url (str): url to the ontology file
            ontology_cache_path (str): path to cache file for the ontology
            config (GenedescConfigParser): configuration object where to read properties
        """
        new_ontology = None
        module = None
        slim_cache_path = ""
        if ontology_type == DataType.GO:
            logger.info("Loading GO ontology data from file")
            self.go_ontology = OntologyFactory().create(
                self._get_cached_file(
                    file_source_url=ontology_url,
                    cache_path=ontology_cache_path)).subontology(
                        relations=self.go_relations)
            new_ontology = self.go_ontology
            module = Module.GO
            slim_cache_path = os.path.join(
                os.path.dirname(os.path.normpath(ontology_cache_path)),
                "go_slim.obo")
        elif ontology_type == DataType.DO:
            logger.info("Loading DO ontology data from file")
            self.do_ontology = OntologyFactory().create(
                self._get_cached_file(
                    file_source_url=ontology_url,
                    cache_path=ontology_cache_path)).subontology(
                        relations=self.do_relations)
            new_ontology = self.do_ontology
            module = Module.DO_EXPERIMENTAL
            slim_cache_path = os.path.join(
                os.path.dirname(os.path.normpath(ontology_cache_path)),
                "do_slim.obo")
        elif ontology_type == DataType.EXPR:
            logger.info("Loading Expression ontology data from file")
            self.expression_ontology = OntologyFactory().create(
                self._get_cached_file(
                    file_source_url=ontology_url,
                    cache_path=ontology_cache_path)).subontology()
            new_ontology = self.expression_ontology
            module = Module.EXPRESSION
            slim_cache_path = os.path.join(
                os.path.dirname(os.path.normpath(ontology_cache_path)),
                "exp_slim.obo")
        terms_replacement_regex = config.get_module_property(
            module=module, prop=ConfigModuleProperty.RENAME_TERMS)
        if terms_replacement_regex:
            self.rename_ontology_terms(
                ontology=new_ontology,
                terms_replacement_regex=terms_replacement_regex)
        if ontology_type == DataType.EXPR:
            DataManager.add_article_to_expression_nodes(
                self.expression_ontology)
        for root_id in new_ontology.get_roots():
            set_all_depths_in_subgraph(ontology=new_ontology,
                                       root_id=root_id,
                                       relations=None)
        slim_url = config.get_module_property(
            module=module, prop=ConfigModuleProperty.SLIM_URL)
        self.load_slim(module=module,
                       slim_url=slim_url,
                       slim_cache_path=slim_cache_path)
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)
Example #27
0
    def get_trimmed_terms_by_common_ancestor(
            self,
            terms: Set[str],
            terms_already_covered,
            aspect: str,
            config: GenedescConfigParser,
            high_priority_terms: List[str] = None):
        dist_root = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.DISTANCE_FROM_ROOT)
        add_mul_common_anc = config.get_module_property(
            module=self.module,
            prop=ConfigModuleProperty.ADD_MULTIPLE_TO_COMMON_ANCEST)
        max_terms = config.get_module_property(
            module=self.module,
            prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE)
        trimming_algorithm = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)
        slim_set = self.data_manager.get_slim(module=self.module)
        slim_bonus_perc = config.get_module_property(
            module=self.module, prop=ConfigModuleProperty.SLIM_BONUS_PERC)
        add_others_highp = False
        add_others_lowp = False
        ancestors_covering_multiple_children = set()
        if not dist_root:
            dist_root = {'F': 1, 'P': 1, 'C': 2, 'D': 3, 'A': 3}
        terms_high_priority = [
            term for term in terms
            if high_priority_terms and term in high_priority_terms
        ]
        if terms_high_priority is None:
            terms_high_priority = []
        if len(terms_high_priority) > max_terms:
            terms_high_priority = self.remove_children_if_parents_present(
                terms_high_priority, self.ontology, terms_already_covered)
        if len(terms_high_priority) > max_terms:
            logger.debug(
                "Reached maximum number of terms. Applying trimming to high priority terms"
            )
            terms_high_priority, add_others_highp = get_best_nodes(
                terms_high_priority,
                trimming_algorithm,
                max_terms,
                self.ontology,
                terms_already_covered,
                ancestors_covering_multiple_children
                if add_mul_common_anc else None,
                slim_bonus_perc,
                dist_root[aspect],
                slim_set,
                nodeids_blacklist=config.get_module_property(
                    module=self.module,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))
        else:
            terms_already_covered.update(terms_high_priority)
        terms_low_priority = [
            term for term in terms
            if not high_priority_terms or term not in high_priority_terms
        ]
        trimming_threshold = max_terms - len(terms_high_priority)
        if 0 < trimming_threshold < len(terms_low_priority):
            terms_low_priority, add_others_lowp = get_best_nodes(
                terms_low_priority,
                trimming_algorithm,
                trimming_threshold,
                self.ontology,
                terms_already_covered,
                ancestors_covering_multiple_children
                if add_mul_common_anc else None,
                slim_bonus_perc,
                dist_root[aspect],
                slim_set,
                nodeids_blacklist=config.get_module_property(
                    module=self.module,
                    prop=ConfigModuleProperty.EXCLUDE_TERMS))

        elif trimming_threshold <= 0 < len(terms_low_priority):
            add_others_lowp = True
        terms = terms_high_priority
        terms_low_priority_orig = terms_low_priority[:]
        # remove exact overlap
        terms_low_priority = list(
            set(terms_low_priority) - set(terms_high_priority))
        # remove possible children of terms in the high priority list
        terms_low_priority = list(
            set(terms_low_priority) | set(terms_high_priority))
        terms_low_priority = OntologySentenceGenerator.remove_children_if_parents_present(
            terms_low_priority, self.ontology)
        # remove possible parents of terms in the high priority list
        terms_low_priority = list(
            set(terms_low_priority) | set(terms_high_priority))
        terms_low_priority = OntologySentenceGenerator.remove_parents_if_child_present(
            terms_low_priority, self.ontology)
        terms_low_priority = list(
            set(terms_low_priority) - set(terms_high_priority))
        if len(terms_low_priority) < len(terms_low_priority_orig):
            add_others_lowp = True
        terms.extend(terms_low_priority)
        # cutoff terms - if number of terms with high priority is higher than max_num_terms
        terms = terms[0:max_terms]
        return terms, add_others_highp or add_others_lowp, ancestors_covering_multiple_children