def setUp(self):
     logging.basicConfig(
         filename=None,
         level="ERROR",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, os.path.pardir, "tests",
                      "config_test.yml"))
     self.df = DataManager(do_relations=None,
                           go_relations=["subClassOf", "BFO:0000050"])
     logger.info("Loading go ontology from file")
     self.df.load_ontology_from_file(
         ontology_type=DataType.GO,
         ontology_url="file://" +
         os.path.join(self.this_dir, "data", "go_gd_test.obo"),
         ontology_cache_path=os.path.join(self.this_dir, "cache",
                                          "go_gd_test.obo"),
         config=self.conf_parser)
     logger.info("Loading go associations from file")
     self.df.load_associations_from_file(
         associations_type=DataType.GO,
         associations_url="file://" + os.path.join(
             self.this_dir, "data", "gene_association_1.7.wb.partial"),
         associations_cache_path=os.path.join(
             self.this_dir, "cache", "gene_association_1.7.wb.partial"),
         config=self.conf_parser)
 def test_compose_sentence(self):
     this_dir = os.path.split(__file__)[0]
     conf_parser = GenedescConfigParser(
         os.path.join(this_dir, os.path.pardir, "tests", "config_test.yml"))
     sentence = compose_sentence(
         prefix="Is expressed in",
         additional_prefix="several processes, including",
         term_names=["cell", "tail", "head", "male"],
         postfix="based on experimental observation",
         config=conf_parser,
         ancestors_with_multiple_children={"head"},
         rename_cell=True,
         put_anatomy_male_at_end=True)
     self.assertTrue("cell" not in sentence)
     self.assertTrue("and in male" in sentence)
     sentence = compose_sentence(
         prefix="Is expressed in",
         additional_prefix="several processes, including",
         term_names=["cell"],
         postfix="based on experimental observation",
         config=conf_parser,
         rename_cell=True)
     self.assertTrue(
         sentence ==
         "Is expressed widely based on experimental observation")
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="INFO",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, os.path.pardir, "tests",
                      "config_test.yml"))
 def load_do_ontology(self):
     logger.info("Starting Ontology Tools tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
     self.df = DataManager(do_relations=None)
     logger.info("Loading do ontology from file")
     logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join(
         self.this_dir, "data", "doid.obo"),
                                     ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"),
                                     config=self.conf_parser)
 def setUp(self):
     logging.basicConfig(
         filename=None,
         level="ERROR",
         format='%(asctime)s - %(name)s - %(levelname)s: %(message)s')
     logger.info("Starting DataManager tests")
     self.this_dir = os.path.split(__file__)[0]
     self.conf_parser = GenedescConfigParser(
         os.path.join(self.this_dir, "config_test_wb.yml"))
     self.df = WBDataManager(do_relations=None,
                             go_relations=["subClassOf", "BFO:0000050"],
                             config=self.conf_parser,
                             species="c_elegans")
def main():
    parser = argparse.ArgumentParser(
        description="Generate gene descriptions for wormbase")
    parser.add_argument("-c",
                        "--config-file",
                        metavar="config_file",
                        dest="config_file",
                        type=str,
                        default="config.yml",
                        help="configuration file. Default ./config.yaml")
    parser.add_argument(
        "-C",
        "--use-cache",
        dest="use_cache",
        action="store_true",
        default=False,
        help=
        "Use cached source files from cache_location specified in config file. Download them from "
        "raw_file_source (configured in config file) if not yet cached")
    parser.add_argument(
        "-l",
        "--log-file",
        metavar="log_file",
        dest="log_file",
        type=str,
        default=None,
        help="path to the log file to generate. Default ./genedescriptions.log"
    )
    parser.add_argument(
        "-L",
        "--log-level",
        dest="log_level",
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        help="set the logging level")
    parser.add_argument("-t",
                        "--textpressoapi-token",
                        metavar="textpresso_token",
                        dest="textpresso_token",
                        type=str,
                        help="Texpresso api token")
    parser.add_argument("-o",
                        "--output-formats",
                        metavar="output_formats",
                        dest="output_formats",
                        type=str,
                        nargs="+",
                        default=["ace", "txt", "json", "tsv"],
                        help="file formats to generate. Accepted values "
                        "are: ace, txt, json, tsv")
    args = parser.parse_args()
    conf_parser = GenedescConfigParser(args.config_file)
    logging.basicConfig(filename=args.log_file,
                        level=args.log_level,
                        format='%(asctime)s - %(name)s - %(levelname)s:'
                        '%(message)s',
                        force=True)
    logger = logging.getLogger("WB Gene Description Pipeline")
    organisms_list = conf_parser.get_wb_organisms_to_process()
    human_genes_props = DataManager.get_human_gene_props()
    api_manager = APIManager(textpresso_api_token=args.textpresso_token)
    for organism in organisms_list:
        logger.info("Processing organism " + organism)
        species = conf_parser.get_wb_organisms_info()
        dm, sister_df, df_agr = load_data(organism=organism,
                                          conf_parser=conf_parser)
        desc_writer = DescriptionsWriter()
        desc_writer.overall_properties.species = organism
        desc_writer.overall_properties.release_version = conf_parser.get_wb_release(
        )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1)
        desc_writer.overall_properties.date = datetime.date.today().strftime(
            "%B %d, %Y")
        for gene in dm.get_gene_data():
            logger.debug("Generating description for gene " + gene.name)
            gene_desc = GeneDescription(gene_id=gene.id,
                                        config=conf_parser,
                                        gene_name=gene.name,
                                        add_gene_name=False)
            selected_orthologs, orth_sent = get_best_orthologs_and_sentence(
                dm=dm,
                orth_fullnames=dm.orth_fullnames,
                human_genes_props=human_genes_props,
                gene_desc=gene_desc,
                api_manager=api_manager,
                config=conf_parser)
            set_gene_ontology_module(dm=dm,
                                     conf_parser=conf_parser,
                                     gene_desc=gene_desc,
                                     gene=gene)
            set_tissue_expression_sentence(dm=dm,
                                           gene=gene,
                                           conf_parser=conf_parser,
                                           gene_desc=gene_desc)
            if not gene_desc.description:
                set_expression_cluster_sentence(dm=dm,
                                                conf_parser=conf_parser,
                                                gene_desc=gene_desc,
                                                gene=gene,
                                                api_manager=api_manager)
            set_disease_module(df=dm,
                               conf_parser=conf_parser,
                               gene=gene,
                               gene_desc=gene_desc)
            if not gene_desc.go_description:
                set_information_poor_sentence(
                    orth_fullnames=dm.orth_fullnames,
                    selected_orthologs=selected_orthologs,
                    conf_parser=conf_parser,
                    human_df_agr=df_agr,
                    gene_desc=gene_desc,
                    dm=dm,
                    gene=gene)
            gene_desc.set_or_extend_module_description_and_final_stats(
                module=Module.ORTHOLOGY, description=orth_sent)
            if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \
                    dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname],
                                                   sister_species_data_fetcher=sister_df,
                                                   ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP",
                                                                        "HDA", "HMP", "HGI", "HEP"])[0]:
                set_sister_species_sentence(
                    dm=dm,
                    sister_sp_fullname=dm.sister_sp_fullname,
                    sister_df=sister_df,
                    species=species,
                    organism=organism,
                    gene_desc=gene_desc,
                    conf_parser=conf_parser,
                    gene=gene)
            desc_writer.add_gene_desc(gene_desc)
        logger.info("All genes processed for " + organism)
        date_prefix = datetime.date.today().strftime("%Y%m%d")
        if "json" in args.output_formats:
            logger.info("Writing descriptions to json")
            desc_writer.write_json(os.path.join(
                conf_parser.get_out_dir(),
                date_prefix + "_" + organism + ".json"),
                                   include_single_gene_stats=True,
                                   data_manager=dm)
        if "txt" in args.output_formats:
            logger.info("Writing descriptions to txt")
            desc_writer.write_plain_text(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".txt"))
        if "tsv" in args.output_formats:
            logger.info("Writing descriptions to tsv")
            desc_writer.write_tsv(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".tsv"))
        if "ace" in args.output_formats:
            logger.info("Writing descriptions to ace")
            curators = ["WBPerson324", "WBPerson37462"]
            release_version = conf_parser.get_wb_release()
            desc_writer.write_ace(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".ace"), curators,
                release_version)
    def _load_and_process_data(self):
        # create gene descriptions data manager and load common data
        context_info = ContextInfo()
        data_manager = DataFileManager(context_info.config_file_location)
        #go_onto_config = data_manager.get_config('GO')
        go_annot_config = data_manager.get_config('GAF')
        #do_onto_config = data_manager.get_config('DOID')
        go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()}
        this_dir = os.path.split(__file__)[0]
        gd_config = GenedescConfigParser(os.path.join(this_dir,
                                                      os.pardir,
                                                      os.pardir,
                                                      "gene_descriptions.yml"))
        gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"])
        gd_data_manager.set_ontology(ontology_type=DataType.GO,
                                     ontology=self.get_ontology(data_type=DataType.GO),
                                     config=gd_config)
        gd_data_manager.set_ontology(ontology_type=DataType.DO,
                                     ontology=self.get_ontology(data_type=DataType.DO),
                                     config=gd_config)
        # generate descriptions for each MOD
        for prvdr in [sub_type.get_data_provider().upper() \
                      for sub_type in self.data_type_config.get_sub_type_objects()]:
            gd_config_mod_specific = copy.deepcopy(gd_config)
            if prvdr == "WB":
                gd_config_mod_specific.config["expression_sentences_options"][
                    "remove_children_if_parent_is_present"] = True
            self.logger.info("Generating gene descriptions for %s", prvdr)
            data_provider = prvdr if prvdr != "HUMAN" else "RGD"
            json_desc_writer = DescriptionsWriter()
            go_annot_path = "file://" + os.path.join(os.getcwd(),
                                                     "tmp",
                                                     go_annot_sub_dict[prvdr].file_to_download)
            gd_data_manager.load_associations_from_file(
                associations_type=DataType.GO, associations_url=go_annot_path,
                associations_cache_path=os.path.join(os.getcwd(),
                                                     "tmp",
                                                     "gd_cache",
                                                     "go_annot_" + prvdr + ".gaf"),
                config=gd_config_mod_specific)
            gd_data_manager.set_associations(associations_type=DataType.DO,
                                             associations=self.get_disease_annotations_from_db(
                                                 data_provider=data_provider,
                                                 gd_data_manager=gd_data_manager,
                                                 logger=self.logger),
                                             config=gd_config_mod_specific)
            if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP:
                gd_data_manager.set_ontology(ontology_type=DataType.EXPR,
                                             ontology=self.get_ontology(data_type=DataType.EXPR,
                                                                        provider=prvdr),
                                             config=gd_config_mod_specific)
                gd_data_manager.set_associations(
                    associations_type=DataType.EXPR,
                    associations=self.get_expression_annotations_from_db(data_provider=data_provider,
                                                                         gd_data_manager=gd_data_manager,
                                                                         logger=self.logger),
                    config=gd_config_mod_specific)
            commit_size = self.data_type_config.get_neo4j_commit_size()
            generators = self.get_generators(prvdr,
                                             gd_data_manager,
                                             gd_config_mod_specific,
                                             json_desc_writer)
            query_template_list = [
                [self.gene_descriptions_query_template, commit_size,
                 "genedescriptions_data_" + prvdr + ".csv"]
            ]

            query_and_file_list = self.process_query_params(query_template_list)
            CSVTransactor.save_file_static(generators, query_and_file_list)
            Neo4jTransactor.execute_query_batch(query_and_file_list)
            self.save_descriptions_report_files(data_provider=prvdr,
                                                json_desc_writer=json_desc_writer,
                                                context_info=context_info,
                                                gd_data_manager=gd_data_manager)