def get_generators(self, data_provider, gd_data_manager, gd_config, json_desc_writer): """Create generators.""" gene_prefix = "" if data_provider == "HUMAN": return_set = Neo4jHelper.run_single_parameter_query( self.get_all_genes_human_query, "RGD") gene_prefix = "RGD:" else: return_set = Neo4jHelper.run_single_parameter_query( self.get_all_genes_query, data_provider) descriptions = [] best_orthologs = self.get_best_orthologs_from_db( data_provider=data_provider) for record in return_set: gene = Gene(id=gene_prefix + record["g.primaryKey"], name=record["g.symbol"], dead=False, pseudo=False) gene_desc = GeneDescription(gene_id=record["g.primaryKey"], gene_name=gene.name, add_gene_name=False, config=gd_config) set_gene_ontology_module(dm=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene) set_expression_module(df=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene) set_disease_module(df=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene, human=data_provider == "HUMAN") if gene.id in best_orthologs: gene_desc.stats.set_best_orthologs = best_orthologs[gene.id][0] set_alliance_human_orthology_module( orthologs=best_orthologs[gene.id][0], excluded_orthologs=best_orthologs[gene.id][1], gene_desc=gene_desc, config=gd_config) if gene_desc.description: descriptions.append({ "genePrimaryKey": gene_desc.gene_id, "geneDescription": gene_desc.description }) json_desc_writer.add_gene_desc(gene_desc) yield [descriptions]
def test_trimming_lca(self): self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene = Gene(id="WB:WBGene00000018", name="abl-1", dead=False, pseudo=False) self.df.load_ontology_from_file(ontology_type=DataType.EXPR, ontology_url="file://" + os.path.join( self.this_dir, "data", "anatomy_ontology.WS274.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "anatomy_ontology.WS274.obo"), config=self.conf_parser) logger.info("Loading expression associations from file") self.conf_parser.config["expression_sentences_options"]["max_num_terms"] = 5 self.conf_parser.config["expression_sentences_options"]["trim_min_distance_from_root"]["A"] = 4 self.conf_parser.config["expression_sentences_options"]["remove_children_if_parent_is_present"] = False associations = self.get_associations(gene.id, ["WBbt:0006796", "WBbt:0006759", "WBbt:0005300", "WBbt:0008598", "WBbt:0003681", "WBbt:0005829", "WBbt:0003927", "WBbt:0006751"], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.expression_ontology) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" set_ic_ontology_struct(ontology=self.df.go_ontology, relations=self.df.go_relations) set_ic_ontology_struct(ontology=self.df.expression_ontology, relations=self.df.expr_relations) gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "1") self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene = Gene(id="WB:WBGene00000022", name="aat-1", dead=False, pseudo=False) associations = self.get_associations(gene.id, ["WBbt:0005828", "WBbt:0006751", "WBbt:0005439", "WBbt:0005788", "WBbt:0006749", "WBbt:0005300", "WBbt:0005735", "WBbt:0005747", "WBbt:0005772", "WBbt:0005776", "WBbt:0005812", "WBbt:0005741", "WBbt:0005799", "WBbt:0003681"], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs( assocs=associations, ontology=self.df.expression_ontology) gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "2") self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene = Gene(id="WB:WBGene00000044", name="acr-5", dead=False, pseudo=False) associations = self.get_associations(gene.id, ['WBbt:0003679', 'WBbt:0006759', 'WBbt:0005336', 'WBbt:0006751', 'WBbt:0005300', 'WBbt:0005274', 'WBbt:0005741', 'WBbt:0006749', 'WBbt:0005735'], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs( assocs=associations, ontology=self.df.expression_ontology) gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "3")
def main(): parser = argparse.ArgumentParser( description="Generate gene descriptions for wormbase") parser.add_argument("-c", "--config-file", metavar="config_file", dest="config_file", type=str, default="config.yml", help="configuration file. Default ./config.yaml") parser.add_argument( "-C", "--use-cache", dest="use_cache", action="store_true", default=False, help= "Use cached source files from cache_location specified in config file. Download them from " "raw_file_source (configured in config file) if not yet cached") parser.add_argument( "-l", "--log-file", metavar="log_file", dest="log_file", type=str, default=None, help="path to the log file to generate. Default ./genedescriptions.log" ) parser.add_argument( "-L", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="set the logging level") parser.add_argument("-t", "--textpressoapi-token", metavar="textpresso_token", dest="textpresso_token", type=str, help="Texpresso api token") parser.add_argument("-o", "--output-formats", metavar="output_formats", dest="output_formats", type=str, nargs="+", default=["ace", "txt", "json", "tsv"], help="file formats to generate. Accepted values " "are: ace, txt, json, tsv") args = parser.parse_args() conf_parser = GenedescConfigParser(args.config_file) logging.basicConfig(filename=args.log_file, level=args.log_level, format='%(asctime)s - %(name)s - %(levelname)s:' '%(message)s', force=True) logger = logging.getLogger("WB Gene Description Pipeline") organisms_list = conf_parser.get_wb_organisms_to_process() human_genes_props = DataManager.get_human_gene_props() api_manager = APIManager(textpresso_api_token=args.textpresso_token) for organism in organisms_list: logger.info("Processing organism " + organism) species = conf_parser.get_wb_organisms_info() dm, sister_df, df_agr = load_data(organism=organism, conf_parser=conf_parser) desc_writer = DescriptionsWriter() desc_writer.overall_properties.species = organism desc_writer.overall_properties.release_version = conf_parser.get_wb_release( )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1) desc_writer.overall_properties.date = datetime.date.today().strftime( "%B %d, %Y") for gene in dm.get_gene_data(): logger.debug("Generating description for gene " + gene.name) gene_desc = GeneDescription(gene_id=gene.id, config=conf_parser, gene_name=gene.name, add_gene_name=False) selected_orthologs, orth_sent = get_best_orthologs_and_sentence( dm=dm, orth_fullnames=dm.orth_fullnames, human_genes_props=human_genes_props, gene_desc=gene_desc, api_manager=api_manager, config=conf_parser) set_gene_ontology_module(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene) set_tissue_expression_sentence(dm=dm, gene=gene, conf_parser=conf_parser, gene_desc=gene_desc) if not gene_desc.description: set_expression_cluster_sentence(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene, api_manager=api_manager) set_disease_module(df=dm, conf_parser=conf_parser, gene=gene, gene_desc=gene_desc) if not gene_desc.go_description: set_information_poor_sentence( orth_fullnames=dm.orth_fullnames, selected_orthologs=selected_orthologs, conf_parser=conf_parser, human_df_agr=df_agr, gene_desc=gene_desc, dm=dm, gene=gene) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.ORTHOLOGY, description=orth_sent) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \ dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"])[0]: set_sister_species_sentence( dm=dm, sister_sp_fullname=dm.sister_sp_fullname, sister_df=sister_df, species=species, organism=organism, gene_desc=gene_desc, conf_parser=conf_parser, gene=gene) desc_writer.add_gene_desc(gene_desc) logger.info("All genes processed for " + organism) date_prefix = datetime.date.today().strftime("%Y%m%d") if "json" in args.output_formats: logger.info("Writing descriptions to json") desc_writer.write_json(os.path.join( conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"), include_single_gene_stats=True, data_manager=dm) if "txt" in args.output_formats: logger.info("Writing descriptions to txt") desc_writer.write_plain_text( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt")) if "tsv" in args.output_formats: logger.info("Writing descriptions to tsv") desc_writer.write_tsv( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".tsv")) if "ace" in args.output_formats: logger.info("Writing descriptions to ace") curators = ["WBPerson324", "WBPerson37462"] release_version = conf_parser.get_wb_release() desc_writer.write_ace( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".ace"), curators, release_version)