def __init__(self, config: GenedescConfigParser, species: str, go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False): """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file locations are automatically generated and stored in class variables ending in _url for remote filed and _cache_path for caching Args: species (str): WormBase species to fetch """ self.config = config raw_files_source = config.get_wb_raw_file_sources() cache_location = config.get_cache_dir() release_version = config.get_wb_release() organisms_info = config.get_wb_organisms_info() project_id = organisms_info[species]["project_id"] self.sister_sp_fullname = "" if "main_sister_species" in organisms_info[species] and "full_name" in \ organisms_info[organisms_info[species]["main_sister_species"]]: self.sister_sp_fullname = organisms_info[ organisms_info[species]["main_sister_species"]]["full_name"] self.orth_fullnames = "" if "ortholog" in organisms_info[species] and all([ "full_name" in organisms_info[ortholog_sp] for ortholog_sp in organisms_info[species]["ortholog"] ]): self.orth_fullnames = [ organisms_info[ortholog_sp]["full_name"] for ortholog_sp in organisms_info[species]["ortholog"] ] expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \ "ec_anatomy_prefix" in organisms_info[species] else None expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \ "ec_molreg_prefix" in organisms_info[species] else None expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \ "ec_genereg_prefix" in organisms_info[species] else None super().__init__(go_relations=go_relations, do_relations=do_relations, use_cache=use_cache) self.gene_data_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".geneIDs.txt.gz") self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz' self.go_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "gene_ontology." + release_version + ".obo") self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \ release_version + '.obo' self.go_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".go_annotations.gaf.gz") self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.go_annotations.gaf.gz' self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \ release_version + '.obo' self.do_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "disease_ontology." + release_version + ".obo") self.do_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.wb") self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.wb' self.do_associations_new_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.daf.txt") self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.daf.txt' self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz' self.orthology_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".orthologs.txt.gz") self.orthologs = defaultdict(lambda: defaultdict(list)) self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.protein_domains.csv.gz' self.protein_domain_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".protein_domains.csv.gz") self.protein_domains = defaultdict(list) self.expression_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_ontology." + release_version + ".obo") self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \ release_version + '.obo' self.expression_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_association." + release_version + ".wb") self.expression_associations_url = raw_files_source + '/' + release_version + \ '/ONTOLOGY/anatomy_association.' + release_version + '.wb' self.expression_cluster_anatomy_url = self._get_expression_cluster_url( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version) self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version, cache_location=cache_location) self.expression_cluster_anatomy_data = defaultdict( list) if self.expression_cluster_anatomy_url else None self.expression_cluster_molreg_url = self._get_expression_cluster_url( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version) self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_molreg_data = defaultdict( list) if self.expression_cluster_molreg_url else None self.expression_cluster_genereg_url = self._get_expression_cluster_url( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version) self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_genereg_data = defaultdict( list) if self.expression_cluster_genereg_url else None
def main(): parser = argparse.ArgumentParser( description="Generate gene descriptions for wormbase") parser.add_argument("-c", "--config-file", metavar="config_file", dest="config_file", type=str, default="config.yml", help="configuration file. Default ./config.yaml") parser.add_argument( "-C", "--use-cache", dest="use_cache", action="store_true", default=False, help= "Use cached source files from cache_location specified in config file. Download them from " "raw_file_source (configured in config file) if not yet cached") parser.add_argument( "-l", "--log-file", metavar="log_file", dest="log_file", type=str, default=None, help="path to the log file to generate. Default ./genedescriptions.log" ) parser.add_argument( "-L", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="set the logging level") parser.add_argument("-t", "--textpressoapi-token", metavar="textpresso_token", dest="textpresso_token", type=str, help="Texpresso api token") parser.add_argument("-o", "--output-formats", metavar="output_formats", dest="output_formats", type=str, nargs="+", default=["ace", "txt", "json", "tsv"], help="file formats to generate. Accepted values " "are: ace, txt, json, tsv") args = parser.parse_args() conf_parser = GenedescConfigParser(args.config_file) logging.basicConfig(filename=args.log_file, level=args.log_level, format='%(asctime)s - %(name)s - %(levelname)s:' '%(message)s', force=True) logger = logging.getLogger("WB Gene Description Pipeline") organisms_list = conf_parser.get_wb_organisms_to_process() human_genes_props = DataManager.get_human_gene_props() api_manager = APIManager(textpresso_api_token=args.textpresso_token) for organism in organisms_list: logger.info("Processing organism " + organism) species = conf_parser.get_wb_organisms_info() dm, sister_df, df_agr = load_data(organism=organism, conf_parser=conf_parser) desc_writer = DescriptionsWriter() desc_writer.overall_properties.species = organism desc_writer.overall_properties.release_version = conf_parser.get_wb_release( )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1) desc_writer.overall_properties.date = datetime.date.today().strftime( "%B %d, %Y") for gene in dm.get_gene_data(): logger.debug("Generating description for gene " + gene.name) gene_desc = GeneDescription(gene_id=gene.id, config=conf_parser, gene_name=gene.name, add_gene_name=False) selected_orthologs, orth_sent = get_best_orthologs_and_sentence( dm=dm, orth_fullnames=dm.orth_fullnames, human_genes_props=human_genes_props, gene_desc=gene_desc, api_manager=api_manager, config=conf_parser) set_gene_ontology_module(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene) set_tissue_expression_sentence(dm=dm, gene=gene, conf_parser=conf_parser, gene_desc=gene_desc) if not gene_desc.description: set_expression_cluster_sentence(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene, api_manager=api_manager) set_disease_module(df=dm, conf_parser=conf_parser, gene=gene, gene_desc=gene_desc) if not gene_desc.go_description: set_information_poor_sentence( orth_fullnames=dm.orth_fullnames, selected_orthologs=selected_orthologs, conf_parser=conf_parser, human_df_agr=df_agr, gene_desc=gene_desc, dm=dm, gene=gene) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.ORTHOLOGY, description=orth_sent) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \ dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"])[0]: set_sister_species_sentence( dm=dm, sister_sp_fullname=dm.sister_sp_fullname, sister_df=sister_df, species=species, organism=organism, gene_desc=gene_desc, conf_parser=conf_parser, gene=gene) desc_writer.add_gene_desc(gene_desc) logger.info("All genes processed for " + organism) date_prefix = datetime.date.today().strftime("%Y%m%d") if "json" in args.output_formats: logger.info("Writing descriptions to json") desc_writer.write_json(os.path.join( conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"), include_single_gene_stats=True, data_manager=dm) if "txt" in args.output_formats: logger.info("Writing descriptions to txt") desc_writer.write_plain_text( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt")) if "tsv" in args.output_formats: logger.info("Writing descriptions to tsv") desc_writer.write_tsv( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".tsv")) if "ace" in args.output_formats: logger.info("Writing descriptions to ace") curators = ["WBPerson324", "WBPerson37462"] release_version = conf_parser.get_wb_release() desc_writer.write_ace( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".ace"), curators, release_version)