Exemple #1
0
    def __init__(self,
                 config: GenedescConfigParser,
                 species: str,
                 go_relations: List[str] = None,
                 do_relations: List[str] = None,
                 use_cache: bool = False):
        """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file
        locations are automatically generated and stored in class variables ending in _url for remote filed and
        _cache_path for caching

        Args:
            species (str): WormBase species to fetch
        """
        self.config = config
        raw_files_source = config.get_wb_raw_file_sources()
        cache_location = config.get_cache_dir()
        release_version = config.get_wb_release()
        organisms_info = config.get_wb_organisms_info()
        project_id = organisms_info[species]["project_id"]
        self.sister_sp_fullname = ""
        if "main_sister_species" in organisms_info[species] and "full_name" in \
                organisms_info[organisms_info[species]["main_sister_species"]]:
            self.sister_sp_fullname = organisms_info[
                organisms_info[species]["main_sister_species"]]["full_name"]
        self.orth_fullnames = ""
        if "ortholog" in organisms_info[species] and all([
                "full_name" in organisms_info[ortholog_sp]
                for ortholog_sp in organisms_info[species]["ortholog"]
        ]):
            self.orth_fullnames = [
                organisms_info[ortholog_sp]["full_name"]
                for ortholog_sp in organisms_info[species]["ortholog"]
            ]
        expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \
            "ec_anatomy_prefix" in organisms_info[species] else None
        expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \
            "ec_molreg_prefix" in organisms_info[species] else None
        expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \
            "ec_genereg_prefix" in organisms_info[species] else None
        super().__init__(go_relations=go_relations,
                         do_relations=do_relations,
                         use_cache=use_cache)
        self.gene_data_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".geneIDs.txt.gz")
        self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz'
        self.go_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "gene_ontology." + release_version + ".obo")
        self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \
                               release_version + '.obo'
        self.go_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".go_annotations.gaf.gz")
        self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                   project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                   '.go_annotations.gaf.gz'
        self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \
                               release_version + '.obo'
        self.do_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "disease_ontology." + release_version + ".obo")
        self.do_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.wb")
        self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                   release_version + '.wb'
        self.do_associations_new_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.daf.txt")
        self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                       release_version + '.daf.txt'
        self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz'
        self.orthology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".orthologs.txt.gz")
        self.orthologs = defaultdict(lambda: defaultdict(list))
        self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                  project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                  '.protein_domains.csv.gz'
        self.protein_domain_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".protein_domains.csv.gz")
        self.protein_domains = defaultdict(list)
        self.expression_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_ontology." + release_version + ".obo")
        self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \
                                       release_version + '.obo'
        self.expression_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_association." + release_version + ".wb")
        self.expression_associations_url = raw_files_source + '/' + release_version + \
                                           '/ONTOLOGY/anatomy_association.' + release_version + '.wb'
        self.expression_cluster_anatomy_url = self._get_expression_cluster_url(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version)
        self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_anatomy_data = defaultdict(
            list) if self.expression_cluster_anatomy_url else None
        self.expression_cluster_molreg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version)
        self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_molreg_data = defaultdict(
            list) if self.expression_cluster_molreg_url else None
        self.expression_cluster_genereg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version)
        self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_genereg_data = defaultdict(
            list) if self.expression_cluster_genereg_url else None
def main():
    parser = argparse.ArgumentParser(
        description="Generate gene descriptions for wormbase")
    parser.add_argument("-c",
                        "--config-file",
                        metavar="config_file",
                        dest="config_file",
                        type=str,
                        default="config.yml",
                        help="configuration file. Default ./config.yaml")
    parser.add_argument(
        "-C",
        "--use-cache",
        dest="use_cache",
        action="store_true",
        default=False,
        help=
        "Use cached source files from cache_location specified in config file. Download them from "
        "raw_file_source (configured in config file) if not yet cached")
    parser.add_argument(
        "-l",
        "--log-file",
        metavar="log_file",
        dest="log_file",
        type=str,
        default=None,
        help="path to the log file to generate. Default ./genedescriptions.log"
    )
    parser.add_argument(
        "-L",
        "--log-level",
        dest="log_level",
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        help="set the logging level")
    parser.add_argument("-t",
                        "--textpressoapi-token",
                        metavar="textpresso_token",
                        dest="textpresso_token",
                        type=str,
                        help="Texpresso api token")
    parser.add_argument("-o",
                        "--output-formats",
                        metavar="output_formats",
                        dest="output_formats",
                        type=str,
                        nargs="+",
                        default=["ace", "txt", "json", "tsv"],
                        help="file formats to generate. Accepted values "
                        "are: ace, txt, json, tsv")
    args = parser.parse_args()
    conf_parser = GenedescConfigParser(args.config_file)
    logging.basicConfig(filename=args.log_file,
                        level=args.log_level,
                        format='%(asctime)s - %(name)s - %(levelname)s:'
                        '%(message)s',
                        force=True)
    logger = logging.getLogger("WB Gene Description Pipeline")
    organisms_list = conf_parser.get_wb_organisms_to_process()
    human_genes_props = DataManager.get_human_gene_props()
    api_manager = APIManager(textpresso_api_token=args.textpresso_token)
    for organism in organisms_list:
        logger.info("Processing organism " + organism)
        species = conf_parser.get_wb_organisms_info()
        dm, sister_df, df_agr = load_data(organism=organism,
                                          conf_parser=conf_parser)
        desc_writer = DescriptionsWriter()
        desc_writer.overall_properties.species = organism
        desc_writer.overall_properties.release_version = conf_parser.get_wb_release(
        )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1)
        desc_writer.overall_properties.date = datetime.date.today().strftime(
            "%B %d, %Y")
        for gene in dm.get_gene_data():
            logger.debug("Generating description for gene " + gene.name)
            gene_desc = GeneDescription(gene_id=gene.id,
                                        config=conf_parser,
                                        gene_name=gene.name,
                                        add_gene_name=False)
            selected_orthologs, orth_sent = get_best_orthologs_and_sentence(
                dm=dm,
                orth_fullnames=dm.orth_fullnames,
                human_genes_props=human_genes_props,
                gene_desc=gene_desc,
                api_manager=api_manager,
                config=conf_parser)
            set_gene_ontology_module(dm=dm,
                                     conf_parser=conf_parser,
                                     gene_desc=gene_desc,
                                     gene=gene)
            set_tissue_expression_sentence(dm=dm,
                                           gene=gene,
                                           conf_parser=conf_parser,
                                           gene_desc=gene_desc)
            if not gene_desc.description:
                set_expression_cluster_sentence(dm=dm,
                                                conf_parser=conf_parser,
                                                gene_desc=gene_desc,
                                                gene=gene,
                                                api_manager=api_manager)
            set_disease_module(df=dm,
                               conf_parser=conf_parser,
                               gene=gene,
                               gene_desc=gene_desc)
            if not gene_desc.go_description:
                set_information_poor_sentence(
                    orth_fullnames=dm.orth_fullnames,
                    selected_orthologs=selected_orthologs,
                    conf_parser=conf_parser,
                    human_df_agr=df_agr,
                    gene_desc=gene_desc,
                    dm=dm,
                    gene=gene)
            gene_desc.set_or_extend_module_description_and_final_stats(
                module=Module.ORTHOLOGY, description=orth_sent)
            if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \
                    dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname],
                                                   sister_species_data_fetcher=sister_df,
                                                   ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP",
                                                                        "HDA", "HMP", "HGI", "HEP"])[0]:
                set_sister_species_sentence(
                    dm=dm,
                    sister_sp_fullname=dm.sister_sp_fullname,
                    sister_df=sister_df,
                    species=species,
                    organism=organism,
                    gene_desc=gene_desc,
                    conf_parser=conf_parser,
                    gene=gene)
            desc_writer.add_gene_desc(gene_desc)
        logger.info("All genes processed for " + organism)
        date_prefix = datetime.date.today().strftime("%Y%m%d")
        if "json" in args.output_formats:
            logger.info("Writing descriptions to json")
            desc_writer.write_json(os.path.join(
                conf_parser.get_out_dir(),
                date_prefix + "_" + organism + ".json"),
                                   include_single_gene_stats=True,
                                   data_manager=dm)
        if "txt" in args.output_formats:
            logger.info("Writing descriptions to txt")
            desc_writer.write_plain_text(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".txt"))
        if "tsv" in args.output_formats:
            logger.info("Writing descriptions to tsv")
            desc_writer.write_tsv(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".tsv"))
        if "ace" in args.output_formats:
            logger.info("Writing descriptions to ace")
            curators = ["WBPerson324", "WBPerson37462"]
            release_version = conf_parser.get_wb_release()
            desc_writer.write_ace(
                os.path.join(conf_parser.get_out_dir(),
                             date_prefix + "_" + organism + ".ace"), curators,
                release_version)