Beispiel #1
0
def load_data(organism, conf_parser: GenedescConfigParser):
    logger = logging.getLogger("WB Gene Description Pipeline - Data loader")
    sister_df = None
    df_agr = None
    organisms_info = conf_parser.get_wb_organisms_info()
    df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"],
                       config=conf_parser)
    if organism == "c_elegans":
        df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None)
        df_agr.load_ontology_from_file(ontology_type=DataType.GO,
                                       ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(),
                                       ontology_cache_path=os.path.join(conf_parser.get_cache_dir(),
                                                                        "wormbase_agr_human", "go_ontology.obo"),
                                       config=conf_parser)
        df_agr.load_associations_from_file(associations_type=DataType.GO,
                                           associations_url=conf_parser.get_wb_human_orthologs_go_associations(),
                                           associations_cache_path=os.path.join(
                                               conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"),
                                           config=conf_parser)
    if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]:
        sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"],
                                  do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser)
        logger.info("Loading GO data for sister species")
        sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url,
                                          ontology_cache_path=sister_df.go_ontology_cache_path,
                                          config=conf_parser)
        sister_df.load_associations_from_file(associations_type=DataType.GO,
                                              associations_url=sister_df.go_associations_url,
                                              associations_cache_path=sister_df.go_associations_cache_path,
                                              config=conf_parser)
    logger.info("Loading all data for main species")
    df.load_all_data_from_file()
    return df, sister_df, df_agr
Beispiel #2
0
    def __init__(self,
                 config: GenedescConfigParser,
                 species: str,
                 go_relations: List[str] = None,
                 do_relations: List[str] = None,
                 use_cache: bool = False):
        """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file
        locations are automatically generated and stored in class variables ending in _url for remote filed and
        _cache_path for caching

        Args:
            species (str): WormBase species to fetch
        """
        self.config = config
        raw_files_source = config.get_wb_raw_file_sources()
        cache_location = config.get_cache_dir()
        release_version = config.get_wb_release()
        organisms_info = config.get_wb_organisms_info()
        project_id = organisms_info[species]["project_id"]
        self.sister_sp_fullname = ""
        if "main_sister_species" in organisms_info[species] and "full_name" in \
                organisms_info[organisms_info[species]["main_sister_species"]]:
            self.sister_sp_fullname = organisms_info[
                organisms_info[species]["main_sister_species"]]["full_name"]
        self.orth_fullnames = ""
        if "ortholog" in organisms_info[species] and all([
                "full_name" in organisms_info[ortholog_sp]
                for ortholog_sp in organisms_info[species]["ortholog"]
        ]):
            self.orth_fullnames = [
                organisms_info[ortholog_sp]["full_name"]
                for ortholog_sp in organisms_info[species]["ortholog"]
            ]
        expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \
            "ec_anatomy_prefix" in organisms_info[species] else None
        expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \
            "ec_molreg_prefix" in organisms_info[species] else None
        expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \
            "ec_genereg_prefix" in organisms_info[species] else None
        super().__init__(go_relations=go_relations,
                         do_relations=do_relations,
                         use_cache=use_cache)
        self.gene_data_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".geneIDs.txt.gz")
        self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz'
        self.go_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "gene_ontology." + release_version + ".obo")
        self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \
                               release_version + '.obo'
        self.go_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".go_annotations.gaf.gz")
        self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                   project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                   '.go_annotations.gaf.gz'
        self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \
                               release_version + '.obo'
        self.do_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "disease_ontology." + release_version + ".obo")
        self.do_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.wb")
        self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                   release_version + '.wb'
        self.do_associations_new_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".do_annotations.daf.txt")
        self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
                                       release_version + '.daf.txt'
        self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \
                             '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz'
        self.orthology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".orthologs.txt.gz")
        self.orthologs = defaultdict(lambda: defaultdict(list))
        self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \
                                  project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \
                                  '.protein_domains.csv.gz'
        self.protein_domain_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "species", species,
            project_id, "annotation", species + '.' + project_id + '.' +
            release_version + ".protein_domains.csv.gz")
        self.protein_domains = defaultdict(list)
        self.expression_ontology_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_ontology." + release_version + ".obo")
        self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \
                                       release_version + '.obo'
        self.expression_associations_cache_path = os.path.join(
            cache_location, "wormbase", release_version, "ONTOLOGY",
            "anatomy_association." + release_version + ".wb")
        self.expression_associations_url = raw_files_source + '/' + release_version + \
                                           '/ONTOLOGY/anatomy_association.' + release_version + '.wb'
        self.expression_cluster_anatomy_url = self._get_expression_cluster_url(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version)
        self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_anatomy_prefix,
            ec_type="anatomy",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_anatomy_data = defaultdict(
            list) if self.expression_cluster_anatomy_url else None
        self.expression_cluster_molreg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version)
        self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_molreg_prefix,
            ec_type="molReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_molreg_data = defaultdict(
            list) if self.expression_cluster_molreg_url else None
        self.expression_cluster_genereg_url = self._get_expression_cluster_url(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version)
        self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path(
            prefix=expression_cluster_genereg_prefix,
            ec_type="geneReg",
            release_version=release_version,
            cache_location=cache_location)
        self.expression_cluster_genereg_data = defaultdict(
            list) if self.expression_cluster_genereg_url else None