def __init__(self, args, logger, context_info): self.args = args self.logger = logger self.context_info = context_info self.start_time = time.time() context_info = ContextInfo() self.schema_branch = context_info.env["TEST_SCHEMA_BRANCH"] if self.schema_branch != 'master': self.logger.warning( "*******WARNING: Using branch {} for schema.".format( self.schema_branch)) # Lets delete the old files and down load new ones. They are small. for name in ['tmp/species.yaml', 'tmp/resourceDescriptors.yaml']: if os.path.exists(name): self.logger.warning( "*********WARNING: removing old {} file.".format(name)) os.remove(name) self.logger.info("Getting files initially") url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/SCHEMA_BRANCH/resourceDescriptors.yaml' url = url.replace('SCHEMA_BRANCH', self.schema_branch) Download('tmp', url, 'resourceDescriptors.yaml').get_downloaded_data() url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/SCHEMA_BRANCH/ingest/species/species.yaml' url = url.replace('SCHEMA_BRANCH', self.schema_branch) Download('tmp', url, 'species.yaml').get_downloaded_data() self.logger.info("Finished getting files initially")
def get_data(self): # Grab the data (TODO validate). # Some of this algorithm is temporary. # e.g. Files from the submission system will arrive without the need for unzipping, etc. path = 'tmp' context_info = ContextInfo() if "SAVE_PATH" in context_info.env: if context_info.env["SAVE_PATH"]: path = context_info.env["SAVE_PATH"] if not os.path.exists(path): logger.info("Making temp file storage: %s" % (path)) os.makedirs(path) if self.filepath is not None: if not os.path.isfile(self.filepath): logger.debug("File to download: " + self.file_to_download) if self.file_to_download.startswith('http'): download_filename = os.path.basename(self.filepath) logger.debug("Download Name: " + download_filename) download_object = Download(path, self.file_to_download, download_filename) self.already_downloaded = download_object.get_downloaded_data_new( ) else: logger.debug("Downloading JSON File: " + self.file_to_download) self.already_downloaded = S3File(self.file_to_download, path).download_new() logger.debug("File already downloaded: %s" % (self.already_downloaded)) if self.file_to_download.endswith('tar.gz'): logger.debug("Extracting all files: %s" % (self.file_to_download)) tar_object = TARFile(path, self.file_to_download) tar_object.extract_all() # Check whether the file exists locally. if self.filepath is not None: try: os.path.isfile(self.filepath) except: logger.critical( 'No local copy of the specified file found!') logger.critical( 'Missing copy of %s for sub type: %s from data type: %s' % (self.filepath, self.sub_data_type, self.data_type)) logger.critical( 'Please check download functions or data source.') sys.exit(-1) else: logger.debug("File Path already downloaded: %s" % (self.filepath)) else: logger.debug("File Path is None not downloading")
def get_generators(self, filepath): """Get Generators.""" species_file = Download('tmp', filepath, 'species.yaml').get_downloaded_data() yaml_list = yaml.load(species_file, Loader=yaml.SafeLoader) species_list = [] for stanza in yaml_list: common_names = [] for name in stanza.get("commonNames"): common_names.append(name) species_dataset = { "taxon_id": stanza.get("taxonId"), "name": stanza.get("fullName"), "short_name": stanza.get("shortName"), "common_names": common_names, "data_provider_full_name": stanza.get("primaryDataProvider").get("dataProviderFullName"), "data_provider_short_name": stanza.get("primaryDataProvider").get("dataProviderShortName"), "phylogenetic_order": stanza.get("phylogenicOrder") } species_list.append(species_dataset) yield [species_list]
def __init__(self): # TODO This should eventually be tied to the schemas submodule. url = 'https://raw.githubusercontent.com/' \ + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml' resource_descriptor_file = Download( 'tmp', url, 'resourceDescriptors.yaml').get_downloaded_data() self.yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader) # Convert the list into a more useful lookup dictionary keyed by db_prefix. self.resource_descriptor_dict = {} for item in self.yaml_list: name = item['db_prefix'] self.resource_descriptor_dict[name] = item # Iterate through this new dictionary and convert page lists to dictionaries. # These are keyed by the page name. for entry in self.resource_descriptor_dict: if 'pages' in self.resource_descriptor_dict[ entry]: # If we have a pages list. self.resource_descriptor_dict[entry]['pages_temp'] = dict() for page_item in self.resource_descriptor_dict[entry]['pages']: page_name = page_item['name'] self.resource_descriptor_dict[entry]['pages_temp'][ page_name] = page_item del self.resource_descriptor_dict[entry][ 'pages'] # Remove the old list. # Rename the new dict with the same name as the old list. For clarity. self.resource_descriptor_dict[entry]['pages'] = \ self.resource_descriptor_dict[entry].pop('pages_temp')
def get_data(self): """get data""" # Grab the data (TODO validate). # Some of this algorithm is temporary. # e.g. Files from the submission system will arrive without the need for unzipping, etc. download_dir = 'tmp' if self.filepath is not None: if not os.path.isfile(self.filepath): self.logger.debug("File to download: %s", self.file_to_download) if self.file_to_download.startswith('http'): download_filename = os.path.basename(self.filepath) self.logger.debug("Download Name: %s", download_filename) download_object = Download(download_dir, self.file_to_download, download_filename) self.already_downloaded = download_object.is_data_downloaded() else: self.logger.debug("Downloading JSON File: %s", self.file_to_download) self.already_downloaded = S3File(self.file_to_download, download_dir).download_new() self.logger.debug("File already downloaded: %s", self.already_downloaded) if self.file_to_download.endswith('tar.gz'): self.logger.debug("Extracting all files: %s", self.file_to_download) tar_object = TARFile(download_dir, self.file_to_download) tar_object.extract_all() # Check whether the file exists locally. if self.filepath is not None: try: os.path.isfile(self.filepath) except (FileNotFoundError, IOError): self.logger.critical('No local copy of the specified file found!') self.logger.critical('Missing copy of %s for sub type: %s %s: %s', self.filepath, "from data_type", self.sub_data_type, self.data_type) self.logger.critical('Please check download functions or data source.') sys.exit(-1) else: self.logger.debug("File Path already downloaded: %s", (self.filepath)) else: self.logger.debug("File Path is None not downloading")
def __init__(self): """Load the dict from file.""" if self.resource_descriptor_dict: self.logger.critical("keys are:- %s", self.resource_descriptor_dict.keys()) return url = 'https://raw.githubusercontent.com/' \ + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml' resource_descriptor_file = Download('tmp', url, 'resourceDescriptors.yaml').get_downloaded_data() yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader) # Convert the list into a more useful lookup dictionary keyed by db_prefix. resource_descriptor_dict = {} for item in yaml_list: main_key = item['db_prefix'].upper() resource_descriptor_dict[main_key] = item self.key_lookup[item['db_prefix']] = main_key self.key_lookup[main_key] = main_key self.key_lookup[item['name'].upper()] = main_key if 'aliases' in item: for alt_name in item['aliases']: self.key_lookup[alt_name.upper()] = main_key if 'ignore_url_generation' in item: self.no_url[main_key] = 1 # Iterate through this new dictionary and convert page lists to dictionaries. # These are keyed by the page name. for entry in resource_descriptor_dict: if 'pages' in resource_descriptor_dict[entry]: # If we have a pages list. resource_descriptor_dict[entry]['pages_temp'] = dict() for page_item in resource_descriptor_dict[entry]['pages']: page_name = page_item['name'] resource_descriptor_dict[entry]['pages_temp'][page_name] = page_item del resource_descriptor_dict[entry]['pages'] # Remove the old list. # Rename the new dict with the same name as the old list. For clarity. resource_descriptor_dict[entry]['pages'] = \ resource_descriptor_dict[entry].pop('pages_temp') # pp = pprint.PrettyPrinter(indent=4) # pp.pprint(self.resource_descriptor_dict) # quit() ResourceDescriptorHelper2.resource_descriptor_dict = resource_descriptor_dict self._get_alt_keys()
def _get_alt_keys(self): """Get alternative keys for species. These are stored in the resourceDescriptor.yaml file under aliases. The keys for this are not used/stored but are here for reference or may be used at a later point. """ url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml' self.logger.critical("species url is %s", url) resource_descriptor_file = Download('tmp', url, 'species.yaml').get_downloaded_data() yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader) for item in yaml_list: db_name = item['primaryDataProvider']['dataProviderShortName'].upper() # Hack human data comes from RGD but we do not want to overwrite RGD # So hardcode test here to HGNC as the key instead. if db_name == 'RGD' and item['fullName'] == 'H**o sapiens': db_name = 'HUMAN' self.key_lookup['HUMAN'] = db_name self.key_lookup[db_name] = db_name self.key_lookup[item['fullName'].upper()] = db_name self.key_to_fullname[db_name] = item['fullName'] for name in item['commonNames']: self.key_lookup[name.upper()] = db_name tax_word, tax_id, _ = self.split_identifier(item['taxonId']) self.key_to_taxonid[db_name] = tax_id self.key_lookup[item['taxonId'].upper()] = db_name self.key_lookup[tax_id] = db_name # Sce has 2 taxon id's so hard code the second one not in species file if item['fullName'] == 'Saccharomyces cerevisiae': self.key_lookup['4932'] = db_name self.key_lookup['NCBITAXON:4932'] = db_name self.key_lookup[item['shortName'].upper()] = db_name self.key_to_order[db_name] = item['phylogenicOrder'] self.key_to_shortname[db_name] = item['shortName']
def get_data(): """Get Data""" ResourceDescriptorHelper.logger.info("got to resourcedescriptor") if len(ResourceDescriptorHelper.list_of_descriptor_maps_to_load) > 0: return ResourceDescriptorHelper.list_of_descriptor_maps_to_load url = 'https://raw.githubusercontent.com/'\ + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml' resource_descriptor_file = Download( 'tmp', url, 'resourceDescriptors.yaml').get_downloaded_data() yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader) for stanza in yaml_list: stanza_map = {} resource = stanza.get("db_prefix") pages = stanza.get("pages") default_url = stanza.get("default_url") gid_pattern = stanza.get("gid_pattern") default_url_suffix = "" if default_url is not None: default_url_parts = default_url.split("[%s]") default_url_prefix = default_url_parts[0] if len(default_url_parts) > 1: default_url_suffix = default_url_parts[1] if pages is not None: for page in pages: page_url_suffix = "" page_name = page.get("name") page_url = page.get("url") if page_url is not None: page_url_parts = page_url.split("[%s]") page_url_prefix = page_url_parts[0] if len(page_url_parts) > 1: page_url_suffix = page_url_parts[1] stanza_map[resource + page_name] = { "resource": resource, "default_url": default_url, "gid_pattern": gid_pattern, "page_name": page_name, "page_url": page_url, "page_url_prefix": page_url_prefix, "page_url_suffix": page_url_suffix, "default_url_prefix": default_url_prefix, "default_url_suffix": default_url_suffix, "primaryKey": resource + page_name, "uuid": str(uuid.uuid4()) } ResourceDescriptorHelper.list_of_descriptor_maps_to_load.append( stanza_map) # TODO: fix special casing of NCBI links w/o pages in BGI if resource == 'NCBI_Gene': stanza_map[resource] = { "resource": resource, "default_url": default_url, "gid_pattern": gid_pattern, "default_url_prefix": default_url_prefix, "default_url_suffix": default_url_suffix, "page_url": "", "page_name": "", "page_url_prefix": default_url_prefix, "page_url_suffix": default_url_suffix, "primaryKey": resource, "uuid": str(uuid.uuid4()) } else: stanza_map[resource] = { "resource": resource, "default_url": default_url, "gid_pattern": gid_pattern, "default_url_prefix": default_url_prefix, "default_url_suffix": default_url_suffix, "page_url": "", "page_name": "", "page_url_prefix": default_url_prefix, "page_url_suffix": default_url_suffix, "primaryKey": resource, "uuid": str(uuid.uuid4()) } ResourceDescriptorHelper.list_of_descriptor_maps_to_load.append( stanza_map) return ResourceDescriptorHelper.list_of_descriptor_maps_to_load