Esempio n. 1
0
    def __init__(self, args, logger, context_info):
        self.args = args
        self.logger = logger
        self.context_info = context_info
        self.start_time = time.time()
        context_info = ContextInfo()
        self.schema_branch = context_info.env["TEST_SCHEMA_BRANCH"]
        if self.schema_branch != 'master':
            self.logger.warning(
                "*******WARNING: Using branch {} for schema.".format(
                    self.schema_branch))

        # Lets delete the old files and down load new ones. They are small.
        for name in ['tmp/species.yaml', 'tmp/resourceDescriptors.yaml']:
            if os.path.exists(name):
                self.logger.warning(
                    "*********WARNING: removing old {} file.".format(name))
                os.remove(name)
        self.logger.info("Getting files initially")
        url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/SCHEMA_BRANCH/resourceDescriptors.yaml'
        url = url.replace('SCHEMA_BRANCH', self.schema_branch)
        Download('tmp', url, 'resourceDescriptors.yaml').get_downloaded_data()
        url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/SCHEMA_BRANCH/ingest/species/species.yaml'
        url = url.replace('SCHEMA_BRANCH', self.schema_branch)
        Download('tmp', url, 'species.yaml').get_downloaded_data()
        self.logger.info("Finished getting files initially")
    def get_data(self):
        # Grab the data (TODO validate).
        # Some of this algorithm is temporary.
        # e.g. Files from the submission system will arrive without the need for unzipping, etc.
        path = 'tmp'
        context_info = ContextInfo()
        if "SAVE_PATH" in context_info.env:
            if context_info.env["SAVE_PATH"]:
                path = context_info.env["SAVE_PATH"]
                if not os.path.exists(path):
                    logger.info("Making temp file storage: %s" % (path))
                    os.makedirs(path)

        if self.filepath is not None:
            if not os.path.isfile(self.filepath):
                logger.debug("File to download: " + self.file_to_download)
                if self.file_to_download.startswith('http'):
                    download_filename = os.path.basename(self.filepath)
                    logger.debug("Download Name: " + download_filename)
                    download_object = Download(path, self.file_to_download,
                                               download_filename)
                    self.already_downloaded = download_object.get_downloaded_data_new(
                    )
                else:
                    logger.debug("Downloading JSON File: " +
                                 self.file_to_download)
                    self.already_downloaded = S3File(self.file_to_download,
                                                     path).download_new()
                    logger.debug("File already downloaded: %s" %
                                 (self.already_downloaded))
                    if self.file_to_download.endswith('tar.gz'):
                        logger.debug("Extracting all files: %s" %
                                     (self.file_to_download))
                        tar_object = TARFile(path, self.file_to_download)
                        tar_object.extract_all()
                        # Check whether the file exists locally.
                if self.filepath is not None:
                    try:
                        os.path.isfile(self.filepath)
                    except:
                        logger.critical(
                            'No local copy of the specified file found!')
                        logger.critical(
                            'Missing copy of %s for sub type: %s from data type: %s'
                            % (self.filepath, self.sub_data_type,
                               self.data_type))
                        logger.critical(
                            'Please check download functions or data source.')
                        sys.exit(-1)
            else:
                logger.debug("File Path already downloaded: %s" %
                             (self.filepath))
        else:
            logger.debug("File Path is None not downloading")
Esempio n. 3
0
    def get_generators(self, filepath):
        """Get Generators."""

        species_file = Download('tmp', filepath,
                                'species.yaml').get_downloaded_data()
        yaml_list = yaml.load(species_file, Loader=yaml.SafeLoader)
        species_list = []

        for stanza in yaml_list:
            common_names = []
            for name in stanza.get("commonNames"):
                common_names.append(name)
            species_dataset = {
                "taxon_id":
                stanza.get("taxonId"),
                "name":
                stanza.get("fullName"),
                "short_name":
                stanza.get("shortName"),
                "common_names":
                common_names,
                "data_provider_full_name":
                stanza.get("primaryDataProvider").get("dataProviderFullName"),
                "data_provider_short_name":
                stanza.get("primaryDataProvider").get("dataProviderShortName"),
                "phylogenetic_order":
                stanza.get("phylogenicOrder")
            }
            species_list.append(species_dataset)
        yield [species_list]
Esempio n. 4
0
    def __init__(self):

        # TODO This should eventually be tied to the schemas submodule.
        url = 'https://raw.githubusercontent.com/' \
               + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml'

        resource_descriptor_file = Download(
            'tmp', url, 'resourceDescriptors.yaml').get_downloaded_data()

        self.yaml_list = yaml.load(resource_descriptor_file,
                                   Loader=yaml.SafeLoader)

        # Convert the list into a more useful lookup dictionary keyed by db_prefix.
        self.resource_descriptor_dict = {}
        for item in self.yaml_list:
            name = item['db_prefix']
            self.resource_descriptor_dict[name] = item

        # Iterate through this new dictionary and convert page lists to dictionaries.
        # These are keyed by the page name.
        for entry in self.resource_descriptor_dict:
            if 'pages' in self.resource_descriptor_dict[
                    entry]:  # If we have a pages list.
                self.resource_descriptor_dict[entry]['pages_temp'] = dict()
                for page_item in self.resource_descriptor_dict[entry]['pages']:
                    page_name = page_item['name']
                    self.resource_descriptor_dict[entry]['pages_temp'][
                        page_name] = page_item
                del self.resource_descriptor_dict[entry][
                    'pages']  # Remove the old list.
                # Rename the new dict with the same name as the old list. For clarity.
                self.resource_descriptor_dict[entry]['pages'] = \
                        self.resource_descriptor_dict[entry].pop('pages_temp')
Esempio n. 5
0
    def get_data(self):
        """get data"""

        # Grab the data (TODO validate).
        # Some of this algorithm is temporary.
        # e.g. Files from the submission system will arrive without the need for unzipping, etc.
        download_dir = 'tmp'

        if self.filepath is not None:
            if not os.path.isfile(self.filepath):
                self.logger.debug("File to download: %s", self.file_to_download)
                if self.file_to_download.startswith('http'):
                    download_filename = os.path.basename(self.filepath)
                    self.logger.debug("Download Name: %s", download_filename)
                    download_object = Download(download_dir,
                                               self.file_to_download,
                                               download_filename)
                    self.already_downloaded = download_object.is_data_downloaded()
                else:
                    self.logger.debug("Downloading JSON File: %s", self.file_to_download)
                    self.already_downloaded = S3File(self.file_to_download,
                                                     download_dir).download_new()
                    self.logger.debug("File already downloaded: %s", self.already_downloaded)
                    if self.file_to_download.endswith('tar.gz'):
                        self.logger.debug("Extracting all files: %s", self.file_to_download)
                        tar_object = TARFile(download_dir, self.file_to_download)
                        tar_object.extract_all()
                        # Check whether the file exists locally.
                if self.filepath is not None:
                    try:
                        os.path.isfile(self.filepath)
                    except (FileNotFoundError, IOError):
                        self.logger.critical('No local copy of the specified file found!')
                        self.logger.critical('Missing copy of %s for sub type: %s %s: %s',
                                             self.filepath,
                                             "from data_type",
                                             self.sub_data_type,
                                             self.data_type)
                        self.logger.critical('Please check download functions or data source.')
                        sys.exit(-1)
            else:
                self.logger.debug("File Path already downloaded: %s", (self.filepath))
        else:
            self.logger.debug("File Path is None not downloading")
    def __init__(self):
        """Load the dict from file."""
        if self.resource_descriptor_dict:
            self.logger.critical("keys are:- %s", self.resource_descriptor_dict.keys())
            return

        url = 'https://raw.githubusercontent.com/' \
            + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml'

        resource_descriptor_file = Download('tmp',
                                            url,
                                            'resourceDescriptors.yaml').get_downloaded_data()

        yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader)
        # Convert the list into a more useful lookup dictionary keyed by db_prefix.
        resource_descriptor_dict = {}
        for item in yaml_list:
            main_key = item['db_prefix'].upper()
            resource_descriptor_dict[main_key] = item
            self.key_lookup[item['db_prefix']] = main_key
            self.key_lookup[main_key] = main_key
            self.key_lookup[item['name'].upper()] = main_key
            if 'aliases' in item:
                for alt_name in item['aliases']:
                    self.key_lookup[alt_name.upper()] = main_key
            if 'ignore_url_generation' in item:
                self.no_url[main_key] = 1
        # Iterate through this new dictionary and convert page lists to dictionaries.
        # These are keyed by the page name.
        for entry in resource_descriptor_dict:
            if 'pages' in resource_descriptor_dict[entry]:  # If we have a pages list.
                resource_descriptor_dict[entry]['pages_temp'] = dict()
                for page_item in resource_descriptor_dict[entry]['pages']:
                    page_name = page_item['name']
                    resource_descriptor_dict[entry]['pages_temp'][page_name] = page_item
                del resource_descriptor_dict[entry]['pages']  # Remove the old list.
                # Rename the new dict with the same name as the old list. For clarity.
                resource_descriptor_dict[entry]['pages'] = \
                    resource_descriptor_dict[entry].pop('pages_temp')

        # pp = pprint.PrettyPrinter(indent=4)
        # pp.pprint(self.resource_descriptor_dict)
        # quit()
        ResourceDescriptorHelper2.resource_descriptor_dict = resource_descriptor_dict
        self._get_alt_keys()
    def _get_alt_keys(self):
        """Get alternative keys for species.

        These are stored in the resourceDescriptor.yaml file under
        aliases. The keys for this are not used/stored but are here for reference
        or may be used at a later point.
        """
        url = 'https://raw.githubusercontent.com/alliance-genome/agr_schemas/master/ingest/species/species.yaml'
        self.logger.critical("species url is %s", url)

        resource_descriptor_file = Download('tmp',
                                            url,
                                            'species.yaml').get_downloaded_data()

        yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader)
        for item in yaml_list:
            db_name = item['primaryDataProvider']['dataProviderShortName'].upper()
            # Hack human data comes from RGD but we do not want to overwrite RGD
            # So hardcode test here to HGNC as the key instead.
            if db_name == 'RGD' and item['fullName'] == 'H**o sapiens':
                db_name = 'HUMAN'
                self.key_lookup['HUMAN'] = db_name
            self.key_lookup[db_name] = db_name
            self.key_lookup[item['fullName'].upper()] = db_name
            self.key_to_fullname[db_name] = item['fullName']
            for name in item['commonNames']:
                self.key_lookup[name.upper()] = db_name
            tax_word, tax_id, _ = self.split_identifier(item['taxonId'])
            self.key_to_taxonid[db_name] = tax_id
            self.key_lookup[item['taxonId'].upper()] = db_name
            self.key_lookup[tax_id] = db_name
            # Sce has 2 taxon id's so hard code the second one not in species file
            if item['fullName'] == 'Saccharomyces cerevisiae':
                self.key_lookup['4932'] = db_name
                self.key_lookup['NCBITAXON:4932'] = db_name
            self.key_lookup[item['shortName'].upper()] = db_name
            self.key_to_order[db_name] = item['phylogenicOrder']
            self.key_to_shortname[db_name] = item['shortName']
    def get_data():
        """Get Data"""

        ResourceDescriptorHelper.logger.info("got to resourcedescriptor")
        if len(ResourceDescriptorHelper.list_of_descriptor_maps_to_load) > 0:
            return ResourceDescriptorHelper.list_of_descriptor_maps_to_load

        url = 'https://raw.githubusercontent.com/'\
                + 'alliance-genome/agr_schemas/master/resourceDescriptors.yaml'
        resource_descriptor_file = Download(
            'tmp', url, 'resourceDescriptors.yaml').get_downloaded_data()

        yaml_list = yaml.load(resource_descriptor_file, Loader=yaml.SafeLoader)
        for stanza in yaml_list:
            stanza_map = {}

            resource = stanza.get("db_prefix")
            pages = stanza.get("pages")
            default_url = stanza.get("default_url")
            gid_pattern = stanza.get("gid_pattern")
            default_url_suffix = ""

            if default_url is not None:
                default_url_parts = default_url.split("[%s]")
                default_url_prefix = default_url_parts[0]
                if len(default_url_parts) > 1:
                    default_url_suffix = default_url_parts[1]

            if pages is not None:
                for page in pages:
                    page_url_suffix = ""
                    page_name = page.get("name")
                    page_url = page.get("url")
                    if page_url is not None:
                        page_url_parts = page_url.split("[%s]")
                        page_url_prefix = page_url_parts[0]
                        if len(page_url_parts) > 1:
                            page_url_suffix = page_url_parts[1]

                        stanza_map[resource + page_name] = {
                            "resource": resource,
                            "default_url": default_url,
                            "gid_pattern": gid_pattern,
                            "page_name": page_name,
                            "page_url": page_url,
                            "page_url_prefix": page_url_prefix,
                            "page_url_suffix": page_url_suffix,
                            "default_url_prefix": default_url_prefix,
                            "default_url_suffix": default_url_suffix,
                            "primaryKey": resource + page_name,
                            "uuid": str(uuid.uuid4())
                        }
                        ResourceDescriptorHelper.list_of_descriptor_maps_to_load.append(
                            stanza_map)

                        # TODO: fix special casing of NCBI links w/o pages in BGI
                        if resource == 'NCBI_Gene':
                            stanza_map[resource] = {
                                "resource": resource,
                                "default_url": default_url,
                                "gid_pattern": gid_pattern,
                                "default_url_prefix": default_url_prefix,
                                "default_url_suffix": default_url_suffix,
                                "page_url": "",
                                "page_name": "",
                                "page_url_prefix": default_url_prefix,
                                "page_url_suffix": default_url_suffix,
                                "primaryKey": resource,
                                "uuid": str(uuid.uuid4())
                            }

            else:
                stanza_map[resource] = {
                    "resource": resource,
                    "default_url": default_url,
                    "gid_pattern": gid_pattern,
                    "default_url_prefix": default_url_prefix,
                    "default_url_suffix": default_url_suffix,
                    "page_url": "",
                    "page_name": "",
                    "page_url_prefix": default_url_prefix,
                    "page_url_suffix": default_url_suffix,
                    "primaryKey": resource,
                    "uuid": str(uuid.uuid4())
                }
                ResourceDescriptorHelper.list_of_descriptor_maps_to_load.append(
                    stanza_map)

        return ResourceDescriptorHelper.list_of_descriptor_maps_to_load