def info(self, version='latest'): """ Display version information (release note, etc...) for given version { "info": ... "release_note": ... } """ file_url = urljoin(self.base_url, "%s.json" % version) result = {} build_meta = self.load_remote_json(file_url) if not build_meta: raise DumperException("Can't find version '%s'" % version) result["info"] = build_meta if build_meta.get("changes"): result["release_note"] = {} for filtyp in build_meta["changes"]: relnote_url = build_meta["changes"][filtyp]["url"] res = self.client.get(relnote_url) if res.status_code == 200: if filtyp == "json": result["release_note"][filtyp] = res.json() else: result["release_note"][filtyp] = res.text else: raise DumperException( "Error while downloading release note '%s (%s)': %s" % (version, res, res.text)) return result
def auth_get(url, *args, **kwargs): if ".s3-website-" in url: raise DumperException( "Can't access s3 static website using authentication") # extract region from URL (reliable ?) pat = re.compile(r"https?://(.*)\.(.*)\.amazonaws.com.*") m = pat.match(url) if m: bucket_name, frag = m.groups() # looks like "s3-us-west-2" # whether static website is activated or not region = frag.replace("s3-", "") if region == "s3": # url doesn't contain a region, we need to query the bucket s3client = boto3.client( "s3", aws_access_key_id=self.__class__.AWS_ACCESS_KEY_ID, aws_secret_access_key=self.__class__. AWS_SECRET_ACCESS_KEY) bucket_info = s3client.get_bucket_location( Bucket=bucket_name) region = bucket_info["LocationConstraint"] auth = AWS4Auth(self.__class__.AWS_ACCESS_KEY_ID, self.__class__.AWS_SECRET_ACCESS_KEY, region, 's3') return self._client.get(url, auth=auth, *args, **kwargs) else: raise DumperException( "Couldn't determine s3 region from url '%s'" % url)
def check_compat(self, build_meta): if hasattr(btconfig, "SKIP_CHECK_COMPAT") and btconfig.SKIP_CHECK_COMPAT: return msg = [] for version_field in [ "app_version", "standalone_version", "biothings_version" ]: VERSION_FIELD = version_field.upper() version = build_meta.get(version_field) assert version is not None, "Version field '%s' is None" % VERSION_FIELD # some releases use dict (most recent) some use string if isinstance(version, dict): version = version["branch"] if type(version) != list: version = [version] # remove hash from versions (only useful when version is a string, # not a dict, see above version = [re.sub(r"( \[.*\])", "", v) for v in version] version = set(version) if version == set([None]): raise DumperException( "Remote data is too old and can't be handled with current app (%s not defined)" % version_field) versionfromconf = re.sub( r"( \[.*\])", "", getattr(btconfig, VERSION_FIELD).get("branch")) VERSION = set() VERSION.add(versionfromconf) found_compat_version = VERSION.intersection(version) assert found_compat_version, "Remote data requires %s to be %s, but current app is %s" % ( version_field, version, VERSION) msg.append("%s=%s:OK" % (version_field, version))
def post_dump(self, *args, **kwargs): if not self.release: # wasn't set before, means no need to post-process (ie. up-to-date, already done) return build_meta = json.load( open(os.path.join(self.new_data_folder, "%s.json" % self.release))) if build_meta["type"] == "incremental": self.logger.info("Checking md5sum for files in '%s'" % self.new_data_folder) metadata = json.load( open(os.path.join(self.new_data_folder, "metadata.json"))) for md5_fname in metadata["diff"]["files"]: spec_md5 = md5_fname["md5sum"] fname = md5_fname["name"] compute_md5 = md5sum(os.path.join(self.new_data_folder, fname)) if compute_md5 != spec_md5: self.logger.error( "md5 check failed for file '%s', it may be corrupted" % fname) e = DumperException("Bad md5sum for file '%s'" % fname) self.register_status("failed", download={"err": repr(e)}) raise e else: self.logger.debug("md5 check success for file '%s'" % fname) elif build_meta["type"] == "full": # if type=fs, check if archive must be uncompressed # TODO # repo_name = list(build_meta["metadata"]["repository"].keys())[0] if build_meta["metadata"]["repository"]["type"] == "fs": uncompressall(self.new_data_folder)
def get_release(self): self.client.cwd(self.__class__.CWD_DIR) releases = sorted(self.client.nlst()) if len(releases) == 0: raise DumperException("Can't any release information in '%s'" % self.__class__.VERSION_DIR) self.release = releases[-1]
def post_download(self,remote,local): filename = os.path.basename(local) if not self.release in filename: raise DumperException("Weird, filename is wrong ('%s')" % filename) # make sure we downloaded to correct one, and that it's the academic version zf = zipfile.ZipFile(local) readme = None for f in zf.filelist: if "readme" in f.filename: readme = f break if not readme: raise DumperException("Can't find a readme in the archive (I was checking version/license)") if not self.release in readme.filename: raise DumperException("Version in readme filename ('%s') doesn't match expected version %s" % (readme.filename, self.release)) assert self.release.endswith("a"), "Release '%s' isn't academic version (how possible ?)" % self.release
def info(self,version=LATEST): """Display version information (release note, etc...) for given version""" txt = ">>> Current local version: '%s'\n" % self.target_backend.version txt += ">>> Release note for remote version '%s':\n" % version file_url = self.__class__.SRC_URL % (self.__class__.BIOTHINGS_S3_FOLDER,version) build_meta = self.load_remote_json(file_url) if not build_meta: raise DumperException("Can't find version '%s'" % version) if build_meta.get("changes") and build_meta["changes"].get("txt"): relnote_url = build_meta["changes"]["txt"]["url"] res = self.client.get(relnote_url) if res.status_code == 200: return txt + res.text else: raise DumperException("Error while downloading release note '%s': %s" % (version,res)) else: return txt + "No information found for release '%s'" % version
def download(self, remotefile, localfile): # pylint: disable=arguments-differ self.prepare_local_folders(localfile) self.logger.debug("Downloading '%s'", os.path.basename(localfile)) # remote is a method name method = getattr(self, remotefile) method(localfile) # rough sanity check against "empty" files w/ just headers if os.stat(localfile).st_size < 1024*1024: # at least 1MiB raise DumperException("'%s' is too small, no data ?" % localfile)
def get_release(self): # only dir with dates releases = sorted([ d for d in self.client.nlst() if re.match("\d{4}-\d{2}-\d{2}", d) ]) if len(releases) == 0: raise DumperException("Can't any release information in '%s'" % self.__class__.VERSION_DIR) self.release = releases[-1]
def versions(self): """Display all available versions""" avail_versions = self.load_remote_json(self.__class__.VERSION_URL) if not avail_versions: raise DumperException("Can't find any versions available...") assert avail_versions[ "format"] == "1.0", "versions.json format has changed: %s" % avail_versions[ "format"] return avail_versions["versions"]
def get_drive_url(self,ftpname): # ok, so let's get the main page data. in this page there are links for both # FTP and Google Drive. We're assuming here that just after FTP link, there's # the corresponding one for Drive (parse will ensure we downloaded the correct # version, and also the correct licensed one - academic only) res = requests.get("https://sites.google.com/site/jpopgen/dbNSFP") html = BeautifulSoup(res.text,"html.parser") ftplink = html.findAll(attrs={"href":re.compile(ftpname)}) if ftplink: ftplink = ftplink.pop() else: raise DumperException("Can't find a FTP link for '%s'" % ftpname) # let's cross fingers here... drivelink = ftplink.findNextSibling() href = drivelink.get("href") if href: return href else: raise DumperException("Can't find a href in drive link element: %s" % drivelink)
def get_latest_release(self): res = self.client.get(self.__class__.HOMEPAGE_URL) html = bs4.BeautifulSoup(res.text,"lxml") # link containing the latest date version version = html.find(attrs={"href":"/srs/jsp/srs/uniiListDownload.jsp"}).text m = re.match("UNII List download \(updated (.*)\)",version) try: latest = datetime.date.strftime(dtparser.parse(m.groups()[0]),"%Y-%m-%d") return latest except Exception as e: raise DumperException("Can't find or parse date from URL '%s': %s" % (self.__class__.HOMEPAGE_URL,e))
def versions(self): """Display all available versions""" versions_url = self.__class__.SRC_URL % (self.__class__.BIOTHINGS_S3_FOLDER,VERSIONS) avail_versions = self.load_remote_json(versions_url) if not avail_versions: raise DumperException("Can't find any versions available...'") res = [] assert avail_versions["format"] == "1.0", "versions.json format has changed: %s" % avail_versions["format"] for ver in avail_versions["versions"]: res.append("version=%s date=%s type=%s" % ('{0: <20}'.format(ver["build_version"]),'{0: <20}'.format(ver["release_date"]), '{0: <16}'.format(ver["type"]))) return "\n".join(res)
def check_compat(self,build_meta): if hasattr(btconfig,"SKIP_CHECK_COMPAT") and btconfig.SKIP_CHECK_COMPAT: return msg = [] for version_field in ["app_version","standalone_version","biothings_version"]: VERSION_FIELD = version_field.upper() version = build_meta.get(version_field) if type(version) != list: version = [version] version = set(version) if version == set([None]): raise DumperException("Remote data is too old and can't be handled with current app (%s not defined)" % version_field) VERSION = set() VERSION.add(getattr(btconfig,VERSION_FIELD)) found_compat_version = VERSION.intersection(version) assert found_compat_version, "Remote data requires %s to be %s, but current app is %s" % (version_field,version,VERSION) msg.append("%s=%s:OK" % (version_field,version)) self.logger.debug("Compat: %s" % ", ".join(msg))
def choose_best_version(self,versions): """ Out of all compatible versions, choose the best: 1. choose incremental vs. full according to preferences 2. version must be the highest (most up-to-date) """ # 1st pass # TODO: implemente inc/full preferences, for now prefer incremental if not versions: raise DumperException("No compatible version found") preferreds = [v for v in versions if "." in v] if preferreds: self.logger.info("Preferred versions (according to preferences): %s" % preferreds) versions = preferreds # we can directly take the max because: # - version is a string # - format if YYYYMMDD # - when incremental, it's always old_version.new_version return max(versions,key=lambda e: e["build_version"])
def download(self, remoteurl, localfile, headers={}): self.prepare_local_folders(localfile) parsed = urlparse(remoteurl) if self.__class__.AWS_ACCESS_KEY_ID and self.__class__.AWS_SECRET_ACCESS_KEY: # accessing diffs controled by auth key = parsed.path.strip( "/") # s3 key are relative, not / at beginning # extract bucket name from URL (reliable?) pat = re.compile(r"^(.*?)\..*\.amazonaws.com") m = pat.match(parsed.netloc) if m: bucket_name = m.groups()[0] else: raise DumperException( "Can't extract bucket name from URL '%s'" % remote_url) return self.auth_download(bucket_name, key, localfile, headers) else: return self.anonymous_download(remoteurl, localfile, headers)
def versions(self): """ Display all available versions. Example: [{ 'build_version': '20171003', 'url': 'https://biothings-releases.s3.amazonaws.com:443/mygene.info/20171003.json', 'release_date': '2017-10-06T11:58:39.749357', 'require_version': None, 'target_version': '20171003', 'type': 'full' }, ...] """ avail_versions = self.load_remote_json(self.__class__.VERSION_URL) if not avail_versions: raise DumperException("Can't find any versions available...") assert avail_versions[ "format"] == "1.0", "versions.json format has changed: %s" % avail_versions[ "format"] return avail_versions["versions"]