def recreate_diskcache(): if cache_options["CACHE_BACKEND"] != "redis": try: diskcache_cache = Cache(diskcache_location, disk_pickle_protocol=pickle_protocol) except DatabaseError: shutil.rmtree(diskcache_location, ignore_errors=True) os.mkdir(diskcache_location) diskcache_cache = Cache(diskcache_location, disk_pickle_protocol=pickle_protocol) diskcache_cache.clear() diskcache_cache.close()
def init_system(argv=None): services._init_db_connection() print('Init system collections') init_system_collections() print('Deleting old data') delete_processes() delete_bricks() delete_core() print('Uploading new data') upload_ontologies() upload_core() upload_bricks() upload_processes() cache = Cache(services._CACHE_DIR) cache.clear()
class Cache: def __init__(self): self.cache = DiskCache(CACHE_PATH, size_limit=CACHE_SIZE) def get(self, key): value = self.cache.get(key) if value: logging.debug('Hit cache key %s' % key) return value def clear(self): return self.cache.clear() def set(self, key, value): return self.cache.set(key, value) def get_or(self, key, _or): """Get a key's value, or use function's return value to set""" if key in self.cache: logging.debug('Hit cache key %s' % key) return self.cache[key] value = _or() self.cache.set(key, value) return value
class Cache(object): def __init__(self): try: self.cache = DC('./tmp') except Exception as ex: print('Get an exception with diskcache open: {}'.format(ex)) self.cache = None def __del__(self): try: self.cache.close() except Exception as ex: print('Get an exception with diskcache close: {}'.format(ex)) def set(self, key, value): if self.cache is not None: self.cache.set(key, BytesIO(value), read=True, tag=u'data') def get(self, key): if self.cache is not None: value = self.cache.get(key, default=b'', read=True, tag=True) if value is not None and value != b'': return value return None def pop(self, key): if self.cache is not None: value = self.cache.pop(key, default=b'', read=True, tag=True) if value is not None and value != b'': return value return None def delete(self, key): if self.cache is not None: self.cache.delete(key) def create_index(self): if self.cache is not None: self.cache.create_tag_index() return self.cache.tag_index return None def clear_all(self): if self.cache is not None: self.cache.clear()
class DiskCache(): def __init__(self, cache_dir, ttl=None): self.ttl = ttl self.cache = Cache(cache_dir, eviction_policy='least-recently-used') def __getitem__(self, key): return self.cache[key] def __setitem__(self, key, value): return self.cache.set(key, value, expire=self.ttl) def get(self, key, default=None): return self.cache.get(key, default=default) def set(self, key, value): return self.cache.set(key, value, expire=self.ttl) def clear(self): self.cache.clear()
def main(source, clear_cache): comi_dir_path = '/tmp/comi' comi_dir_path_object = Path(comi_dir_path) comi_dir_path_object.mkdir(exist_ok=True) cache_dir_path = '/tmp/comi/cache' cache = Cache(cache_dir_path) if clear_cache: cache.clear() print('Cache cleared') return True if source: if 'github.com' in source and 'blob' in source: url = source.replace('blob', 'raw') else: url = source else: url = 'https://github.com/commmands/commands/raw/master/commands_1.commands' cache_value = cache.get(url) temp_commands_path = '/tmp/comi/temp_commands' temp_commands_path_object = Path(temp_commands_path) temp_commands_path_object.parent.mkdir(parents=True, exist_ok=True) if cache_value: temp_commands_path_object.write_text(cache_value) else: commands_response = requests.get(url) commands = commands_response.text temp_commands_path_object.write_text(commands) cache.set(url, commands) perl_part = "perl -e 'ioctl STDOUT, 0x5412, $_ for split //, do{ chomp($_ = <>); $_ }'" command = f"cat {temp_commands_path} | fzf --tac | {perl_part} ; echo" subprocess.call(command, shell=True)
def scrape_part_data(pool_size): supported_parts = { "cpu", "cpu-cooler", "motherboard", "memory", "internal-hard-drive", "video-card", "power-supply", "case", "case-fan", "fan-controller", "thermal-paste", "optical-drive", "sound-card", "wired-network-card", "wireless-network-card", "monitor", "external-hard-drive", "headphones", "keyboard", "mouse", "speakers", "ups" } supported_regions = { "au", "be", "ca", "de", "es", "fr", "se", "in", "ie", "it", "nz", "uk", "us" } cache = Cache("/tmp/pcpartpicker-cache/") if "timestamp" in cache: timestamp = cache["timestamp"] if datetime.now().month > timestamp.month: cache.clear() cache["timestamp"] = datetime.now() print("Clearing cache...") else: cache.clear() cache["timestamp"] = datetime.now() print("Clearing cache...") for region in supported_regions: if region not in cache: cache[region] = {} to_scrape = list(itertools.product(supported_parts, supported_regions)) total_to_scrape = len(to_scrape) to_scrape = list(filter(lambda x: x[0] not in cache[x[1]], to_scrape)) pool = Pool(pool_size) print( f"About to scrape {len(to_scrape)}/{total_to_scrape} part+region combos that are not cached using {pool_size} concurrent requests" ) pool.map(scrape_part_region_combo, to_scrape)
def check(self, dir, args): warnings = [] log.info("Running URL checks (CheckURLs)") assert 'URLs' in __main__.remoteCheckList if 'URLs' in args.disableChecksRemote: return warnings cache_dir = 'data-check-cache/URLs' if not os.path.exists(cache_dir): os.makedirs(cache_dir) global cache cache = Cache(cache_dir) if 'URLs' in args.purgeCaches: cache.clear() log.info("Testing biobank URLs") for biobank in dir.getBiobanks(): if not 'url' in biobank or re.search('^\s*$', biobank['url']): warnings.append( DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Missing URL")) else: URLwarnings = testURL( biobank['url'], DataCheckWarning(self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Biobank URL")) warnings += URLwarnings log.info("Testing collection URLs") for collection in dir.getBiobanks(): # non-existence of access URIs is tested in the access policy checks - here we only check validity of the URL if it exists if 'data_access_uri' in collection and not re.search( '^\s*$', collection['data_access_uri']): URLwarnings = testURL( collection['data_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Data access URL for collection")) warnings += URLwarnings if 'sample_access_uri' in collection and not re.search( '^\s*$', collection['sample_access_uri']): URLwarnings = testURL( collection['sample_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Sample access URL for collection")) warnings += URLwarnings if 'image_access_uri' in collection and not re.search( '^\s*$', collection['image_access_uri']): URLwarnings = testURL( collection['image_access_uri'], DataCheckWarning(self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Image access URL for collection")) warnings += URLwarnings cache.close() return warnings
class CAgent: def __init__(self, name, oDir: CDirectoryConfig, oConfigByYaml: CConfigByYaml, connectKnowlegeServer=False): self.name = name self.crawlerManager: CCrawlerManager = None self.storageManager: CStorage = None self.knowledgeManagerClient: CKnowledgeClient = None self.oDir: CDirectoryConfig = oDir self.oConf = oConfigByYaml self.oLog = CLog(oDir['Log'], self.name + '_log') self.dbWeb = '' self.cacheAgent = Cache(oDir['cacheAgentFolder']) self.cacheCrawler = Cache(oDir['cacheCrawlerFolder']) self.flagConnectKnowlegeServer = connectKnowlegeServer fKeyboardInterruptRegistrar(self._callbackKeyboardInterrupt) self.flagUserClose = False # fKeyboardInterruptRegistrar._register['test'] = self._callbackKeyboardInterrupt def _configStorage(self, mode='mongoDB'): oSubConfig = self.oConf['Storage'] self.dbWeb = oSubConfig['dbWeb'] if (oSubConfig.get('mode') != None): mode = oSubConfig['mode'] path = self.dbWeb if (mode == 'mongoDB'): self.storageManager = CStorageMongoDB(self.name, path) def _configCrawler(self): self.crawlerManager = CCrawlerManager(self.name, self.oDir['crawlerCWD'], self.oLog, self.oDir['cacheCrawlerFolder'], self.oDir['cacheAgentFolder']) def _configKnowledgeManager(self): oSubConfig = self.oConf['KnowledgeManager'] addressTuple = (oSubConfig['address'], oSubConfig['port']) key = oSubConfig['password'] key = bytes(key, 'utf-8') print(key) self.knowledgeManagerClient = CKnowledgeClient(addressTuple, key, self.oLog) if self.flagConnectKnowlegeServer: err = self.knowledgeManagerClient.connect() if err == False: raise ValueError("KnowledgeManager connection failed") def configAll(self): self._configCrawler() self.oLog.safeRecordTime('CrawlerManager conf finished') self._configKnowledgeManager() self.oLog.safeRecordTime('KnowledgeManager conf finished') self._configStorage() self.oLog.safeRecordTime('StorageManager conf finished') def startCrawling(self, jobsList: list): return self.crawlerManager.engineStart(jobsList) def fetchResult( self, handler, subProcHandle, timeWaitStep=1, maxWaitTimes=5 ): #total continuous waittime will be (timeWaitStep * maxWaitTimes) result = '' cnt = 0 global WRITE_TO_STORAGE_FLAG WRITE_TO_STORAGE_FLAG = True while (True): _, result = self.cacheAgent.pull() if (result != None): result = json.loads(result) ans = handler(result['type'], result['content']) # print(ans) for temp in ans: self.storageManager.storeData(temp[0], temp[1], temp[2]) # break cnt = 0 #clear counter elif (timeWaitStep * maxWaitTimes > 0): if (cnt >= maxWaitTimes ): # if continuous wait time equals to maxWaitTimes WRITE_TO_STORAGE_FLAG = False return False elif subProcHandle.poll( ) != None: #if the subprocess is finished WRITE_TO_STORAGE_FLAG = False return subProcHandle.poll() else: time.sleep(timeWaitStep) cnt += 1 #counter add one else: WRITE_TO_STORAGE_FLAG = False raise ValueError( "timeWaitStep * maxWaitTimes should be bigger than 0") def clearCache(self): self.cacheAgent.clear() self.cacheCrawler.clear() def closeCache(self): self.cacheAgent.close() self.cacheCrawler.close() self.crawlerManager.closeCache() def _callbackKeyboardInterrupt(self, *args, **kwargs): global WRITE_TO_STORAGE_FLAG self.flagUserClose = True if (WRITE_TO_STORAGE_FLAG is True): numRemainedMsg = len(self.cacheAgent) MSG = "Agent is fetching the result to the Storage," + \ " number of remained items: " + str(numRemainedMsg) + \ ", will close later." return False, MSG else: return True, '' def test(self): #code for testing keyboard interruption handle global WRITE_TO_STORAGE_FLAG WRITE_TO_STORAGE_FLAG = True for i in range(1000): time.sleep(0.01) WRITE_TO_STORAGE_FLAG = False # # print('Press Ctrl+C') # for x in range(1,100): # time.sleep(0.2) # print(x) def close(self): self.knowledgeManagerClient.close() self.closeCache()
def check(self, dir, args): warnings = [] log.info("Running geographical location checks (BiobankGeo)") # This is to be enabled for real runs. assert 'geocoding' in __main__.remoteCheckList if 'geocoding' in args.disableChecksRemote: geoCodingEnabled = False else: geoCodingEnabled = True cache_dir = 'data-check-cache/geolocator' if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache = Cache(cache_dir) if 'geocoding' in args.purgeCaches: cache.clear() geocoords_pattern = '^-?\d+\.\d+$' geolocator = Nominatim( user_agent= 'Mozilla/5.0 (X11; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0', timeout=15) for biobank in dir.getBiobanks(): if 'latitude' in biobank and not re.search( '^\s*$', biobank['latitude'] ) and 'longitude' in biobank and not re.search( '^\s*$', biobank['longitude']): # we check before doing any convenience substitutions if not re.search(geocoords_pattern, biobank['latitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Invalid biobank latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + biobank['latitude'] + "'")) if not re.search(geocoords_pattern, biobank['longitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.ERROR, biobank['id'], DataCheckEntityType.BIOBANK, "Invalid biobank longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + biobank['longitude'] + "'")) # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks biobank['latitude'] = re.sub(r',', r'.', biobank['latitude']) biobank['longitude'] = re.sub(r',', r'.', biobank['longitude']) if re.search(geocoords_pattern, biobank['latitude']) and re.search( geocoords_pattern, biobank['longitude']): if geoCodingEnabled: logMessage = "Checking reverse geocoding for " + biobank[ 'latitude'] + ", " + biobank['longitude'] try: loc_string = biobank['latitude'] + ", " + biobank[ 'longitude'] if loc_string in cache and cache[loc_string] != "": country_code = cache[loc_string] else: location = geolocator.reverse(loc_string, language='en') country_code = location.raw['address'][ 'country_code'] cache[loc_string] = country_code logMessage += " -> OK" if ((biobank['country']['id'] != "IARC" and biobank['country']['id'] != "EU") and country_code.upper() != biobank['country']['id'] and not (country_code.upper() == "GB" and biobank['country']['id'] == "UK")): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Geolocation of the biobank is likely outside of its country " + biobank['country']['id'] + "; biobank seems to be in " + country_code.upper() + f" based on geographical coordinates 'latitude'={biobank['latitude']} 'longitude'={biobank['longitude']}" )) except Exception as e: logMessage += " -> failed (" + str(e) + ")" warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.WARNING, biobank['id'], DataCheckEntityType.BIOBANK, "Reverse geocoding of the biobank location failed (" + str(e) + ")")) log.info(logMessage) else: warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getBiobankNN(biobank['id']), DataCheckWarningLevel.INFO, biobank['id'], DataCheckEntityType.BIOBANK, "Missing geographical coordinates ('latitude and/or 'longitude' attributes are empty)" )) for collection in dir.getCollections(): if 'latitude' in collection and not re.search( '^\s*$', collection['latitude'] ) and 'longitude' in collection and not re.search( '^\s*$', collection['longitude']): # we check before doing any convenience substitutions if not re.search(geocoords_pattern, collection['latitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Invalid collection latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + collection['latitude'] + "'")) if not re.search(geocoords_pattern, collection['longitude']): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.ERROR, collection['id'], DataCheckEntityType.COLLECTION, "Invalid collection longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '" + collection['longitude'] + "'")) # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks collection['latitude'] = re.sub(r',', r'.', collection['latitude']) collection['longitude'] = re.sub(r',', r'.', collection['longitude']) if re.search(geocoords_pattern, collection['latitude']) and re.search( geocoords_pattern, collection['longitude']): if geoCodingEnabled: logMessage = "Checking reverse geocoding for " + collection[ 'latitude'] + ", " + collection['longitude'] try: loc_string = collection[ 'latitude'] + ", " + collection['longitude'] if loc_string in cache and cache[loc_string] != "": country_code = cache[loc_string] else: location = geolocator.reverse(loc_string, language='en') country_code = location.raw['address'][ 'country_code'] cache[loc_string] = country_code logMessage += " -> OK" biobankId = dir.getCollectionBiobankId( collection['id']) biobank = dir.getBiobankById(biobankId) if ((biobank['country']['id'] != "IARC" and biobank['country']['id'] != "EU") and country_code.upper() != biobank['country']['id'] and not (country_code.upper() == "GB" and biobank['country']['id'] == "UK")): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Geolocation of the collection is likely outside of its country " + collection['country']['id'] + "; collection seems to be in " + country_code.upper() + f" based on geographical coordinates 'latitude'={collection['latitude']} 'longitude'={collection['longitude']}" )) except Exception as e: logMessage += " -> failed (" + str(e) + ")" warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getCollectionNN(collection['id']), DataCheckWarningLevel.WARNING, collection['id'], DataCheckEntityType.COLLECTION, "Reverse geocoding of the collection location failed (" + str(e) + ")")) log.info(logMessage) cache.close() return warnings
class PropertyManagerFetch(Fetch_Akamai_OPENAPI_Response): forceTempCache = False @staticmethod def UseTempCache(): PropertyManagerFetch.forceTempCache = True @staticmethod def DisableTempCache(): PropertyManagerFetch.forceTempCache = False def __init__(self, tempCache=False): cacheDir = os.environ.get('AKAMAI_CLI_CACHE_PATH') cacheDirCommand = os.environ.get('AKAMAI_CLI_COMMAND') if PropertyManagerFetch.forceTempCache: self.cache = Cache() self.cache.clear() elif not tempCache and cacheDir is not None and cacheDirCommand is not None: self.cache = Cache(directory="{}/{}/PropertyManagerFetch".format( cacheDir, cacheDirCommand)) elif not tempCache: self.cache = Cache(directory="cache/PropertyManagerFetch") else: self.cache = Cache() self.cache.clear() def buildBulkSearchUrl(self, context, *, contractId=None, groupId=None): url = self.buildUrl("https://{}/papi/v1/bulk/rules-search-requests", context) queryArgs = [("contractId", contractId), ("groupdId", groupId)] url = self.appendQueryStringTupple(url, queryArgs) return url def buildGetPropertyUrl(self, context, *, propertyId=None, propertyVersion=None): uri = "/papi/v1/properties/{}/versions/{}/rules".format( propertyId, propertyVersion) url = self.buildUrl("https://{}" + uri, context) return url def buildGetPropertyDigitalPropertyUrl(self, context, *, propertyId=None, propertyVersion=None): #/papi/v1/properties/{propertyId}/versions/{propertyVersion}/hostnames{?contractId,groupId,validateHostnames} uri = "/papi/v1/properties/{}/versions/{}/hostnames".format( propertyId, propertyVersion) url = self.buildUrl("https://{}" + uri, context) queryArgs = [("validateHostnames", "false")] url = self.appendQueryStringTupple(url, queryArgs) return url def buildGetPropertyVersionMetaInfoUrl(self, context, *, propertyId=None, propertyVersion=None): #https://developer.akamai.com/api/core_features/property_manager/v1.html#api1580152614326 uri = "/papi/v1/properties/{}/versions/{}".format( propertyId, propertyVersion) url = self.buildUrl("https://{}" + uri, context) return url def bulksearch(self, edgerc=None, section=None, account_key=None, contractId=None, groupId=None, postdata=None, network=None, debug=False): factory = CredentialFactory() context = factory.load(edgerc, section, account_key) url = self.buildBulkSearchUrl(context, contractId=contractId, groupId=groupId) headers = { "Content-Type": "application/json", "Accept": "application/json, */*" } result = context.session.post(url, json=postdata, headers=headers) code, headers, json = self.handleResponseWithHeaders( result, url, debug) if code in [200] and "results" in json: print(" ... Found {} properties".format(len(json["results"])), file=sys.stderr) json = self.getMatchLocationValues(json["results"], edgerc=edgerc, account_key=account_key, network=network, debug=debug) return (code, json) elif code in [202]: locationURL = headers["Location"] result = context.session.get(locationURL) code, headers, json = self.handleResponseWithHeaders( result, url, debug) status = json["searchTargetStatus"] attempts = 0 maxAttempts = 550 while status != "COMPLETE" and attempts < maxAttempts: if status == "ERROR": print(" ... Encountered error from bulksearch endpoint", file=sys.stderr) print( " ... fatalError message from API response: {}".format( json["fatalError"]), file=sys.stderr) print(" ... Error Bulksearch Request POST body:", file=sys.stderr) print(" ... {}".format(jsonlib.dumps(postdata)), file=sys.stderr) if debug: print(" ... Error bulksearch POST JSON response:", file=sys.stderr) print(" ... {}".format(jsonlib.dumps(json)), file=sys.stderr) if "bulkSearchId" in json and "fatalError" in json: raise ValueError( "Error bulksearch API response bulkSearchId: \"{}\" fatalError message: \"{}\"" .format(json["bulkSearchId"], json["fatalError"])) else: raise ValueError( "Error bulksearch API response. Unknown error. No bulkSearchId and fatalError json keys" ) attempts = attempts + 1 if attempts == 1: time.sleep(3) result = context.session.get(locationURL) code, headers, json = self.handleResponseWithHeaders( result, url, debug, retry=1, context=context) status = json["searchTargetStatus"] if debug: print( " ... Waiting for search results. {} attempt {} of {} for {}" .format(status, attempts, maxAttempts, locationURL), file=sys.stderr) print(" .... got HTTP code {} with headers: {}".format( code, jsonlib.dumps(dict(headers))), file=sys.stderr) print(" .... got json: {}".format(jsonlib.dumps(json)), file=sys.stderr) else: print( " ... Waiting for search results. {} attempt {} of {}". format(status, attempts, maxAttempts), file=sys.stderr) if status != "COMPLETE": time.sleep(7) print(" ... Found {} properties".format(len(json["results"])), file=sys.stderr) if status == "COMPLETE": json = self.getMatchLocationValues(json["results"], edgerc=edgerc, account_key=account_key, network=network, debug=debug) else: raise ValueError( "Search status never encountred COMPLETE. Last Status = {}" .format(status)) return (code, json) else: return (code, json) def getMatchLocationValues(self, json, edgerc=None, account_key=None, network=None, debug=False): count = 0 if network is not None and (network.startswith("p") or network.startswith("P")): json = list( filter(lambda x: x["productionStatus"] == "ACTIVE", json)) print( " ... Limiting to production network with {} ACTIVE properties" .format(len(json)), file=sys.stderr) elif network is not None and (network.startswith("s") or network.startswith("S")): json = list(filter(lambda x: x["stagingStatus"] == "ACTIVE", json)) print(" ... Limiting to staging network with {} ACTIVE properties". format(len(json)), file=sys.stderr) else: print( " ... Warning: searching non-cacheable properties. Limit to production or staging network for faster searching", file=sys.stderr) if debug == True: print(" ... filtered json:", file=sys.stderr) printjson = jsonlib.dumps(json, indent=2) print(printjson, file=sys.stderr) jobsize = len(json) def manipulateSearchResults(matchJson, edgerc=None, account_key=None, propertyId=None, propertyVersion=None, cacheResponses=False, debug=None): (code, propertyJson) = self.fetchPropertyVersion( edgerc=edgerc, propertyId=propertyId, propertyVersion=propertyVersion, account_key=account_key, cacheResponses=cacheResponses, debug=debug) if code in [200, 202]: self.mergeVersionPointerValues(matchJson, propertyJson) print( " ..... with hostnames, notes, formats, product_ids, etc..", file=sys.stderr) (code, digitalPropertyJson ) = self.fetchPropertyVersionDigitalProperty( edgerc=edgerc, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion, cacheResponses=cacheResponses, debug=debug) if code in [200]: lastModifiedTime = matchJson[ "lastModifiedTime"] if "lastModifiedTime" in matchJson else None self.mergeDigitalPropertiesValues( matchJson, digitalPropertyJson, lastModifiedTime=lastModifiedTime) (code, versionMetaJson) = self.fetchPropertyVersionMetaInfo( edgerc=edgerc, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion, cacheResponses=cacheResponses, debug=debug) if code in [200]: self.mergeDigitalPropertiesVersionMeta( matchJson, versionMetaJson) for match in json: count = count + 1 propertyId = match["propertyId"] propertyVersion = match["propertyVersion"] propertyName = match["propertyName"] productionStatus = match["productionStatus"] stagingStatus = match["stagingStatus"] if productionStatus in [ "ACTIVE", "DEACTIVATED" ] or stagingStatus in ["ACTIVE", "DEACTIVATED"]: cacheResponses = True print( " ... Getting Immutable Property {} of {}. {} v{} production={} staging={}" .format(count, jobsize, propertyName, propertyVersion, productionStatus, stagingStatus), file=sys.stderr) manipulateSearchResults(match, edgerc=edgerc, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion, cacheResponses=cacheResponses, debug=debug) else: cacheResponses = False print( " ... Getting property {} of {}. {} v{} production={} staging={}" .format(count, jobsize, propertyName, propertyVersion, productionStatus, stagingStatus), file=sys.stderr) manipulateSearchResults(match, edgerc=edgerc, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion, cacheResponses=cacheResponses, debug=debug) return json def mergeVersionPointerValues(self, match, propertyJson): matchLocations = match["matchLocations"] matchResults = [] for pointer in matchLocations: subjson = self.resolvepointer(pointer, propertyJson) matchResults.append(subjson) if len(matchResults) > 0: match["matchLocationResults"] = matchResults def mergeDigitalPropertiesValues(self, searchJson, hostnameJson, lastModifiedTime=None): if len(hostnameJson) > 0: searchJson["hostnames"] = hostnameJson if lastModifiedTime is not None: days = daysSince(lastModifiedTime) searchJson["daysSinceModified"] = days def mergeDigitalPropertiesVersionMeta(self, searchJson, versionMetaJson): if len(versionMetaJson) > 0: if "propertyVersion" in versionMetaJson: del versionMetaJson["propertyVersion"] if "stagingStatus" in versionMetaJson: del versionMetaJson["stagingStatus"] if "productionStatus" in versionMetaJson: del versionMetaJson["productionStatus"] if "etag" in versionMetaJson: del versionMetaJson["etag"] if "updatedDate" in versionMetaJson: del versionMetaJson["updatedDate"] searchJson["versionInfo"] = versionMetaJson def validateResponse( self, jsonObj, account_key=None, propertyId=None, propertyVersion=None, ): if propertyId != jsonObj["propertyId"]: raise ValueError( "Unexpected API response! Expecting propertyId={} but got {}". format(propertyId, jsonObj["propertyId"])) #doesn't support hyphenated account keys as the return back different values #elif account_key is not None and account_key not in jsonObj["accountId"]: # raise ValueError("Unexpected API response! Expecting accountId={} but got {}.".format(account_key,jsonObj["accountId"] )) elif "propertyVersion" in jsonObj and propertyVersion != jsonObj[ "propertyVersion"]: raise ValueError( "Unexpected API response! Expecting propertyVersion={} but got {}." .format(propertyVersion, jsonObj["propertyVersion"])) elif "versions" in jsonObj and "items" in jsonObj["versions"] and (len( jsonObj["versions"]["items"]) == 1): versionItem = jsonObj["versions"]["items"][0] if "propertyVersion" in versionItem and propertyVersion != versionItem[ "propertyVersion"]: pass else: pass def fetchPropertyVersionMetaInfo(self, edgerc=None, section=None, account_key=None, propertyId=None, propertyVersion=None, cacheResponses=False, debug=False): factory = CredentialFactory() context = factory.load(edgerc, section, account_key) url = self.buildGetPropertyVersionMetaInfoUrl( context, propertyId=propertyId, propertyVersion=propertyVersion) headers = { "Content-Type": "application/json", "Accept": "application/json, */*" } bypassCache = not cacheResponses cachedHandler = CachedContextHandler(context, self.cache, debug=debug) code, jsonObj = cachedHandler.get(url, requestHeaders=headers, bypassCache=bypassCache) if code in [ 200 ] and "versions" in jsonObj and "items" in jsonObj["versions"]: self.validateResponse(jsonObj, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion) jsonObj = jsonObj["versions"]["items"][0] return (code, jsonObj) else: return (code, jsonObj) def fetchPropertyVersionDigitalProperty(self, edgerc=None, section=None, account_key=None, propertyId=None, propertyVersion=None, cacheResponses=False, debug=False): factory = CredentialFactory() context = factory.load(edgerc, section, account_key) url = self.buildGetPropertyDigitalPropertyUrl( context, propertyId=propertyId, propertyVersion=propertyVersion) headers = { "Content-Type": "application/json", "Accept": "application/json, */*" } bypassCache = not cacheResponses cachedHandler = CachedContextHandler(context, self.cache, debug=debug) code, jsonObj = cachedHandler.get(url, requestHeaders=headers, bypassCache=bypassCache) if code in [ 200 ] and "hostnames" in jsonObj and "items" in jsonObj["hostnames"]: self.validateResponse(jsonObj, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion) jsonObj = jsonObj["hostnames"]["items"] return (code, jsonObj) else: return (code, jsonObj) def fetchPropertyVersion(self, edgerc=None, section=None, account_key=None, propertyId=None, propertyVersion=None, cacheResponses=False, debug=False): factory = CredentialFactory() context = factory.load(edgerc, section, account_key) url = self.buildGetPropertyUrl(context, propertyId=propertyId, propertyVersion=propertyVersion) headers = { "Content-Type": "application/json", "Accept": "application/json, */*" } bypassCache = not cacheResponses cachedHandler = CachedContextHandler(context, self.cache, debug=debug) code, jsonObj = cachedHandler.get(url, requestHeaders=headers, bypassCache=bypassCache) if code in [200, 201, 202] and "rules" in jsonObj: self.validateResponse(jsonObj, account_key=account_key, propertyId=propertyId, propertyVersion=propertyVersion) return (code, jsonObj) else: return (code, jsonObj) def resolvepointer(self, pointer, doc): doc = copy.deepcopy(doc) pointerJson = jsonpointer.resolve_pointer(doc, pointer) return pointerJson
def check(self, dir, args): warnings = [] log.info("Running contact fields checks (ContactFields)") ValidateEmails = True assert 'emails' in __main__.remoteCheckList if 'emails' in args.disableChecksRemote: ValidateEmails = False else: ValidateEmails = True cache_dir = 'data-check-cache/emails' if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache = Cache(cache_dir) if 'emails' in args.purgeCaches: cache.clear() for contact in dir.getContacts(): if (not 'first_name' in contact or re.search('^\s*$', contact['first_name'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing first name for contact ('first_name' attribute is empty)" )) if (not 'last_name' in contact or re.search('^\s*$', contact['last_name'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing last name for contact ('last_name' attribute is empty)" )) if (not 'email' in contact or re.search('^\s*$', contact['email'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "Missing email for contact ('email' attribute is empty)" )) elif (not validate_email(contact['email'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Email for contact is invalid - offending 'email' attribute value: " + contact['email'])) else: # This is pretty dramatic test and should be used sparingly if ValidateEmails: contact_email = contact['email'] log_message = "Validating email " + contact_email # XXX: does not work in most cases #if(not validate_email(contact['email'],verify=True)): try: if (contact_email in cache): cache_result = cache[contact_email] if (cache_result['valid']): log_message += " -> OK" else: log_message += " -> failed" warnings.append(cache_result['warning']) else: if (not validate_email(contact_email, check_mx=True)): log_message += " -> failed" warning = DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Email for contact seems to be unreachable because of missing DNS MX record" ) warnings.append(warning) cache[contact_email] = { 'valid': False, 'warning': warning } else: log_message += " -> OK" cache[contact_email] = { 'valid': True, 'warning': None } log.info(log_message) except (DNS.Base.TimeoutError, DNS.Base.ServerError, DNS.Base.SocketError) as e: log_message += " -> failed with exception (" + str( e) + ")" log.error(log_message) if (not 'phone' in contact or re.search('^\s*$', contact['phone'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.WARNING, contact['id'], DataCheckEntityType.CONTACT, "Missing phone for contact ('phone' attribute is empty'" )) elif (not re.search('^\+(?:[0-9]??){6,14}[0-9]$', contact['phone'])): warnings.append( DataCheckWarning( self.__class__.__name__, "", dir.getContactNN(contact['id']), DataCheckWarningLevel.ERROR, contact['id'], DataCheckEntityType.CONTACT, "Phone number for contact does not conform to the E.123 international standard (means starts with + sign, no spaces) - offending phone number in 'phone' attribute: " + contact['phone'])) return warnings
#!/usr/bin/env python3 """ Clears cache from the default tmp directory """ from diskcache import Cache import tempfile cache = Cache(tempfile.gettempdir()) cache.clear()
class FileDirCache(MutableMapping): def __init__( self, use_listings_cache=True, listings_expiry_time=None, listings_cache_location=None, **kwargs, ): """ Parameters ---------- use_listings_cache: bool If False, this cache never returns items, but always reports KeyError, and setting items has no effect listings_expiry_time: int or float (optional) Time in seconds that a listing is considered valid. If None, listings do not expire. listings_cache_location: str (optional) Directory path at which the listings cache file is stored. If None, an autogenerated path at the user folder is created. """ import appdirs from diskcache import Cache listings_expiry_time = listings_expiry_time and float(listings_expiry_time) if listings_cache_location: listings_cache_location = Path(listings_cache_location) / str(listings_expiry_time) listings_cache_location.mkdir(exist_ok=True, parents=True) else: listings_cache_location = Path(appdirs.user_cache_dir(appname="fsspec_dircache")) / str( listings_expiry_time ) try: listings_cache_location.mkdir(exist_ok=True, parents=True) except Exception: logger.error(f"folder for dircache could not be created at {listings_cache_location}") self.cache_location = listings_cache_location self._cache = Cache(directory=listings_cache_location) self.use_listings_cache = use_listings_cache self.listings_expiry_time = listings_expiry_time def __getitem__(self, item): """Draw item as fileobject from cache, retry if timeout occurs""" return self._cache.get(key=item, read=True, retry=True) def clear(self): self._cache.clear() def __len__(self): return len(list(self._cache.iterkeys())) def __contains__(self, item): value = self._cache.get(item, retry=True) # None, if expired if value: return True return False def __setitem__(self, key, value): if not self.use_listings_cache: return self._cache.set(key=key, value=value, expire=self.listings_expiry_time, retry=True) def __delitem__(self, key): del self._cache[key] def __iter__(self): return (k for k in self._cache.iterkeys() if k in self) def __reduce__(self): return ( FileDirCache, (self.use_listings_cache, self.listings_expiry_time, self.cache_location), )
class VocabAPI: s = Session() BASE_URL = "https://www.vocabulary.com" API_BASE_URL = 'https://api.vocab.com/1.0' PLAY_URL = BASE_URL + "/play" START_URL = BASE_URL + "/challenge/start.json" NEXT_URL = BASE_URL + "/challenge/nextquestion.json" HINT_URL = BASE_URL + "/challenge/hint.json" SAVE_ANSWER_URL = BASE_URL + "/challenge/saveanswer.json" ME_URL = BASE_URL + "/auth/me.json" LOGIN_URL = BASE_URL + "/login/" SET_PRIORITY_URL = BASE_URL + "/progress/setpriority.json" AUTO_COMPLETE_URL = BASE_URL + "/dictionary/autocomplete?search=" APL_WORD_PROGRESS_URL = API_BASE_URL + "/progress/words" API_AUTH_TOKEN_URL = API_BASE_URL + "/auth/token" def __init__(self): super(VocabAPI, self).__init__() self._access_token = '' self._me_info = None # type: MeRsp self._is_logged_in = None self.session_pool = {} self.cache = Cache(CACHE_DIR) if self.cache.get("cookies"): self.s.cookies = self.cache['cookies'] self.s.headers.update({ "authority": "www.vocabulary.com", "accept": "application/json, text/javascript, */*; q=0.01", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "accept-encoding": "gzip, deflate, br", "accept-language": "zh-CN,zh;q=0.9", "origin": self.BASE_URL, "referer": self.BASE_URL, "keep-alive": "true" }) def clear_cache(self): self.cache.clear() @property def access_token(self): if not self._access_token: self._access_token = self.refresh_token().access_token return self._access_token @property def auth_header(self) -> dict: if not self.access_token: return {} return {'authorization': f'Bearer {self.access_token}'} @property def is_logged_in(self) -> bool: logging.debug("Check logging status ..") if self._is_logged_in is None: if not self.cache.get("cookies"): logging.debug(f"logged in: {False}") return False try: self.get( "https://www.vocabulary.com/account/activities.json?limit=1" ).json self._is_logged_in = True except json.JSONDecodeError: self._is_logged_in = False logging.debug(f"logged in: {self._is_logged_in}") return self._is_logged_in def login(self, user_name: str, password: str, auto_login: bool) -> str: error_msg = '' if not self.is_logged_in: # login procedure login_data = { 'username': user_name, 'password': password, '.cb-autoLogon': int(auto_login), 'autoLogon': auto_login } login_bs = self.post(self.LOGIN_URL, data=login_data).bs error_tag = login_bs.find(class_='errors') if error_tag: error_msg = error_tag.find(class_='msg').text else: self.cache.set('cookies', self.s.cookies) self._is_logged_in = bool(user_name and password and not error_msg) return error_msg def refresh_token(self) -> Box: """ This function refreshes access auth token important: will affect attribute `auth_header` and `access_token` :return: """ logging.debug("Refreshing access auth ... ") auth_box = self.post(self.API_AUTH_TOKEN_URL, data={ "refresh_token": self.s.cookies.get("guid") }).box logging.debug("New access auth:", auth_box) self._access_token = auth_box.access_token return auth_box def get_autocomplete_list(self, word: str) -> List[AutoCompleteItem]: rsp_bs = self.get(self.AUTO_COMPLETE_URL + word).bs r_list = [] for li in rsp_bs.select("li"): freq = li.get('freq', 0) if freq == '∞': freq = 0 auto_item = AutoCompleteItem( word=li['word'], short_def=str(li.select(".definition")[0].string), freq=float(freq)) r_list.append(auto_item) return r_list def get_word_progress(self, word: str) -> Union[WordProgressRsp, None]: word_prg_box = self.get(self.APL_WORD_PROGRESS_URL + f"/{word}", ensure_auth=True).box progress = None if word_prg_box.get('progress'): progress = WordProgress( progress=word_prg_box.progress.progress, played_at=word_prg_box.progress.get('played_at', ''), scheduled_at=word_prg_box.progress.get('scheduled_at', ''), play_count=word_prg_box.progress.play_count, correct_count=word_prg_box.progress.correct_count, incorrect_count=word_prg_box.progress.incorrect_count, value=word_prg_box.progress.value, priority=word_prg_box.progress.priority) try: return WordProgressRsp( word=word_prg_box.word, sense=WordSense( id=word_prg_box.sense.id, part_of_speech=word_prg_box.sense.part_of_speech, audio=self.get_first_audio_url(word_prg_box.sense), definition=word_prg_box.sense.definition, ordinal=word_prg_box.sense.ordinal), progress=progress, pkv=word_prg_box.get('pkv', None), learnable=word_prg_box.learnable) except BoxKeyError: # fixme box.BoxKeyError: "'Box' object has no attribute 'word'" return None def set_word_priority(self, word: str, priority: EnumLearningPriority) -> bool: rsp = self.s.post(self.SET_PRIORITY_URL, { 'word': word, "priority": priority.value }) return rsp.status_code == 200 @staticmethod def get_first_audio_url(sense: Box) -> str: if sense.audio: return f"https://audio.vocab.com/1.0/us/{sense.audio[0]}.mp3" return '' @property def meInfo(self) -> MeRsp: if not self._me_info: rsp_box = self.get(self.ME_URL).box if rsp_box.auth.loggedin: self._me_info = MeRsp( validUser=rsp_box.validUser, guid=rsp_box.guid, auth=ChallengeAuth(loggedin=rsp_box.auth.loggedin, uid=rsp_box.auth.uid, nickname=rsp_box.auth.nickname, fullname=rsp_box.auth.fullname, email=rsp_box.auth.email), perms=dict(rsp_box.perms), points=rsp_box.points, level=ChallengeLevel(id=int( rsp_box.level.id.replace("L", "")), name=rsp_box.level.name), ima=rsp_box.ima, paid=rsp_box.paid) else: self._me_info = ChallengeAuth(loggedin=False) return self._me_info def _get_my_lists(self, my_list_type: str = ""): if not self.is_logged_in: return [] url = self.BASE_URL + "/account/lists/" + my_list_type bs = self.get(url).bs list_table_tag = bs.find(class_='list-list') if not list_table_tag: return [] lists = [] for tr in list_table_tag.select("tr"): t = [i for i in tr.contents if isinstance(i, Tag)][0].contents[1] list_id = int(re.search("\d+", t['href']).group(0)) name = t.contents[0].strip() created_string, total_words = t.span.text.split("(") created_string = created_string.strip() total_words = int( total_words.split(")")[0].strip().split("words")[0].strip()) created_date = datetime.strptime( created_string, '%B %d, %Y', ).date() lists.append( UserWordlist(listId=list_id, wordcount=total_words, name=name, created=created_date)) return lists def get_my_list_detail(self, listid: int) -> UserWordlistDetail: logging.info(f"Getting my word list details: {listid}") url = self.API_BASE_URL + f"/progress/lists/{listid}" rsp_box = self.get(url, ensure_auth=True).box uwld = UserWordlistDetail( starred=rsp_box.starred, word_count=rsp_box.word_count, learnable_word_count=rsp_box.learnable_word_count, learning_progress=UserWordlistLeaningProgress( active=rsp_box.learning_progress.active, progress=rsp_box.learning_progress.progress, mastered_word_count=rsp_box.learning_progress. mastered_word_count)) logging.info(f"Word list got: ", uwld) return uwld @property def my_lists_all(self) -> List[UserWordlist]: return self._get_my_lists() @property def my_lists_created(self) -> List[UserWordlist]: return self._get_my_lists("created") @property def my_lists_shared(self) -> List[UserWordlist]: return self._get_my_lists("shared") @property def my_lists_learning(self) -> List[UserWordlist]: return self._get_my_lists("learning") # pure # http://app.vocabulary.com/app/1.0/dictionary/search?word=sun def get_word_def(self, word: str) -> WordDef: cache_key = f"word_def: {word}" if not self.cache.get(cache_key): # def_url = self.BASE_URL + f"/dictionary/{word}" def_url = f"http://app.vocabulary.com/app/1.0/dictionary/search?word={word}" rsp_bs = self.get(def_url).bs # get response word word_ = '' word_tag = rsp_bs.find(class_='dynamictext') if word_tag: word_ = word_tag.contents[0].__str__() # get audio url audio_tag = rsp_bs.find(class_='audio') if audio_tag: audio_url = self.get_first_audio_url( Box(audio=[ audio_tag['data-audio'], ])) else: audio_url = '' # get word short/long blurb blurb_tag = rsp_bs.find(class_='blurb') if blurb_tag: short_tag = blurb_tag.find('p', class_='short') long_tag = blurb_tag.find('p', class_='long') short_blurb_txt = "".join( [i.__str__() for i in short_tag.contents]) long_blurb_txt = "".join( [i.__str__() for i in long_tag.contents]) else: short_blurb_txt, long_blurb_txt = '', '' def_groups = [] for group_tag in rsp_bs.select(".group"): ordinals = [] for ordinal_tag in group_tag.select(".ordinal"): senses = [] for sense_tag in ordinal_tag.select(".sense"): def_example = '' def_content_tag = sense_tag.find(class_='defContent') if def_content_tag: def_example_tag = sense_tag.find(class_='example') if def_example_tag: def_example = " ".join([ re.sub(r"\s+", " ", str(i)) for i in def_example_tag.contents ]) def_tag = ordinal_tag.find(class_="definition") pos_full = def_tag.a['title'] pos_short = def_tag.contents[1].text.strip() def_txt = def_tag.contents[2].strip() senses.append( WordDefSense(pos_short=pos_short, pos_long=pos_full, def_=def_txt, example=def_example)) ordinals.append(senses) def_groups.append(ordinals) self.cache.set( cache_key, WordDef(word_, audio_url, AnswerBlurb(short_blurb_txt, long_blurb_txt), def_groups)) return self.cache.get(cache_key) def post(self, url: str, ensure_auth: bool = False, **kwargs) -> Rsp: return Rsp(self._request("POST", url, ensure_auth, **kwargs)) def get(self, url: str, ensure_auth: bool = False, **kwargs) -> Rsp: return Rsp(self._request("GET", url, ensure_auth, **kwargs)) def get_session(self, val: str): domain = urlparse(val).netloc.lower().strip() if val in self.session_pool: s = Session() self.session_pool[domain] = s else: s = self.session_pool[domain] return s def _request(self, method: str, url, ensure_auth, **kwargs): param_headers = kwargs.get("headers", {}) if ensure_auth: param_headers.update(self.auth_header) rqst = Request(method, url, headers=param_headers, **kwargs) if ensure_auth: rqst.register_hook("response", partial(self._handle_401, method, url, kwargs)) rsp = self.s.send(self.s.prepare_request(rqst)) rsp.raise_for_status() if rsp.status_code != 200: raise APIGetError(url) return rsp.content.decode() def _handle_401(self, method: str, url: str, rqst_kwargs: Dict, rsp: Response, **kwargs): if rsp.status_code != 401: return rsp self.refresh_token() param_headers = rqst_kwargs.get("headers", {}) param_headers.update(self.auth_header) rqst = Request(method, url, headers=param_headers, **rqst_kwargs) return self.s.send(self.s.prepare_request(rqst))
def __init__(self, package='eu_bbmri_eric', purgeCaches=[], debug=False, pp=None, username=None, password=None): self.__pp = pp self.__package = package log.debug('Checking data in package: ' + package) cache_dir = 'data-check-cache/directory' if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache = Cache(cache_dir) if 'directory' in purgeCaches: cache.clear() self.__directoryURL = "https://directory.bbmri-eric.eu/api/" log.info('Retrieving directory content from ' + self.__directoryURL) session = molgenis.client.Session(self.__directoryURL) if username is not None and password is not None: log.info("Logging in to MOLGENIS with a user account.") log.debug('username: '******'password: '******' ... retrieving biobanks') if 'biobanks' in cache: self.biobanks = cache['biobanks'] else: start_time = time.perf_counter() # TODO: remove exception handling once BBMRI.uk staging has been fixed try: self.biobanks = session.get( self.__package + "_biobanks", expand='contact,collections,country,covid19biobank') except: log.warning( "Using work-around for inconsistence in the database structure." ) self.biobanks = session.get( self.__package + "_biobanks", expand='contact,collections,country,COVID_19') cache['biobanks'] = self.biobanks end_time = time.perf_counter() log.info(' ... retrieved biobanks in ' + "%0.3f" % (end_time - start_time) + 's') log.info(' ... retrieving collections') if 'collections' in cache: self.collections = cache['collections'] else: start_time = time.perf_counter() self.collections = session.get( self.__package + "_collections", expand= 'biobank,contact,network,parent_collection,sub_collections,type,materials,order_of_magnitude,data_categories,diagnosis_available,imaging_modality,image_dataset_type' ) #self.collections = session.get(self.__package + "_collections", num=2000, expand=[]) cache['collections'] = self.collections end_time = time.perf_counter() if debug and self.__pp is not None: for c in self.collections: pp.pprint(c) log.info(' ... retrieved collections in ' + "%0.3f" % (end_time - start_time) + 's') log.info(' ... retrieving contacts') if 'contacts' in cache: self.contacts = cache['contacts'] else: start_time = time.perf_counter() self.contacts = session.get(self.__package + "_persons", num=2000, expand='biobanks,collections,country') cache['contacts'] = self.contacts end_time = time.perf_counter() log.info(' ... retrieved contacts in ' + "%0.3f" % (end_time - start_time) + 's') log.info(' ... retrieving networks') if 'networks' in cache: self.networks = cache['networks'] else: start_time = time.perf_counter() self.networks = session.get(self.__package + "_networks", num=2000, expand='contact') cache['networks'] = self.networks end_time = time.perf_counter() log.info(' ... retrieved networks in ' + "%0.3f" % (end_time - start_time) + 's') log.info(' ... all entities retrieved') self.contactHashmap = {} log.info('Processing directory data') # Graph containing only biobanks and collections self.directoryGraph = nx.DiGraph() # DAG containing only biobanks and collections self.directoryCollectionsDAG = nx.DiGraph() # Weighted graph linking contacts to biobanks/collections/networks self.contactGraph = nx.DiGraph() # Graph linking networks to biobanks/collections self.networkGraph = nx.DiGraph() for c in self.contacts: if self.contactGraph.has_node(c['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in contactGraph: ' + c['id']) # XXX temporary hack -- adding contactID prefix #self.contactGraph.add_node(c['id'], data=c) self.contactGraph.add_node('contactID:' + c['id'], data=c) self.contactHashmap[c['id']] = c for b in self.biobanks: if self.directoryGraph.has_node(b['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in directoryGraph: ' + b['id']) self.directoryGraph.add_node(b['id'], data=b) self.directoryCollectionsDAG.add_node(b['id'], data=b) if self.contactGraph.has_node(b['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in contactGraph: ' + b['id']) self.contactGraph.add_node(b['id'], data=b) if self.networkGraph.has_node(b['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in networkGraph: ' + b['id']) self.networkGraph.add_node(b['id'], data=b) for c in self.collections: if self.directoryGraph.has_node(c['id']): raise Exception('DirectoryStructure', 'Conflicting ID found: ' + c['id']) self.directoryGraph.add_node(c['id'], data=c) self.directoryCollectionsDAG.add_node(c['id'], data=c) if self.contactGraph.has_node(c['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in contactGraph: ' + c['id']) self.contactGraph.add_node(c['id'], data=c) if self.networkGraph.has_node(c['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in networkGraph: ' + c['id']) self.networkGraph.add_node(c['id'], data=c) for n in self.networks: if self.contactGraph.has_node(n['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in contactGraph: ' + n['id']) self.contactGraph.add_node(n['id'], data=n) if self.networkGraph.has_node(n['id']): raise Exception( 'DirectoryStructure', 'Conflicting ID found in networkGraph: ' + n['id']) self.networkGraph.add_node(n['id'], data=n) # check forward pointers from biobanks for b in self.biobanks: for c in b['collections']: if not self.directoryGraph.has_node(c['id']): raise Exception( 'DirectoryStructure', 'Biobank refers non-existent collection ID: ' + c['id']) # add biobank contact and network edges for b in self.biobanks: if 'contact' in b: self.contactGraph.add_edge(b['id'], 'contactID:' + b['contact']['id']) if 'networks' in c: for n in c['networks']: self.networkGraph.add_edge(b['id'], n['id']) # now we have all the collections created and checked duplicates, so we create edges for c in self.collections: if 'parent_collection' in c: # some child collection self.directoryGraph.add_edge(c['id'], c['parent_collection']['id']) else: # some of root collections of a biobank # we add both edges as we can't extract this information from the biobank level (it contains pointers to all the child collections) self.directoryGraph.add_edge(c['id'], c['biobank']['id']) self.directoryGraph.add_edge(c['biobank']['id'], c['id']) self.directoryCollectionsDAG.add_edge(c['biobank']['id'], c['id']) if 'sub_collections' in c: # some of root collections of a biobank for sb in c['sub_collections']: self.directoryGraph.add_edge(c['id'], sb['id']) self.directoryCollectionsDAG.add_edge(c['id'], sb['id']) if 'contact' in c: self.contactGraph.add_edge(c['id'], 'contactID:' + c['contact']['id']) if 'networks' in c: for n in c['networks']: self.networkGraph.add_edge(c['id'], n['id']) # processing network edges for n in self.networks: if 'biobanks' in n: for b in n['biobanks']: self.networkGraph.add_edge(n['id'], b['id']) # TODO remove once the datamodel is fixed if 'contacts' in n: for c in n['contacts']: self.contactGraph.add_edge(n['id'], 'contactID:' + c['id']) if 'contact' in n: self.contactGraph.add_edge(n['id'], 'contactID:' + n['contact']['id']) if 'collections' in n: for c in n['collections']: self.networkGraph.add_edge(n['id'], c['id']) # processing edges from contacts for c in self.contacts: if 'biobanks' in c: for b in c['biobanks']: self.contactGraph.add_edge('contactID:' + c['id'], b['id']) if 'collections' in c: for coll in c['collections']: self.contactGraph.add_edge('contactID:' + c['id'], coll['id']) if 'networks' in c: for n in c['networks']: self.contactGraph.add_edge('contactID:' + c['id'], n['id']) # now make graphs immutable nx.freeze(self.directoryGraph) nx.freeze(self.directoryCollectionsDAG) nx.freeze(self.contactGraph) nx.freeze(self.networkGraph) log.info('Checks of directory data as graphs') # now we check if all the edges in the graph are in both directions for e in self.directoryGraph.edges(): if not self.directoryGraph.has_edge(e[1], e[0]): raise Exception( 'DirectoryStructure', 'directoryGraph: Missing edge: ' + e[1] + ' to ' + e[0]) for e in self.contactGraph.edges(): if not self.contactGraph.has_edge(e[1], e[0]): raise Exception( 'DirectoryStructure', 'contactGraph: Missing edge: ' + e[1] + ' to ' + e[0]) for e in self.networkGraph.edges(): if not self.networkGraph.has_edge(e[1], e[0]): raise Exception( 'DirectoryStructure', 'networkGraph: Missing edge: ' + e[1] + ' to ' + e[0]) # we check that DAG is indeed DAG :-) if not nx.algorithms.dag.is_directed_acyclic_graph( self.directoryCollectionsDAG): raise Exception('DirectoryStructure', 'Collection DAG is not DAG') log.info('Directory structure initialized') self.__orphacodesmapper = None
from diskcache import Cache task_queue = Cache('/tmp/ad-poster') # print dir(task_queue) print "Count:", task_queue.count print "Cleared:", task_queue.clear()
import gym import numpy as np from PIL import Image from lshash.lshash import LSHash from collections import deque from random import random from diskcache import FanoutCache, Cache qtable = Cache('cache') qtable.clear() env = gym.make('Breakout-v0') lshs = LSHash(500, 8192) LEARNING_RATE = 0.15 DISCOUNT = 0.95 EPISODES = 25000 def preprocess(obs): image = Image.fromarray(observation) image = image.resize((64, 64)) image = image.convert(mode='1') array = np.array(image, dtype=np.uint8).flatten() return array def get_action(obs_seq): query = lshs.query(obs_seq, num_results=1) if len(query) <= 0: lshs.index(obs_seq) actions = np.ones(env.action_space.n) qtable[obs_seq] = actions elif query[0][1] >= 10: lshs.index(obs_seq)