Beispiel #1
0
def recreate_diskcache():
    if cache_options["CACHE_BACKEND"] != "redis":
        try:
            diskcache_cache = Cache(diskcache_location,
                                    disk_pickle_protocol=pickle_protocol)
        except DatabaseError:
            shutil.rmtree(diskcache_location, ignore_errors=True)
            os.mkdir(diskcache_location)
            diskcache_cache = Cache(diskcache_location,
                                    disk_pickle_protocol=pickle_protocol)
        diskcache_cache.clear()
        diskcache_cache.close()
Beispiel #2
0
def init_system(argv=None):
    services._init_db_connection()
    print('Init system collections')
    init_system_collections()
    print('Deleting old data')
    delete_processes()
    delete_bricks()
    delete_core()
    print('Uploading new data')
    upload_ontologies()
    upload_core()
    upload_bricks()
    upload_processes()
    cache = Cache(services._CACHE_DIR)
    cache.clear()
Beispiel #3
0
class Cache:
    def __init__(self):
        self.cache = DiskCache(CACHE_PATH, size_limit=CACHE_SIZE)

    def get(self, key):
        value = self.cache.get(key)

        if value:
            logging.debug('Hit cache key %s' % key)

        return value

    def clear(self):
        return self.cache.clear()

    def set(self, key, value):
        return self.cache.set(key, value)

    def get_or(self, key, _or):
        """Get a key's value, or use function's return value to set"""
        if key in self.cache:
            logging.debug('Hit cache key %s' % key)
            return self.cache[key]

        value = _or()
        self.cache.set(key, value)
        return value
Beispiel #4
0
class Cache(object):
    def __init__(self):
        try:
            self.cache = DC('./tmp')
        except Exception as ex:
            print('Get an exception with diskcache open: {}'.format(ex))
            self.cache = None

    def __del__(self):
        try:
            self.cache.close()
        except Exception as ex:
            print('Get an exception with diskcache close: {}'.format(ex))

    def set(self, key, value):
        if self.cache is not None:
            self.cache.set(key, BytesIO(value), read=True, tag=u'data')

    def get(self, key):
        if self.cache is not None:
            value = self.cache.get(key, default=b'', read=True, tag=True)
            if value is not None and value != b'':
                return value
        return None

    def pop(self, key):
        if self.cache is not None:
            value = self.cache.pop(key, default=b'', read=True, tag=True)
            if value is not None and value != b'':
                return value
        return None

    def delete(self, key):
        if self.cache is not None:
            self.cache.delete(key)

    def create_index(self):
        if self.cache is not None:
            self.cache.create_tag_index()
            return self.cache.tag_index
        return None

    def clear_all(self):
        if self.cache is not None:
            self.cache.clear()
Beispiel #5
0
class DiskCache():
    def __init__(self, cache_dir, ttl=None):
        self.ttl = ttl
        self.cache = Cache(cache_dir, eviction_policy='least-recently-used')

    def __getitem__(self, key):
        return self.cache[key]

    def __setitem__(self, key, value):
        return self.cache.set(key, value, expire=self.ttl)

    def get(self, key, default=None):
        return self.cache.get(key, default=default)

    def set(self, key, value):
        return self.cache.set(key, value, expire=self.ttl)

    def clear(self):
        self.cache.clear()
Beispiel #6
0
def main(source, clear_cache):

    comi_dir_path = '/tmp/comi'
    comi_dir_path_object = Path(comi_dir_path)
    comi_dir_path_object.mkdir(exist_ok=True)

    cache_dir_path = '/tmp/comi/cache'
    cache = Cache(cache_dir_path)

    if clear_cache:
        cache.clear()
        print('Cache cleared')
        return True

    if source:
        if 'github.com' in source and 'blob' in source:
            url = source.replace('blob', 'raw')
        else:
            url = source
    else:
        url = 'https://github.com/commmands/commands/raw/master/commands_1.commands'

    cache_value = cache.get(url)

    temp_commands_path = '/tmp/comi/temp_commands'
    temp_commands_path_object = Path(temp_commands_path)
    temp_commands_path_object.parent.mkdir(parents=True, exist_ok=True)

    if cache_value:
        temp_commands_path_object.write_text(cache_value)
    else:
        commands_response = requests.get(url)
        commands = commands_response.text
        temp_commands_path_object.write_text(commands)
        cache.set(url, commands)

    perl_part = "perl -e 'ioctl STDOUT, 0x5412, $_ for split //, do{ chomp($_ = <>); $_ }'"
    command = f"cat {temp_commands_path} | fzf --tac | {perl_part} ; echo"

    subprocess.call(command, shell=True)
Beispiel #7
0
def scrape_part_data(pool_size):
    supported_parts = {
        "cpu", "cpu-cooler", "motherboard", "memory", "internal-hard-drive",
        "video-card", "power-supply", "case", "case-fan", "fan-controller",
        "thermal-paste", "optical-drive", "sound-card", "wired-network-card",
        "wireless-network-card", "monitor", "external-hard-drive",
        "headphones", "keyboard", "mouse", "speakers", "ups"
    }

    supported_regions = {
        "au", "be", "ca", "de", "es", "fr", "se", "in", "ie", "it", "nz", "uk",
        "us"
    }

    cache = Cache("/tmp/pcpartpicker-cache/")
    if "timestamp" in cache:
        timestamp = cache["timestamp"]
        if datetime.now().month > timestamp.month:
            cache.clear()
            cache["timestamp"] = datetime.now()
            print("Clearing cache...")
    else:
        cache.clear()
        cache["timestamp"] = datetime.now()
        print("Clearing cache...")

    for region in supported_regions:
        if region not in cache:
            cache[region] = {}

    to_scrape = list(itertools.product(supported_parts, supported_regions))
    total_to_scrape = len(to_scrape)
    to_scrape = list(filter(lambda x: x[0] not in cache[x[1]], to_scrape))
    pool = Pool(pool_size)
    print(
        f"About to scrape {len(to_scrape)}/{total_to_scrape} part+region combos that are not cached using {pool_size} concurrent requests"
    )
    pool.map(scrape_part_region_combo, to_scrape)
Beispiel #8
0
    def check(self, dir, args):
        warnings = []
        log.info("Running URL checks (CheckURLs)")
        assert 'URLs' in __main__.remoteCheckList
        if 'URLs' in args.disableChecksRemote:
            return warnings

        cache_dir = 'data-check-cache/URLs'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        global cache
        cache = Cache(cache_dir)
        if 'URLs' in args.purgeCaches:
            cache.clear()

        log.info("Testing biobank URLs")
        for biobank in dir.getBiobanks():
            if not 'url' in biobank or re.search('^\s*$', biobank['url']):
                warnings.append(
                    DataCheckWarning(self.__class__.__name__, "",
                                     dir.getBiobankNN(biobank['id']),
                                     DataCheckWarningLevel.WARNING,
                                     biobank['id'],
                                     DataCheckEntityType.BIOBANK,
                                     "Missing URL"))
            else:
                URLwarnings = testURL(
                    biobank['url'],
                    DataCheckWarning(self.__class__.__name__, "",
                                     dir.getBiobankNN(biobank['id']),
                                     DataCheckWarningLevel.ERROR,
                                     biobank['id'],
                                     DataCheckEntityType.BIOBANK,
                                     "Biobank URL"))
                warnings += URLwarnings

        log.info("Testing collection URLs")
        for collection in dir.getBiobanks():
            # non-existence of access URIs is tested in the access policy checks - here we only check validity of the URL if it exists
            if 'data_access_uri' in collection and not re.search(
                    '^\s*$', collection['data_access_uri']):
                URLwarnings = testURL(
                    collection['data_access_uri'],
                    DataCheckWarning(self.__class__.__name__, "",
                                     dir.getCollectionNN(collection['id']),
                                     DataCheckWarningLevel.ERROR,
                                     collection['id'],
                                     DataCheckEntityType.COLLECTION,
                                     "Data access URL for collection"))
                warnings += URLwarnings

            if 'sample_access_uri' in collection and not re.search(
                    '^\s*$', collection['sample_access_uri']):
                URLwarnings = testURL(
                    collection['sample_access_uri'],
                    DataCheckWarning(self.__class__.__name__, "",
                                     dir.getCollectionNN(collection['id']),
                                     DataCheckWarningLevel.ERROR,
                                     collection['id'],
                                     DataCheckEntityType.COLLECTION,
                                     "Sample access URL for collection"))
                warnings += URLwarnings
            if 'image_access_uri' in collection and not re.search(
                    '^\s*$', collection['image_access_uri']):
                URLwarnings = testURL(
                    collection['image_access_uri'],
                    DataCheckWarning(self.__class__.__name__, "",
                                     dir.getCollectionNN(collection['id']),
                                     DataCheckWarningLevel.ERROR,
                                     collection['id'],
                                     DataCheckEntityType.COLLECTION,
                                     "Image access URL for collection"))
                warnings += URLwarnings

        cache.close()
        return warnings
Beispiel #9
0
class CAgent:
    def __init__(self,
                 name,
                 oDir: CDirectoryConfig,
                 oConfigByYaml: CConfigByYaml,
                 connectKnowlegeServer=False):
        self.name = name
        self.crawlerManager: CCrawlerManager = None
        self.storageManager: CStorage = None
        self.knowledgeManagerClient: CKnowledgeClient = None
        self.oDir: CDirectoryConfig = oDir
        self.oConf = oConfigByYaml
        self.oLog = CLog(oDir['Log'], self.name + '_log')
        self.dbWeb = ''
        self.cacheAgent = Cache(oDir['cacheAgentFolder'])
        self.cacheCrawler = Cache(oDir['cacheCrawlerFolder'])
        self.flagConnectKnowlegeServer = connectKnowlegeServer
        fKeyboardInterruptRegistrar(self._callbackKeyboardInterrupt)
        self.flagUserClose = False
#        fKeyboardInterruptRegistrar._register['test'] = self._callbackKeyboardInterrupt

    def _configStorage(self, mode='mongoDB'):
        oSubConfig = self.oConf['Storage']
        self.dbWeb = oSubConfig['dbWeb']
        if (oSubConfig.get('mode') != None):
            mode = oSubConfig['mode']
        path = self.dbWeb
        if (mode == 'mongoDB'):
            self.storageManager = CStorageMongoDB(self.name, path)

    def _configCrawler(self):
        self.crawlerManager = CCrawlerManager(self.name,
                                              self.oDir['crawlerCWD'],
                                              self.oLog,
                                              self.oDir['cacheCrawlerFolder'],
                                              self.oDir['cacheAgentFolder'])

    def _configKnowledgeManager(self):
        oSubConfig = self.oConf['KnowledgeManager']
        addressTuple = (oSubConfig['address'], oSubConfig['port'])
        key = oSubConfig['password']
        key = bytes(key, 'utf-8')
        print(key)
        self.knowledgeManagerClient = CKnowledgeClient(addressTuple, key,
                                                       self.oLog)
        if self.flagConnectKnowlegeServer:
            err = self.knowledgeManagerClient.connect()
            if err == False:
                raise ValueError("KnowledgeManager connection failed")

    def configAll(self):
        self._configCrawler()
        self.oLog.safeRecordTime('CrawlerManager conf finished')
        self._configKnowledgeManager()
        self.oLog.safeRecordTime('KnowledgeManager conf finished')
        self._configStorage()
        self.oLog.safeRecordTime('StorageManager conf finished')

    def startCrawling(self, jobsList: list):
        return self.crawlerManager.engineStart(jobsList)

    def fetchResult(
        self,
        handler,
        subProcHandle,
        timeWaitStep=1,
        maxWaitTimes=5
    ):  #total continuous waittime will be (timeWaitStep * maxWaitTimes)
        result = ''
        cnt = 0
        global WRITE_TO_STORAGE_FLAG
        WRITE_TO_STORAGE_FLAG = True
        while (True):
            _, result = self.cacheAgent.pull()
            if (result != None):
                result = json.loads(result)
                ans = handler(result['type'], result['content'])
                #                print(ans)
                for temp in ans:
                    self.storageManager.storeData(temp[0], temp[1], temp[2])
#                break
                cnt = 0  #clear counter
            elif (timeWaitStep * maxWaitTimes > 0):
                if (cnt >= maxWaitTimes
                    ):  # if continuous wait time equals to maxWaitTimes
                    WRITE_TO_STORAGE_FLAG = False
                    return False
                elif subProcHandle.poll(
                ) != None:  #if the subprocess is finished
                    WRITE_TO_STORAGE_FLAG = False
                    return subProcHandle.poll()
                else:
                    time.sleep(timeWaitStep)
                    cnt += 1  #counter add one
            else:
                WRITE_TO_STORAGE_FLAG = False
                raise ValueError(
                    "timeWaitStep * maxWaitTimes should be bigger than 0")

    def clearCache(self):
        self.cacheAgent.clear()
        self.cacheCrawler.clear()

    def closeCache(self):
        self.cacheAgent.close()
        self.cacheCrawler.close()
        self.crawlerManager.closeCache()

    def _callbackKeyboardInterrupt(self, *args, **kwargs):
        global WRITE_TO_STORAGE_FLAG
        self.flagUserClose = True
        if (WRITE_TO_STORAGE_FLAG is True):
            numRemainedMsg = len(self.cacheAgent)
            MSG = "Agent is fetching the result to the Storage," + \
            " number of remained items: " + str(numRemainedMsg) + \
            ", will close later."
            return False, MSG
        else:
            return True, ''

    def test(self):
        #code for testing keyboard interruption handle
        global WRITE_TO_STORAGE_FLAG
        WRITE_TO_STORAGE_FLAG = True
        for i in range(1000):
            time.sleep(0.01)
        WRITE_TO_STORAGE_FLAG = False
        #


#        print('Press Ctrl+C')
#        for x in range(1,100):
#            time.sleep(0.2)
#            print(x)

    def close(self):
        self.knowledgeManagerClient.close()
        self.closeCache()
    def check(self, dir, args):
        warnings = []
        log.info("Running geographical location checks (BiobankGeo)")
        # This is to be enabled for real runs.
        assert 'geocoding' in __main__.remoteCheckList
        if 'geocoding' in args.disableChecksRemote:
            geoCodingEnabled = False
        else:
            geoCodingEnabled = True

        cache_dir = 'data-check-cache/geolocator'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        cache = Cache(cache_dir)
        if 'geocoding' in args.purgeCaches:
            cache.clear()

        geocoords_pattern = '^-?\d+\.\d+$'
        geolocator = Nominatim(
            user_agent=
            'Mozilla/5.0 (X11; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0',
            timeout=15)

        for biobank in dir.getBiobanks():
            if 'latitude' in biobank and not re.search(
                    '^\s*$', biobank['latitude']
            ) and 'longitude' in biobank and not re.search(
                    '^\s*$', biobank['longitude']):
                # we check before doing any convenience substitutions
                if not re.search(geocoords_pattern, biobank['latitude']):
                    warnings.append(
                        DataCheckWarning(
                            self.__class__.__name__, "",
                            dir.getBiobankNN(biobank['id']),
                            DataCheckWarningLevel.ERROR, biobank['id'],
                            DataCheckEntityType.BIOBANK,
                            "Invalid biobank latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '"
                            + biobank['latitude'] + "'"))
                if not re.search(geocoords_pattern, biobank['longitude']):
                    warnings.append(
                        DataCheckWarning(
                            self.__class__.__name__, "",
                            dir.getBiobankNN(biobank['id']),
                            DataCheckWarningLevel.ERROR, biobank['id'],
                            DataCheckEntityType.BIOBANK,
                            "Invalid biobank longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '"
                            + biobank['longitude'] + "'"))
                # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks
                biobank['latitude'] = re.sub(r',', r'.', biobank['latitude'])
                biobank['longitude'] = re.sub(r',', r'.', biobank['longitude'])
                if re.search(geocoords_pattern,
                             biobank['latitude']) and re.search(
                                 geocoords_pattern, biobank['longitude']):
                    if geoCodingEnabled:
                        logMessage = "Checking reverse geocoding for " + biobank[
                            'latitude'] + ", " + biobank['longitude']
                        try:
                            loc_string = biobank['latitude'] + ", " + biobank[
                                'longitude']
                            if loc_string in cache and cache[loc_string] != "":
                                country_code = cache[loc_string]
                            else:
                                location = geolocator.reverse(loc_string,
                                                              language='en')
                                country_code = location.raw['address'][
                                    'country_code']
                                cache[loc_string] = country_code
                            logMessage += " -> OK"
                            if ((biobank['country']['id'] != "IARC"
                                 and biobank['country']['id'] != "EU")
                                    and country_code.upper() !=
                                    biobank['country']['id'] and
                                    not (country_code.upper() == "GB" and
                                         biobank['country']['id'] == "UK")):
                                warnings.append(
                                    DataCheckWarning(
                                        self.__class__.__name__, "",
                                        dir.getBiobankNN(biobank['id']),
                                        DataCheckWarningLevel.WARNING,
                                        biobank['id'],
                                        DataCheckEntityType.BIOBANK,
                                        "Geolocation of the biobank is likely outside of its country "
                                        + biobank['country']['id'] +
                                        "; biobank seems to be in " +
                                        country_code.upper() +
                                        f" based on geographical coordinates 'latitude'={biobank['latitude']} 'longitude'={biobank['longitude']}"
                                    ))
                        except Exception as e:
                            logMessage += " -> failed (" + str(e) + ")"
                            warnings.append(
                                DataCheckWarning(
                                    self.__class__.__name__, "",
                                    dir.getBiobankNN(biobank['id']),
                                    DataCheckWarningLevel.WARNING,
                                    biobank['id'], DataCheckEntityType.BIOBANK,
                                    "Reverse geocoding of the biobank  location failed ("
                                    + str(e) + ")"))
                        log.info(logMessage)
            else:
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getBiobankNN(biobank['id']),
                        DataCheckWarningLevel.INFO, biobank['id'],
                        DataCheckEntityType.BIOBANK,
                        "Missing geographical coordinates ('latitude and/or 'longitude' attributes are empty)"
                    ))

        for collection in dir.getCollections():
            if 'latitude' in collection and not re.search(
                    '^\s*$', collection['latitude']
            ) and 'longitude' in collection and not re.search(
                    '^\s*$', collection['longitude']):
                # we check before doing any convenience substitutions
                if not re.search(geocoords_pattern, collection['latitude']):
                    warnings.append(
                        DataCheckWarning(
                            self.__class__.__name__, "",
                            dir.getCollectionNN(collection['id']),
                            DataCheckWarningLevel.ERROR, collection['id'],
                            DataCheckEntityType.COLLECTION,
                            "Invalid collection latitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '"
                            + collection['latitude'] + "'"))
                if not re.search(geocoords_pattern, collection['longitude']):
                    warnings.append(
                        DataCheckWarning(
                            self.__class__.__name__, "",
                            dir.getCollectionNN(collection['id']),
                            DataCheckWarningLevel.ERROR, collection['id'],
                            DataCheckEntityType.COLLECTION,
                            "Invalid collection longitude (should be a decimal number with period without any spaces or stray characters around - the surrounding quotes are added in this report): offending value '"
                            + collection['longitude'] + "'"))
                # this is for convenience - if there are commas used instead of periods, we should still do the remaining checks
                collection['latitude'] = re.sub(r',', r'.',
                                                collection['latitude'])
                collection['longitude'] = re.sub(r',', r'.',
                                                 collection['longitude'])
                if re.search(geocoords_pattern,
                             collection['latitude']) and re.search(
                                 geocoords_pattern, collection['longitude']):
                    if geoCodingEnabled:
                        logMessage = "Checking reverse geocoding for " + collection[
                            'latitude'] + ", " + collection['longitude']
                        try:
                            loc_string = collection[
                                'latitude'] + ", " + collection['longitude']
                            if loc_string in cache and cache[loc_string] != "":
                                country_code = cache[loc_string]
                            else:
                                location = geolocator.reverse(loc_string,
                                                              language='en')
                                country_code = location.raw['address'][
                                    'country_code']
                                cache[loc_string] = country_code
                            logMessage += " -> OK"
                            biobankId = dir.getCollectionBiobankId(
                                collection['id'])
                            biobank = dir.getBiobankById(biobankId)
                            if ((biobank['country']['id'] != "IARC"
                                 and biobank['country']['id'] != "EU")
                                    and country_code.upper() !=
                                    biobank['country']['id'] and
                                    not (country_code.upper() == "GB" and
                                         biobank['country']['id'] == "UK")):
                                warnings.append(
                                    DataCheckWarning(
                                        self.__class__.__name__, "",
                                        dir.getCollectionNN(collection['id']),
                                        DataCheckWarningLevel.WARNING,
                                        collection['id'],
                                        DataCheckEntityType.COLLECTION,
                                        "Geolocation of the collection is likely outside of its country "
                                        + collection['country']['id'] +
                                        "; collection seems to be in " +
                                        country_code.upper() +
                                        f" based on geographical coordinates 'latitude'={collection['latitude']} 'longitude'={collection['longitude']}"
                                    ))
                        except Exception as e:
                            logMessage += " -> failed (" + str(e) + ")"
                            warnings.append(
                                DataCheckWarning(
                                    self.__class__.__name__, "",
                                    dir.getCollectionNN(collection['id']),
                                    DataCheckWarningLevel.WARNING,
                                    collection['id'],
                                    DataCheckEntityType.COLLECTION,
                                    "Reverse geocoding of the collection  location failed ("
                                    + str(e) + ")"))
                        log.info(logMessage)

        cache.close()
        return warnings
class PropertyManagerFetch(Fetch_Akamai_OPENAPI_Response):

    forceTempCache = False

    @staticmethod
    def UseTempCache():
        PropertyManagerFetch.forceTempCache = True

    @staticmethod
    def DisableTempCache():
        PropertyManagerFetch.forceTempCache = False

    def __init__(self, tempCache=False):

        cacheDir = os.environ.get('AKAMAI_CLI_CACHE_PATH')
        cacheDirCommand = os.environ.get('AKAMAI_CLI_COMMAND')

        if PropertyManagerFetch.forceTempCache:
            self.cache = Cache()
            self.cache.clear()

        elif not tempCache and cacheDir is not None and cacheDirCommand is not None:
            self.cache = Cache(directory="{}/{}/PropertyManagerFetch".format(
                cacheDir, cacheDirCommand))

        elif not tempCache:
            self.cache = Cache(directory="cache/PropertyManagerFetch")

        else:
            self.cache = Cache()
            self.cache.clear()

    def buildBulkSearchUrl(self, context, *, contractId=None, groupId=None):

        url = self.buildUrl("https://{}/papi/v1/bulk/rules-search-requests",
                            context)

        queryArgs = [("contractId", contractId), ("groupdId", groupId)]

        url = self.appendQueryStringTupple(url, queryArgs)
        return url

    def buildGetPropertyUrl(self,
                            context,
                            *,
                            propertyId=None,
                            propertyVersion=None):

        uri = "/papi/v1/properties/{}/versions/{}/rules".format(
            propertyId, propertyVersion)
        url = self.buildUrl("https://{}" + uri, context)
        return url

    def buildGetPropertyDigitalPropertyUrl(self,
                                           context,
                                           *,
                                           propertyId=None,
                                           propertyVersion=None):

        #/papi/v1/properties/{propertyId}/versions/{propertyVersion}/hostnames{?contractId,groupId,validateHostnames}

        uri = "/papi/v1/properties/{}/versions/{}/hostnames".format(
            propertyId, propertyVersion)
        url = self.buildUrl("https://{}" + uri, context)

        queryArgs = [("validateHostnames", "false")]

        url = self.appendQueryStringTupple(url, queryArgs)

        return url

    def buildGetPropertyVersionMetaInfoUrl(self,
                                           context,
                                           *,
                                           propertyId=None,
                                           propertyVersion=None):

        #https://developer.akamai.com/api/core_features/property_manager/v1.html#api1580152614326

        uri = "/papi/v1/properties/{}/versions/{}".format(
            propertyId, propertyVersion)
        url = self.buildUrl("https://{}" + uri, context)
        return url

    def bulksearch(self,
                   edgerc=None,
                   section=None,
                   account_key=None,
                   contractId=None,
                   groupId=None,
                   postdata=None,
                   network=None,
                   debug=False):

        factory = CredentialFactory()
        context = factory.load(edgerc, section, account_key)
        url = self.buildBulkSearchUrl(context,
                                      contractId=contractId,
                                      groupId=groupId)

        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json, */*"
        }

        result = context.session.post(url, json=postdata, headers=headers)
        code, headers, json = self.handleResponseWithHeaders(
            result, url, debug)

        if code in [200] and "results" in json:

            print(" ... Found {} properties".format(len(json["results"])),
                  file=sys.stderr)

            json = self.getMatchLocationValues(json["results"],
                                               edgerc=edgerc,
                                               account_key=account_key,
                                               network=network,
                                               debug=debug)
            return (code, json)

        elif code in [202]:

            locationURL = headers["Location"]
            result = context.session.get(locationURL)
            code, headers, json = self.handleResponseWithHeaders(
                result, url, debug)
            status = json["searchTargetStatus"]

            attempts = 0
            maxAttempts = 550

            while status != "COMPLETE" and attempts < maxAttempts:

                if status == "ERROR":
                    print(" ... Encountered error from bulksearch endpoint",
                          file=sys.stderr)
                    print(
                        " ... fatalError message from API response: {}".format(
                            json["fatalError"]),
                        file=sys.stderr)
                    print(" ... Error Bulksearch Request POST body:",
                          file=sys.stderr)
                    print(" ... {}".format(jsonlib.dumps(postdata)),
                          file=sys.stderr)

                    if debug:
                        print(" ... Error bulksearch POST JSON response:",
                              file=sys.stderr)
                        print(" ... {}".format(jsonlib.dumps(json)),
                              file=sys.stderr)

                    if "bulkSearchId" in json and "fatalError" in json:
                        raise ValueError(
                            "Error bulksearch API response bulkSearchId: \"{}\" fatalError message: \"{}\""
                            .format(json["bulkSearchId"], json["fatalError"]))
                    else:
                        raise ValueError(
                            "Error bulksearch API response. Unknown error. No bulkSearchId and fatalError json keys"
                        )

                attempts = attempts + 1

                if attempts == 1:
                    time.sleep(3)

                result = context.session.get(locationURL)
                code, headers, json = self.handleResponseWithHeaders(
                    result, url, debug, retry=1, context=context)
                status = json["searchTargetStatus"]

                if debug:
                    print(
                        " ... Waiting for search results. {} attempt {} of {} for {}"
                        .format(status, attempts, maxAttempts, locationURL),
                        file=sys.stderr)
                    print(" .... got HTTP code {} with headers: {}".format(
                        code, jsonlib.dumps(dict(headers))),
                          file=sys.stderr)
                    print(" .... got json: {}".format(jsonlib.dumps(json)),
                          file=sys.stderr)
                else:
                    print(
                        " ... Waiting for search results. {} attempt {} of {}".
                        format(status, attempts, maxAttempts),
                        file=sys.stderr)

                if status != "COMPLETE":
                    time.sleep(7)

            print(" ... Found {} properties".format(len(json["results"])),
                  file=sys.stderr)

            if status == "COMPLETE":
                json = self.getMatchLocationValues(json["results"],
                                                   edgerc=edgerc,
                                                   account_key=account_key,
                                                   network=network,
                                                   debug=debug)
            else:
                raise ValueError(
                    "Search status never encountred COMPLETE. Last Status = {}"
                    .format(status))

            return (code, json)

        else:
            return (code, json)

    def getMatchLocationValues(self,
                               json,
                               edgerc=None,
                               account_key=None,
                               network=None,
                               debug=False):

        count = 0

        if network is not None and (network.startswith("p")
                                    or network.startswith("P")):
            json = list(
                filter(lambda x: x["productionStatus"] == "ACTIVE", json))
            print(
                " ... Limiting to production network with {} ACTIVE properties"
                .format(len(json)),
                file=sys.stderr)
        elif network is not None and (network.startswith("s")
                                      or network.startswith("S")):
            json = list(filter(lambda x: x["stagingStatus"] == "ACTIVE", json))
            print(" ... Limiting to staging network with {} ACTIVE properties".
                  format(len(json)),
                  file=sys.stderr)
        else:
            print(
                " ... Warning: searching non-cacheable properties. Limit to production or staging network for faster searching",
                file=sys.stderr)

        if debug == True:
            print(" ... filtered json:", file=sys.stderr)
            printjson = jsonlib.dumps(json, indent=2)
            print(printjson, file=sys.stderr)

        jobsize = len(json)

        def manipulateSearchResults(matchJson,
                                    edgerc=None,
                                    account_key=None,
                                    propertyId=None,
                                    propertyVersion=None,
                                    cacheResponses=False,
                                    debug=None):

            (code, propertyJson) = self.fetchPropertyVersion(
                edgerc=edgerc,
                propertyId=propertyId,
                propertyVersion=propertyVersion,
                account_key=account_key,
                cacheResponses=cacheResponses,
                debug=debug)

            if code in [200, 202]:
                self.mergeVersionPointerValues(matchJson, propertyJson)

                print(
                    " ..... with hostnames, notes, formats, product_ids, etc..",
                    file=sys.stderr)
                (code, digitalPropertyJson
                 ) = self.fetchPropertyVersionDigitalProperty(
                     edgerc=edgerc,
                     account_key=account_key,
                     propertyId=propertyId,
                     propertyVersion=propertyVersion,
                     cacheResponses=cacheResponses,
                     debug=debug)

                if code in [200]:
                    lastModifiedTime = matchJson[
                        "lastModifiedTime"] if "lastModifiedTime" in matchJson else None
                    self.mergeDigitalPropertiesValues(
                        matchJson,
                        digitalPropertyJson,
                        lastModifiedTime=lastModifiedTime)

                (code, versionMetaJson) = self.fetchPropertyVersionMetaInfo(
                    edgerc=edgerc,
                    account_key=account_key,
                    propertyId=propertyId,
                    propertyVersion=propertyVersion,
                    cacheResponses=cacheResponses,
                    debug=debug)

                if code in [200]:
                    self.mergeDigitalPropertiesVersionMeta(
                        matchJson, versionMetaJson)

        for match in json:
            count = count + 1
            propertyId = match["propertyId"]
            propertyVersion = match["propertyVersion"]
            propertyName = match["propertyName"]
            productionStatus = match["productionStatus"]
            stagingStatus = match["stagingStatus"]

            if productionStatus in [
                    "ACTIVE", "DEACTIVATED"
            ] or stagingStatus in ["ACTIVE", "DEACTIVATED"]:

                cacheResponses = True
                print(
                    " ... Getting Immutable Property {} of {}. {} v{} production={} staging={}"
                    .format(count, jobsize, propertyName, propertyVersion,
                            productionStatus, stagingStatus),
                    file=sys.stderr)
                manipulateSearchResults(match,
                                        edgerc=edgerc,
                                        account_key=account_key,
                                        propertyId=propertyId,
                                        propertyVersion=propertyVersion,
                                        cacheResponses=cacheResponses,
                                        debug=debug)

            else:
                cacheResponses = False
                print(
                    " ... Getting property {} of {}. {} v{} production={} staging={}"
                    .format(count, jobsize, propertyName, propertyVersion,
                            productionStatus, stagingStatus),
                    file=sys.stderr)
                manipulateSearchResults(match,
                                        edgerc=edgerc,
                                        account_key=account_key,
                                        propertyId=propertyId,
                                        propertyVersion=propertyVersion,
                                        cacheResponses=cacheResponses,
                                        debug=debug)

        return json

    def mergeVersionPointerValues(self, match, propertyJson):

        matchLocations = match["matchLocations"]
        matchResults = []
        for pointer in matchLocations:
            subjson = self.resolvepointer(pointer, propertyJson)
            matchResults.append(subjson)

        if len(matchResults) > 0:
            match["matchLocationResults"] = matchResults

    def mergeDigitalPropertiesValues(self,
                                     searchJson,
                                     hostnameJson,
                                     lastModifiedTime=None):

        if len(hostnameJson) > 0:
            searchJson["hostnames"] = hostnameJson

        if lastModifiedTime is not None:

            days = daysSince(lastModifiedTime)
            searchJson["daysSinceModified"] = days

    def mergeDigitalPropertiesVersionMeta(self, searchJson, versionMetaJson):

        if len(versionMetaJson) > 0:
            if "propertyVersion" in versionMetaJson:
                del versionMetaJson["propertyVersion"]

            if "stagingStatus" in versionMetaJson:
                del versionMetaJson["stagingStatus"]

            if "productionStatus" in versionMetaJson:
                del versionMetaJson["productionStatus"]

            if "etag" in versionMetaJson:
                del versionMetaJson["etag"]

            if "updatedDate" in versionMetaJson:
                del versionMetaJson["updatedDate"]

            searchJson["versionInfo"] = versionMetaJson

    def validateResponse(
        self,
        jsonObj,
        account_key=None,
        propertyId=None,
        propertyVersion=None,
    ):

        if propertyId != jsonObj["propertyId"]:
            raise ValueError(
                "Unexpected API response! Expecting propertyId={} but got {}".
                format(propertyId, jsonObj["propertyId"]))

        #doesn't support hyphenated account keys as the return back different values
        #elif account_key is not None and account_key not in jsonObj["accountId"]:
        #    raise ValueError("Unexpected API response! Expecting accountId={} but got {}.".format(account_key,jsonObj["accountId"] ))

        elif "propertyVersion" in jsonObj and propertyVersion != jsonObj[
                "propertyVersion"]:
            raise ValueError(
                "Unexpected API response! Expecting propertyVersion={} but got {}."
                .format(propertyVersion, jsonObj["propertyVersion"]))

        elif "versions" in jsonObj and "items" in jsonObj["versions"] and (len(
                jsonObj["versions"]["items"]) == 1):
            versionItem = jsonObj["versions"]["items"][0]

            if "propertyVersion" in versionItem and propertyVersion != versionItem[
                    "propertyVersion"]:
                pass

        else:
            pass

    def fetchPropertyVersionMetaInfo(self,
                                     edgerc=None,
                                     section=None,
                                     account_key=None,
                                     propertyId=None,
                                     propertyVersion=None,
                                     cacheResponses=False,
                                     debug=False):

        factory = CredentialFactory()
        context = factory.load(edgerc, section, account_key)
        url = self.buildGetPropertyVersionMetaInfoUrl(
            context, propertyId=propertyId, propertyVersion=propertyVersion)

        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json, */*"
        }
        bypassCache = not cacheResponses

        cachedHandler = CachedContextHandler(context, self.cache, debug=debug)
        code, jsonObj = cachedHandler.get(url,
                                          requestHeaders=headers,
                                          bypassCache=bypassCache)

        if code in [
                200
        ] and "versions" in jsonObj and "items" in jsonObj["versions"]:
            self.validateResponse(jsonObj,
                                  account_key=account_key,
                                  propertyId=propertyId,
                                  propertyVersion=propertyVersion)
            jsonObj = jsonObj["versions"]["items"][0]

            return (code, jsonObj)
        else:
            return (code, jsonObj)

    def fetchPropertyVersionDigitalProperty(self,
                                            edgerc=None,
                                            section=None,
                                            account_key=None,
                                            propertyId=None,
                                            propertyVersion=None,
                                            cacheResponses=False,
                                            debug=False):

        factory = CredentialFactory()
        context = factory.load(edgerc, section, account_key)
        url = self.buildGetPropertyDigitalPropertyUrl(
            context, propertyId=propertyId, propertyVersion=propertyVersion)

        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json, */*"
        }
        bypassCache = not cacheResponses

        cachedHandler = CachedContextHandler(context, self.cache, debug=debug)
        code, jsonObj = cachedHandler.get(url,
                                          requestHeaders=headers,
                                          bypassCache=bypassCache)

        if code in [
                200
        ] and "hostnames" in jsonObj and "items" in jsonObj["hostnames"]:
            self.validateResponse(jsonObj,
                                  account_key=account_key,
                                  propertyId=propertyId,
                                  propertyVersion=propertyVersion)
            jsonObj = jsonObj["hostnames"]["items"]

            return (code, jsonObj)
        else:
            return (code, jsonObj)

    def fetchPropertyVersion(self,
                             edgerc=None,
                             section=None,
                             account_key=None,
                             propertyId=None,
                             propertyVersion=None,
                             cacheResponses=False,
                             debug=False):

        factory = CredentialFactory()
        context = factory.load(edgerc, section, account_key)
        url = self.buildGetPropertyUrl(context,
                                       propertyId=propertyId,
                                       propertyVersion=propertyVersion)

        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json, */*"
        }
        bypassCache = not cacheResponses
        cachedHandler = CachedContextHandler(context, self.cache, debug=debug)
        code, jsonObj = cachedHandler.get(url,
                                          requestHeaders=headers,
                                          bypassCache=bypassCache)

        if code in [200, 201, 202] and "rules" in jsonObj:

            self.validateResponse(jsonObj,
                                  account_key=account_key,
                                  propertyId=propertyId,
                                  propertyVersion=propertyVersion)
            return (code, jsonObj)

        else:
            return (code, jsonObj)

    def resolvepointer(self, pointer, doc):
        doc = copy.deepcopy(doc)
        pointerJson = jsonpointer.resolve_pointer(doc, pointer)
        return pointerJson
    def check(self, dir, args):
        warnings = []
        log.info("Running contact fields checks (ContactFields)")
        ValidateEmails = True
        assert 'emails' in __main__.remoteCheckList
        if 'emails' in args.disableChecksRemote:
            ValidateEmails = False
        else:
            ValidateEmails = True

        cache_dir = 'data-check-cache/emails'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        cache = Cache(cache_dir)
        if 'emails' in args.purgeCaches:
            cache.clear()

        for contact in dir.getContacts():
            if (not 'first_name' in contact
                    or re.search('^\s*$', contact['first_name'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.WARNING, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Missing first name for contact ('first_name' attribute is empty)"
                    ))
            if (not 'last_name' in contact
                    or re.search('^\s*$', contact['last_name'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.WARNING, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Missing last name for contact ('last_name' attribute is empty)"
                    ))
            if (not 'email' in contact
                    or re.search('^\s*$', contact['email'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.ERROR, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Missing email for contact ('email' attribute is empty)"
                    ))
            elif (not validate_email(contact['email'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.WARNING, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Email for contact is invalid - offending  'email' attribute value: "
                        + contact['email']))
            else:
                # This is pretty dramatic test and should be used sparingly
                if ValidateEmails:
                    contact_email = contact['email']
                    log_message = "Validating email " + contact_email
                    # XXX: does not work in most cases
                    #if(not validate_email(contact['email'],verify=True)):
                    try:
                        if (contact_email in cache):
                            cache_result = cache[contact_email]
                            if (cache_result['valid']):
                                log_message += " -> OK"
                            else:
                                log_message += " -> failed"
                                warnings.append(cache_result['warning'])
                        else:
                            if (not validate_email(contact_email,
                                                   check_mx=True)):
                                log_message += " -> failed"
                                warning = DataCheckWarning(
                                    self.__class__.__name__, "",
                                    dir.getContactNN(contact['id']),
                                    DataCheckWarningLevel.WARNING,
                                    contact['id'], DataCheckEntityType.CONTACT,
                                    "Email for contact seems to be unreachable because of missing DNS MX record"
                                )
                                warnings.append(warning)
                                cache[contact_email] = {
                                    'valid': False,
                                    'warning': warning
                                }
                            else:
                                log_message += " -> OK"
                                cache[contact_email] = {
                                    'valid': True,
                                    'warning': None
                                }
                        log.info(log_message)
                    except (DNS.Base.TimeoutError, DNS.Base.ServerError,
                            DNS.Base.SocketError) as e:
                        log_message += " -> failed with exception (" + str(
                            e) + ")"
                        log.error(log_message)

            if (not 'phone' in contact
                    or re.search('^\s*$', contact['phone'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.WARNING, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Missing phone for contact ('phone' attribute is empty'"
                    ))
            elif (not re.search('^\+(?:[0-9]??){6,14}[0-9]$',
                                contact['phone'])):
                warnings.append(
                    DataCheckWarning(
                        self.__class__.__name__, "",
                        dir.getContactNN(contact['id']),
                        DataCheckWarningLevel.ERROR, contact['id'],
                        DataCheckEntityType.CONTACT,
                        "Phone number for contact does not conform to the E.123 international standard (means starts with + sign, no spaces) - offending phone number in 'phone' attribute: "
                        + contact['phone']))
        return warnings
Beispiel #13
0
#!/usr/bin/env python3
"""
Clears cache from the default tmp directory
"""
from diskcache import Cache
import tempfile

cache = Cache(tempfile.gettempdir())
cache.clear()
class FileDirCache(MutableMapping):
    def __init__(
        self,
        use_listings_cache=True,
        listings_expiry_time=None,
        listings_cache_location=None,
        **kwargs,
    ):
        """

        Parameters
        ----------
        use_listings_cache: bool
            If False, this cache never returns items, but always reports KeyError,
            and setting items has no effect
        listings_expiry_time: int or float (optional)
            Time in seconds that a listing is considered valid. If None,
            listings do not expire.
        listings_cache_location: str (optional)
            Directory path at which the listings cache file is stored. If None,
            an autogenerated path at the user folder is created.

        """
        import appdirs
        from diskcache import Cache

        listings_expiry_time = listings_expiry_time and float(listings_expiry_time)

        if listings_cache_location:
            listings_cache_location = Path(listings_cache_location) / str(listings_expiry_time)
            listings_cache_location.mkdir(exist_ok=True, parents=True)
        else:
            listings_cache_location = Path(appdirs.user_cache_dir(appname="fsspec_dircache")) / str(
                listings_expiry_time
            )

        try:
            listings_cache_location.mkdir(exist_ok=True, parents=True)
        except Exception:
            logger.error(f"folder for dircache could not be created at {listings_cache_location}")

        self.cache_location = listings_cache_location

        self._cache = Cache(directory=listings_cache_location)
        self.use_listings_cache = use_listings_cache
        self.listings_expiry_time = listings_expiry_time

    def __getitem__(self, item):
        """Draw item as fileobject from cache, retry if timeout occurs"""
        return self._cache.get(key=item, read=True, retry=True)

    def clear(self):
        self._cache.clear()

    def __len__(self):
        return len(list(self._cache.iterkeys()))

    def __contains__(self, item):
        value = self._cache.get(item, retry=True)  # None, if expired
        if value:
            return True
        return False

    def __setitem__(self, key, value):
        if not self.use_listings_cache:
            return
        self._cache.set(key=key, value=value, expire=self.listings_expiry_time, retry=True)

    def __delitem__(self, key):
        del self._cache[key]

    def __iter__(self):
        return (k for k in self._cache.iterkeys() if k in self)

    def __reduce__(self):
        return (
            FileDirCache,
            (self.use_listings_cache, self.listings_expiry_time, self.cache_location),
        )
Beispiel #15
0
class VocabAPI:
    s = Session()

    BASE_URL = "https://www.vocabulary.com"
    API_BASE_URL = 'https://api.vocab.com/1.0'
    PLAY_URL = BASE_URL + "/play"
    START_URL = BASE_URL + "/challenge/start.json"
    NEXT_URL = BASE_URL + "/challenge/nextquestion.json"
    HINT_URL = BASE_URL + "/challenge/hint.json"
    SAVE_ANSWER_URL = BASE_URL + "/challenge/saveanswer.json"
    ME_URL = BASE_URL + "/auth/me.json"
    LOGIN_URL = BASE_URL + "/login/"
    SET_PRIORITY_URL = BASE_URL + "/progress/setpriority.json"
    AUTO_COMPLETE_URL = BASE_URL + "/dictionary/autocomplete?search="

    APL_WORD_PROGRESS_URL = API_BASE_URL + "/progress/words"
    API_AUTH_TOKEN_URL = API_BASE_URL + "/auth/token"

    def __init__(self):
        super(VocabAPI, self).__init__()
        self._access_token = ''
        self._me_info = None  # type: MeRsp
        self._is_logged_in = None

        self.session_pool = {}
        self.cache = Cache(CACHE_DIR)
        if self.cache.get("cookies"):
            self.s.cookies = self.cache['cookies']
        self.s.headers.update({
            "authority":
            "www.vocabulary.com",
            "accept":
            "application/json, text/javascript, */*; q=0.01",
            "user-agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) "
            "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36",
            "content-type":
            "application/x-www-form-urlencoded; charset=UTF-8",
            "accept-encoding":
            "gzip, deflate, br",
            "accept-language":
            "zh-CN,zh;q=0.9",
            "origin":
            self.BASE_URL,
            "referer":
            self.BASE_URL,
            "keep-alive":
            "true"
        })

    def clear_cache(self):
        self.cache.clear()

    @property
    def access_token(self):
        if not self._access_token:
            self._access_token = self.refresh_token().access_token
        return self._access_token

    @property
    def auth_header(self) -> dict:
        if not self.access_token:
            return {}
        return {'authorization': f'Bearer {self.access_token}'}

    @property
    def is_logged_in(self) -> bool:
        logging.debug("Check logging status ..")

        if self._is_logged_in is None:
            if not self.cache.get("cookies"):
                logging.debug(f"logged in: {False}")
                return False

            try:
                self.get(
                    "https://www.vocabulary.com/account/activities.json?limit=1"
                ).json
                self._is_logged_in = True
            except json.JSONDecodeError:
                self._is_logged_in = False
            logging.debug(f"logged in: {self._is_logged_in}")
        return self._is_logged_in

    def login(self, user_name: str, password: str, auto_login: bool) -> str:
        error_msg = ''
        if not self.is_logged_in:
            # login procedure
            login_data = {
                'username': user_name,
                'password': password,
                '.cb-autoLogon': int(auto_login),
                'autoLogon': auto_login
            }
            login_bs = self.post(self.LOGIN_URL, data=login_data).bs
            error_tag = login_bs.find(class_='errors')
            if error_tag:
                error_msg = error_tag.find(class_='msg').text
            else:
                self.cache.set('cookies', self.s.cookies)
        self._is_logged_in = bool(user_name and password and not error_msg)
        return error_msg

    def refresh_token(self) -> Box:
        """
        This function refreshes access auth token

        important: will affect attribute `auth_header` and `access_token`

        :return:
        """
        logging.debug("Refreshing access auth ... ")
        auth_box = self.post(self.API_AUTH_TOKEN_URL,
                             data={
                                 "refresh_token": self.s.cookies.get("guid")
                             }).box
        logging.debug("New access auth:", auth_box)
        self._access_token = auth_box.access_token
        return auth_box

    def get_autocomplete_list(self, word: str) -> List[AutoCompleteItem]:
        rsp_bs = self.get(self.AUTO_COMPLETE_URL + word).bs
        r_list = []
        for li in rsp_bs.select("li"):
            freq = li.get('freq', 0)
            if freq == '∞':
                freq = 0
            auto_item = AutoCompleteItem(
                word=li['word'],
                short_def=str(li.select(".definition")[0].string),
                freq=float(freq))
            r_list.append(auto_item)
        return r_list

    def get_word_progress(self, word: str) -> Union[WordProgressRsp, None]:
        word_prg_box = self.get(self.APL_WORD_PROGRESS_URL + f"/{word}",
                                ensure_auth=True).box
        progress = None
        if word_prg_box.get('progress'):
            progress = WordProgress(
                progress=word_prg_box.progress.progress,
                played_at=word_prg_box.progress.get('played_at', ''),
                scheduled_at=word_prg_box.progress.get('scheduled_at', ''),
                play_count=word_prg_box.progress.play_count,
                correct_count=word_prg_box.progress.correct_count,
                incorrect_count=word_prg_box.progress.incorrect_count,
                value=word_prg_box.progress.value,
                priority=word_prg_box.progress.priority)
        try:
            return WordProgressRsp(
                word=word_prg_box.word,
                sense=WordSense(
                    id=word_prg_box.sense.id,
                    part_of_speech=word_prg_box.sense.part_of_speech,
                    audio=self.get_first_audio_url(word_prg_box.sense),
                    definition=word_prg_box.sense.definition,
                    ordinal=word_prg_box.sense.ordinal),
                progress=progress,
                pkv=word_prg_box.get('pkv', None),
                learnable=word_prg_box.learnable)
        except BoxKeyError:
            # fixme box.BoxKeyError: "'Box' object has no attribute 'word'"
            return None

    def set_word_priority(self, word: str,
                          priority: EnumLearningPriority) -> bool:
        rsp = self.s.post(self.SET_PRIORITY_URL, {
            'word': word,
            "priority": priority.value
        })
        return rsp.status_code == 200

    @staticmethod
    def get_first_audio_url(sense: Box) -> str:
        if sense.audio:
            return f"https://audio.vocab.com/1.0/us/{sense.audio[0]}.mp3"
        return ''

    @property
    def meInfo(self) -> MeRsp:
        if not self._me_info:
            rsp_box = self.get(self.ME_URL).box
            if rsp_box.auth.loggedin:
                self._me_info = MeRsp(
                    validUser=rsp_box.validUser,
                    guid=rsp_box.guid,
                    auth=ChallengeAuth(loggedin=rsp_box.auth.loggedin,
                                       uid=rsp_box.auth.uid,
                                       nickname=rsp_box.auth.nickname,
                                       fullname=rsp_box.auth.fullname,
                                       email=rsp_box.auth.email),
                    perms=dict(rsp_box.perms),
                    points=rsp_box.points,
                    level=ChallengeLevel(id=int(
                        rsp_box.level.id.replace("L", "")),
                                         name=rsp_box.level.name),
                    ima=rsp_box.ima,
                    paid=rsp_box.paid)
            else:
                self._me_info = ChallengeAuth(loggedin=False)
        return self._me_info

    def _get_my_lists(self, my_list_type: str = ""):
        if not self.is_logged_in:
            return []
        url = self.BASE_URL + "/account/lists/" + my_list_type
        bs = self.get(url).bs
        list_table_tag = bs.find(class_='list-list')
        if not list_table_tag:
            return []
        lists = []
        for tr in list_table_tag.select("tr"):
            t = [i for i in tr.contents if isinstance(i, Tag)][0].contents[1]
            list_id = int(re.search("\d+", t['href']).group(0))
            name = t.contents[0].strip()
            created_string, total_words = t.span.text.split("(")
            created_string = created_string.strip()
            total_words = int(
                total_words.split(")")[0].strip().split("words")[0].strip())
            created_date = datetime.strptime(
                created_string,
                '%B %d, %Y',
            ).date()

            lists.append(
                UserWordlist(listId=list_id,
                             wordcount=total_words,
                             name=name,
                             created=created_date))
        return lists

    def get_my_list_detail(self, listid: int) -> UserWordlistDetail:
        logging.info(f"Getting my word list details: {listid}")
        url = self.API_BASE_URL + f"/progress/lists/{listid}"
        rsp_box = self.get(url, ensure_auth=True).box
        uwld = UserWordlistDetail(
            starred=rsp_box.starred,
            word_count=rsp_box.word_count,
            learnable_word_count=rsp_box.learnable_word_count,
            learning_progress=UserWordlistLeaningProgress(
                active=rsp_box.learning_progress.active,
                progress=rsp_box.learning_progress.progress,
                mastered_word_count=rsp_box.learning_progress.
                mastered_word_count))
        logging.info(f"Word list got: ", uwld)
        return uwld

    @property
    def my_lists_all(self) -> List[UserWordlist]:
        return self._get_my_lists()

    @property
    def my_lists_created(self) -> List[UserWordlist]:
        return self._get_my_lists("created")

    @property
    def my_lists_shared(self) -> List[UserWordlist]:
        return self._get_my_lists("shared")

    @property
    def my_lists_learning(self) -> List[UserWordlist]:
        return self._get_my_lists("learning")

    # pure
    # http://app.vocabulary.com/app/1.0/dictionary/search?word=sun
    def get_word_def(self, word: str) -> WordDef:
        cache_key = f"word_def: {word}"
        if not self.cache.get(cache_key):
            # def_url = self.BASE_URL + f"/dictionary/{word}"
            def_url = f"http://app.vocabulary.com/app/1.0/dictionary/search?word={word}"
            rsp_bs = self.get(def_url).bs
            # get response word
            word_ = ''
            word_tag = rsp_bs.find(class_='dynamictext')
            if word_tag:
                word_ = word_tag.contents[0].__str__()

            # get audio url
            audio_tag = rsp_bs.find(class_='audio')
            if audio_tag:
                audio_url = self.get_first_audio_url(
                    Box(audio=[
                        audio_tag['data-audio'],
                    ]))
            else:
                audio_url = ''

            # get word short/long blurb
            blurb_tag = rsp_bs.find(class_='blurb')
            if blurb_tag:
                short_tag = blurb_tag.find('p', class_='short')
                long_tag = blurb_tag.find('p', class_='long')
                short_blurb_txt = "".join(
                    [i.__str__() for i in short_tag.contents])
                long_blurb_txt = "".join(
                    [i.__str__() for i in long_tag.contents])
            else:
                short_blurb_txt, long_blurb_txt = '', ''

            def_groups = []
            for group_tag in rsp_bs.select(".group"):
                ordinals = []
                for ordinal_tag in group_tag.select(".ordinal"):
                    senses = []
                    for sense_tag in ordinal_tag.select(".sense"):
                        def_example = ''
                        def_content_tag = sense_tag.find(class_='defContent')
                        if def_content_tag:
                            def_example_tag = sense_tag.find(class_='example')
                            if def_example_tag:
                                def_example = " ".join([
                                    re.sub(r"\s+", " ", str(i))
                                    for i in def_example_tag.contents
                                ])
                        def_tag = ordinal_tag.find(class_="definition")
                        pos_full = def_tag.a['title']
                        pos_short = def_tag.contents[1].text.strip()
                        def_txt = def_tag.contents[2].strip()
                        senses.append(
                            WordDefSense(pos_short=pos_short,
                                         pos_long=pos_full,
                                         def_=def_txt,
                                         example=def_example))
                    ordinals.append(senses)
                def_groups.append(ordinals)
            self.cache.set(
                cache_key,
                WordDef(word_, audio_url,
                        AnswerBlurb(short_blurb_txt, long_blurb_txt),
                        def_groups))
        return self.cache.get(cache_key)

    def post(self, url: str, ensure_auth: bool = False, **kwargs) -> Rsp:
        return Rsp(self._request("POST", url, ensure_auth, **kwargs))

    def get(self, url: str, ensure_auth: bool = False, **kwargs) -> Rsp:
        return Rsp(self._request("GET", url, ensure_auth, **kwargs))

    def get_session(self, val: str):
        domain = urlparse(val).netloc.lower().strip()
        if val in self.session_pool:
            s = Session()
            self.session_pool[domain] = s
        else:
            s = self.session_pool[domain]
        return s

    def _request(self, method: str, url, ensure_auth, **kwargs):
        param_headers = kwargs.get("headers", {})
        if ensure_auth:
            param_headers.update(self.auth_header)
        rqst = Request(method, url, headers=param_headers, **kwargs)
        if ensure_auth:
            rqst.register_hook("response",
                               partial(self._handle_401, method, url, kwargs))
        rsp = self.s.send(self.s.prepare_request(rqst))

        rsp.raise_for_status()
        if rsp.status_code != 200:
            raise APIGetError(url)
        return rsp.content.decode()

    def _handle_401(self, method: str, url: str, rqst_kwargs: Dict,
                    rsp: Response, **kwargs):
        if rsp.status_code != 401:
            return rsp
        self.refresh_token()
        param_headers = rqst_kwargs.get("headers", {})
        param_headers.update(self.auth_header)
        rqst = Request(method, url, headers=param_headers, **rqst_kwargs)
        return self.s.send(self.s.prepare_request(rqst))
    def __init__(self,
                 package='eu_bbmri_eric',
                 purgeCaches=[],
                 debug=False,
                 pp=None,
                 username=None,
                 password=None):
        self.__pp = pp
        self.__package = package
        log.debug('Checking data in package: ' + package)

        cache_dir = 'data-check-cache/directory'
        if not os.path.exists(cache_dir):
            os.makedirs(cache_dir)
        cache = Cache(cache_dir)
        if 'directory' in purgeCaches:
            cache.clear()

        self.__directoryURL = "https://directory.bbmri-eric.eu/api/"
        log.info('Retrieving directory content from ' + self.__directoryURL)
        session = molgenis.client.Session(self.__directoryURL)
        if username is not None and password is not None:
            log.info("Logging in to MOLGENIS with a user account.")
            log.debug('username: '******'password: '******'   ... retrieving biobanks')
        if 'biobanks' in cache:
            self.biobanks = cache['biobanks']
        else:
            start_time = time.perf_counter()
            # TODO: remove exception handling once BBMRI.uk staging has been fixed
            try:
                self.biobanks = session.get(
                    self.__package + "_biobanks",
                    expand='contact,collections,country,covid19biobank')
            except:
                log.warning(
                    "Using work-around for inconsistence in the database structure."
                )
                self.biobanks = session.get(
                    self.__package + "_biobanks",
                    expand='contact,collections,country,COVID_19')
            cache['biobanks'] = self.biobanks
            end_time = time.perf_counter()
            log.info('   ... retrieved biobanks in ' + "%0.3f" %
                     (end_time - start_time) + 's')
        log.info('   ... retrieving collections')
        if 'collections' in cache:
            self.collections = cache['collections']
        else:
            start_time = time.perf_counter()
            self.collections = session.get(
                self.__package + "_collections",
                expand=
                'biobank,contact,network,parent_collection,sub_collections,type,materials,order_of_magnitude,data_categories,diagnosis_available,imaging_modality,image_dataset_type'
            )
            #self.collections = session.get(self.__package + "_collections", num=2000, expand=[])
            cache['collections'] = self.collections
            end_time = time.perf_counter()
            if debug and self.__pp is not None:
                for c in self.collections:
                    pp.pprint(c)
            log.info('   ... retrieved collections in ' + "%0.3f" %
                     (end_time - start_time) + 's')
        log.info('   ... retrieving contacts')
        if 'contacts' in cache:
            self.contacts = cache['contacts']
        else:
            start_time = time.perf_counter()
            self.contacts = session.get(self.__package + "_persons",
                                        num=2000,
                                        expand='biobanks,collections,country')
            cache['contacts'] = self.contacts
            end_time = time.perf_counter()
            log.info('   ... retrieved contacts in ' + "%0.3f" %
                     (end_time - start_time) + 's')
        log.info('   ... retrieving networks')
        if 'networks' in cache:
            self.networks = cache['networks']
        else:
            start_time = time.perf_counter()
            self.networks = session.get(self.__package + "_networks",
                                        num=2000,
                                        expand='contact')
            cache['networks'] = self.networks
            end_time = time.perf_counter()
            log.info('   ... retrieved networks in ' + "%0.3f" %
                     (end_time - start_time) + 's')
        log.info('   ... all entities retrieved')
        self.contactHashmap = {}

        log.info('Processing directory data')
        # Graph containing only biobanks and collections
        self.directoryGraph = nx.DiGraph()
        # DAG containing only biobanks and collections
        self.directoryCollectionsDAG = nx.DiGraph()
        # Weighted graph linking contacts to biobanks/collections/networks
        self.contactGraph = nx.DiGraph()
        # Graph linking networks to biobanks/collections
        self.networkGraph = nx.DiGraph()
        for c in self.contacts:
            if self.contactGraph.has_node(c['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in contactGraph: ' + c['id'])
            # XXX temporary hack -- adding contactID prefix
            #self.contactGraph.add_node(c['id'], data=c)
            self.contactGraph.add_node('contactID:' + c['id'], data=c)
            self.contactHashmap[c['id']] = c
        for b in self.biobanks:
            if self.directoryGraph.has_node(b['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in directoryGraph: ' + b['id'])
            self.directoryGraph.add_node(b['id'], data=b)
            self.directoryCollectionsDAG.add_node(b['id'], data=b)
            if self.contactGraph.has_node(b['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in contactGraph: ' + b['id'])
            self.contactGraph.add_node(b['id'], data=b)
            if self.networkGraph.has_node(b['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in networkGraph: ' + b['id'])
            self.networkGraph.add_node(b['id'], data=b)
        for c in self.collections:
            if self.directoryGraph.has_node(c['id']):
                raise Exception('DirectoryStructure',
                                'Conflicting ID found: ' + c['id'])
            self.directoryGraph.add_node(c['id'], data=c)
            self.directoryCollectionsDAG.add_node(c['id'], data=c)
            if self.contactGraph.has_node(c['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in contactGraph: ' + c['id'])
            self.contactGraph.add_node(c['id'], data=c)
            if self.networkGraph.has_node(c['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in networkGraph: ' + c['id'])
            self.networkGraph.add_node(c['id'], data=c)
        for n in self.networks:
            if self.contactGraph.has_node(n['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in contactGraph: ' + n['id'])
            self.contactGraph.add_node(n['id'], data=n)
            if self.networkGraph.has_node(n['id']):
                raise Exception(
                    'DirectoryStructure',
                    'Conflicting ID found in networkGraph: ' + n['id'])
            self.networkGraph.add_node(n['id'], data=n)

        # check forward pointers from biobanks
        for b in self.biobanks:
            for c in b['collections']:
                if not self.directoryGraph.has_node(c['id']):
                    raise Exception(
                        'DirectoryStructure',
                        'Biobank refers non-existent collection ID: ' +
                        c['id'])
        # add biobank contact and network edges
        for b in self.biobanks:
            if 'contact' in b:
                self.contactGraph.add_edge(b['id'],
                                           'contactID:' + b['contact']['id'])
            if 'networks' in c:
                for n in c['networks']:
                    self.networkGraph.add_edge(b['id'], n['id'])

        # now we have all the collections created and checked duplicates, so we create edges
        for c in self.collections:
            if 'parent_collection' in c:
                # some child collection
                self.directoryGraph.add_edge(c['id'],
                                             c['parent_collection']['id'])
            else:
                # some of root collections of a biobank
                # we add both edges as we can't extract this information from the biobank level (it contains pointers to all the child collections)
                self.directoryGraph.add_edge(c['id'], c['biobank']['id'])
                self.directoryGraph.add_edge(c['biobank']['id'], c['id'])
                self.directoryCollectionsDAG.add_edge(c['biobank']['id'],
                                                      c['id'])
            if 'sub_collections' in c:
                # some of root collections of a biobank
                for sb in c['sub_collections']:
                    self.directoryGraph.add_edge(c['id'], sb['id'])
                    self.directoryCollectionsDAG.add_edge(c['id'], sb['id'])
            if 'contact' in c:
                self.contactGraph.add_edge(c['id'],
                                           'contactID:' + c['contact']['id'])
            if 'networks' in c:
                for n in c['networks']:
                    self.networkGraph.add_edge(c['id'], n['id'])

        # processing network edges
        for n in self.networks:
            if 'biobanks' in n:
                for b in n['biobanks']:
                    self.networkGraph.add_edge(n['id'], b['id'])
            # TODO remove once the datamodel is fixed
            if 'contacts' in n:
                for c in n['contacts']:
                    self.contactGraph.add_edge(n['id'], 'contactID:' + c['id'])
            if 'contact' in n:
                self.contactGraph.add_edge(n['id'],
                                           'contactID:' + n['contact']['id'])
            if 'collections' in n:
                for c in n['collections']:
                    self.networkGraph.add_edge(n['id'], c['id'])

        # processing edges from contacts
        for c in self.contacts:
            if 'biobanks' in c:
                for b in c['biobanks']:
                    self.contactGraph.add_edge('contactID:' + c['id'], b['id'])
            if 'collections' in c:
                for coll in c['collections']:
                    self.contactGraph.add_edge('contactID:' + c['id'],
                                               coll['id'])
            if 'networks' in c:
                for n in c['networks']:
                    self.contactGraph.add_edge('contactID:' + c['id'], n['id'])

        # now make graphs immutable
        nx.freeze(self.directoryGraph)
        nx.freeze(self.directoryCollectionsDAG)
        nx.freeze(self.contactGraph)
        nx.freeze(self.networkGraph)

        log.info('Checks of directory data as graphs')
        # now we check if all the edges in the graph are in both directions
        for e in self.directoryGraph.edges():
            if not self.directoryGraph.has_edge(e[1], e[0]):
                raise Exception(
                    'DirectoryStructure',
                    'directoryGraph: Missing edge: ' + e[1] + ' to ' + e[0])
        for e in self.contactGraph.edges():
            if not self.contactGraph.has_edge(e[1], e[0]):
                raise Exception(
                    'DirectoryStructure',
                    'contactGraph: Missing edge: ' + e[1] + ' to ' + e[0])
        for e in self.networkGraph.edges():
            if not self.networkGraph.has_edge(e[1], e[0]):
                raise Exception(
                    'DirectoryStructure',
                    'networkGraph: Missing edge: ' + e[1] + ' to ' + e[0])
        # we check that DAG is indeed DAG :-)
        if not nx.algorithms.dag.is_directed_acyclic_graph(
                self.directoryCollectionsDAG):
            raise Exception('DirectoryStructure', 'Collection DAG is not DAG')

        log.info('Directory structure initialized')
        self.__orphacodesmapper = None
Beispiel #17
0
from diskcache import Cache

task_queue = Cache('/tmp/ad-poster')
# print dir(task_queue)
print "Count:", task_queue.count
print "Cleared:", task_queue.clear()
Beispiel #18
0
import gym
import numpy as np
from PIL import Image
from lshash.lshash import LSHash
from collections import deque
from random import random
from diskcache import FanoutCache, Cache
qtable = Cache('cache')
qtable.clear()

env = gym.make('Breakout-v0')
lshs = LSHash(500, 8192)
LEARNING_RATE = 0.15
DISCOUNT = 0.95
EPISODES = 25000

def preprocess(obs):
    image = Image.fromarray(observation)
    image = image.resize((64, 64))
    image = image.convert(mode='1')
    array = np.array(image, dtype=np.uint8).flatten()
    return array

def get_action(obs_seq):
    query = lshs.query(obs_seq, num_results=1)
    if len(query) <= 0:
        lshs.index(obs_seq)
        actions = np.ones(env.action_space.n)
        qtable[obs_seq] = actions
    elif query[0][1] >= 10:
        lshs.index(obs_seq)