Beispiel #1
0
class Storage:

    def __init__(self, mode=StorageModes.MEMORY):
        self.mode = mode
        if self.mode == StorageModes.PERSISTENT:
            self.cache = SqliteDict('../my_db.sqlite', autocommit=True)
        elif self.mode == StorageModes.MEMORY:
            self.cache = dict()

    def set(self, k, v):

        self.cache[k] = v
        if self.mode == StorageModes.PERSISTENT:
            # need to commit manually, as autocomit only commits with commit(blocking=False) and might not persist data
            self.cache.commit()

    def dump(self, k):
        self.cache.pop(k)
        if self.mode == StorageModes.PERSISTENT:
            # need to commit manually, as autocomit only commits with commit(blocking=False) and might not persist data
            self.cache.commit()

    def get(self, k):
        return self.cache.get(k)

    def append(self, k, v):
        current_data = self.cache.get(k)
        if not current_data:
            self.set(k, [v])
        else:
            if not isinstance(current_data, list):
                current_data = [current_data]
            current_data.append(v)
            self.set(k, current_data)
Beispiel #2
0
def _import_sql_data(data_dir):
    file_path = os.path.join(data_dir, DATA_FILE)

    # Find out what format we have
    with sqlite3.connect(file_path) as conn:
        try:
            conn.execute('select count(*) from zipgun_info')
            zipgun_info = SqliteDict(file_path, tablename='zipgun_info')
            version = zipgun_info.get('version', 0)
        except sqlite3.OperationalError:
            version = 0

    if version == 0:
        country_postal_codes = SqliteDict(file_path)
    elif version == 1:
        country_postal_codes = {}
        for country_code in zipgun_info['country_codes']:
            if country_code in country_postal_codes:
                raise ValueError('Duplicate entry found for {}'.format(
                    country_code))
            country_postal_codes[country_code] = SqliteDict(
                file_path, tablename='zg_{}'.format(country_code),
                journal_mode='OFF')
        zipgun_info.close()
    else:
        raise ValueError('Unknown data file version {}'.format(version))
    return country_postal_codes
Beispiel #3
0
def _import_sql_data(data_dir):
    import sqlite3
    from sqlitedict import SqliteDict

    file_path = os.path.join(data_dir, DATA_FILE)

    # Find out what format we have
    with sqlite3.connect(file_path) as conn:
        try:
            conn.execute('select count(*) from zipgun_info')
            zipgun_info = SqliteDict(file_path, tablename='zipgun_info')
            version = zipgun_info.get('version', 0)
        except sqlite3.OperationalError:
            version = 0

    if version == 0:
        country_postal_codes = SqliteDict(file_path)
    elif version == 1:
        country_postal_codes = {}
        for country_code in zipgun_info['country_codes']:
            if country_code in country_postal_codes:
                raise ValueError(
                    'Duplicate entry found for {}'.format(country_code))
            country_postal_codes[country_code] = SqliteDict(
                file_path,
                tablename='zg_{}'.format(country_code),
                journal_mode='OFF')
        zipgun_info.close()
    else:
        raise ValueError('Unknown data file version {}'.format(version))
    return country_postal_codes
Beispiel #4
0
class db(object):
    def __init__(self, user):
        self.user = str(user)
        self.db = SqliteDict(self.getCfgPath(), autocommit=True)

    def get(self, key=''):
        return self.db.get(key) if key else self.db.iteritems()

    def set(self, key='', data=''):
        if not key:
            key = self.user
        if data:
            self.db[key] = data
        else:
            self.db.__delitem__(key)

    def getCfgPath(self):
        if os.path.isdir('hoshino'):
            if not os.path.isdir('hoshino/modules/ASF_Plus/config'):
                os.mkdir('hoshino/modules/ASF_Plus/config')
            return os.path.join(
                os.path.abspath('hoshino/modules/ASF_Plus/config'),
                f'{self.user}.sqlite')
        else:
            if not os.path.isdir('../config'):
                os.mkdir('../config')
            return os.path.join(os.path.abspath('../config'),
                                f'{self.user}.sqlite')
Beispiel #5
0
 async def post(self):
     request = self.request
     data = await request.post()
     try:
         mydict = SqliteDict('./my_db.sqlite', autocommit=True)
         if mydict.get(data["url"]) is not None:
             return web.Response(text=str(mydict.get(data["url"])))
         image = await fetch(session, data["url"])
         nsfw_prob = classify(image)
         text = nsfw_prob.astype(str)
         mydict[data["url"]] = text
         return web.Response(text=text)
     except KeyError:
         return HTTPBadRequest(text="Missing `url` POST parameter")
     except OSError as e:
         if "cannot identify" in str(e):
             raise HTTPUnsupportedMediaType(text="Invalid image")
         else:
             raise e
Beispiel #6
0
class Cache(object):
    """ Cache -- Key-Value Store for Twizzle to reduce unnecessary recomputations
    """
    def __init__(self,
                 bPersistent=False,
                 sPathToPersistenceDB="twizzle_cache.db"):
        """Constructor of the Twizzle Cache

        Note:
            You can decide whether the Chache should be persistent between multiple executions or
            just a runtime cache for one execution of a set of tests
        Args:
            bPersistent (bool): Flag whether the chache should be persistent or not (Note: persistent cache
            is much slower because it has to write the data to the harddisk)

            sPathToPersistenceDB (str): Path to the Cache DB where the Cache should write its data to
        """
        self._cache = {}
        self._lock = Lock()
        self._persistent = bPersistent
        self._first_get = True

        if bPersistent:
            if not sPathToPersistenceDB:
                raise Exception(
                    "On persistent mode a path to the persistence database has to be defined"
                )
            self._db = SqliteDict(sPathToPersistenceDB)

    def set(self, sKey, oValue):
        """set cache element by key"""
        # debug
        print("ADDING CACHELINE: %s" % (sKey))
        self._lock.acquire()
        self._cache[sKey] = oValue
        if self._persistent:
            self._db[CACHE_KEY] = self._cache
            self._db.commit()
        self._lock.release()

    def get(self, sKey):
        """get cache element by key"""

        if self._persistent:
            self._lock.acquire()
            if self._first_get:
                self._first_get = False
                self._cache = self._db.get(CACHE_KEY, {})
            self._lock.release()
        return self._cache.get(sKey, None)

    def calc_unique_key(self, *params):
        """create a unique key based on parameters given by converting them
        to string and concatenating them"""
        return "".join([str(elem) for elem in params])
Beispiel #7
0
def get_cached_embedding(s):
    global db
    if db is None:
        db = SqliteDict(default_sbert_db, autocommit=True)
        print(f'Inited db using {default_sbert_db}')
    array_str = db.get(s)
    if array_str is not None:
        return t.from_numpy(__string_to_numpy(array_str))
    else:
        print(f'Not cached: {s}')
        tensor = get_embedding(s)
        db[s] = __numpy_to_string(tensor.numpy())
        return tensor
Beispiel #8
0
def get_annotations(dbpath, ids, options):
    # No context manager: close() can block and this is read-only
    db = SqliteDict(dbpath, flag='r', autocommit=False)
    for docid, annid in ids:
        so_key = docid + options.ann_suffix
        so = db.get(so_key)
        if so is None:
            warning('{} not found in {}, skipping'.format(so_key, dbpath))
            continue
        text_key = docid + options.text_suffix
        text = db.get(text_key)
        if text is None:
            warning('{} not found in {}, skipping'.format(text_key, dbpath))
            continue
        ann = get_annotation(so, annid)
        before = 'DOCSTART ' + text[:ann.start]
        after = text[ann.end:] + 'DOCEND'
        before = get_words(before, options.words, reverse=True)
        after = get_words(after, options.words, reverse=False)
        before = normalize_space(before)
        after = normalize_space(after)
        print('\t'.join([docid, annid, ann.type, before, ann.text, after]))
Beispiel #9
0
class OutputNewOrChangedEntires(beam.DoFn):
    def __init__(self, cache_file: str):
        super().__init__()
        self._cache_file = cache_file
        self._cache = None

    def start_bundle(self):
        self._cache = SqliteDict(self._cache_file, autocommit=True)

    def finish_bundle(self):
        self._cache.close()

    def process(
            self, element: Tuple[EntryId, Dict[str, Any]], *args,
            **kwargs) -> Generator[Tuple[EntryId, Dict[str, Any]], None, None]:
        # Make the type checker happy
        assert isinstance(self._cache, SqliteDict)

        (wikidata_id, entry) = element
        cached_entry = self._cache.get(wikidata_id)
        if cached_entry is None or cached_entry != entry:
            self._cache[wikidata_id] = entry
            yield wikidata_id, entry
Beispiel #10
0
class ToolDocumentCache:
    def __init__(self, cache_dir):
        self.cache_dir = cache_dir
        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)
        self.cache_file = os.path.join(self.cache_dir, 'cache.sqlite')
        self.writeable_cache_file = None
        self._cache = None
        self.disabled = False
        self._get_cache(create_if_necessary=True)

    def close(self):
        self._cache and self._cache.close()

    def _get_cache(self, flag='r', create_if_necessary=False):
        try:
            if create_if_necessary and not os.path.exists(self.cache_file):
                # Create database if necessary using 'c' flag
                self._cache = SqliteDict(self.cache_file,
                                         flag='c',
                                         encode=encoder,
                                         decode=decoder,
                                         autocommit=False)
                if flag == 'r':
                    self._cache.flag = flag
            else:
                cache_file = self.writeable_cache_file.name if self.writeable_cache_file else self.cache_file
                self._cache = SqliteDict(cache_file,
                                         flag=flag,
                                         encode=encoder,
                                         decode=decoder,
                                         autocommit=False)
        except sqlite3.OperationalError:
            log.warning('Tool document cache unavailable')
            self._cache = None
            self.disabled = True

    @property
    def cache_file_is_writeable(self):
        return os.access(self.cache_file, os.W_OK)

    def reopen_ro(self):
        self._get_cache(flag='r')
        self.writeable_cache_file = None

    def get(self, config_file):
        try:
            tool_document = self._cache.get(config_file)
        except sqlite3.OperationalError:
            log.debug("Tool document cache unavailable")
            return None
        if not tool_document:
            return None
        if tool_document.get(
                'tool_cache_version') != CURRENT_TOOL_CACHE_VERSION:
            return None
        if self.cache_file_is_writeable:
            for path, modtime in tool_document['paths_and_modtimes'].items():
                if os.path.getmtime(path) != modtime:
                    return None
        return tool_document

    def _make_writable(self):
        if not self.writeable_cache_file:
            self.writeable_cache_file = tempfile.NamedTemporaryFile(
                dir=self.cache_dir, suffix='cache.sqlite.tmp', delete=False)
            if os.path.exists(self.cache_file):
                shutil.copy(self.cache_file, self.writeable_cache_file.name)
            self._get_cache(flag='c')

    def persist(self):
        if self.writeable_cache_file:
            self._cache.commit()
            os.rename(self.writeable_cache_file.name, self.cache_file)
            self.reopen_ro()

    def set(self, config_file, tool_source):
        try:
            if self.cache_file_is_writeable:
                self._make_writable()
                to_persist = {
                    'document': tool_source.to_string(),
                    'macro_paths': tool_source.macro_paths,
                    'paths_and_modtimes': tool_source.paths_and_modtimes(),
                    'tool_cache_version': CURRENT_TOOL_CACHE_VERSION,
                }
                try:
                    self._cache[config_file] = to_persist
                except RuntimeError:
                    log.debug("Tool document cache not writeable")
        except sqlite3.OperationalError:
            log.debug("Tool document cache unavailable")

    def delete(self, config_file):
        if self.cache_file_is_writeable:
            self._make_writable()
            try:
                del self._cache[config_file]
            except (KeyError, RuntimeError):
                pass

    def __del__(self):
        if self.writeable_cache_file:
            try:
                os.unlink(self.writeable_cache_file.name)
            except Exception:
                pass
Beispiel #11
0
class GoogleDrive:
    auth_url = 'https://accounts.google.com/o/oauth2/v2/auth'
    token_url = 'https://www.googleapis.com/oauth2/v4/token'
    api_url = 'https://www.googleapis.com/drive/'
    redirect_url = 'urn:ietf:wg:oauth:2.0:oob'
    scopes = ['https://www.googleapis.com/auth/drive.readonly']

    def __init__(self, config, client_id: str, client_secret: str, token_path: str, cache_path: str):
        self.cfg = config
        self.client_id = client_id
        self.client_secret = client_secret
        self.token_path = token_path
        self.cache_path = cache_path
        self.cache = SqliteDict(self.cache_path, tablename='cache', encode=json.dumps, decode=json.loads,
                                autocommit=False)
        self.transcodes_cache = ExpiringDict(max_len=5000, max_age_seconds=2 * (60 * 60))
        self.token = self._load_token()
        self.token_refresh_lock = Lock()
        self.http = self._new_http_object()

    ############################################################
    # CORE CLASS METHODS
    ############################################################

    def get_auth_link(self):
        auth_url, state = self.http.authorization_url(self.auth_url, access_type='offline', prompt='select_account')
        return auth_url

    def exchange_code(self, code: str):
        token = self.http.fetch_token(self.token_url, code=code, client_secret=self.client_secret)
        if 'access_token' in token:
            self._token_saver(token)
        return self.token

    def query(self, path: str, method: str = 'GET', fetch_all_pages: bool = False, callbacks={}, **kwargs):
        resp: Response = None
        pages: int = 1
        resp_json = {}
        request_url = self.api_url + path.lstrip('/') if not path.startswith('http') else path

        try:
            while True:
                resp = self._do_query(request_url, method, **kwargs)
                log.debug(f"Request URL: {resp.url}")
                log.debug(f"Request ARG: {kwargs}")
                log.debug(f'Response Status: {resp.status_code} {resp.reason}')

                if 'stream' in kwargs and kwargs['stream']:
                    return True, resp, None

                if 'Content-Type' in resp.headers and 'json' in resp.headers['Content-Type']:
                    if fetch_all_pages:
                        resp_json.pop('nextPageToken', None)
                    new_json = resp.json()
                    # does this page have changes
                    extended_changes = False
                    changes = []
                    if 'changes' in new_json:
                        if 'changes' in resp_json:
                            changes.extend(resp_json['changes'])
                        changes.extend(new_json['changes'])
                        extended_changes = True

                    resp_json.update(new_json)
                    if extended_changes:
                        resp_json['changes'] = changes
                else:
                    return False if resp.status_code != 200 else True, resp, resp.text

                # call page_token_callback to update cached page_token, if specified
                if 'page_token_callback' in callbacks:
                    if 'nextPageToken' in resp_json:
                        callbacks['page_token_callback'](resp_json['nextPageToken'])
                    elif 'newStartPageToken' in resp_json:
                        callbacks['page_token_callback'](resp_json['newStartPageToken'])

                # call data_callback, fetch_all_pages is true
                if fetch_all_pages and 'data_callback' in callbacks:
                    callbacks['data_callback'](resp.json(), callbacks)

                # handle nextPageToken
                if fetch_all_pages and 'nextPageToken' in resp_json and resp_json['nextPageToken']:
                    # there are more pages
                    pages += 1
                    log.info("Fetching extra results from page %d", pages)
                    if 'params' in kwargs:
                        kwargs['params'].update({'pageToken': resp_json['nextPageToken']})
                    elif 'json' in kwargs:
                        kwargs['json'].update({'pageToken': resp_json['nextPageToken']})
                    elif 'data' in kwargs:
                        kwargs['data'].update({'pageToken': resp_json['nextPageToken']})
                    continue

                break

            return True if resp_json and len(resp_json) else False, resp, resp_json if (
                    resp_json and len(resp_json)) else resp.text

        except Exception:
            log.exception(f"Exception sending request to {request_url} with kwargs={kwargs}: ")
            return False, resp, None

    ############################################################
    # DRIVE FUNCTIONS
    ############################################################

    def validate_access_token(self):
        success, resp, data = self.query('/v3/changes/startPageToken',
                                         params={'supportsTeamDrives': self.cfg.google.teamdrive})
        if success and resp.status_code == 200:
            if 'startPageToken' not in data:
                log.error("Failed validate up to date access_token:\n\n%s\n", data)
                return False
            return True
        else:
            log.error("Error validating access token, status_code = %d, data =\n\n%s\n",
                      resp.status_code if resp is not None else 0, data)
        return False

    def get_changes(self, new_items_callback=None, removed_items_callback=None):
        callbacks = {'page_token_callback': self._page_token_saver,
                     'data_callback': self._process_changes}

        if new_items_callback:
            callbacks['new_items_callback'] = new_items_callback
        if removed_items_callback:
            callbacks['removed_items_callback'] = removed_items_callback

        success, resp, data = self.query('/v3/changes', params={
            'pageToken': self.token['page_token'] if 'page_token' in self.token else '1', 'pageSize': 1000,
            'includeRemoved': True,
            'includeTeamDriveItems': self.cfg.google.teamdrive,
            'supportsTeamDrives': self.cfg.google.teamdrive,
            'fields': 'changes(file(md5Checksum,mimeType,modifiedTime,'
                      'name,parents,teamDriveId,trashed),'
                      'fileId,removed,teamDrive(id,name),'
                      'teamDriveId),newStartPageToken,nextPageToken'}, fetch_all_pages=True,
                                         callbacks=callbacks)
        return

    def get_file(self, file_id, stream=True, headers=None, timeout=30):
        req_url = '/v2/files/%s' % file_id if not file_id.startswith('http') else file_id
        success, resp, data = self.query(req_url, params={
            'includeTeamDriveItems': self.cfg.google.teamdrive,
            'supportsTeamDrives': self.cfg.google.teamdrive,
            'alt': 'media'
        }, stream=stream, headers=headers, timeout=timeout)
        return resp

    def get_stream_link(self, file_id):
        # validate / refersh current access_token
        if not self.validate_access_token():
            return ''
        log.debug("Validated access_token is current")

        # generate url
        req = Request('GET', f'{self.api_url.rstrip("/")}/v2/files/{file_id}',
                      params={'includeTeamDriveItems': self.cfg.google.teamdrive,
                              'supportsTeamDrives': self.cfg.google.teamdrive,
                              'alt': 'media',
                              'access_token': self.token['access_token']}).prepare()
        log.debug(f'Direct Stream URL: {req.url}')
        return req.url

    def get_transcodes(self, file_id):
        # do we have the transcoded versions already cached within the last 5 minutes?
        cached_transcodes = self.transcodes_cache.get(file_id, None)
        if cached_transcodes is not None and len(cached_transcodes):
            log.debug(f"Loaded {len(cached_transcodes)} transcode streams from temporary cache for: {file_id}")
            return cached_transcodes

        # retrieve transcoded versions from google docs
        success, resp, data = self.query(f'https://docs.google.com/get_video_info?docid={file_id}')
        if not success or (not data or 'fmt_stream_map' not in data or 'fmt_list' not in data):
            log.error(f"Failed to find transcoded versions data for: {file_id}")
            return None

        # parse main response
        tmp = parse_qs(data)
        tmp_versions = tmp['fmt_list'][0]
        tmp_stream_map = tmp['fmt_stream_map'][0]
        drive_stream_cookie = resp.cookies.get('DRIVE_STREAM', '')

        # parse required variables
        transcode_versions = {}
        transcode_streams = {}

        # parse version list
        for version in tmp_versions.split(','):
            tmp_v = version.split('/')
            transcode_versions[tmp_v[0]] = tmp_v[1].split('x')[1]

        if not len(transcode_versions):
            log.error(f"Failed to parse transcoded versions (fmt_list) for: {file_id}")
            return None

        # parse transcode lists
        for stream in tmp_stream_map.split(','):
            tmp_s = stream.split('|')
            transcode_streams[transcode_versions[tmp_s[0]]] = tmp_s[1]

        if not len(transcode_streams):
            log.error(f"Failed to parse transcoded streams (fmt_stream_map) for: {file_id}")
            return None

        # cache the transcode streams for 5 minutes
        self.transcodes_cache[file_id] = transcode_streams
        log.debug(f"Added {len(transcode_streams)} transcode streams to temporary cache for: {file_id}")
        return transcode_streams

    ############################################################
    # CACHE
    ############################################################

    def get_id_metadata(self, item_id, teamdrive_id=None):
        # return cache from metadata if available
        cached_metadata = self._get_cached_metdata(item_id)
        if cached_metadata:
            return True, cached_metadata

        # does item_id match teamdrive_id?
        if teamdrive_id is not None and item_id == teamdrive_id:
            success, resp, data = self.query('v3/teamdrives/%s' % str(item_id))
            if success and resp.status_code == 200 and 'name' in data:
                # we successfully retrieved this teamdrive info, lets place a mimeType key in the result
                # so we know it needs to be cached
                data['mimeType'] = 'application/vnd.google-apps.folder'
        else:
            # retrieve file metadata
            success, resp, data = self.query('v3/files/%s' % str(item_id),
                                             params={
                                                 'supportsTeamDrives': self.cfg.google.teamdrive,
                                                 'fields': 'id,md5Checksum,mimeType,modifiedTime,name,parents,'
                                                           'trashed,teamDriveId'})
        if success and resp.status_code == 200:
            return True, data
        else:
            log.error("Error retrieving metadata for item %r:\n\n%s\n", item_id, data)
            return False, data

    def get_id_file_paths(self, item_id, teamdrive_id=None):
        file_paths = []
        added_to_cache = 0

        try:
            def get_item_paths(obj_id, path, paths, new_cache_entries, teamdrive_id=None):
                success, obj = self.get_id_metadata(obj_id, teamdrive_id)
                if not success:
                    return new_cache_entries

                teamdrive_id = teamdrive_id if 'teamDriveId' not in obj else obj['teamDriveId']

                # add item object to cache if we know its not from cache
                if 'mimeType' in obj:
                    # we know this is a new item fetched from the api, because the cache does not store this field
                    self.add_item_to_cache(obj['id'], obj['name'], [] if 'parents' not in obj else obj['parents'])
                    new_cache_entries += 1

                if path.strip() == '':
                    path = obj['name']
                else:
                    path = os.path.join(obj['name'], path)

                if 'parents' in obj and obj['parents']:
                    for parent in obj['parents']:
                        new_cache_entries += get_item_paths(parent, path, paths, new_cache_entries, teamdrive_id)

                if (not obj or 'parents' not in obj or not obj['parents']) and len(path):
                    paths.append(path)
                    return new_cache_entries
                return new_cache_entries

            added_to_cache += get_item_paths(item_id, '', file_paths, added_to_cache, teamdrive_id)
            if added_to_cache:
                log.debug("Dumping cache due to new entries!")
                self._dump_cache()

            if len(file_paths):
                return True, file_paths
            else:
                return False, file_paths

        except Exception:
            log.exception("Exception retrieving filepaths for '%s': ", item_id)

        return False, []

    def add_item_to_cache(self, item_id, item_name, item_parents):
        if item_id not in self.cache:
            log.debug("Added '%s' to cache: %s", item_id, item_name)
        self.cache[item_id] = {'name': item_name, 'parents': item_parents}
        return

    def remove_item_from_cache(self, item_id):
        if self.cache.pop(item_id, None):
            return True
        return False

    def get_item_name_from_cache(self, item_id):
        try:
            item = self.cache.get(item_id)
            return item['name'] if isinstance(item, dict) else 'Unknown'
        except Exception:
            pass
        return 'Unknown'

    def get_item_from_cache(self, item_id):
        try:
            item = self.cache.get(item_id, None)
            return item
        except Exception:
            pass
        return None

    ############################################################
    # INTERNALS
    ############################################################

    def _do_query(self, request_url: str, method: str, **kwargs):
        tries: int = 0
        max_tries: int = 2
        lock_acquirer: bool = False
        resp: Response = None
        use_timeout: int = 30

        # override default timeout
        if 'timeout' in kwargs and isinstance(kwargs['timeout'], int):
            use_timeout = kwargs['timeout']
            kwargs.pop('timeout', None)

        # remove un-needed kwargs
        kwargs.pop('fetch_all_pages', None)
        kwargs.pop('page_token_callback', None)

        # do query
        while tries < max_tries:
            if self.token_refresh_lock.locked() and not lock_acquirer:
                log.debug("Token refresh lock is currently acquired... trying again in 500ms")
                time.sleep(0.5)
                continue

            if method == 'POST':
                resp = self.http.post(request_url, timeout=use_timeout, **kwargs)
            elif method == 'PATCH':
                resp = self.http.patch(request_url, timeout=use_timeout, **kwargs)
            elif method == 'DELETE':
                resp = self.http.delete(request_url, timeout=use_timeout, **kwargs)
            else:
                resp = self.http.get(request_url, timeout=use_timeout, **kwargs)
            tries += 1

            if resp.status_code == 401 and tries < max_tries:
                # unauthorized error, lets refresh token and retry
                self.token_refresh_lock.acquire(False)
                lock_acquirer = True
                log.warning(f"Unauthorized Response (Attempts {tries}/{max_tries})")
                self.token['expires_at'] = time() - 10
                self.http = self._new_http_object()
            else:
                break

        return resp

    def _load_token(self):
        try:
            if not os.path.exists(self.token_path):
                return {}

            with open(self.token_path, 'r') as fp:
                return json.load(fp)
        except Exception:
            log.exception(f"Exception loading token from {self.token_path}: ")
        return {}

    def _dump_token(self):
        try:
            with open(self.token_path, 'w') as fp:
                json.dump(self.token, fp, indent=2)
            return True
        except Exception:
            log.exception(f"Exception dumping token to {self.token_path}: ")
        return False

    def _token_saver(self, token: dict):
        # update internal token dict
        self.token.update(token)
        try:
            if self.token_refresh_lock.locked():
                self.token_refresh_lock.release()
        except Exception:
            log.exception("Exception releasing token_refresh_lock: ")
        self._dump_token()
        log.info("Renewed access token!")
        return

    def _page_token_saver(self, page_token: str):
        # update internal token dict
        self.token['page_token'] = page_token
        self._dump_token()
        return

    def _new_http_object(self):
        return OAuth2Session(client_id=self.client_id, redirect_uri=self.redirect_url, scope=self.scopes,
                             auto_refresh_url=self.token_url, auto_refresh_kwargs={'client_id': self.client_id,
                                                                                   'client_secret': self.client_secret},
                             token_updater=self._token_saver, token=self.token)

    def _get_cached_metdata(self, item_id):
        if item_id in self.cache:
            return self.cache[item_id]
        return None

    def _dump_cache(self):
        self.cache.commit()
        return

    def _remove_unwanted_paths(self, paths_list: list, mime_type: str):
        # remove paths that were not allowed - this is always enabled
        for item_path in copy(paths_list):
            allowed_path = False
            for allowed_file_path in self.cfg.google.allowed.file_paths:
                if item_path.lower().startswith(allowed_file_path.lower()):
                    allowed_path = True
                    break
            if not allowed_path:
                log.debug("Ignoring %r because its not an allowed path", item_path)
                paths_list.remove(item_path)
                continue

        # remove unallowed extensions
        if self.cfg.google.allowed.file_extensions:
            for item_path in copy(paths_list):
                allowed_file = False
                for allowed_extension in self.cfg.google.allowed.file_extensions_list:
                    if item_path.lower().endswith(allowed_extension.lower()):
                        allowed_file = True
                        break
                if not allowed_file:
                    log.debug("Ignoring %r because it was not an allowed extension", item_path)
                    paths_list.remove(item_path)

        # remove unallowed mimes
        if self.cfg.google.allowed.mime_types:
            allowed_file = False
            for allowed_mime in self.cfg.google.allowed.mime_types_list:
                if allowed_mime.lower() in mime_type.lower():
                    if 'video' in mime_type.lower():
                        # we want to validate this is not a .sub file, which for some reason, google shows as video/MP2G
                        double_checked_allowed = True
                        for item_path in paths_list:
                            if item_path.lower().endswith('.sub'):
                                double_checked_allowed = False
                        if double_checked_allowed:
                            allowed_file = True
                            break
                    else:
                        allowed_file = True
                        break

            if not allowed_file:
                log.debug("Ignoring %s because it was not an allowed mime: %s", paths_list, mime_type)
                for item_path in copy(paths_list):
                    paths_list.remove(item_path)

    def _process_changes(self, data: dict, callbacks: dict = {}):
        removed_file_paths = {}
        added_file_paths = {}
        if not data or 'changes' not in data:
            log.error("There were no changes to process")
            return
        log.info("Processing %d changes", len(data['changes']))

        # process changes
        for change in data['changes']:
            if 'file' in change and 'fileId' in change:
                # dont consider trashed/removed events for processing
                if ('trashed' in change['file'] and change['file']['trashed']) or (
                        'removed' in change and change['removed']):
                    # store the removed file paths - only if we have this item cached, otherwise we are not interested
                    # as we would not have stored it anyway...
                    item_exists = self.get_item_from_cache(change['fileId'])
                    if item_exists is not None:
                        success, item_paths = self.get_id_file_paths(change['fileId'],
                                                                     change['file']['teamDriveId'] if 'teamDriveId'
                                                                                                      in
                                                                                                      change['file']
                                                                     else None)
                        self._remove_unwanted_paths(item_paths, change['file']['mimeType'] if 'mimeType' in change[
                            'file'] else 'Unknown')
                        if success and len(item_paths):
                            if change['fileId'] in removed_file_paths:
                                removed_file_paths[change['fileId']].extend(item_paths)
                            else:
                                removed_file_paths[change['fileId']] = item_paths

                    # remove item from cache
                    if self.remove_item_from_cache(change['fileId']):
                        log.debug("Removed '%s' from cache: %s", change['fileId'], change['file']['name'])

                    continue

                existing_cache_item = self.get_item_from_cache(change['fileId'])
                existing_success, existing_cache_item_paths = self.get_id_file_paths(change['fileId'],
                                                                                     change['file']['teamDriveId']
                                                                                     if 'teamDriveId' in change[
                                                                                         'file'] else None) if \
                    existing_cache_item is not None else (None, None)

                # we always want to add changes to the cache so renames etc can be reflected inside the cache
                self.add_item_to_cache(change['fileId'], change['file']['name'],
                                       [] if 'parents' not in change['file'] else change['file']['parents'])

                # dont process folder events
                if 'mimeType' in change['file'] and 'vnd.google-apps.folder' in change['file']['mimeType']:
                    # ignore this change as we dont want to scan folders
                    continue

                # get this files paths
                success, item_paths = self.get_id_file_paths(change['fileId'],
                                                             change['file']['teamDriveId'] if 'teamDriveId' in change[
                                                                 'file'] else None)

                # remove unwanted paths
                if existing_success and len(existing_cache_item_paths):
                    self._remove_unwanted_paths(existing_cache_item_paths,
                                                change['file']['mimeType'] if 'mimeType' in change[
                                                    'file'] else 'Unknown')
                if success and len(item_paths):
                    self._remove_unwanted_paths(item_paths, change['file']['mimeType'] if 'mimeType' in change[
                        'file'] else 'Unknown')

                # was this an existing item?
                if (existing_cache_item is not None and existing_success and len(existing_cache_item_paths)) and (
                        success and len(item_paths)):
                    # this was an existing item, and we are re-processing it again
                    # we need to find the differences between the before and after paths.
                    existing_path_set = set(existing_cache_item_paths)
                    new_path_set = set(item_paths)

                    removed_item_paths = existing_path_set.difference(new_path_set)
                    added_item_paths = new_path_set.difference(existing_path_set)

                    if len(removed_item_paths):
                        if change['fileId'] in removed_file_paths:
                            removed_file_paths[change['fileId']].extend(list(removed_item_paths))
                        else:
                            removed_file_paths[change['fileId']] = list(removed_item_paths)
                    if len(added_item_paths):
                        if change['fileId'] in added_file_paths:
                            added_file_paths[change['fileId']].extend(list(added_item_paths))
                        else:
                            added_file_paths[change['fileId']] = list(added_item_paths)

                elif success and len(item_paths):
                    # these are new paths/files that were not already in the cache
                    if change['fileId'] in added_file_paths:
                        added_file_paths[change['fileId']].extend(item_paths)
                    else:
                        added_file_paths[change['fileId']] = item_paths

            elif 'teamDrive' in change and 'teamDriveId' in change:
                # this is a teamdrive change
                # dont consider trashed/removed events for processing
                if 'removed' in change and change['removed']:
                    # remove item from cache
                    if self.remove_item_from_cache(change['teamDriveId']):
                        log.info("Removed teamDrive '%s' from cache: %s", change['teamDriveId'],
                                 change['teamDrive']['name'] if 'name' in change[
                                     'teamDrive'] else 'Unknown teamDrive')
                    continue

                if 'id' in change['teamDrive'] and 'name' in change['teamDrive']:
                    # we always want to add changes to the cache so renames etc can be reflected inside the cache
                    self.add_item_to_cache(change['teamDrive']['id'], change['teamDrive']['name'], [])
                    continue

        # always dump the cache after running changes
        self._dump_cache()
        log.info('%d added / %d removed', len(added_file_paths), len(removed_file_paths))

        # call further callbacks
        if len(removed_file_paths) and 'removed_items_callback' in callbacks:
            callbacks['removed_items_callback'](removed_file_paths)
        if len(added_file_paths) and 'new_items_callback' in callbacks:
            callbacks['new_items_callback'](added_file_paths)

        return
Beispiel #12
0
class SqliteDb:

    KEY_PREFIX = "slb:"
    KEY_SUBSCRIBED_CHANNELS = KEY_PREFIX + "subscribed_channels"
    KEY_NOTIFICATION_TASK_STORE = KEY_PREFIX + "notification_task_store"
    KEY_GUILD = KEY_PREFIX + "guild:{}"

    def __init__(self, sqlite_location: str):
        self.sqlite = SqliteDict(sqlite_location, autocommit=False)

    def init_defaults(self) -> None:
        if self.KEY_NOTIFICATION_TASK_STORE not in self.sqlite:
            self.sqlite[self.KEY_NOTIFICATION_TASK_STORE] = [False, ""]

        if self.KEY_SUBSCRIBED_CHANNELS not in self.sqlite:
            self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = set()

        self.sqlite.commit()

    def get_notification_task_store(self) -> Tuple[bool, Dict]:
        return self.sqlite[self.KEY_NOTIFICATION_TASK_STORE]

    def set_notification_task_store(self, ls_notif_sent: bool,
                                    li_embed_dict: Dict) -> None:
        self.sqlite[self.KEY_NOTIFICATION_TASK_STORE] = [
            ls_notif_sent, li_embed_dict
        ]
        self.sqlite.commit()

    def set_guild_mentions(self, guild_id: int, to_mention: str) -> None:
        self.sqlite[self.KEY_GUILD.format(guild_id)] = to_mention
        self.sqlite.commit()

    def get_guild_mentions(self, guild_id: int) -> str:
        return self.sqlite.get(self.KEY_GUILD.format(guild_id), "")

    def delete_guild_mentions(self, guild_id: int) -> int:
        if self.KEY_GUILD.format(guild_id) in self.sqlite:
            del self.sqlite[self.KEY_GUILD.format(guild_id)]
            self.sqlite.commit()
            return 1
        return 0

    def get_subbed_channels(self) -> Set[int]:
        return self.sqlite[self.KEY_SUBSCRIBED_CHANNELS]

    def add_subbed_channel(self, channel_id: int) -> int:
        channels = self.sqlite[self.KEY_SUBSCRIBED_CHANNELS]
        if channel_id not in channels:
            channels.add(channel_id)
            self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = channels
            self.sqlite.commit()
            return 1
        return 0

    def remove_subbed_channel(self, channel_id: int) -> int:
        channels = self.sqlite[self.KEY_SUBSCRIBED_CHANNELS]
        if channel_id in channels:
            channels.remove(channel_id)
            self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = channels
            self.sqlite.commit()
            return 1
        return 0

    def remove_subbed_channels(self, channels_to_remove: Set[int]) -> None:
        self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] = (
            self.sqlite[self.KEY_SUBSCRIBED_CHANNELS] - channels_to_remove)
        self.sqlite.commit()

    def subbed_channels_count(self) -> int:
        return len(self.sqlite[self.KEY_SUBSCRIBED_CHANNELS])

    def stop(self):
        self.sqlite.commit()
        self.sqlite.close()
Beispiel #13
0
class MockBinanceManager(BinanceAPIManager):
    def __init__(
        self,
        config: Config,
        db: Database,
        logger: Logger,
        start_date: datetime = None,
        start_balances: Dict[str, float] = None,
    ):
        super().__init__(config, db, logger)
        self.config = config
        self.datetime = start_date or datetime(2021, 1, 1)
        self.balances = start_balances or {config.BRIDGE.symbol: 100}
        self.cache = SqliteDict("data/backtest_cache.db")

    def setup_websockets(self):
        pass  # No websockets are needed for backtesting

    def increment(self, interval=1):
        self.datetime += timedelta(minutes=interval)

    def get_fee(self, origin_coin: Coin, target_coin: Coin, selling: bool):
        return 0.0075

    def get_ticker_price(self, ticker_symbol: str):
        """
        Get ticker price of a specific coin
        """
        target_date = self.datetime.replace(second=0, microsecond=0)
        target_date_str = self.datetime.isoformat(timespec="seconds")

        key = f"{ticker_symbol} - {target_date}"
        val = self.cache.get(key, None)

        if val is None:
            end_date = self.datetime + timedelta(minutes=1000)
            if end_date > datetime.now():
                end_date = datetime.now()
            end_date_str = end_date.isoformat(timespec="seconds")

            self.logger.info(
                f"Fetching prices for {ticker_symbol} between {target_date} and {end_date}"
            )

            # Use internal binance_client method because the public one doesn't
            # actually pass on limits.
            results = self.binance_client._historical_klines(
                ticker_symbol,
                "1m",
                start_str=target_date_str,
                end_str=end_date_str,
                limit=1000)

            prices = {}
            for result in results:
                result_date = datetime.utcfromtimestamp(result[0] / 1000)
                result_date = result_date.replace(second=0, microsecond=0)

                price = float(result[1])
                prices[f"{ticker_symbol} - {result_date}"] = price

            # Verify all intervals were returning, explicitly mark as missing
            # otherwise so we can skip fetch.
            for verify_date in (target_date + timedelta(minutes=n)
                                for n in range(1000)):
                verify_key = f"{ticker_symbol} - {verify_date}"
                self.cache[verify_key] = prices.get(verify_key, "MISSING")

            self.cache.commit()
            val = self.cache.get(key, None)

        if val == "MISSING":
            return None

        return val

    def get_currency_balance(self, currency_symbol: str, force=False):
        """
        Get balance of a specific coin
        """
        return self.balances.get(currency_symbol, 0)

    def buy_alt(self, origin_coin: Coin, target_coin: Coin):
        origin_symbol = origin_coin.symbol
        target_symbol = target_coin.symbol

        target_balance = self.get_currency_balance(target_symbol)
        from_coin_price = self.get_ticker_price(origin_symbol + target_symbol)

        order_quantity = self._buy_quantity(origin_symbol, target_symbol,
                                            target_balance, from_coin_price)
        target_quantity = order_quantity * from_coin_price
        self.balances[target_symbol] -= target_quantity
        self.balances[origin_symbol] = self.balances.get(
            origin_symbol, 0) + order_quantity * (
                1 - self.get_fee(origin_coin, target_coin, False))
        self.logger.info(
            f"Bought {origin_symbol}, balance now: {self.balances[origin_symbol]} - bridge: "
            f"{self.balances[target_symbol]}")

        event = defaultdict(lambda: None,
                            order_price=from_coin_price,
                            cumulative_quote_asset_transacted_quantity=0)

        return BinanceOrder(event)

    def sell_alt(self, origin_coin: Coin, target_coin: Coin):
        origin_symbol = origin_coin.symbol
        target_symbol = target_coin.symbol

        origin_balance = self.get_currency_balance(origin_symbol)
        from_coin_price = self.get_ticker_price(origin_symbol + target_symbol)

        order_quantity = self._sell_quantity(origin_symbol, target_symbol,
                                             origin_balance)
        target_quantity = order_quantity * from_coin_price
        self.balances[target_symbol] = self.balances.get(
            target_symbol, 0) + target_quantity * (
                1 - self.get_fee(origin_coin, target_coin, True))
        self.balances[origin_symbol] -= order_quantity
        self.logger.info(
            f"Sold {origin_symbol}, balance now: {self.balances[origin_symbol]} - bridge: "
            f"{self.balances[target_symbol]}")
        return {"price": from_coin_price}

    def collate_coins(self, target_symbol: str):
        total = 0
        for coin, balance in self.balances.items():
            if coin == target_symbol:
                total += balance
                continue
            if coin == self.config.BRIDGE.symbol:
                price = self.get_ticker_price(target_symbol + coin)
                if price is None:
                    continue
                total += balance / price
            else:
                price = self.get_ticker_price(coin + target_symbol)
                if price is None:
                    continue
                total += price * balance
        return total

    def close(self):
        self.cache.close()
import os
import pandas as pd
from math import inf
from sqlitedict import SqliteDict
from statsmodels.tsa.ar_model import AutoReg

cache = SqliteDict('changed.db', autocommit=True)
# A US manufacturer buys raw materials in multiple currencies
purchases = pd.read_excel('Purchases.xlsx')

# For each of those currencies, find the best model to forecast prices
best_model = {}
for currency in purchases.currency:
    print('Currency', currency)
    file_time = os.stat(f'{currency}.xlsx').st_mtime
    if cache.get(currency, (0, 0))[0] < file_time:
        data = pd.read_excel(f'{currency}.xlsx')
        data = data[data[currency] > 0]
        best_aic, best_fit = inf, None
        for lags in (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095):
            print('    Lag', lags)
            model = AutoReg(data[currency], lags=lags)
            fit = model.fit()
            if fit.aic < best_aic:
                best_aic, best_fit = fit.aic, fit
        cache[currency] = (file_time, best_fit)
    best_model[currency] = cache[currency]

# Estimate next month's price increase assuming the same volume as today
forecasted_value = 0
for index, row in purchases.iterrows():
Beispiel #15
0
                "author": line["author"],
                "subreddit": line["subreddit"].lower() if k == "all" else k,
                "timestamp": int(line["created_utc"])
            }
            val = tmp_dict.get(subreddit, [])
            val.append(dict_val)
            tmp_dict[subreddit] = val

        if count % 1000000 == 0:
            print(datetime.datetime.now() - now, len(list(tmp_dict.keys())))
            now = datetime.datetime.now()

c = 0
for key, item in tmp_dict.items():
    c += 1
    try:
        val = dict_db.get(key, [])
        val += item
        dict_db[key] = val
    except:
        print("ERROR", key)
        continue

    if count % 1000000 == 0:
        print(datetime.datetime.now() - now)
        now = datetime.datetime.now()
        dict_db.commit()

dict_db.commit()
dict_db.close()
from math import inf
from sqlitedict import SqliteDict
from statsmodels.tsa.ar_model import AutoReg

cache = SqliteDict('precompute.db', autocommit=True)
# A US manufacturer buys raw materials in multiple currencies
purchases = pd.read_excel('Purchases.xlsx')

# For each of those currencies, find the best model to forecast prices
best_model = {}
for currency in purchases.currency:
    print('Currency', currency)
    data = pd.read_excel(f'{currency}.xlsx')
    data = data[data[currency] > 0]
    best_aic, best_fit, best_lags = inf, None, None
    check_lags = cache.get(
        currency, (3, 5, 7, 10, 14, 28, 60, 90, 120, 183, 365, 730, 1095))
    for lags in check_lags:
        print('    Lags', lags)
        model = AutoReg(data[currency], lags=lags)
        fit = model.fit()
        if fit.aic < best_aic:
            best_aic, best_fit, best_lags = fit.aic, fit, lags
    cache[currency] = (best_lags, )
    best_model[currency] = best_fit

# Estimate next month's price increase assuming the same volume as today
forecasted_value = 0
for index, row in purchases.iterrows():
    fit = best_model[row.currency]
    prices = fit.predict(fit.model.nobs, fit.model.nobs + 30)
    change = prices.iloc[-1] / prices.iloc[0]
Beispiel #17
0
class Twizzle(object):
    """Twizzle multi purpose benchmarking system -- base class
    """
    def __init__(self, sDBPath):
        """Constructor of the Twizzle class

        Note:
            Please pass the path of the SQLite
            as parameter
        Args:
            sDBPath (str): Path to the SQLite database.
        """
        if sDBPath is None:
            raise Exception("Path to SQL-Database has to be defined")
        self._db = SqliteDict(sDBPath)

    def add_challenge(self,
                      sName,
                      aOriginalObjects,
                      aComparativeObjects,
                      aTargetDecisions,
                      dicMetadata={}):
        """Adds a challenge under the given name to the database

        Note:
            The three lists describe a table of the following format:

            | Original object | Comparative object | target decision |
            |----------------|-------------------|-----------------|
            | Img1.png       | Img1_scaled.png   | True            |
            | Img2.png       | Img2_brighter.png | True            |
            | Img2.png       | Img9.png          | False           |


        Args:
            sName (str): the name of the challenge.
            aOriginalObjects (:obj:`list` of :obj:`str`): List of paths of the original objects
            aComparativeObjects (:obj:`list` of :obj:`str`): List of paths of the objects that should be compared to
                                                            the original objects at the same position in the list
            aTargetDecisions (:obj:`list` of :obj:`bool`): List of boolean defining whether the objects linked in aOriginalObjects
                                                            and aComparativeObjects beeing at the same position in the list are
                                                            the same (True) or not (False)
            dicMetadata (:obj:): an object defining metadata for the challenge like what printer was used or what kind of attack
                                using which parameters was performed

        Returns:
            None
        """

        # catch wrong parameters
        if (not sName) or (aOriginalObjects is None) or (
                aComparativeObjects is None) or (aTargetDecisions is None):
            raise Exception("Parameters can not be None.")
        if not (len(aOriginalObjects) == len(aComparativeObjects) ==
                len(aTargetDecisions)):
            raise Exception(
                "Objects sets and target decisions have to have the same amount of entries."
            )
        if not (all(isinstance(x, str) for x in aOriginalObjects)
                and all(isinstance(x, str) for x in aComparativeObjects)):
            raise Exception(
                "All objects have to be defined as path given as string.")
        if (not all(isinstance(x, bool)
                    for x in aTargetDecisions)) and not isinstance(
                        aTargetDecisions, np.ndarray) and not (
                            aTargetDecisions.dtype == np.dtype("bool")):
            raise Exception("The target decisions have to be boolean only.")

        # get current challenges from database
        aChallenges = self._db.get(DB_CHALLENGES_KEY, [])

        # test whether name was used before
        aChallengesSameName = [
            ch for ch in aChallenges if ch["challenge"] == sName
        ]
        if len(aChallengesSameName) != 0:
            raise Exception(
                "Challenge name %s is already in use. Define an other one. Aborting."
                % sName)

        # append new challenge
        dicChallenge = {
            "challenge": sName,
            "originalObjects": aOriginalObjects,
            "comparativeObjects": aComparativeObjects,
            "targetDecisions": aTargetDecisions
        }
        # adding additional information if given
        if dicMetadata:
            dicChallenge = {**dicMetadata, **dicChallenge}
        aChallenges.append(dicChallenge)
        self._db[DB_CHALLENGES_KEY] = aChallenges
        self._db.commit()

    def del_challenge(self, sName):
        """ deletes an existing challenge by its name

        Args:
            sName (str): the name of the challenge to be deleted

        Returns:
            None
        """

        # get current challenges from database
        aChallenges = self._db.get(DB_CHALLENGES_KEY, [])
        aMatches = [ch for ch in aChallenges if ch["challenge"] == sName]
        if len(aMatches) == 0:
            raise Exception("No challenge named %s found." % sName)

        # remove element
        aChallenges.remove(aMatches[0])

        # save new db
        self._db[DB_CHALLENGES_KEY] = aChallenges
        self._db.commit()

    def get_challenges(self):
        """ getting a list of all defined challenges

        Returns:
            :obj:`list` of :obj:: `obj`:  List of all defined challenges
        """
        return self._db.get(DB_CHALLENGES_KEY, [])

    def get_challenge(self, sChallengeName):
        """ getting a single challenge object

        Args:
            sChallengeName (str): the name of the challenge to get

          Returns:
            :obj:: `obj`:  Object defining the challenge having the name sChallengeName
        """
        aChallenges = self._db.get(DB_CHALLENGES_KEY, [])
        aMatches = [
            ch for ch in aChallenges if ch["challenge"] == sChallengeName
        ]
        if len(aMatches) == 0:
            raise Exception("No challenge with name %s found." %
                            sChallengeName)
        return aMatches[0]

    def clear_challenges(self):
        """ clears all challenge entries from the database """
        self._db[DB_CHALLENGES_KEY] = []
        self._db.commit()

    def run_test(self,
                 sChallengeName,
                 fnCallback,
                 dicCallbackParameters={},
                 autosave_to_db=False):
        """ run single challenge as test using given callback function and optional params

        Note:
            fnCallback has to fullfill following specifications

            Parameters:
            fnCallback(aOriginalObjects, aComparativeObjects, **dicCallbackParameters)
            - aOriginalObjects: list of strings describing paths to original objects
            - aComparativeObjects: list of strings describing paths to comparative objects
            ... arbitrary number of further parameters

            Returns:
            aDecisions, dicAdditionalInformation = fnCallback(...)
            - aDecisions: list of boolean decisions describing wether the algorithm has decided that the original object
                          and the comparative objects are the same (True) or not (False)
            - dicAdditionalInformation: the algorithm can supply additional information that can be used in the evaluation
                                        later on to compare different settings


        Args:
            sChallengeName (str): the challenge that should be executed
            fnCallback (function): Pointer to wrapper-function that tests a challenge on a specific algorithm
                                    and makes decisions whether the objects are the same or not depending on its decision algorithm
            dicCallbackParameters (:obj:): Dictionary defining parameters for the function in fnCallback

        Returns:
            dicTest: dictionary of test results that can be saved to db
        """
        if not (sChallengeName) or not (fnCallback):
            raise Exception("Parameters are not allowed to be None.")

        dicChallenge = self.get_challenge(sChallengeName)
        sChallengeName = dicChallenge["challenge"]
        aOriginalObjects = dicChallenge["originalObjects"]
        aComparativeObjects = dicChallenge["comparativeObjects"]
        aTargetDecisions = dicChallenge["targetDecisions"]

        # run challenge
        aDecisions, dicAdditionalInformation = fnCallback(
            aOriginalObjects, aComparativeObjects, **dicCallbackParameters)

        # check if site of decisions is right
        if len(aDecisions) != len(aTargetDecisions):
            raise Exception(
                "Array of Decisions is not the same size as given set of objects. Aborting."
            )

        # calculate rates
        lTP = np.sum(np.logical_and(aDecisions, aTargetDecisions))
        lTN = np.sum(
            np.logical_and(np.logical_not(aDecisions),
                           np.logical_not(aTargetDecisions)))
        lFP = np.sum(
            np.logical_and(aDecisions, np.logical_not(aTargetDecisions)))
        lFN = np.sum(
            np.logical_and(np.logical_not(aDecisions), aTargetDecisions))

        #True positive Rate / Recall -- Robustness in PIH
        dTPR = lTP / (lTP + lFN) if ((lTP + lFN) > 0) else 0.
        # True negative Rate -- Sensitivity
        dTNR = lTN / (lTN + lFP) if ((lTN + lFP) > 0) else 0.
        # False positive Rate / FAR
        dFPR = 1 - dTNR
        # False negative Rate / FRR
        dFNR = 1 - dTPR

        dAccuracy = (lTP + lTN) / (lTP + lTN + lFP + lFN)
        dPrecision = lTP / (lTP + lFP) if ((lTP + lFP) > 0.) else 0.
        dF1score = 2 * ((dPrecision * dTPR) / (dPrecision + dTPR)) if (
            (dPrecision + dTPR) > 0) else 0.

        # fill test object
        dicTest = dicAdditionalInformation
        dicTest["challenge"] = sChallengeName
        #dicTest["TP"] = lTP
        #dicTest["TN"] = lTN
        #dicTest["FP"] = lFP
        #dicTest["FN"] = lFN
        dicTest["TPR"] = dTPR  #Recall
        dicTest["TNR"] = dTNR
        dicTest["FPR"] = dFPR  # FAR
        dicTest["FNR"] = dFNR  #FRR

        dicTest["Accuracy"] = dAccuracy
        dicTest["Precision"] = dPrecision
        dicTest["F1_score"] = dF1score

        # save test in db
        if autosave_to_db:
            self.__save_test(dicTest)

        return dicTest

    def __save_test(self, dicTest):
        """ saves a test object to the database"""
        if not dicTest:
            raise Exception("Test object must not be None.")

        aTests = self._db.get(DB_TESTS_KEY, [])
        aTests.append(dicTest)
        self._db[DB_TESTS_KEY] = aTests
        self._db.commit()

    def save_test_threadsafe(self, dicTest, lock):
        """ saves a test object to the database threadsafe"""
        lock.acquire()
        self.__save_test(dicTest)
        lock.release()

    def get_tests(self):
        """getting all tests

        Returns:
            :obj:`list` of :obj:: `obj`:  List of all tests executed
        """
        return self._db.get(DB_TESTS_KEY, [])

    def clear_tests(self):
        """ delete all tests from the database """
        self._db[DB_TESTS_KEY] = []
        self._db.commit()
class SimServer(object):
    """
    Top-level functionality for similarity services. A similarity server takes
    care of::

    1. creating semantic models
    2. indexing documents using these models
    3. finding the most similar documents in an index.

    An object of this class can be shared across network via Pyro, to answer remote
    client requests. It is thread safe. Using a server concurrently from multiple
    processes is safe for reading = answering similarity queries. Modifying
    (training/indexing) is realized via locking = serialized internally.
    """
    def __init__(self, basename, use_locks=False):
        """
        All data will be stored under directory `basename`. If there is a server
        there already, it will be loaded (resumed).

        The server object is stateless in RAM -- its state is defined entirely by its location.
        There is therefore no need to store the server object.
        """
        if not os.path.isdir(basename):
            raise ValueError("%r must be a writable directory" % basename)
        self.basename = basename
        self.use_locks = use_locks

        self.lock_update = threading.RLock(
        ) if use_locks else gensim.utils.nocm
        #self.lock_update = threading._RLock if use_locks else gensim.utils.nocm
        #self.lock_update = RLock() if use_locks else gensim.utils.nocm

        try:
            self.fresh_index = SimIndex.load(self.location('index_fresh'))
        except:
            logger.debug("starting a new fresh index")
            self.fresh_index = None
        try:
            self.opt_index = SimIndex.load(self.location('index_opt'))
        except:
            logger.debug("starting a new optimized index")
            self.opt_index = None
        try:
            self.model = SimModel.load(self.location('model'))
        except:
            self.model = None
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)
        self.flush(save_index=False, save_model=False, clear_buffer=True)
        logger.info("loaded %s" % self)

    def location(self, name):
        return os.path.join(self.basename, name)

    @gensim.utils.synchronous('lock_update')
    def flush(self, save_index=False, save_model=False, clear_buffer=False):
        """Commit all changes, clear all caches."""
        if save_index:
            if self.fresh_index is not None:
                self.fresh_index.save(self.location('index_fresh'))
            if self.opt_index is not None:
                self.opt_index.save(self.location('index_opt'))
        if save_model:
            if self.model is not None:
                self.model.save(self.location('model'))
        self.payload.commit()
        if clear_buffer:
            if hasattr(self, 'fresh_docs'):
                try:
                    self.fresh_docs.terminate(
                    )  # erase all buffered documents + file on disk
                except:
                    pass
            self.fresh_docs = SqliteDict(
                journal_mode=JOURNAL_MODE
            )  # buffer defaults to a random location in temp
        self.fresh_docs.sync()

    def close(self):
        """Explicitly close open file handles, databases etc."""
        try:
            self.payload.close()
        except:
            pass
        try:
            self.model.close()
        except:
            pass
        try:
            self.fresh_index.close()
        except:
            pass
        try:
            self.opt_index.close()
        except:
            pass
        try:
            self.fresh_docs.terminate()
        except:
            pass

    def __del__(self):
        """When the server went out of scope, make an effort to close its DBs."""
        self.close()

    @gensim.utils.synchronous('lock_update')
    def buffer(self, documents):
        """
        Add a sequence of documents to be processed (indexed or trained on).

        Here, the documents are simply collected; real processing is done later,
        during the `self.index` or `self.train` calls.

        `buffer` can be called repeatedly; the result is the same as if it was
        called once, with a concatenation of all the partial document batches.
        The point is to save memory when sending large corpora over network: the
        entire `documents` must be serialized into RAM. See `utils.upload_chunked()`.

        A call to `flush()` clears this documents-to-be-processed buffer (`flush`
        is also implicitly called when you call `index()` and `train()`).
        """
        logger.info("adding documents to temporary buffer of %s" % (self))
        for doc in documents:
            docid = doc['id']
            #            logger.debug("buffering document %r" % docid)
            if docid in self.fresh_docs:
                logger.warning("asked to re-add id %r; rewriting old value" %
                               docid)
            self.fresh_docs[docid] = doc
        self.fresh_docs.sync()

    @gensim.utils.synchronous('lock_update')
    def train(self,
              corpus=None,
              method='auto',
              clear_buffer=True,
              params=None):
        """
        Create an indexing model. Will overwrite the model if it already exists.
        All indexes become invalid, because documents in them use a now-obsolete
        representation.

        The model is trained on documents previously entered via `buffer`,
        or directly on `corpus`, if specified.
        """
        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)
        if not self.fresh_docs:
            msg = "train called but no training corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)
        if method == 'auto':
            numdocs = len(self.fresh_docs)
            if numdocs < 1000:
                logging.warning(
                    "too few training documents; using simple log-entropy model instead of latent semantic indexing"
                )
                method = 'logentropy'
            else:
                method = 'lsi'
        if params is None:
            params = {}
        self.model = SimModel(self.fresh_docs, method=method, params=params)
        self.flush(save_model=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def index(self, corpus=None, clear_buffer=True):
        """
        Permanently index all documents previously added via `buffer`, or
        directly index documents from `corpus`, if specified.

        The indexing model must already exist (see `train`) before this function
        is called.
        """
        if not self.model:
            msg = 'must initialize model for %s before indexing documents' % self.basename
            logger.error(msg)
            raise AttributeError(msg)

        if corpus is not None:
            # use the supplied corpus only (erase existing buffer, if any)
            self.flush(clear_buffer=True)
            self.buffer(corpus)

        if not self.fresh_docs:
            msg = "index called but no indexing corpus specified for %s" % self
            logger.error(msg)
            raise ValueError(msg)

        if not self.fresh_index:
            logger.info("starting a new fresh index for %s" % self)
            self.fresh_index = SimIndex(self.location('index_fresh'),
                                        self.model.num_features)
        self.fresh_index.index_documents(self.fresh_docs, self.model)
        if self.opt_index is not None:
            self.opt_index.delete(self.fresh_docs.keys())
        logger.info("storing document payloads")
        for docid in self.fresh_docs:
            payload = self.fresh_docs[docid].get('payload', None)
            if payload is None:
                # HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
                break
            self.payload[docid] = payload
        self.flush(save_index=True, clear_buffer=clear_buffer)

    @gensim.utils.synchronous('lock_update')
    def optimize(self):
        """
        Precompute top similarities for all indexed documents. This speeds up
        `find_similar` queries by id (but not queries by fulltext).

        Internally, documents are moved from a fresh index (=no precomputed similarities)
        to an optimized index (precomputed similarities). Similarity queries always
        query both indexes, so this split is transparent to clients.

        If you add documents later via `index`, they go to the fresh index again.
        To precompute top similarities for these new documents too, simply call
        `optimize` again.

        """
        if self.fresh_index is None:
            logger.warning("optimize called but there are no new documents")
            return  # nothing to do!

        if self.opt_index is None:
            logger.info("starting a new optimized index for %s" % self)
            self.opt_index = SimIndex(self.location('index_opt'),
                                      self.model.num_features)

        self.opt_index.merge(self.fresh_index)
        self.fresh_index.terminate()  # delete old files
        self.fresh_index = None
        self.flush(save_index=True)

    @gensim.utils.synchronous('lock_update')
    def drop_index(self, keep_model=True):
        """Drop all indexed documents. If `keep_model` is False, also dropped the model."""
        modelstr = "" if keep_model else "and model "
        logger.info("deleting similarity index " + modelstr +
                    "from %s" % self.basename)

        # delete indexes
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.terminate()
        self.fresh_index, self.opt_index = None, None

        # delete payload
        if self.payload is not None:
            self.payload.close()

            fname = self.location('payload')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s" % fname)
        self.payload = SqliteDict(self.location('payload'),
                                  autocommit=True,
                                  journal_mode=JOURNAL_MODE)

        # optionally, delete the model as well
        if not keep_model and self.model is not None:
            self.model.close()
            fname = self.location('model')
            try:
                if os.path.exists(fname):
                    os.remove(fname)
                    logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s" % fname)
            self.model = None
        self.flush(save_index=True, save_model=True, clear_buffer=True)

    @gensim.utils.synchronous('lock_update')
    def delete(self, docids):
        """Delete specified documents from the index."""
        logger.info("asked to drop %i documents" % len(docids))
        for index in [self.opt_index, self.fresh_index]:
            if index is not None:
                index.delete(docids)
        self.flush(save_index=True)

    def is_locked(self):
        #return self.use_locks and self.lock_update._RLock__count > 0
        #return self.use_locks
        #return self.use_locks and self.lock_update._RLock._count > 0
        return self.use_locks and self.lock_update._count > 0

    def vec_by_id(self, docid):
        for index in [self.opt_index, self.fresh_index]:
            if index is not None and docid in index:
                return index.vec_by_id(docid)

    def find_similar(self, doc, min_score=0.0, max_results=100):
        """
        Find `max_results` most similar articles in the index, each having similarity
        score of at least `min_score`. The resulting list may be shorter than `max_results`,
        in case there are not enough matching documents.

        `doc` is either a string (=document id, previously indexed) or a
        dict containing a 'tokens' key. These tokens are processed to produce a
        vector, which is then used as a query against the index.

        The similar documents are returned in decreasing similarity order, as
        `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned
        is identical to what was supplied for this document during indexing.

        """
        logger.debug("received query call with %r" % doc)
        if self.is_locked():
            msg = "cannot query while the server is being updated"
            logger.error(msg)
            raise RuntimeError(msg)
        sims_opt, sims_fresh = None, None
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:
                index.topsims = max_results
        #if isinstance(doc, basestring):
        if isinstance(doc, str):
            # query by direct document id
            docid = doc
            if self.opt_index is not None and docid in self.opt_index:
                sims_opt = self.opt_index.sims_by_id(docid)
                if self.fresh_index is not None:
                    vec = self.opt_index.vec_by_id(docid)
                    sims_fresh = self.fresh_index.sims_by_vec(vec,
                                                              normalize=False)
            elif self.fresh_index is not None and docid in self.fresh_index:
                sims_fresh = self.fresh_index.sims_by_id(docid)
                if self.opt_index is not None:
                    vec = self.fresh_index.vec_by_id(docid)
                    sims_opt = self.opt_index.sims_by_vec(vec, normalize=False)
            else:
                raise ValueError("document %r not in index" % docid)
        else:
            if 'topics' in doc:
                # user supplied vector directly => use that
                vec = gensim.matutils.any2sparse(doc['topics'])
            else:
                # query by an arbitrary text (=tokens) inside doc['tokens']
                vec = self.model.doc2vec(
                    doc)  # convert document (text) to vector
            if self.opt_index is not None:
                sims_opt = self.opt_index.sims_by_vec(vec)
            if self.fresh_index is not None:
                sims_fresh = self.fresh_index.sims_by_vec(vec)

        merged = merge_sims(sims_opt, sims_fresh)
        logger.debug(
            "got %s raw similars, pruning with max_results=%s, min_score=%s" %
            (len(merged), max_results, min_score))
        result = []
        for docid, score in merged:
            if score < min_score or 0 < max_results <= len(result):
                break
            result.append((docid, float(score), self.payload.get(docid, None)))
        return result

    #def find_similar(self, doc, min_score=0.0, max_results=100):
    def find_dissimilar(self, doc, max_score=1.0, max_results=100):
        """
        Find `max_results` most similar articles in the index, each having similarity
        score of at least `min_score`. The resulting list may be shorter than `max_results`,
        in case there are not enough matching documents.

        `doc` is either a string (=document id, previously indexed) or a
        dict containing a 'tokens' key. These tokens are processed to produce a
        vector, which is then used as a query against the index.

        The similar documents are returned in decreasing similarity order, as
        `(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned
        is identical to what was supplied for this document during indexing.

        """
        logger.debug("received query call with %r" % doc)
        if self.is_locked():
            msg = "cannot query while the server is being updated"
            logger.error(msg)
            raise RuntimeError(msg)
        sims_opt, sims_fresh = None, None
        for index in [self.fresh_index, self.opt_index]:
            if index is not None:

                #index.topsims = max_results
                index.topsims = 10000000

        #if isinstance(doc, basestring):
        if isinstance(doc, str):
            # query by direct document id
            docid = doc
            if self.opt_index is not None and docid in self.opt_index:
                sims_opt = self.opt_index.sims_by_id(docid)
                if self.fresh_index is not None:
                    vec = self.opt_index.vec_by_id(docid)
                    sims_fresh = self.fresh_index.sims_by_vec(vec,
                                                              normalize=False)
            elif self.fresh_index is not None and docid in self.fresh_index:
                sims_fresh = self.fresh_index.sims_by_id(docid)
                if self.opt_index is not None:
                    vec = self.fresh_index.vec_by_id(docid)
                    sims_opt = self.opt_index.sims_by_vec(vec, normalize=False)
            else:
                raise ValueError("document %r not in index" % docid)
        else:
            if 'topics' in doc:
                # user supplied vector directly => use that
                vec = gensim.matutils.any2sparse(doc['topics'])
            else:
                # query by an arbitrary text (=tokens) inside doc['tokens']
                vec = self.model.doc2vec(
                    doc)  # convert document (text) to vector
            if self.opt_index is not None:
                sims_opt = self.opt_index.sims_by_vec(vec)
            if self.fresh_index is not None:
                sims_fresh = self.fresh_index.sims_by_vec(vec)

        merged = merge_sims(sims_opt, sims_fresh)
        #merged.sort(reverse=False)
        merged.sort(key=lambda tup: tup[1], reverse=False)

        #logger.debug("got %s raw similars, pruning with max_results=%s, min_score=%s" %
        #    (len(merged), max_results, min_score))
        logger.debug(
            "got %s raw similars, pruning with max_results=%s, max_score=%s" %
            (len(merged), max_results, max_score))
        result = []
        #print("merged = ", merged)
        #print("len(merged) = ", len(merged))
        for docid, score in merged:
            #if score < min_score or 0 < max_results <= len(result):
            #if score > max_score or 0 < max_results <= len(result):
            if score > max_score:
                if len(result) >= max_results:
                    break
            #elif len(result) >= (10 * max_results):
            #elif len(result) >= (2 * max_results):
            elif len(result) >= (1 * max_results):
                break
            result.append((docid, float(score), self.payload.get(docid, None)))
        return result

    def __str__(self):
        return ("SimServer(loc=%r, fresh=%s, opt=%s, model=%s, buffer=%s)" %
                (self.basename, self.fresh_index, self.opt_index, self.model,
                 self.fresh_docs))

    def __len__(self):
        return sum(
            len(index) for index in [self.opt_index, self.fresh_index]
            if index is not None)

    def __contains__(self, docid):
        """Is document with `docid` in the index?"""
        return any(index is not None and docid in index
                   for index in [self.opt_index, self.fresh_index])

    def get_tfidf(self, *args, **kwargs):
        return self.model.get_tfidf(*args, **kwargs)

    def status(self):
        return str(self)

    def keys(self):
        """Return ids of all indexed documents."""
        result = []
        if self.fresh_index is not None:
            result += self.fresh_index.keys()
        if self.opt_index is not None:
            result += self.opt_index.keys()
        return result

    def memdebug(self):
        from guppy import hpy
        return str(hpy().heap())
Beispiel #19
0
class CharLMEmbeddings(TokenEmbeddings):
    """Contextual string embeddings of words, as proposed in Akbik et al., 2018."""

    def __init__(self, model, detach: bool = True, use_cache: bool = True, cache_directory: str = None):
        """
        initializes contextual string embeddings using a character-level language model.
        :param model: model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward',
                'german-backward' depending on which character language model is desired
        :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
                training and often leads to worse results, so not recommended.
        :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
                not allow re-use of once computed embeddings that do not fit into memory
        :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
                is written to the provided directory.
        """
        super().__init__()

        # news-english-forward
        if model.lower() == 'news-forward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # news-english-backward
        if model.lower() == 'news-backward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # news-english-forward
        if model.lower() == 'news-forward-fast':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # news-english-backward
        if model.lower() == 'news-backward-fast':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # mix-english-forward
        if model.lower() == 'mix-forward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-forward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # mix-english-backward
        if model.lower() == 'mix-backward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-english-backward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # mix-german-forward
        if model.lower() == 'german-forward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-forward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # mix-german-backward
        if model.lower() == 'german-backward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-mix-german-backward-v0.2rc.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # common crawl Polish forward
        if model.lower() == 'polish-forward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-forward-v0.2.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        # common crawl Polish backward
        if model.lower() == 'polish-backward':
            base_path = 'https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-polish-backward-v0.2.pt'
            model = cached_path(base_path, cache_dir='embeddings')

        self.name = model
        self.static_embeddings = detach

        from flair.models import LanguageModel
        self.lm = LanguageModel.load_language_model(model)
        self.detach = detach

        self.is_forward_lm: bool = self.lm.is_forward_lm

        # caching variables
        self.use_cache: bool = use_cache
        self.cache = None
        self.cache_directory: str = cache_directory

        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token('hello'))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())

    def __getstate__(self):
        # Copy the object's state from self.__dict__ which contains
        # all our instance attributes. Always use the dict.copy()
        # method to avoid modifying the original state.
        state = self.__dict__.copy()
        # Remove the unpicklable entries.
        state['cache'] = None
        state['use_cache'] = False
        state['cache_directory'] = None
        return state

    @property
    def embedding_length(self) -> int:
        return self.__embedding_length

    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:

        # this whole block is for compatibility with older serialized models  TODO: remove in version 0.4
        if 'cache' not in self.__dict__ or 'cache_directory' not in self.__dict__:
            self.use_cache = False
            self.cache_directory = None
        else:
            cache_path = '{}-tmp-cache.sqllite'.format(self.name) if not self.cache_directory else os.path.join(
                self.cache_directory, '{}-tmp-cache.sqllite'.format(os.path.basename(self.name)))
            if not os.path.exists(cache_path):
                self.use_cache = False
                self.cache_directory = None

        # if cache is used, try setting embeddings from cache first
        if self.use_cache:

            # lazy initialization of cache
            if not self.cache:
                from sqlitedict import SqliteDict
                self.cache = SqliteDict(cache_path, autocommit=True)

            # try populating embeddings from cache
            all_embeddings_retrieved_from_cache: bool = True
            for sentence in sentences:
                key = sentence.to_tokenized_string()
                embeddings = self.cache.get(key)

                if not embeddings:
                    all_embeddings_retrieved_from_cache = False
                    break
                else:
                    for token, embedding in zip(sentence, embeddings):
                        token.set_embedding(self.name, torch.FloatTensor(embedding))

            if all_embeddings_retrieved_from_cache:
                return sentences

        # if this is not possible, use LM to generate embedding. First, get text sentences
        text_sentences = [sentence.to_tokenized_string() for sentence in sentences]

        longest_character_sequence_in_batch: int = len(max(text_sentences, key=len))

        # pad strings with whitespaces to longest sentence
        sentences_padded: List[str] = []
        append_padded_sentence = sentences_padded.append

        end_marker = ' '
        extra_offset = 1
        for sentence_text in text_sentences:
            pad_by = longest_character_sequence_in_batch - len(sentence_text)
            if self.is_forward_lm:
                padded = '\n{}{}{}'.format(sentence_text, end_marker, pad_by * ' ')
                append_padded_sentence(padded)
            else:
                padded = '\n{}{}{}'.format(sentence_text[::-1], end_marker, pad_by * ' ')
                append_padded_sentence(padded)

        # get hidden states from language model
        all_hidden_states_in_lm = self.lm.get_representation(sentences_padded, self.detach)

        # take first or last hidden states from language model as word representation
        for i, sentence in enumerate(sentences):
            sentence_text = sentence.to_tokenized_string()

            offset_forward: int = extra_offset
            offset_backward: int = len(sentence_text) + extra_offset

            for token in sentence.tokens:
                token: Token = token

                offset_forward += len(token.text)

                if self.is_forward_lm:
                    offset = offset_forward
                else:
                    offset = offset_backward

                embedding = all_hidden_states_in_lm[offset, i, :]

                # if self.tokenized_lm or token.whitespace_after:
                offset_forward += 1
                offset_backward -= 1

                offset_backward -= len(token.text)

                token.set_embedding(self.name, embedding)

        if self.use_cache:
            for sentence in sentences:
                self.cache[sentence.to_tokenized_string()] = [token._embeddings[self.name].tolist() for token in
                                                              sentence]

        return sentences
class SimIndex(gensim.utils.SaveLoad):
    """
    An index of documents. Used internally by SimServer.

    It uses the Similarity class to persist all document vectors to disk (via mmap).
    """
    def __init__(self,
                 fname,
                 num_features,
                 shardsize=SHARD_SIZE,
                 topsims=TOP_SIMS):
        """
        Spill index shards to disk after every `shardsize` documents.
        In similarity queries, return only the `topsims` most similar documents.
        """
        self.fname = fname
        self.shardsize = int(shardsize)
        self.topsims = int(topsims)
        self.id2pos = {
        }  # map document id (string) to index position (integer)
        self.pos2id = {
        }  # reverse mapping for id2pos; redundant, for performance
        self.id2sims = SqliteDict(
            self.fname + '.id2sims', journal_mode=JOURNAL_MODE
        )  # precomputed top similar: document id -> [(doc_id, similarity)]
        self.qindex = gensim.similarities.Similarity(self.fname + '.idx',
                                                     corpus=None,
                                                     num_best=None,
                                                     num_features=num_features,
                                                     shardsize=shardsize)
        self.length = 0

    def save(self, fname):
        tmp, self.id2sims = self.id2sims, None
        super(SimIndex, self).save(fname)
        self.id2sims = tmp

    @staticmethod
    def load(fname):
        result = gensim.utils.SaveLoad.load(fname)
        result.fname = fname
        result.check_moved()
        result.id2sims = SqliteDict(fname + '.id2sims',
                                    journal_mode=JOURNAL_MODE)
        return result

    def check_moved(self):
        output_prefix = self.fname + '.idx'
        if self.qindex.output_prefix != output_prefix:
            logger.info(
                "index seems to have moved from %s to %s; updating locations" %
                (self.qindex.output_prefix, output_prefix))
            self.qindex.output_prefix = output_prefix
            self.qindex.check_moved()

    def close(self):
        "Explicitly release important resources (file handles, db, ...)"
        try:
            self.id2sims.close()
        except:
            pass
        try:
            del self.qindex
        except:
            pass

    def terminate(self):
        """Delete all files created by this index, invalidating `self`. Use with care."""
        try:
            self.id2sims.terminate()
        except:
            pass
        import glob
        for fname in glob.glob(self.fname + '*'):
            try:
                os.remove(fname)
                logger.info("deleted %s" % fname)
            #except Exception, e:
            except Exception as e:
                logger.warning("failed to delete %s: %s" % (fname, e))
        for val in self.__dict__.keys():
            try:
                delattr(self, val)
            except:
                pass

    def index_documents(self, fresh_docs, model):
        """
        Update fresh index with new documents (potentially replacing old ones with
        the same id). `fresh_docs` is a dictionary-like object (=dict, sqlitedict, shelve etc)
        that maps document_id->document.
        """
        docids = fresh_docs.keys()
        vectors = (model.docs2vecs(fresh_docs[docid] for docid in docids))
        logger.info("adding %i documents to %s" % (len(docids), self))
        self.qindex.add_documents(vectors)
        self.qindex.save()
        self.update_ids(docids)

    def update_ids(self, docids):
        """Update id->pos mapping with new document ids."""
        logger.info("updating %i id mappings" % len(docids))
        for docid in docids:
            if docid is not None:
                pos = self.id2pos.get(docid, None)
                if pos is not None:
                    logger.info("replacing existing document %r in %s" %
                                (docid, self))
                    del self.pos2id[pos]
                self.id2pos[docid] = self.length
                try:
                    del self.id2sims[docid]
                except:
                    pass
            self.length += 1
        self.id2sims.sync()
        self.update_mappings()

    def update_mappings(self):
        """Synchronize id<->position mappings."""
        #self.pos2id = dict((v, k) for k, v in self.id2pos.iteritems())
        self.pos2id = dict((v, k) for k, v in self.id2pos.items())
        assert len(self.pos2id) == len(
            self.id2pos), "duplicate ids or positions detected"

    def delete(self, docids):
        """Delete documents (specified by their ids) from the index."""
        logger.debug("deleting %i documents from %s" % (len(docids), self))
        deleted = 0
        for docid in docids:
            try:
                del self.id2pos[docid]
                deleted += 1
                del self.id2sims[docid]
            except:
                pass
        self.id2sims.sync()
        if deleted:
            logger.info("deleted %i documents from %s" % (deleted, self))
        self.update_mappings()

    def sims2scores(self, sims, eps=1e-7):
        """Convert raw similarity vector to a list of (docid, similarity) results."""
        result = []
        if isinstance(sims, numpy.ndarray):
            sims = abs(
                sims
            )  # TODO or maybe clip? are opposite vectors "similar" or "dissimilar"?!
            for pos in numpy.argsort(sims)[::-1]:
                if pos in self.pos2id and sims[
                        pos] > eps:  # ignore deleted/rewritten documents
                    # convert positions of resulting docs back to ids
                    result.append((self.pos2id[pos], sims[pos]))
                    if len(result) == self.topsims:
                        break
        else:
            for pos, score in sims:
                if pos in self.pos2id and abs(
                        score) > eps:  # ignore deleted/rewritten documents
                    # convert positions of resulting docs back to ids
                    result.append((self.pos2id[pos], abs(score)))
                    if len(result) == self.topsims:
                        break
        return result

    def vec_by_id(self, docid):
        """Return indexed vector corresponding to document `docid`."""
        pos = self.id2pos[docid]
        return self.qindex.vector_by_id(pos)

    def sims_by_id(self, docid):
        """Find the most similar documents to the (already indexed) document with `docid`."""
        result = self.id2sims.get(docid, None)
        if result is None:
            self.qindex.num_best = self.topsims
            sims = self.qindex.similarity_by_id(self.id2pos[docid])
            result = self.sims2scores(sims)
        return result

    def sims_by_vec(self, vec, normalize=None):
        """
        Find the most similar documents to a given vector (=already processed document).
        """
        if normalize is None:

            #normalize = self.qindex.normalize
            normalize = self.qindex.norm

        #norm, self.qindex.normalize = self.qindex.normalize, normalize # store old value
        norm, self.qindex.norm = self.qindex.norm, normalize  # store old value

        self.qindex.num_best = self.topsims
        sims = self.qindex[vec]

        #self.qindex.normalize = norm # restore old value of qindex.normalize
        self.qindex.norm = norm  # restore old value of qindex.norm

        return self.sims2scores(sims)

    def merge(self, other):
        """Merge documents from the other index. Update precomputed similarities
        in the process."""

        #other.qindex.normalize, other.qindex.num_best = False, self.topsims
        other.qindex.norm, other.qindex.num_best = False, self.topsims

        # update precomputed "most similar" for old documents (in case some of
        # the new docs make it to the top-N for some of the old documents)
        logger.info("updating old precomputed values")
        pos, lenself = 0, len(self.qindex)
        for chunk in self.qindex.iter_chunks():
            for sims in other.qindex[chunk]:
                if pos in self.pos2id:
                    # ignore masked entries (deleted, overwritten documents)
                    docid = self.pos2id[pos]
                    sims = self.sims2scores(sims)
                    self.id2sims[docid] = merge_sims(self.id2sims[docid], sims,
                                                     self.topsims)
                pos += 1
                if pos % 10000 == 0:
                    logger.info("PROGRESS: updated doc #%i/%i" %
                                (pos, lenself))
        self.id2sims.sync()

        logger.info("merging fresh index into optimized one")
        pos, docids = 0, []
        for chunk in other.qindex.iter_chunks():
            for vec in chunk:
                if pos in other.pos2id:  # don't copy deleted documents
                    self.qindex.add_documents([vec])
                    docids.append(other.pos2id[pos])
                pos += 1
        self.qindex.save()
        self.update_ids(docids)

        logger.info("precomputing most similar for the fresh index")
        pos, lenother = 0, len(other.qindex)

        #norm, self.qindex.normalize = self.qindex.normalize, False
        norm, self.qindex.norm = self.qindex.norm, False

        topsims, self.qindex.num_best = self.qindex.num_best, self.topsims
        for chunk in other.qindex.iter_chunks():
            for sims in self.qindex[chunk]:
                if pos in other.pos2id:
                    # ignore masked entries (deleted, overwritten documents)
                    docid = other.pos2id[pos]
                    self.id2sims[docid] = self.sims2scores(sims)
                pos += 1
                if pos % 10000 == 0:
                    logger.info("PROGRESS: precomputed doc #%i/%i" %
                                (pos, lenother))

        #self.qindex.normalize, self.qindex.num_best = norm, topsims
        self.qindex.norm, self.qindex.num_best = norm, topsims

        self.id2sims.sync()

    def __len__(self):
        return len(self.id2pos)

    def __contains__(self, docid):
        return docid in self.id2pos

    def keys(self):
        return self.id2pos.keys()

    def __str__(self):
        return "SimIndex(%i docs, %i real size)" % (len(self), self.length)
class SqliteSparseSequence(MutableSequence[Any]):
    def __init__(self, filename: Union[str, PathLike], read_only: bool = False):
        self.table = SqliteDict(filename, "sparse_sequence", flag="r" if read_only else "c")

    def __del__(self):
        self.close()

    def __getitem__(self, i: Union[int, slice]) -> Any:
        if isinstance(i, int):
            try:
                return self.table[str(i)]
            except KeyError:
                current_length = len(self)
                if i >= current_length or current_length <= 0:
                    raise IndexError("list index out of range")
                elif i < 0 < current_length:
                    return self.__getitem__(i % current_length)
                else:
                    return None
        elif isinstance(i, slice):
            return SlicedSequence(self, i)
        else:
            raise TypeError(f"list indices must be integers or slices, not {i.__class__.__name__}")

    def __setitem__(self, i: Union[int, slice], value: Any):
        if isinstance(i, int):
            current_length = len(self)
            if i < 0:
                i %= current_length
            self.table[str(i)] = value
            self.table["_len"] = max(i, current_length)
            self.table.commit()
        else:
            raise TypeError(f"list indices must be integers, not {i.__class__.__name__}")

    def __delitem__(self, i: Union[int, slice]):
        current_length = len(self)
        if isinstance(i, int):
            if i < 0:
                i %= current_length
            if i >= current_length:
                raise IndexError("list assignment index out of range")
            for index in range(i + 1, current_length):
                self.table[str(index - 1)] = self.table.get(str(index))
            del self.table[str(current_length - 1)]
            self.table["_len"] = current_length - 1
            self.table.commit()
        elif isinstance(i, slice):
            # This isn't very efficient for continuous slices.
            for index in reversed(range(*i.indices(current_length))):
                del self[index]
        else:
            raise TypeError(f"list indices must be integers or slices, not {i.__class__.__name__}")

    def extend(self, values: Iterable[Any]) -> None:
        current_length = len(self)
        index = -1
        for index, value in enumerate(values):
            self.table[str(index + current_length)] = value
        if index < 0:
            return
        self.table["_len"] = current_length + index + 1
        self.table.commit()

    def insert(self, i: int, value: Any) -> None:
        current_length = len(self)
        for index in reversed(range(i, current_length)):
            self.table[str(index + 1)] = self.table.get(str(index))
        self.table[str(i)] = value
        self.table["_len"] = current_length + 1
        self.table.commit()

    def __len__(self) -> int:
        try:
            return self.table["_len"]
        except KeyError:
            return 0

    def clear(self) -> None:
        self.table.clear()
        self.table.commit()

    def close(self) -> None:
        if self.table is not None:
            self.table.close()
            self.table = None

    def copy_to(self, target: Union[str, PathLike]):
        try:
            os.link(self.table.filename, target)
        except OSError as e:
            if e.errno == 18:  # Cross-device link
                shutil.copy(self.table.filename, target)
            else:
                raise
Beispiel #22
0
class abcService(abc.ABC):

    name: str
    author: str
    version: str
    preferred_port: int | None = None
    host = 'localhost'

    def __init__(self):
        for att in ['name', 'author', 'version']:
            assert hasattr(self, att), f'Missing attribute {att}'

        self.dirs = AppDirs(
            appname=self.name,
            appauthor=self.author,
            version=self.version,
        )
        self.site_config_dir = Path(self.dirs.site_config_dir)
        os.makedirs(self.site_config_dir, exist_ok=True)

        self.db_service = SqliteDict(
            filename=self.site_config_dir / 'service.db',
            autocommit=True,
        )
        curr_pid = self.db_service.get('pid', -1)
        curr_port = self.db_service.get('port', self.preferred_port)
        self.db_service['pid'] = curr_pid
        self.db_service['port'] = curr_port

    @property
    def pid(self) -> int:
        return self.db_service['pid']

    @property
    def port(self) -> int | None:
        return self.db_service['port']

    def alive(self, check=True):
        alive = (self.pid > 0)
        if check and alive and not self._alive():
            self.db_service['pid'] = -1
            self.db_service['port'] = None
            alive = False
        return alive

    def _alive(self):
        return psutil.pid_exists(self.pid)

    def stop(self):
        if self.alive():
            stop_process(self.pid)
        else:
            print('(already stopped)')
        print(self)

    @abc.abstractmethod
    def start(self):
        raise NotImplementedError

    def cli_start(self):
        if not self.alive():
            setproctitle.setproctitle(self.name)  # just fancy
            self.db_service['pid'] = pid = os.getpid()
            self.db_service['port'] = find_free_port(self.port)
            try:
                self.start()
                procs = psutil.Process().children()
                psutil.wait_procs(procs)
            finally:
                self.db_service['pid'] = -1
                self.db_service['port'] = None
                stop_process(pid)
        else:
            print('(already active)')
        print(self)
        return

    def __repr__(self):
        out = [
            f'alive: {self.alive()}',
            f'  pid: {self.pid}',
            f' port: {self.port}',
            f'  url: http://{self.host}:{self.port}',
        ]
        return '\n'.join(out) if self.alive() else out[0]
Beispiel #23
0
class SessionState:
    def __init__(self, cache_file: Path, cache_key: str, redis: str, user_requested=False):
        self.user_requested = user_requested
        self._cache_file = cache_file
        self._cache_key = cache_key
        self._cache: Optional[SqliteDict] = None
        random.seed()
        self._session_key = random.randint(0, 999999)
        self._redis = Redis(host=redis)

        if not user_requested:
            self._open()
            if self._cache_key != self._cache.get("_cache_key_", None):
                self._cache.close()
                self._cache: Optional[SqliteDict] = None
                self._cache_file.unlink()
                self._open()
                self._cache["_cache_key_"] = self._cache_key

        self.session = Session()
        # noinspection PyTypeChecker
        self.session.mount(
            'https://',
            HTTPAdapter(max_retries=Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])))
        self.sites = {}
        self.wikidata = Sparql()
        self.primary_site = self.get_site(primary_domain)

    def __enter__(self):
        return self

    def __exit__(self, typ, value, traceback):
        self.session.close()
        if self._cache is not None:
            self._cache.close()
            self._cache = None
            print(f'Closed SQL connection for {self._session_key} at {datetime.utcnow()}')

    def _open(self):
        if self._cache is None:
            print(f'Opening SQL connection for {self._session_key} at {datetime.utcnow()}')
            self._cache_file.parent.mkdir(parents=True, exist_ok=True)
            self._cache = SqliteDict(self._cache_file, autocommit=True)

    def get_site(self, domain: Domain) -> WikiSite:
        try:
            return self.sites[domain]
        except KeyError:
            # noinspection PyTypeChecker
            site = WikiSite(domain, self.session, domain == primary_domain)
            if self.user_requested:
                site.maxlag = None
            self.sites[domain] = site
            return site

    def delete_cached_items(self, prefix: str) -> None:
        self._open()
        for vv in {v for v in self._cache.keys() if v.startswith(prefix)}:
            del self._cache[vv]

    def del_obj(self, key: str) -> Any:
        self._redis.delete(self.redis_key(key))
        self._open()
        print(f"%% del {key}")
        return self._cache.pop(key, None)

    def load_obj(self, key: str, default: Any = None) -> Any:
        value = self._redis.get(self.redis_key(key))
        if value is not None:
            return loads(value)
        self._open()
        print(f"%% load {key}")
        value = self._cache.get(key, default)
        self._redis.set(self.redis_key(key), dumps(value))
        return value

    def save_obj(self, key: str, value: Any):
        self._open()
        print(f"%% save {key}")
        self._cache[key] = value
        self._redis.set(self.redis_key(key), dumps(value))

    def redis_key(self, key: str):
        return self._cache_key + key