Esempio n. 1
0
    def get_start_date(self, exchange: str, data_type: str,
                       pair: str) -> float:
        objs = []
        files = []

        if not self.del_file:
            file_pattern = f'{exchange}-{data_type}-{pair}-[0-9]*.parquet'
            files = glob.glob(file_pattern)

        if self._read:
            for func, bucket, prefix, kwargs in zip(self._list, self.bucket,
                                                    self.prefix, self.kwargs):
                path = f'{exchange}/{data_type}/{pair}/'
                if prefix:
                    path = f"{prefix}/{path}"
                ret = func(bucket, path, limit=1, **kwargs)
                objs.append(ret)
        if not files and not any(objs):
            return None

        if files:
            files = sorted(files)
            start = files[0]
        else:
            start = objs[0][0]

        for entry in objs:
            if entry[0] != start:
                raise InconsistentStorage(
                    "Stored data differs, cannot backfill")

        if files:
            return float(
                pq.read_table(files[0],
                              columns=['timestamp']).to_pandas().timestamp[0])
        else:
            tmp = f'{exchange}-{pair}-temp.parquet'
            self._read[0](self.bucket[0], objs[0][0], tmp, **self.kwargs[0])
            start = float(
                pq.read_table(tmp,
                              columns=['timestamp']).to_pandas().timestamp[0])
            os.remove(tmp)
            return start
Esempio n. 2
0
    def __init__(self, creds: str, exchanges: dict, prefix: str,
                 folder_name_sep: str, path: Callable[[str, str, str], str]):
        """
        Initialize a `drive` service, and create the list of folders' IDs,
        either retrieving them if already existing, or creating them if not
        existing.

        Parameters:
            creds (str):
                Path to credential file.
            exchanges (dict):
                List of exchanges with related data types and pairs that are
                retrieved by cryptostore.
            prefix (str):
                Base folder into which storing recorded data.
            folder_name_sep (str):
                Separator to be used between `exchange`, `data_type` and `pair`
                in Google Drive folder name.
            path (Callable[[str, str, str], str]):
                Function from which deriving folders' name.
        """

        httplib2 = StorageEngines['httplib2']

        self.folder_name_sep = folder_name_sep
        # Initialize a drive service, with an authorized caching-enabled
        # `http` object.
        if creds:
            google = StorageEngines['google.oauth2.service_account']
            self.creds = google.oauth2.service_account.Credentials.from_service_account_file(
                creds).with_scopes(['https://www.googleapis.com/auth/drive'])
        else:
            # Use environment variable GOOGLE_APPLICATION_CREDENTIALS
            google = StorageEngines['google.auth']
            self.creds, _ = google.auth.default(
                scopes=['https://www.googleapis.com/auth/drive'])
        googleapiclient = StorageEngines['googleapiclient._auth']
        auth_http = googleapiclient._auth.authorized_http(self.creds)
        auth_http.cache = httplib2.FileCache(self.cache_path)
        googleapiclient = StorageEngines['googleapiclient.discovery']
        self.drive = googleapiclient.discovery.build('drive',
                                                     'v3',
                                                     http=auth_http)

        files = self.drive.files()
        # Retrieve candidates for child and parent folders in Google Drive.
        # `pageSize` is by default to 100 and is limited to 1000.
        g_drive_folders = []
        request = files.list(
            q=
            "mimeType = 'application/vnd.google-apps.folder' and trashed = false",
            pageSize=800,
            fields='nextPageToken, files(id, name, parents)')
        while request is not None:
            res = request.execute()
            g_drive_folders.extend(res.get('files', []))
            request = files.list_next(request, res)

        # Retrieve parent folder ID (prefix).
        p_folders = [
            folder['id'] for folder in g_drive_folders
            if folder['name'] == prefix
        ]
        if len(p_folders) > 1:
            # If more than 2 folders with the same name, throw an error. We do not
            # know which one is the right one to record data.
            raise InconsistentStorage(
                "At least 2 parent folders identified with \
name {!s}. Please, make sure to provide a prefix corresponding to a unique \
folder name in your Google Drive space.".format(prefix))
        elif not p_folders:
            # If parent folder is not found, ask the user to create one.
            raise InconsistentStorage(
                "No existing folder found with name {!s}. \
Please, make sure to provide a prefix corresponding to an existing and \
accessible folder.".format(prefix))
        else:
            p_folder_id = p_folders[0]

        # Manage child folders. Build list of folders' name.
        c_folders = []
        for exchange in exchanges:
            for dtype in exchanges[exchange]:
                # Skip over the retries arg in the config if present.
                if dtype in {'retries', 'channel_timeouts'}:
                    continue
                for pair in exchanges[exchange][
                        dtype] if 'symbols' not in exchanges[exchange][
                            dtype] else exchanges[exchange][dtype]['symbols']:
                    c_folders.append(
                        folder_name_sep.join(
                            path(exchange, dtype, pair).split('/')))
        # Retrieve ID for existing ones.
        existing_childs = [
            (folder['name'], folder['id']) for folder in g_drive_folders
            if ((folder['name'] in c_folders) and ('parents' in folder) and (
                p_folder_id in folder['parents']))
        ]
        # If duplicates in folder names, throw an exception.
        existing_as_dict = dict(existing_childs)
        n = len(existing_childs) - len(existing_as_dict)
        if n != 0:
            raise InconsistentStorage(
                "{!s} existing folder(s) share(s) same name with another. Please, clean content of {!s} folder."
                .format(n, prefix))
        # Get missing ones and create corresponding child folders in batch.
        missing_childs = list(set(c_folders) - set(existing_as_dict))
        # Number of calls in batch is limited to 1000.
        call_limit = 800
        missing_in_chunks = [
            missing_childs[x:x + call_limit]
            for x in range(0, len(missing_childs), call_limit)
        ]

        # Setup & operate requests in batch.
        def _callback(request_id, response, exception, keep=existing_as_dict):
            keep[response['name']] = response['id']
            return

        for sub_list in missing_in_chunks:
            batch = self.drive.new_batch_http_request(_callback)
            for folder in sub_list:
                folder_metadata = {
                    'name': folder,
                    'mimeType': 'application/vnd.google-apps.folder',
                    'parents': [p_folder_id]
                }
                batch.add(files.create(body=folder_metadata,
                                       fields='id, name'))
            batch.execute()
        self.folders = existing_as_dict
Esempio n. 3
0
def _get_folder_in_parent(drive: gad.Resource, path: str) -> Tuple[str, str]:
    """
    Retrieve folder ID from given name and parent folder name.
    If not existing, it is created.

    Parameters:
        drive (gad.Resource):
            Service with which interacting with Google Drive.
        path (str):
            path = '{prefix}/{exchange}/{data_type}/{pair}/
                        {exchange}-{data_type}-{pair}-{int(timestamp)}.parquet'
            String from which is retrieved `prefix` (parent folder) and name of
            child folder '{exchange}-{data_type}-{pair}'.

    Returns:
        folder_id, folder_name (Tuple[str, str]):
            Id of child folder '{exchange}-{data_type}-{pair}'. Create it if
            not existing.

    """

    # Retrieve parent folder (prefix), and child folder.
    path_struct = path.split('/')
    folder_name = '-'.join(path_struct[1:4])
    if len(path_struct) > 5:
        # If larger than 5, it means prefix is more than a single folder.
        # This case is not supported.
        raise InconsistentStorage("Prefix {!s} appears to be a path. Only a single folder name is accepted.".format(folder_name))

    parent_name = path_struct[0]
    # Retrieve candidates for child and parent folders.
    res = drive.files().list(q="(name = '" + parent_name + "' or name = '"
                                           + folder_name + "') and mimeType = 'application/vnd.google-apps.folder' and trashed = false",
                             pageSize=20,
                             fields='files(id, name, parents)').execute()
    folders = res.get('files', [])

    # Manage parent folder.
    p_folders = [(folder['id'], folder['name']) for folder in folders
                 if folder['name'] == parent_name]
    if len(p_folders) > 1:
        # If more than 2 folders with the same name, throw an error. We do not
        # know which one is the right one to record data.
        raise InconsistentStorage("At least 2 parent folders identified with \
name {!s}. Please, make sure to provide a prefix corresponding to a unique \
folder name in your Google Drive space.".format(parent_name))
    elif not p_folders:
        # If parent folder is not found, ask the user to create one.
        raise InconsistentStorage("No existing folder found with name {!s}. \
Please, make sure to provide a prefix corresponding to an existing and \
accessible folder.".format(parent_name))
    else:
        p_folder_id = p_folders[0][0]

    # Manage child folder.
    c_folders = [(folder['id'], folder['name']) for folder in folders
                 if ((folder['name'] == folder_name) and ('parents' in folder)
                     and (p_folder_id in folder['parents']))]
    if len(c_folders) > 1:
        # If more than 2 folders with the same name, throw an error. We do not
        # know which one is the right one to record data.
        raise InconsistentStorage("At least 2 folders identified with name {!s}. Please, clean content of parent folder.".format(folder_name))
    elif not c_folders:
        # If folder not found, create it.
        folder_metadata = {'name': folder_name,
                           'mimeType': 'application/vnd.google-apps.folder',
                           'parents': [p_folder_id]}
        folder = drive.files().create(body=folder_metadata, fields='id')\
                              .execute()

        return folder.get('id'), folder_name
    else:
        # Single folder found.

        return folders[0]['id'], folder_name
Esempio n. 4
0
    def _worker(self, exchange):
        r = Rest()
        storage = Storage(self.config)
        for pair in self.config.backfill[exchange]:
            try:
                start = self.config.backfill[exchange][pair].start

                while True:
                    end = storage.get_start_date(exchange, 'trades', pair)
                    if not all(e == end[0] for e in end):
                        raise InconsistentStorage(
                            "Stored data differs, cannot backfill")
                    end = end[0]
                    if end:
                        break
                    time.sleep(10)
                end = Timestamp(end, unit='s')
                end -= Timedelta(microseconds=1)
                start = Timestamp(start)
                if end <= Timestamp(start):
                    LOG.info(
                        "Data in storage is earlier than backfill start date for %s - %s",
                        exchange, pair)
                    continue

                LOG.info("Backfill - Starting for %s - %s for range %s - %s",
                         exchange, pair, start, str(end))

                # Backfill from end date to start date, 1 day at a time, in reverse order (from end -> start)
                while start < end:
                    seg_start = end.replace(hour=0,
                                            minute=0,
                                            second=0,
                                            microsecond=0,
                                            nanosecond=0)
                    if start > seg_start:
                        seg_start = start
                    LOG.info("Backfill - Reading %s to %s for %s - %s",
                             seg_start, end, exchange, pair)

                    trades = []
                    try:
                        for t in r[exchange].trades(pair, str(seg_start),
                                                    str(end)):
                            trades.extend(t)
                    except Exception:
                        LOG.warning(
                            "Backfill - encountered error backfilling %s - %s, trying again...",
                            exchange,
                            pair,
                            exc_info=True)
                        time.sleep(300)
                        continue

                    if not trades:
                        end = seg_start - Timedelta(nanoseconds=1)
                        continue

                    storage.aggregate(trades)
                    storage.write(exchange, 'trades', pair, end.timestamp())
                    LOG.info("Backfill - Wrote %s to %s for %s - %s",
                             seg_start, end, exchange, pair)
                    end = seg_start - Timedelta(nanoseconds=1)
                LOG.info("Backfill for %s - %s completed", exchange, pair)
            except Exception:
                LOG.error("Backfill failed for %s - %s",
                          exchange,
                          pair,
                          exc_info=True)