Beispiel #1
0
    def process_temp_bundle(self, ds_name, path):
        """
        Merge the temp bundle into the main bundle for the specified
        data source.

        Parameters
        ----------
        ds_name
        path

        Returns
        -------

        """
        tmp_bundle = extract_bundle(path)
        bundle_folder = get_data_source_folder(ds_name)
        ensure_directory(bundle_folder)
        if os.listdir(bundle_folder):
            zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r')
            ztarget = bcolz.ctable(rootdir=bundle_folder, mode='a')
            ztarget.append(zsource)

        else:
            shutil.rmtree(bundle_folder, ignore_errors=True)
            os.rename(tmp_bundle, bundle_folder)
Beispiel #2
0
    def process_temp_bundle(self, ds_name, path):
        """
        Merge the temp bundle into the main bundle for the specified
        data source.

        Parameters
        ----------
        ds_name
        path

        Returns
        -------

        """
        tmp_bundle = extract_bundle(path)
        bundle_folder = get_data_source_folder(ds_name)
        ensure_directory(bundle_folder)
        if os.listdir(bundle_folder):
            zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r')
            ztarget = bcolz.ctable(rootdir=bundle_folder, mode='r')
            merge_bundles(zsource, ztarget)

        else:
            os.rename(tmp_bundle, bundle_folder)

        pass
Beispiel #3
0
    def clean(self, data_source_name, data_frequency=None):
        data_source_name = data_source_name.lower()

        if data_frequency is None:
            folder = get_data_source_folder(data_source_name)

        else:
            folder = get_bundle_folder(data_source_name, data_frequency)

        shutil.rmtree(folder)
        pass
Beispiel #4
0
    def get_dataset(self, ds_name, start=None, end=None):
        ds_name = ds_name.lower()

        # TODO: filter ctable by start and end date
        bundle_folder = get_data_source_folder(ds_name)
        z = bcolz.ctable(rootdir=bundle_folder, mode='r')

        df = z.todataframe()  # type: pd.DataFrame
        df.set_index(['date', 'symbol'], drop=True, inplace=True)

        # TODO: implement the filter more carefully
        # if start and end is None:
        #     df = df.xs(start, level=0)

        return df
Beispiel #5
0
    def get_dataset(self, ds_name, start=None, end=None):
        ds_name = ds_name.lower()

        # TODO: filter ctable by start and end date
        bundle_folder = get_data_source_folder(ds_name)
        z = bcolz.ctable(rootdir=bundle_folder, mode='r')

        if start is not None and end is not None:
            z = z.fetchwhere('(date>=start_date) & (date<end_date)',
                             user_dict={'start_date': start.encode(),
                                        'end_date': end.encode()})
        elif start is not None:
            z = z.fetchwhere('(date>=start_date)',
                             user_dict={'start_date': start.encode()})
        elif end is not None:
            z = z.fetchwhere('(date<end_date)',
                             user_dict={'end_date': end.encode()})
        df = z.todataframe()  # type: pd.DataFrame
        df.set_index(['date', 'symbol'], drop=True, inplace=True)

        return df
Beispiel #6
0
    def get_dataset(self, ds_name, start=None, end=None):
        ds_name = ds_name.lower()

        # TODO: filter ctable by start and end date
        bundle_folder = get_data_source_folder(ds_name)
        z = bcolz.ctable(rootdir=bundle_folder, mode='r')

        # if start is not None and end is not None:
        #     z = z.fetchwhere('(date>=start_date) & (date<end_date)', user_dict={'start_date': start.to_datetime64(),
        #                                                                          'end_date': end.to_datetime64()})
        # elif start is not None:
        #     z = z.fetchwhere('(date>=start_date)', user_dict={'start_date': start.to_datetime64()})
        # elif end is not None:
        #     z = z.fetchwhere('(date<end_date)', user_dict={'end_date': end.to_datetime64()})
        df = z.todataframe()  # type: pd.DataFrame
        df.set_index(['date', 'symbol'], drop=True, inplace=True)

        # TODO: implement the filter more carefully
        # if start and end is None:
        #     df = df.xs(start, level=0)

        return df
Beispiel #7
0
    def clean(self, ds_name=None, data_frequency=None):

        if ds_name is None:
            mktplace_root = get_marketplace_folder()
            folders = [
                os.path.basename(f.rstrip('/'))
                for f in glob.glob('{}/*/'.format(mktplace_root))
                if 'temp_bundles' not in f
            ]

            while True:
                for idx, f in enumerate(folders):
                    print('{}\t{}'.format(idx, f))
                dataset_num = input('Choose the dataset you want to '
                                    'clean [0..{}]: '.format(len(folders) - 1))
                try:
                    dataset_num = int(dataset_num)
                except ValueError:
                    print('Enter a number between 0 and {}'.format(
                        len(folders) - 1))
                else:
                    if dataset_num not in range(0, len(folders)):
                        print('Enter a number between 0 and {}'.format(
                            len(folders) - 1))
                    else:
                        ds_name = folders[dataset_num]
                        break

        ds_name = ds_name.lower()

        if data_frequency is None:
            folder = get_data_source_folder(ds_name)

        else:
            folder = get_bundle_folder(ds_name, data_frequency)

        shutil.rmtree(folder)
Beispiel #8
0
    def ingest(self, ds_name=None, start=None, end=None, force_download=False):

        if ds_name is None:

            df_sets = self._list()
            if df_sets.empty:
                print('There are no datasets available yet.')
                return

            set_print_settings()
            while True:
                print(df_sets)
                dataset_num = input('Choose the dataset you want to '
                                    'ingest [0..{}]: '.format(df_sets.size -
                                                              1))
                try:
                    dataset_num = int(dataset_num)
                except ValueError:
                    print(
                        'Enter a number between 0 and {}'.format(df_sets.size -
                                                                 1))
                else:
                    if dataset_num not in range(0, df_sets.size):
                        print('Enter a number between 0 and {}'.format(
                            df_sets.size - 1))
                    else:
                        ds_name = df_sets.iloc[dataset_num]['dataset']
                        break

        # ds_name = ds_name.lower()

        # TODO: catch error conditions
        provider_info = self.mkt_contract.functions.getDataProviderInfo(
            Web3.toHex(ds_name)).call()

        if not provider_info[4]:
            print('The requested "{}" dataset is not registered in '
                  'the Data Marketplace.'.format(ds_name))
            return

        address, address_i = self.choose_pubaddr()
        fns = self.mkt_contract.functions
        check_sub = fns.checkAddressSubscription(address,
                                                 Web3.toHex(ds_name)).call()

        if check_sub[0] != address or self.to_text(check_sub[1]) != ds_name:
            print('You are not subscribed to dataset "{}" with address {}. '
                  'Plese subscribe first.'.format(ds_name, address))
            return

        if not check_sub[5]:
            print('Your subscription to dataset "{}" expired on {} UTC.'
                  'Please renew your subscription by running:\n'
                  'catalyst marketplace subscribe --dataset={}'.format(
                      ds_name, pd.to_datetime(check_sub[4], unit='s',
                                              utc=True), ds_name))

        if 'key' in self.addresses[address_i]:
            key = self.addresses[address_i]['key']
            secret = self.addresses[address_i]['secret']
        else:
            key, secret = get_key_secret(address,
                                         self.addresses[address_i]['wallet'])

        headers = get_signed_headers(ds_name, key, secret)
        log.info('Starting download of dataset for ingestion...')
        r = requests.post(
            '{}/marketplace/ingest'.format(AUTH_SERVER),
            headers=headers,
            stream=True,
        )
        if r.status_code == 200:
            log.info('Dataset downloaded successfully. Processing dataset...')
            bundle_folder = get_data_source_folder(ds_name)
            shutil.rmtree(bundle_folder, ignore_errors=True)
            target_path = get_temp_bundles_folder()
            try:
                decoder = MultipartDecoder.from_response(r)
                # with maybe_show_progress(
                #     iter(decoder.parts),
                #     True,
                #     label='Processing files') as part:
                counter = 1
                for part in decoder.parts:
                    log.info("Processing file {} of {}".format(
                        counter, len(decoder.parts)))
                    h = part.headers[b'Content-Disposition'].decode('utf-8')
                    # Extracting the filename from the header
                    name = re.search(r'filename="(.*)"', h).group(1)

                    filename = os.path.join(target_path, name)
                    with open(filename, 'wb') as f:
                        # for chunk in part.content.iter_content(
                        #         chunk_size=1024):
                        #     if chunk: # filter out keep-alive new chunks
                        #         f.write(chunk)
                        f.write(part.content)

                    self.process_temp_bundle(ds_name, filename)
                    counter += 1

            except NonMultipartContentTypeException:
                response = r.json()
                raise MarketplaceHTTPRequest(
                    request='ingest dataset',
                    error=response,
                )
        else:
            raise MarketplaceHTTPRequest(
                request='ingest dataset',
                error=r.status_code,
            )

        log.info('{} ingested successfully'.format(ds_name))