def process_temp_bundle(self, ds_name, path): """ Merge the temp bundle into the main bundle for the specified data source. Parameters ---------- ds_name path Returns ------- """ tmp_bundle = extract_bundle(path) bundle_folder = get_data_source_folder(ds_name) ensure_directory(bundle_folder) if os.listdir(bundle_folder): zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r') ztarget = bcolz.ctable(rootdir=bundle_folder, mode='a') ztarget.append(zsource) else: shutil.rmtree(bundle_folder, ignore_errors=True) os.rename(tmp_bundle, bundle_folder)
def process_temp_bundle(self, ds_name, path): """ Merge the temp bundle into the main bundle for the specified data source. Parameters ---------- ds_name path Returns ------- """ tmp_bundle = extract_bundle(path) bundle_folder = get_data_source_folder(ds_name) ensure_directory(bundle_folder) if os.listdir(bundle_folder): zsource = bcolz.ctable(rootdir=tmp_bundle, mode='r') ztarget = bcolz.ctable(rootdir=bundle_folder, mode='r') merge_bundles(zsource, ztarget) else: os.rename(tmp_bundle, bundle_folder) pass
def clean(self, data_source_name, data_frequency=None): data_source_name = data_source_name.lower() if data_frequency is None: folder = get_data_source_folder(data_source_name) else: folder = get_bundle_folder(data_source_name, data_frequency) shutil.rmtree(folder) pass
def get_dataset(self, ds_name, start=None, end=None): ds_name = ds_name.lower() # TODO: filter ctable by start and end date bundle_folder = get_data_source_folder(ds_name) z = bcolz.ctable(rootdir=bundle_folder, mode='r') df = z.todataframe() # type: pd.DataFrame df.set_index(['date', 'symbol'], drop=True, inplace=True) # TODO: implement the filter more carefully # if start and end is None: # df = df.xs(start, level=0) return df
def get_dataset(self, ds_name, start=None, end=None): ds_name = ds_name.lower() # TODO: filter ctable by start and end date bundle_folder = get_data_source_folder(ds_name) z = bcolz.ctable(rootdir=bundle_folder, mode='r') if start is not None and end is not None: z = z.fetchwhere('(date>=start_date) & (date<end_date)', user_dict={'start_date': start.encode(), 'end_date': end.encode()}) elif start is not None: z = z.fetchwhere('(date>=start_date)', user_dict={'start_date': start.encode()}) elif end is not None: z = z.fetchwhere('(date<end_date)', user_dict={'end_date': end.encode()}) df = z.todataframe() # type: pd.DataFrame df.set_index(['date', 'symbol'], drop=True, inplace=True) return df
def get_dataset(self, ds_name, start=None, end=None): ds_name = ds_name.lower() # TODO: filter ctable by start and end date bundle_folder = get_data_source_folder(ds_name) z = bcolz.ctable(rootdir=bundle_folder, mode='r') # if start is not None and end is not None: # z = z.fetchwhere('(date>=start_date) & (date<end_date)', user_dict={'start_date': start.to_datetime64(), # 'end_date': end.to_datetime64()}) # elif start is not None: # z = z.fetchwhere('(date>=start_date)', user_dict={'start_date': start.to_datetime64()}) # elif end is not None: # z = z.fetchwhere('(date<end_date)', user_dict={'end_date': end.to_datetime64()}) df = z.todataframe() # type: pd.DataFrame df.set_index(['date', 'symbol'], drop=True, inplace=True) # TODO: implement the filter more carefully # if start and end is None: # df = df.xs(start, level=0) return df
def clean(self, ds_name=None, data_frequency=None): if ds_name is None: mktplace_root = get_marketplace_folder() folders = [ os.path.basename(f.rstrip('/')) for f in glob.glob('{}/*/'.format(mktplace_root)) if 'temp_bundles' not in f ] while True: for idx, f in enumerate(folders): print('{}\t{}'.format(idx, f)) dataset_num = input('Choose the dataset you want to ' 'clean [0..{}]: '.format(len(folders) - 1)) try: dataset_num = int(dataset_num) except ValueError: print('Enter a number between 0 and {}'.format( len(folders) - 1)) else: if dataset_num not in range(0, len(folders)): print('Enter a number between 0 and {}'.format( len(folders) - 1)) else: ds_name = folders[dataset_num] break ds_name = ds_name.lower() if data_frequency is None: folder = get_data_source_folder(ds_name) else: folder = get_bundle_folder(ds_name, data_frequency) shutil.rmtree(folder)
def ingest(self, ds_name=None, start=None, end=None, force_download=False): if ds_name is None: df_sets = self._list() if df_sets.empty: print('There are no datasets available yet.') return set_print_settings() while True: print(df_sets) dataset_num = input('Choose the dataset you want to ' 'ingest [0..{}]: '.format(df_sets.size - 1)) try: dataset_num = int(dataset_num) except ValueError: print( 'Enter a number between 0 and {}'.format(df_sets.size - 1)) else: if dataset_num not in range(0, df_sets.size): print('Enter a number between 0 and {}'.format( df_sets.size - 1)) else: ds_name = df_sets.iloc[dataset_num]['dataset'] break # ds_name = ds_name.lower() # TODO: catch error conditions provider_info = self.mkt_contract.functions.getDataProviderInfo( Web3.toHex(ds_name)).call() if not provider_info[4]: print('The requested "{}" dataset is not registered in ' 'the Data Marketplace.'.format(ds_name)) return address, address_i = self.choose_pubaddr() fns = self.mkt_contract.functions check_sub = fns.checkAddressSubscription(address, Web3.toHex(ds_name)).call() if check_sub[0] != address or self.to_text(check_sub[1]) != ds_name: print('You are not subscribed to dataset "{}" with address {}. ' 'Plese subscribe first.'.format(ds_name, address)) return if not check_sub[5]: print('Your subscription to dataset "{}" expired on {} UTC.' 'Please renew your subscription by running:\n' 'catalyst marketplace subscribe --dataset={}'.format( ds_name, pd.to_datetime(check_sub[4], unit='s', utc=True), ds_name)) if 'key' in self.addresses[address_i]: key = self.addresses[address_i]['key'] secret = self.addresses[address_i]['secret'] else: key, secret = get_key_secret(address, self.addresses[address_i]['wallet']) headers = get_signed_headers(ds_name, key, secret) log.info('Starting download of dataset for ingestion...') r = requests.post( '{}/marketplace/ingest'.format(AUTH_SERVER), headers=headers, stream=True, ) if r.status_code == 200: log.info('Dataset downloaded successfully. Processing dataset...') bundle_folder = get_data_source_folder(ds_name) shutil.rmtree(bundle_folder, ignore_errors=True) target_path = get_temp_bundles_folder() try: decoder = MultipartDecoder.from_response(r) # with maybe_show_progress( # iter(decoder.parts), # True, # label='Processing files') as part: counter = 1 for part in decoder.parts: log.info("Processing file {} of {}".format( counter, len(decoder.parts))) h = part.headers[b'Content-Disposition'].decode('utf-8') # Extracting the filename from the header name = re.search(r'filename="(.*)"', h).group(1) filename = os.path.join(target_path, name) with open(filename, 'wb') as f: # for chunk in part.content.iter_content( # chunk_size=1024): # if chunk: # filter out keep-alive new chunks # f.write(chunk) f.write(part.content) self.process_temp_bundle(ds_name, filename) counter += 1 except NonMultipartContentTypeException: response = r.json() raise MarketplaceHTTPRequest( request='ingest dataset', error=response, ) else: raise MarketplaceHTTPRequest( request='ingest dataset', error=r.status_code, ) log.info('{} ingested successfully'.format(ds_name))