def filter_items(input, output, predicate): def evaluated_predicate(item): eval_environment = globals() if 'datetime' in predicate: import datetime eval_environment['datetime'] = datetime return eval(predicate, eval_environment, {'item': item}) misc_utils.filter_items(input, output, evaluated_predicate)
def test_filter_items(tmpdir): input_file = str(tmpdir.join('input.json')) open(input_file, 'w').write('''{"field1": "x1", "field2": "y1"} {"field1": "x2", "field2": "y2"} ''') output_file = str(tmpdir.join('output.json')) filter_items(input_file, output_file, lambda item: item['field1'] == 'x1') expected_file = str(tmpdir.join('expected.json')) open(expected_file, 'w').write('''{"field1": "x1", "field2": "y1"} ''') compare_lines_ignore_order( read_file(expected_file), read_file(output_file) )
def export_all(chain, partitions, output_dir, provider_uri, max_workers, batch_size, enrich): for batch_start_block, batch_end_block, partition_dir, *args in partitions: # # # start # # # start_time = time() padded_batch_start_block = str(batch_start_block).zfill(8) padded_batch_end_block = str(batch_end_block).zfill(8) block_range = '{padded_batch_start_block}-{padded_batch_end_block}'.format( padded_batch_start_block=padded_batch_start_block, padded_batch_end_block=padded_batch_end_block, ) file_name_suffix = '{padded_batch_start_block}_{padded_batch_end_block}'.format( padded_batch_start_block=padded_batch_start_block, padded_batch_end_block=padded_batch_end_block, ) # # # blocks_and_transactions # # # blocks_output_dir = '{output_dir}/blocks{partition_dir}'.format( output_dir=output_dir, partition_dir=partition_dir, ) os.makedirs(os.path.dirname(blocks_output_dir), exist_ok=True) transactions_output_dir = '{output_dir}/transactions{partition_dir}'.format( output_dir=output_dir, partition_dir=partition_dir, ) os.makedirs(os.path.dirname(transactions_output_dir), exist_ok=True) blocks_file = '{blocks_output_dir}/blocks_{file_name_suffix}.json'.format( blocks_output_dir=blocks_output_dir, file_name_suffix=file_name_suffix, ) transactions_file = '{transactions_output_dir}/transactions_{file_name_suffix}.json'.format( transactions_output_dir=transactions_output_dir, file_name_suffix=file_name_suffix, ) enriched_transactions_file = '{transactions_output_dir}/enriched_transactions_{file_name_suffix}.json'.format( transactions_output_dir=transactions_output_dir, file_name_suffix=file_name_suffix, ) logger.info('Exporting blocks {block_range} to {blocks_file}'.format( block_range=block_range, blocks_file=blocks_file, )) logger.info( 'Exporting transactions from blocks {block_range} to {transactions_file}' .format( block_range=block_range, transactions_file=transactions_file, )) job = ExportBlocksJob( chain=chain, start_block=batch_start_block, end_block=batch_end_block, batch_size=batch_size, bitcoin_rpc=ThreadLocalProxy(lambda: BitcoinRpc(provider_uri)), max_workers=max_workers, item_exporter=blocks_and_transactions_item_exporter( blocks_file, transactions_file), export_blocks=blocks_file is not None, export_transactions=transactions_file is not None) job.run() if enrich == True: with smart_open(transactions_file, 'r') as transactions_file: job = EnrichTransactionsJob( transactions_iterable=( json.loads(transaction) for transaction in transactions_file), batch_size=batch_size, bitcoin_rpc=ThreadLocalProxy( lambda: BitcoinRpc(provider_uri)), max_workers=max_workers, item_exporter=blocks_and_transactions_item_exporter( None, enriched_transactions_file), chain=chain) job.run() if args is not None and len(args) > 0: date = args[0] logger.info('Filtering blocks {blocks_file} by date {date}'.format( blocks_file=blocks_file, date=date, )) def filter_by_date(item, field): return datetime.datetime.fromtimestamp(item[field]).astimezone(datetime.timezone.utc) \ .strftime('%Y-%m-%d') == date.strftime('%Y-%m-%d') filtered_blocks_file = blocks_file + '.filtered' filter_items(blocks_file, filtered_blocks_file, lambda item: filter_by_date(item, 'timestamp')) shutil.move(filtered_blocks_file, blocks_file) logger.info( 'Filtering transactions {transactions_file} by date {date}'. format( transactions_file=transactions_file, date=date, )) filtered_transactions_file = transactions_file + '.filtered' filter_items(transactions_file, filtered_transactions_file, lambda item: filter_by_date(item, 'block_timestamp')) shutil.move(filtered_transactions_file, transactions_file) # # # finish # # # end_time = time() time_diff = round(end_time - start_time, 5) logger.info( 'Exporting blocks {block_range} took {time_diff} seconds'.format( block_range=block_range, time_diff=time_diff, ))