def transform_single_file(file_path, output_path, servicex=None): print("Transforming a single path: " + str(file_path) + " into " + output_path) # os.system("voms-proxy-info --all") r = os.system('bash /generated/runner.sh -r -d ' + file_path + ' -o ' + output_path + '| tee log.txt') reason_bad = None if r != 0: reason_bad = "Error return from transformer: " + str(r) if (reason_bad is None) and not os.path.exists(output_path): reason_bad = "Output file " + output_path + " was not found" if reason_bad is not None: with open('log.txt', 'r') as f: errors = f.read() raise RuntimeError("Failed to transform input file " + file_path + ": " + reason_bad + ' -- errors: \n' + errors) if not object_store: flat_file = uproot.open(output_path) flat_tree_name = flat_file.keys()[0] attr_name_list = flat_file[flat_tree_name].keys() arrow_writer = ArrowWriter(file_format=args.result_format, object_store=object_store, messaging=messaging) # NB: We're converting the *output* ROOT file to Arrow arrays # TODO: Implement configurable chunk_size event_iterator = UprootEvents(file_path=output_path, tree_name=flat_tree_name, attr_name_list=attr_name_list, chunk_size=1000) transformer = UprootTransformer(event_iterator) arrow_writer.write_branches_to_arrow(transformer=transformer, topic_name=args.request_id, file_id=None, request_id=args.request_id) print("Kafka Timings: "+str(arrow_writer.messaging_timings))
def test_transform_file_kafka(self, mocker): mock_kafka = mocker.MagicMock(KafkaMessaging) aw = ArrowWriter(file_format='parquet', object_store=None, messaging=mock_kafka) mock_transformer = mocker.MagicMock(UprootTransformer) mock_transformer.file_path = '/tmp/foo' mock_transformer.chunk_size = 100 mock_transformer.attr_name_list = ['a', 'b'] data = OrderedDict([('strs', [chr(c) for c in range(ord('a'), ord('n'))]), ('ints', list(range(1, 14)))]) table = pa.Table.from_pydict(data) table2 = pa.Table.from_pydict(data) mock_transformer.arrow_table = mocker.Mock(return_value=iter([table, table2])) aw.write_branches_to_arrow(transformer=mock_transformer, topic_name='servicex', file_id=42, request_id="123-45") mock_transformer.arrow_table.assert_called_with() kafka_calls = mock_kafka.publish_message.call_args_list assert len(kafka_calls) == 2 topic, key, py_arrow_buffer = kafka_calls[0][0] assert topic == 'servicex' assert key == b'/tmp/foo-0' reader = pa.RecordBatchStreamReader(py_arrow_buffer) table = reader.read_all() assert table.column_names == ['strs', 'ints']
def transform_single_file(file_path, output_path, servicex=None, tree_name='Events'): print("Transforming a single path: " + str(file_path)) try: import generated_transformer start_transform = time.time() table = generated_transformer.run_query(file_path, tree_name) end_transform = time.time() print( f'generated_transformer.py: {round(end_transform - start_transform, 2)} sec' ) start_serialization = time.time() table_awk1 = awkward1.from_awkward0(table) new_table = awkward1.to_awkward0(table_awk1) arrow = awkward.toarrow(new_table) end_serialization = time.time() print( f'awkward Table -> Arrow: {round(end_serialization - start_serialization, 2)} sec' ) if output_path: writer = pq.ParquetWriter(output_path, arrow.schema) writer.write_table(table=arrow) writer.close() except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_tb(exc_traceback, limit=20, file=sys.stdout) print(exc_value) raise RuntimeError("Failed to transform input file " + file_path + ": " + str(exc_value)) if messaging: arrow_writer = ArrowWriter(file_format=args.result_format, object_store=None, messaging=messaging) #Todo implement chunk size parameter transformer = ArrowIterator(arrow, chunk_size=1000, file_path=file_path) arrow_writer.write_branches_to_arrow(transformer=transformer, topic_name=args.request_id, file_id=None, request_id=args.request_id)
def transform_single_file(file_path, output_path, servicex=None): print("Transforming a single path: " + str(file_path) + " into " + output_path) selection = '' with open('/generated/config.txt', 'r') as sel_file: selection = sel_file.readline() selection_split = selection.split(';') r = os.system( 'cd CMSSW_10_2_21/src/TreeMaker/Production/test/ && source /opt/cms/cmsset_default.sh && eval `scramv1 runtime -sh` && python ${CMSSW_BASE}/src/TreeMaker/Production/test/unitTest.py ' + selection_split[0] + ' dataset=' + str(file_path) + ' name=' + str(output_path).replace("_RA2AnalysisTree.root", "") + ' command=\"' + selection_split[1] + '\" 2> log.txt && pwd && ls -alh ./ && ls -alh /home/cmsusr/ && ls -alh /servicex/' ) reason_bad = None if r != 0: reason_bad = "Error return from transformer: " + str(r) if (reason_bad is None) and not os.path.exists(output_path): reason_bad = "Output file " + output_path + " was not found" if reason_bad is not None: with open('log.txt', 'r') as f: errors = f.read() raise RuntimeError("Failed to transform input file " + file_path + ": " + reason_bad + ' -- errors: \n' + errors) if not object_store: flat_file = uproot.open(output_path) flat_tree_name = flat_file.keys()[0] attr_name_list = flat_file[flat_tree_name].keys() arrow_writer = ArrowWriter(file_format=args.result_format, object_store=object_store, messaging=messaging) # NB: We're converting the *output* ROOT file to Arrow arrays # TODO: Implement configurable chunk_size event_iterator = UprootEvents(file_path=output_path, tree_name=flat_tree_name, attr_name_list=attr_name_list, chunk_size=1000) transformer = UprootTransformer(event_iterator) arrow_writer.write_branches_to_arrow(transformer=transformer, topic_name=args.request_id, file_id=None, request_id=args.request_id) print("Kafka Timings: " + str(arrow_writer.messaging_timings))
def test_init(self, mocker): mock_object_store = mocker.MagicMock(ObjectStoreManager) mock_messaging = mocker.MagicMock(KafkaMessaging) aw = ArrowWriter(file_format='hdf5', object_store=mock_object_store, messaging=mock_messaging) assert aw.object_store == mock_object_store assert aw.file_format == 'hdf5' assert aw.messaging == mock_messaging
def test_transform_file_object_store(self, mocker): from servicex.transformer.scratch_file_writer import ScratchFileWriter mock_object_store = mocker.MagicMock(ObjectStoreManager) mock_scratch_file = mocker.MagicMock(ScratchFileWriter) scratch_file_init = mocker.patch( "servicex.transformer.scratch_file_writer.ScratchFileWriter", return_value=mock_scratch_file) mock_scratch_file.file_path = "/tmp/foo" aw = ArrowWriter(file_format='parquet', object_store=mock_object_store, messaging=None) mock_transformer = mocker.MagicMock(UprootTransformer) mock_transformer.file_path = '/tmp/foo' mock_transformer.chunk_size = 100 mock_transformer.attr_name_list = ['a', 'b'] data = OrderedDict([('strs', [chr(c) for c in range(ord('a'), ord('n'))]), ('ints', list(range(1, 14)))]) table = pa.Table.from_pydict(data) table2 = pa.Table.from_pydict(data) mock_transformer.arrow_table = mocker.Mock(return_value=iter([table, table2])) aw.write_branches_to_arrow(transformer=mock_transformer, topic_name='servicex', file_id=42, request_id="123-45") scratch_file_init.assert_called_with(file_format='parquet') mock_transformer.arrow_table.assert_called_with() mock_scratch_file.open_scratch_file.assert_called_once_with(table) mock_scratch_file.append_table_to_scratch.assert_has_calls( [call(table), call(table2)]) mock_scratch_file.close_scratch_file.assert_called_once() mock_object_store.upload_file.assert_called_once_with("123-45", ":tmp:foo", "/tmp/foo") mock_scratch_file.remove_scratch_file.assert_called_once()
def transform_single_file(file_path, output_path, servicex=None, tree_name='Events'): print("Transforming a single path: " + str(file_path)) try: import generated_transformer table = generated_transformer.run_query(file_path, tree_name) # Deal with messy, nested lazy arrays which cannot be converted to arrow new_table = pd.DataFrame(table) arrow = pa.Table.from_pandas(new_table) if output_path: writer = pq.ParquetWriter(output_path, arrow.schema) writer.write_table(table=arrow) writer.close() except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_tb(exc_traceback, limit=20, file=sys.stdout) print(exc_value) raise RuntimeError("Failed to transform input file " + file_path + ": " + str(exc_value)) if messaging: arrow_writer = ArrowWriter(file_format=args.result_format, object_store=None, messaging=messaging) #Todo implement chunk size parameter transformer = ArrowIterator(arrow, chunk_size=1000, file_path=file_path) arrow_writer.write_branches_to_arrow(transformer=transformer, topic_name=args.request_id, file_id=None, request_id=args.request_id)