Beispiel #1
0
def transform_single_file(file_path, output_path, servicex=None):
    print("Transforming a single path: " + str(file_path) + " into " + output_path)
    # os.system("voms-proxy-info --all")
    r = os.system('bash /generated/runner.sh -r -d ' + file_path + ' -o ' + output_path +  '| tee log.txt')
    reason_bad = None
    if r != 0:
        reason_bad = "Error return from transformer: " + str(r)
    if (reason_bad is None) and not os.path.exists(output_path):
        reason_bad = "Output file " + output_path + " was not found"
    if reason_bad is not None:
        with open('log.txt', 'r') as f:
            errors = f.read()
            raise RuntimeError("Failed to transform input file " + file_path + ": " + reason_bad + ' -- errors: \n' + errors)

    if not object_store:
        flat_file = uproot.open(output_path)
        flat_tree_name = flat_file.keys()[0]
        attr_name_list = flat_file[flat_tree_name].keys()

        arrow_writer = ArrowWriter(file_format=args.result_format,
                                   object_store=object_store,
                                   messaging=messaging)
        # NB: We're converting the *output* ROOT file to Arrow arrays
        # TODO: Implement configurable chunk_size
        event_iterator = UprootEvents(file_path=output_path, tree_name=flat_tree_name,
                                      attr_name_list=attr_name_list, chunk_size=1000)
        transformer = UprootTransformer(event_iterator)
        arrow_writer.write_branches_to_arrow(transformer=transformer, topic_name=args.request_id,
                                             file_id=None, request_id=args.request_id)
        print("Kafka Timings: "+str(arrow_writer.messaging_timings))
Beispiel #2
0
    def test_transform_file_kafka(self, mocker):
        mock_kafka = mocker.MagicMock(KafkaMessaging)

        aw = ArrowWriter(file_format='parquet',
                         object_store=None,
                         messaging=mock_kafka)

        mock_transformer = mocker.MagicMock(UprootTransformer)
        mock_transformer.file_path = '/tmp/foo'
        mock_transformer.chunk_size = 100
        mock_transformer.attr_name_list = ['a', 'b']

        data = OrderedDict([('strs', [chr(c) for c in range(ord('a'), ord('n'))]),
                            ('ints', list(range(1, 14)))])
        table = pa.Table.from_pydict(data)
        table2 = pa.Table.from_pydict(data)

        mock_transformer.arrow_table = mocker.Mock(return_value=iter([table, table2]))
        aw.write_branches_to_arrow(transformer=mock_transformer, topic_name='servicex',
                                   file_id=42, request_id="123-45")

        mock_transformer.arrow_table.assert_called_with()

        kafka_calls = mock_kafka.publish_message.call_args_list
        assert len(kafka_calls) == 2
        topic, key, py_arrow_buffer = kafka_calls[0][0]
        assert topic == 'servicex'
        assert key == b'/tmp/foo-0'

        reader = pa.RecordBatchStreamReader(py_arrow_buffer)
        table = reader.read_all()
        assert table.column_names == ['strs', 'ints']
def transform_single_file(file_path,
                          output_path,
                          servicex=None,
                          tree_name='Events'):
    print("Transforming a single path: " + str(file_path))

    try:
        import generated_transformer
        start_transform = time.time()
        table = generated_transformer.run_query(file_path, tree_name)
        end_transform = time.time()
        print(
            f'generated_transformer.py: {round(end_transform - start_transform, 2)} sec'
        )

        start_serialization = time.time()
        table_awk1 = awkward1.from_awkward0(table)
        new_table = awkward1.to_awkward0(table_awk1)
        arrow = awkward.toarrow(new_table)
        end_serialization = time.time()
        print(
            f'awkward Table -> Arrow: {round(end_serialization - start_serialization, 2)} sec'
        )

        if output_path:
            writer = pq.ParquetWriter(output_path, arrow.schema)
            writer.write_table(table=arrow)
            writer.close()

    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_tb(exc_traceback, limit=20, file=sys.stdout)
        print(exc_value)

        raise RuntimeError("Failed to transform input file " + file_path +
                           ": " + str(exc_value))

    if messaging:
        arrow_writer = ArrowWriter(file_format=args.result_format,
                                   object_store=None,
                                   messaging=messaging)

        #Todo implement chunk size parameter
        transformer = ArrowIterator(arrow,
                                    chunk_size=1000,
                                    file_path=file_path)
        arrow_writer.write_branches_to_arrow(transformer=transformer,
                                             topic_name=args.request_id,
                                             file_id=None,
                                             request_id=args.request_id)
def transform_single_file(file_path, output_path, servicex=None):
    print("Transforming a single path: " + str(file_path) + " into " +
          output_path)

    selection = ''
    with open('/generated/config.txt', 'r') as sel_file:
        selection = sel_file.readline()
    selection_split = selection.split(';')

    r = os.system(
        'cd CMSSW_10_2_21/src/TreeMaker/Production/test/ && source /opt/cms/cmsset_default.sh && eval `scramv1 runtime -sh` && python ${CMSSW_BASE}/src/TreeMaker/Production/test/unitTest.py '
        + selection_split[0] + ' dataset=' + str(file_path) + ' name=' +
        str(output_path).replace("_RA2AnalysisTree.root", "") + ' command=\"' +
        selection_split[1] +
        '\" 2> log.txt && pwd && ls -alh ./ && ls -alh /home/cmsusr/ && ls -alh /servicex/'
    )
    reason_bad = None
    if r != 0:
        reason_bad = "Error return from transformer: " + str(r)
    if (reason_bad is None) and not os.path.exists(output_path):
        reason_bad = "Output file " + output_path + " was not found"
    if reason_bad is not None:
        with open('log.txt', 'r') as f:
            errors = f.read()
            raise RuntimeError("Failed to transform input file " + file_path +
                               ": " + reason_bad + ' -- errors: \n' + errors)

    if not object_store:
        flat_file = uproot.open(output_path)
        flat_tree_name = flat_file.keys()[0]
        attr_name_list = flat_file[flat_tree_name].keys()

        arrow_writer = ArrowWriter(file_format=args.result_format,
                                   object_store=object_store,
                                   messaging=messaging)
        # NB: We're converting the *output* ROOT file to Arrow arrays
        # TODO: Implement configurable chunk_size
        event_iterator = UprootEvents(file_path=output_path,
                                      tree_name=flat_tree_name,
                                      attr_name_list=attr_name_list,
                                      chunk_size=1000)
        transformer = UprootTransformer(event_iterator)
        arrow_writer.write_branches_to_arrow(transformer=transformer,
                                             topic_name=args.request_id,
                                             file_id=None,
                                             request_id=args.request_id)
        print("Kafka Timings: " + str(arrow_writer.messaging_timings))
Beispiel #5
0
    def test_init(self, mocker):
        mock_object_store = mocker.MagicMock(ObjectStoreManager)
        mock_messaging = mocker.MagicMock(KafkaMessaging)

        aw = ArrowWriter(file_format='hdf5',
                         object_store=mock_object_store,
                         messaging=mock_messaging)

        assert aw.object_store == mock_object_store
        assert aw.file_format == 'hdf5'
        assert aw.messaging == mock_messaging
Beispiel #6
0
    def test_transform_file_object_store(self, mocker):
        from servicex.transformer.scratch_file_writer import ScratchFileWriter

        mock_object_store = mocker.MagicMock(ObjectStoreManager)
        mock_scratch_file = mocker.MagicMock(ScratchFileWriter)
        scratch_file_init = mocker.patch(
            "servicex.transformer.scratch_file_writer.ScratchFileWriter",
            return_value=mock_scratch_file)

        mock_scratch_file.file_path = "/tmp/foo"

        aw = ArrowWriter(file_format='parquet',
                         object_store=mock_object_store,
                         messaging=None)

        mock_transformer = mocker.MagicMock(UprootTransformer)
        mock_transformer.file_path = '/tmp/foo'
        mock_transformer.chunk_size = 100
        mock_transformer.attr_name_list = ['a', 'b']

        data = OrderedDict([('strs', [chr(c) for c in range(ord('a'), ord('n'))]),
                            ('ints', list(range(1, 14)))])
        table = pa.Table.from_pydict(data)
        table2 = pa.Table.from_pydict(data)

        mock_transformer.arrow_table = mocker.Mock(return_value=iter([table, table2]))
        aw.write_branches_to_arrow(transformer=mock_transformer, topic_name='servicex',
                                   file_id=42, request_id="123-45")

        scratch_file_init.assert_called_with(file_format='parquet')
        mock_transformer.arrow_table.assert_called_with()
        mock_scratch_file.open_scratch_file.assert_called_once_with(table)
        mock_scratch_file.append_table_to_scratch.assert_has_calls(
            [call(table), call(table2)])

        mock_scratch_file.close_scratch_file.assert_called_once()

        mock_object_store.upload_file.assert_called_once_with("123-45", ":tmp:foo",
                                                              "/tmp/foo")
        mock_scratch_file.remove_scratch_file.assert_called_once()
def transform_single_file(file_path,
                          output_path,
                          servicex=None,
                          tree_name='Events'):
    print("Transforming a single path: " + str(file_path))

    try:
        import generated_transformer
        table = generated_transformer.run_query(file_path, tree_name)

        # Deal with messy, nested lazy arrays which cannot be converted to arrow
        new_table = pd.DataFrame(table)
        arrow = pa.Table.from_pandas(new_table)

        if output_path:
            writer = pq.ParquetWriter(output_path, arrow.schema)
            writer.write_table(table=arrow)
            writer.close()

    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_tb(exc_traceback, limit=20, file=sys.stdout)
        print(exc_value)

        raise RuntimeError("Failed to transform input file " + file_path +
                           ": " + str(exc_value))

    if messaging:
        arrow_writer = ArrowWriter(file_format=args.result_format,
                                   object_store=None,
                                   messaging=messaging)

        #Todo implement chunk size parameter
        transformer = ArrowIterator(arrow,
                                    chunk_size=1000,
                                    file_path=file_path)
        arrow_writer.write_branches_to_arrow(transformer=transformer,
                                             topic_name=args.request_id,
                                             file_id=None,
                                             request_id=args.request_id)