Exemple #1
0
    def process(self, element, load_job_name_prefix, *schema_side_inputs):
        # Each load job is assumed to have files respecting these constraints:
        # 1. Total size of all files < 15 TB (Max size for load jobs)
        # 2. Total no. of files in a single load job < 10,000
        # This assumption means that there will always be a single load job
        # triggered for each partition of files.
        destination = element[0]
        files = element[1]

        if callable(self.schema):
            schema = self.schema(destination, *schema_side_inputs)
        elif isinstance(self.schema, vp.ValueProvider):
            schema = self.schema.get()
        else:
            schema = self.schema

        if callable(self.additional_bq_parameters):
            additional_parameters = self.additional_bq_parameters(destination)
        elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
            additional_parameters = self.additional_bq_parameters.get()
        else:
            additional_parameters = self.additional_bq_parameters

        table_reference = bigquery_tools.parse_table_reference(destination)
        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')
        # Load jobs for a single destination are always triggered from the same
        # worker. This means that we can generate a deterministic numbered job id,
        # and not need to worry.
        destination_hash = _bq_uuid(
            '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId))
        uid = _bq_uuid()
        job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid)
        logging.debug('Load job has %s files. Job name is %s.', len(files),
                      job_name)

        if self.temporary_tables:
            # For temporary tables, we create a new table with the name with JobId.
            table_reference.tableId = job_name
            yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES,
                                      table_reference)

        logging.info(
            'Triggering job %s to load data to BigQuery table %s.'
            'Schema: %s. Additional parameters: %s', job_name, table_reference,
            schema, additional_parameters)
        job_reference = self.bq_wrapper.perform_load_job(
            table_reference,
            files,
            job_name,
            schema=schema,
            write_disposition=self.write_disposition,
            create_disposition=self.create_disposition,
            additional_load_parameters=additional_parameters)
        yield (destination, job_reference)
Exemple #2
0
  def process(self, element, load_job_name_prefix, *schema_side_inputs):
    destination = element[0]
    files = iter(element[1])

    if callable(self.schema):
      schema = self.schema(destination, *schema_side_inputs)
    elif isinstance(self.schema, vp.ValueProvider):
      schema = self.schema.get()
    else:
      schema = self.schema

    if callable(self.additional_bq_parameters):
      additional_parameters = self.additional_bq_parameters(destination)
    elif isinstance(self.additional_bq_parameters, vp.ValueProvider):
      additional_parameters = self.additional_bq_parameters.get()
    else:
      additional_parameters = self.additional_bq_parameters

    batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
    while batch_of_files:

      table_reference = bigquery_tools.parse_table_reference(destination)
      if table_reference.projectId is None:
        table_reference.projectId = vp.RuntimeValueProvider.get_value(
            'project', str, '')
      # Load jobs for a single destination are always triggered from the same
      # worker. This means that we can generate a deterministic numbered job id,
      # and not need to worry.
      destination_hash = _bq_uuid('%s:%s.%s' % (table_reference.projectId,
                                                table_reference.datasetId,
                                                table_reference.tableId))
      timestamp = int(time.time())
      job_name = '%s_%s_%s' % (
          load_job_name_prefix, destination_hash, timestamp)
      logging.debug('Batch of files has %s files. Job name is %s.',
                    len(batch_of_files), job_name)

      if self.temporary_tables:
        # For temporary tables, we create a new table with the name with JobId.
        table_reference.tableId = job_name
        yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference)

      logging.info('Triggering job %s to load data to BigQuery table %s.'
                   'Schema: %s. Additional parameters: %s',
                   job_name, table_reference,
                   schema, additional_parameters)
      job_reference = self.bq_wrapper.perform_load_job(
          table_reference, batch_of_files, job_name,
          schema=schema,
          write_disposition=self.write_disposition,
          create_disposition=self.create_disposition,
          additional_load_parameters=additional_parameters)
      yield (destination, job_reference)

      # Prepare to trigger the next job
      batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
Exemple #3
0
 def finish_bundle(self):
   for destination, file_path_writer in \
     iteritems(self._destination_to_file_writer):
     (file_path, writer) = file_path_writer
     file_size = writer.tell()
     writer.close()
     yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG,
                               GlobalWindows.windowed_value(
                                   (destination, (file_path, file_size))))
   self._destination_to_file_writer = {}
Exemple #4
0
 def process(self, element, *args, **kwargs):
     try:
         row = json.loads(element, encoding='utf-8')
         yield self.parse_row(row)
     except (TypeError, ValueError) as e:
         yield pvalue.TaggedOutput(self.TAG_BROKEN_DATA, {
             Field.Element: element,
             Field.Error: e.message
         })
         self.broken_data_counter.inc()
    def process(self, element: str):
        self.input_records_counter.inc()

        # We have two outputs: one for well formed input lines, and another one with potential parsing errors
        # (the parsing error output will be written to a different BigQuery table)
        try:
            # ignore header row
            if element != self._header_line:
                record: Record = data_classes.line2record(element)
                self.correct_records_counter.inc()
                yield pvalue.TaggedOutput(ParseCSVDoFn.CORRECT_OUTPUT_TAG,
                                          record)
        except TypeError as err:
            self.wrong_records_counter.inc()
            msg = str(err)
            yield pvalue.TaggedOutput(ParseCSVDoFn.WRONG_OUTPUT_TAG, {
                'error': msg,
                'line': element
            })
    def process(self,
                element,
                publish_time=beam.DoFn.TimestampParam,
                table_dictionary=table_dictionary,
                *arg,
                **kwargs):
        if (element.data != None and element.data != b''
                and element.data != "b''"):
            data = json.loads(element.data)
            data['publish_time'] = (
                datetime.datetime.utcfromtimestamp(float(publish_time)) +
                datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S.%f")
            if list(data.keys()) == table_dictionary['columns_table_name']:
                data['timestamp'] = datetime.datetime(
                    data['timestamp']['DateTime']['year'],
                    data['timestamp']['DateTime']['month'],
                    data['timestamp']['DateTime']['day'],
                    data['timestamp']['DateTime']['hour'],
                    data['timestamp']['DateTime']['minute'],
                    data['timestamp']['DateTime']['second'],
                    data['timestamp']['DateTime']['micro'])
                data['timestamp'] = data['timestamp'].strftime(
                    '%Y-%m-%d %H:%M:%S.%f')
                yield pvalue.TaggedOutput('table_name', data)
                logging.info('this is table_name' + str(data))
        else:
            keys = element.attributes['key']
            keys = {}
            for attr in element.attributes['key'][7:-1].split(','):
                key, val = attr.split('=')
                try:
                    keys[key] = int(val)
                except ValueError as ve:
                    keys[key] = val

            keys['publish_time'] = (
                datetime.datetime.utcfromtimestamp(float(publish_time)) +
                datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S.%f")

            if list(keys.keys()) == table_dictionary['keys_table_name']:
                yield pvalue.TaggedOutput('table_name_dbactions', keys)
                logging.info('table_name_dbactions' + str(keys))
    def process(self, element):
        events = element['events']
        monitors = element['monitors']
        info = element['info']
        testsuites = element['testsuites']
        useractions = element['useractions']
        campaign = element['campaign']

        for event in events:
            yield event

        for monitor in monitors:
            yield pvalue.TaggedOutput('monitorsPC', monitor)
        for testsuite in testsuites:
            yield pvalue.TaggedOutput('testsuitesPC', testsuite)
        for useraction in useractions:
            yield pvalue.TaggedOutput('useractionsPC', useraction)
        for campaignu in campaign:
            yield pvalue.TaggedOutput('campaignPC', campaignu)
        yield pvalue.TaggedOutput('infoPC', info)
    def process(self, serialized_example):
        example = tf.train.Example()
        example.ParseFromString(serialized_example)

        thread_id, = example.features.feature['product_id'].bytes_list.value
        split_value = self._split_value(thread_id)

        split = (
            self.TRAIN_TAG if split_value < self._train_split else
            self.TEST_TAG)
        yield pvalue.TaggedOutput(split, serialized_example)
Exemple #9
0
    def process(self, element):
      records = list(element[1])
      # Split of 2 crops and pre-generate the subgrid.
      # Select the crop with highest number of possible greenhouses:
      # in case two crops with only a single possible greenhouse were selected
      # the subgrid would consist of only 1 element.
      best_split = np.argsort([-len(r['transport_costs']) for r in records])[:2]
      rec1 = records[best_split[0]]
      rec2 = records[best_split[1]]

      # Generate & emit all combinations
      for a in rec1['transport_costs']:
        if a[1]:
          for b in rec2['transport_costs']:
            if b[1]:
              combination = [(rec1['crop'], a[0]), (rec2['crop'], b[0])]
              yield pvalue.TaggedOutput('splitted', combination)

      # Pass on remaining records
      remaining = [rec for i, rec in enumerate(records) if i not in best_split]
      yield pvalue.TaggedOutput('combine', remaining)
Exemple #10
0
  def process(self, element): # pylint: disable=no-self-use
    try:
      import code_search.utils as utils

      start_time = time.time()
      element['pairs'] = utils.get_function_docstring_pairs(element.pop('content'))
      self.tokenization_time_ms.inc(int((time.time() - start_time) * 1000.0))

      yield element
    except Exception as e: #pylint: disable=broad-except
      logging.warning('Tokenization failed, %s', e.message)
      yield pvalue.TaggedOutput('err_rows', element)
Exemple #11
0
  def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use
    try:
      from preprocess.tokenizer import get_function_docstring_pairs

      start_time = time.time()
      element['pairs'] = get_function_docstring_pairs(element.pop('content'))
      self.tokenization_time_ms.inc(int((time.time() - start_time) * 1000.0))

      yield element
    except Exception as e: #pylint: disable=broad-except
      logging.warning('Tokenization failed, %s', e.message)
      yield pvalue.TaggedOutput('err_rows', element)
Exemple #12
0
    def process(self, kmsg):
        tagged_state = _helpers.TaggedStates.DEFAULT
        item = kmsg.data.element.decode("utf-8")

        if self.ping(kmsg):
            self._klio.logger.info("Pass through '%s': Ping mode ON." % item)
            tagged_state = _helpers.TaggedStates.PASS_THRU

        else:
            self._klio.logger.debug("Process '%s': Ping mode OFF." % item)
            tagged_state = _helpers.TaggedStates.PROCESS

        yield pvalue.TaggedOutput(tagged_state.value, kmsg.SerializeToString())
Exemple #13
0
    def process(self, element):
        user_event = recommendationengine.UserEvent(element)
        request = recommendationengine.WriteUserEventRequest(
            parent=self.parent, user_event=user_event)

        try:
            created_user_event = self._client.write_user_event(request)
            self.counter.inc()
            yield recommendationengine.UserEvent.to_dict(created_user_event)
        except Exception:
            yield pvalue.TaggedOutput(
                self.FAILED_USER_EVENTS,
                recommendationengine.UserEvent.to_dict(user_event))
Exemple #14
0
 def process(self, element):
     '''
     Overriding process method of beam's DoFn class
     element - JSON string
     '''
     json_data = json.loads(
         element)  # loading JSON string into a dictionary
     if json_data[
             "event_name"] == 'super duper event - 1':  # Check if event_name matches pre-decided tag
         # Some pre-processing steps here, if needed; Make sure contents of json_data["payload"] matches the row specification of BigQuery table
         yield pvalue.TaggedOutput(
             'super duper', ast.literal_eval(json_data["payload"])
         )  # return a generator object that produces tagged pValues.
Exemple #15
0
 def process(self, element, *args, **kwargs):  # pylint: disable=unused-argument
     try:
         info_rows = [
             dict(zip(self.info_keys, pair))
             for pair in element.pop('pairs')
         ]
         info_rows = [
             self.merge_two_dicts(info_dict, element)
             for info_dict in info_rows
         ]
         info_rows = map(self.dict_to_unicode, info_rows)
         yield info_rows
     except:  #pylint: disable=bare-except
         yield pvalue.TaggedOutput('err_rows', element)
Exemple #16
0
    def process(self, element):
        user_event = recommendationengine.UserEvent(element)
        request = recommendationengine.PredictRequest(name=self.name,
                                                      user_event=user_event)

        try:
            prediction = self._client.predict(request)
            self.counter.inc()
            yield [
                recommendationengine.PredictResponse.to_dict(p)
                for p in prediction.pages
            ]
        except Exception:
            yield pvalue.TaggedOutput(self.FAILED_PREDICTIONS, user_event)
Exemple #17
0
def parse_and_move(path_and_meta):
    import xml.etree.ElementTree as ET
    import re
    import sys
    import apache_beam as beam
    from apache_beam import pvalue
    try:
        path,_,_ = path_and_meta
        _,unprocessed_dir,_ = path_and_meta
        _,_,processed_dir = path_and_meta

        open_file = beam.io.filesystems.FileSystems.open(path)
        content = open_file.read()
        root = ET.fromstring(content)
        root.findall(".")
        item_list = []
        for item in root.findall(".//channel/item"):
            link = item.find('link').text
            title = item.find('title').text
            pubdate = item.find('pubDate').text
            i = {
                "pubdate": pubdate,
                "link": link,
                "title": title
            }
            item_list.append(i)

        dest = re.sub(unprocessed_dir, processed_dir, path) 
        beam.io.filesystems.FileSystems.rename([path], [dest])

        yield pvalue.TaggedOutput('ok', item_list)
        yield item_list

    except Exception as e:
        error_pack = [{"filepath":path,"errormsg":str(e)}]
        yield pvalue.TaggedOutput('fail', error_pack)
        yield error_pack
Exemple #18
0
    def process(self, element, *_args, **_kwargs):
        """Get list of Function-Docstring tokens

    This processes each Python file's content
    and returns a list of metadata for each extracted
    pair. These contain the tokenized functions and
    docstrings. In cases where the tokenization fails,
    a side output is returned. All values are unicode
    for serialization.

    Args:
      element: A Python dict of the form,
        {
          "nwo": "STRING",
          "path": "STRING",
          "content": "STRING",
        }

    Yields:
      A Python list of the form,
      [
        {
          "nwo": "STRING",
          "path": "STRING",
          "function_name": "STRING",
          "lineno": "STRING",
          "original_function": "STRING",
          "function_tokens": "STRING",
          "docstring_tokens": "STRING",
        },
        ...
      ]
    """
        try:
            content_blob = element.pop(self.content_key)
            pairs = utils.get_function_docstring_pairs(content_blob)

            result = [
                dict(zip(self.info_keys, pair_tuple), **element)
                for pair_tuple in pairs
            ]

            yield result
        # TODO(jlewi): Can we narrow down the scope covered by swallowing
        # errors? It should really only be the AST parsing code so can
        # we move try/catch into get_function_docstring_pairs?
        except Exception as e:  # pylint: disable=broad-except
            logging.warning('Tokenization failed, %s', e.message)
            yield pvalue.TaggedOutput('err', element)
Exemple #19
0
    def process(self, element, file_prefix):
        destination = element[0]
        row = element[1]

        if destination in self._destination_to_file_writer:
            writer = self._destination_to_file_writer[destination]
        elif len(self._destination_to_file_writer) < self.max_files_per_bundle:
            (file_path,
             writer) = _make_new_file_writer(file_prefix, destination)
            self._destination_to_file_writer[destination] = writer
            yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG,
                                      (destination, file_path))
        else:
            yield pvalue.TaggedOutput(WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                      element)
            return

        # TODO(pabloem): Is it possible for this to throw exception?
        writer.write(self.coder.encode(row))
        writer.write(b'\n')

        if writer.tell() > self.max_file_size:
            writer.close()
            self._destination_to_file_writer.pop(destination)
Exemple #20
0
    def process(self, element, *_args, **_kwargs):
        """Get list of Function-Docstring tokens

    This processes each Python file's content
    and returns a list of metadata for each extracted
    pair. These contain the tokenized functions and
    docstrings. In cases where the tokenization fails,
    a side output is returned. All values are unicode
    for serialization.

    Args:
      element: A Python dict of the form,
        {
          "nwo": "STRING",
          "path": "STRING",
          "content": "STRING",
        }

    Yields:
      A Python list of the form,
      [
        {
          "nwo": "STRING",
          "path": "STRING",
          "function_name": "STRING",
          "lineno": "STRING",
          "original_function": "STRING",
          "function_tokens": "STRING",
          "docstring_tokens": "STRING",
        },
        ...
      ]
    """
        try:
            import code_search.dataflow.utils as utils

            content_blob = element.pop(self.content_key)
            pairs = utils.get_function_docstring_pairs(content_blob)

            result = [
                dict(zip(self.info_keys, pair_tuple), **element)
                for pair_tuple in pairs
            ]

            yield result
        except Exception as e:  # pylint: disable=broad-except
            logging.warning('Tokenization failed, %s', e.message)
            yield pvalue.TaggedOutput('err', element)
Exemple #21
0
 def _pickle_dump(ctx, kmsg, ret):
     tagged, tag = False, None
     if isinstance(ret, pvalue.TaggedOutput):
         tagged = True
         tag = ret.tag
         ret = ret.value
     try:
         dumped = pickle.dumps(ret)
         if tagged:
             return pvalue.TaggedOutput(tag, dumped)
         return dumped
     except Exception as err:
         ctx.logger.error(
             "Exception occurred when pickling payload for '%s'.\nError: %s"
             % (kmsg.element, err))
         raise err
Exemple #22
0
    def _flush_batch(self, destination):

        # Flush the current batch of rows to BigQuery.
        rows = self._rows_buffer[destination]
        table_reference = bigquery_tools.parse_table_reference(destination)

        if table_reference.projectId is None:
            table_reference.projectId = vp.RuntimeValueProvider.get_value(
                'project', str, '')

        logging.debug('Flushing data to %s. Total %s rows.', destination,
                      len(rows))

        while True:
            # TODO: Figure out an insertId to make calls idempotent.
            passed, errors = self.bigquery_wrapper.insert_rows(
                project_id=table_reference.projectId,
                dataset_id=table_reference.datasetId,
                table_id=table_reference.tableId,
                rows=rows,
                skip_invalid_rows=True)

            logging.debug("Passed: %s. Errors are %s", passed, errors)
            failed_rows = [rows[entry.index] for entry in errors]
            should_retry = any(
                bigquery_tools.RetryStrategy.should_retry(
                    self._retry_strategy, entry.errors[0].reason)
                for entry in errors)
            rows = failed_rows

            if not should_retry:
                break
            else:
                retry_backoff = next(self._backoff_calculator)
                logging.info('Sleeping %s seconds before retrying insertion.',
                             retry_backoff)
                time.sleep(retry_backoff)

        self._total_buffered_rows -= len(self._rows_buffer[destination])
        del self._rows_buffer[destination]

        return [
            pvalue.TaggedOutput(
                BigQueryWriteFn.FAILED_ROWS,
                GlobalWindows.windowed_value((destination, row)))
            for row in failed_rows
        ]
Exemple #23
0
    def process(self, element, load_job_name_prefix):
        destination = element[0]
        files = iter(element[1])

        job_count = 0
        batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
        while batch_of_files:

            table_reference = bigquery_tools.parse_table_reference(destination)
            if table_reference.projectId is None:
                table_reference.projectId = vp.RuntimeValueProvider.get_value(
                    'project', str, '')

            # Load jobs for a single des5tination are always triggered from the same
            # worker. This means that we can generate a deterministic numbered job id,
            # and not need to worry.
            job_name = '%s_%s_%s' % (
                load_job_name_prefix,
                _bq_uuid('%s:%s.%s' %
                         (table_reference.projectId, table_reference.datasetId,
                          table_reference.tableId)), job_count)
            logging.debug("Batch of files has %s files. Job name is %s",
                          len(batch_of_files), job_name)

            if self.temporary_tables:
                # For temporary tables, we create a new table with the name with JobId.
                table_reference.tableId = job_name
                yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES,
                                          table_reference)

            logging.info(
                "Triggering job %s to load data to BigQuery table %s.",
                job_name, table_reference)
            job_reference = self.bq_wrapper.perform_load_job(
                table_reference,
                batch_of_files,
                job_name,
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition)
            yield (destination, job_reference)

            # Prepare to trigger the next job
            job_count += 1
            batch_of_files = list(itertools.islice(files,
                                                   _MAXIMUM_SOURCE_URIS))
Exemple #24
0
    def process(self, kmsg):
        item = kmsg.data.element
        item_path = self._get_absolute_path(item)
        item_exists = self.exists(item_path)

        state = DataExistState.FOUND
        if not item_exists:
            state = DataExistState.NOT_FOUND

        self._klio.logger.info("%s %s at %s" % (
            self.DIRECTION_PFX.value.title(),
            DataExistState.to_str(state),
            item_path,
        ))

        # double tag for easier user interface, i.e. pcoll.found vs pcoll.true
        yield pvalue.TaggedOutput(state.value, kmsg.SerializeToString())
Exemple #25
0
    def process(self, element):

        user_events = [recommendationengine.UserEvent(e) for e in element[1]]
        user_event_inline_source = recommendationengine.UserEventInlineSource(
            {"user_events": user_events})
        input_config = recommendationengine.InputConfig(
            user_event_inline_source=user_event_inline_source)

        request = recommendationengine.ImportUserEventsRequest(
            parent=self.parent, input_config=input_config)

        try:
            operation = self._client.write_user_event(request)
            self.counter.inc(len(user_events))
            yield recommendationengine.PredictResponse.to_dict(
                operation.result())
        except Exception:
            yield pvalue.TaggedOutput(self.FAILED_USER_EVENTS, user_events)
Exemple #26
0
 def _numpy_dump(ctx, kmsg, ret):
     tagged, tag = False, None
     if isinstance(ret, pvalue.TaggedOutput):
         tagged = True
         tag = ret.tag
         ret = ret.value
     try:
         out = io.BytesIO()
         np.save(out, ret)
         dumped = out.getvalue()  # returns the data in `out` in bytes
         if tagged:
             return pvalue.TaggedOutput(tag, dumped)
         return dumped
     except Exception as err:
         ctx.logger.error(
             "Exception occurred when dumping numpy payload for '%s'.\n"
             "Error: %s" % (kmsg.element, err))
         raise err
Exemple #27
0
    def process(self, kmsg):
        tagged_state = _helpers.TaggedStates.DEFAULT
        item_path = self._get_absolute_path(kmsg.data.element)
        item = kmsg.data.element.decode("utf-8")

        if not self.force(kmsg):
            self._klio.logger.info(
                "Pass through '%s': Force mode OFF with output found at '%s'."
                % (item, item_path))
            tagged_state = _helpers.TaggedStates.PASS_THRU

        else:
            self._klio.logger.info(
                "Process '%s': Force mode ON with output found at '%s'." %
                (item, item_path))
            tagged_state = _helpers.TaggedStates.PROCESS

        yield pvalue.TaggedOutput(tagged_state.value, kmsg.SerializeToString())
Exemple #28
0
    def process(self, element):
        catalog_item = recommendationengine.CatalogItem(element)
        request = recommendationengine.CreateCatalogItemRequest(
            parent=self.parent, catalog_item=catalog_item)

        try:
            created_catalog_item = self._client.create_catalog_item(
                request=request,
                retry=self.retry,
                timeout=self.timeout,
                metadata=self.metadata)

            self.counter.inc()
            yield recommendationengine.CatalogItem.to_dict(
                created_catalog_item)
        except Exception:
            yield pvalue.TaggedOutput(
                FAILED_CATALOG_ITEMS,
                recommendationengine.CatalogItem.to_dict(catalog_item))
Exemple #29
0
def __from_klio_message_generator(self, kmsg, payload, orig_item):
    try:
        yield serializer.from_klio_message(kmsg, payload)

    except Exception as err:
        self._klio.logger.error(_ERROR_MSG_KMSG_TO_BYTES.format(kmsg, err),
                                exc_info=True)
        # Since the yielded value in the `try` clause may not tagged, that
        # one will be used by default by whatever executed this function,
        # and anything that has a tagged output value (like this dropped one)
        # will just be ignored, which is fine for dropped values.
        # But if the caller function wanted to, they could access this via
        # pcoll.drop.
        # We won't try to serialize kmsg to bytes since something already
        # went wrong.
        yield pvalue.TaggedOutput("drop", orig_item)
        # explicitly return so that Beam doesn't call `next` and
        # executes the next `yield`
        return
Exemple #30
0
    def process(self, element):
        catalog_items = [
            recommendationengine.CatalogItem(e) for e in element[1]
        ]
        catalog_inline_source = recommendationengine.CatalogInlineSource(
            {"catalog_items": catalog_items})
        input_config = recommendationengine.InputConfig(
            catalog_inline_source=catalog_inline_source)

        request = recommendationengine.ImportCatalogItemsRequest(
            parent=self.parent, input_config=input_config)

        try:
            operation = self._client.import_catalog_items(
                request=request,
                retry=self.retry,
                timeout=self.timeout,
                metadata=self.metadata)
            self.counter.inc(len(catalog_items))
            yield operation.result()
        except Exception:
            yield pvalue.TaggedOutput(FAILED_CATALOG_ITEMS, catalog_items)