def process(self, element, load_job_name_prefix, *schema_side_inputs): # Each load job is assumed to have files respecting these constraints: # 1. Total size of all files < 15 TB (Max size for load jobs) # 2. Total no. of files in a single load job < 10,000 # This assumption means that there will always be a single load job # triggered for each partition of files. destination = element[0] files = element[1] if callable(self.schema): schema = self.schema(destination, *schema_side_inputs) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema if callable(self.additional_bq_parameters): additional_parameters = self.additional_bq_parameters(destination) elif isinstance(self.additional_bq_parameters, vp.ValueProvider): additional_parameters = self.additional_bq_parameters.get() else: additional_parameters = self.additional_bq_parameters table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. destination_hash = _bq_uuid( '%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) uid = _bq_uuid() job_name = '%s_%s_%s' % (load_job_name_prefix, destination_hash, uid) logging.debug('Load job has %s files. Job name is %s.', len(files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info( 'Triggering job %s to load data to BigQuery table %s.' 'Schema: %s. Additional parameters: %s', job_name, table_reference, schema, additional_parameters) job_reference = self.bq_wrapper.perform_load_job( table_reference, files, job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, additional_load_parameters=additional_parameters) yield (destination, job_reference)
def process(self, element, load_job_name_prefix, *schema_side_inputs): destination = element[0] files = iter(element[1]) if callable(self.schema): schema = self.schema(destination, *schema_side_inputs) elif isinstance(self.schema, vp.ValueProvider): schema = self.schema.get() else: schema = self.schema if callable(self.additional_bq_parameters): additional_parameters = self.additional_bq_parameters(destination) elif isinstance(self.additional_bq_parameters, vp.ValueProvider): additional_parameters = self.additional_bq_parameters.get() else: additional_parameters = self.additional_bq_parameters batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS)) while batch_of_files: table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single destination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. destination_hash = _bq_uuid('%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)) timestamp = int(time.time()) job_name = '%s_%s_%s' % ( load_job_name_prefix, destination_hash, timestamp) logging.debug('Batch of files has %s files. Job name is %s.', len(batch_of_files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info('Triggering job %s to load data to BigQuery table %s.' 'Schema: %s. Additional parameters: %s', job_name, table_reference, schema, additional_parameters) job_reference = self.bq_wrapper.perform_load_job( table_reference, batch_of_files, job_name, schema=schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, additional_load_parameters=additional_parameters) yield (destination, job_reference) # Prepare to trigger the next job batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
def finish_bundle(self): for destination, file_path_writer in \ iteritems(self._destination_to_file_writer): (file_path, writer) = file_path_writer file_size = writer.tell() writer.close() yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG, GlobalWindows.windowed_value( (destination, (file_path, file_size)))) self._destination_to_file_writer = {}
def process(self, element, *args, **kwargs): try: row = json.loads(element, encoding='utf-8') yield self.parse_row(row) except (TypeError, ValueError) as e: yield pvalue.TaggedOutput(self.TAG_BROKEN_DATA, { Field.Element: element, Field.Error: e.message }) self.broken_data_counter.inc()
def process(self, element: str): self.input_records_counter.inc() # We have two outputs: one for well formed input lines, and another one with potential parsing errors # (the parsing error output will be written to a different BigQuery table) try: # ignore header row if element != self._header_line: record: Record = data_classes.line2record(element) self.correct_records_counter.inc() yield pvalue.TaggedOutput(ParseCSVDoFn.CORRECT_OUTPUT_TAG, record) except TypeError as err: self.wrong_records_counter.inc() msg = str(err) yield pvalue.TaggedOutput(ParseCSVDoFn.WRONG_OUTPUT_TAG, { 'error': msg, 'line': element })
def process(self, element, publish_time=beam.DoFn.TimestampParam, table_dictionary=table_dictionary, *arg, **kwargs): if (element.data != None and element.data != b'' and element.data != "b''"): data = json.loads(element.data) data['publish_time'] = ( datetime.datetime.utcfromtimestamp(float(publish_time)) + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S.%f") if list(data.keys()) == table_dictionary['columns_table_name']: data['timestamp'] = datetime.datetime( data['timestamp']['DateTime']['year'], data['timestamp']['DateTime']['month'], data['timestamp']['DateTime']['day'], data['timestamp']['DateTime']['hour'], data['timestamp']['DateTime']['minute'], data['timestamp']['DateTime']['second'], data['timestamp']['DateTime']['micro']) data['timestamp'] = data['timestamp'].strftime( '%Y-%m-%d %H:%M:%S.%f') yield pvalue.TaggedOutput('table_name', data) logging.info('this is table_name' + str(data)) else: keys = element.attributes['key'] keys = {} for attr in element.attributes['key'][7:-1].split(','): key, val = attr.split('=') try: keys[key] = int(val) except ValueError as ve: keys[key] = val keys['publish_time'] = ( datetime.datetime.utcfromtimestamp(float(publish_time)) + datetime.timedelta(hours=8)).strftime("%Y-%m-%d %H:%M:%S.%f") if list(keys.keys()) == table_dictionary['keys_table_name']: yield pvalue.TaggedOutput('table_name_dbactions', keys) logging.info('table_name_dbactions' + str(keys))
def process(self, element): events = element['events'] monitors = element['monitors'] info = element['info'] testsuites = element['testsuites'] useractions = element['useractions'] campaign = element['campaign'] for event in events: yield event for monitor in monitors: yield pvalue.TaggedOutput('monitorsPC', monitor) for testsuite in testsuites: yield pvalue.TaggedOutput('testsuitesPC', testsuite) for useraction in useractions: yield pvalue.TaggedOutput('useractionsPC', useraction) for campaignu in campaign: yield pvalue.TaggedOutput('campaignPC', campaignu) yield pvalue.TaggedOutput('infoPC', info)
def process(self, serialized_example): example = tf.train.Example() example.ParseFromString(serialized_example) thread_id, = example.features.feature['product_id'].bytes_list.value split_value = self._split_value(thread_id) split = ( self.TRAIN_TAG if split_value < self._train_split else self.TEST_TAG) yield pvalue.TaggedOutput(split, serialized_example)
def process(self, element): records = list(element[1]) # Split of 2 crops and pre-generate the subgrid. # Select the crop with highest number of possible greenhouses: # in case two crops with only a single possible greenhouse were selected # the subgrid would consist of only 1 element. best_split = np.argsort([-len(r['transport_costs']) for r in records])[:2] rec1 = records[best_split[0]] rec2 = records[best_split[1]] # Generate & emit all combinations for a in rec1['transport_costs']: if a[1]: for b in rec2['transport_costs']: if b[1]: combination = [(rec1['crop'], a[0]), (rec2['crop'], b[0])] yield pvalue.TaggedOutput('splitted', combination) # Pass on remaining records remaining = [rec for i, rec in enumerate(records) if i not in best_split] yield pvalue.TaggedOutput('combine', remaining)
def process(self, element): # pylint: disable=no-self-use try: import code_search.utils as utils start_time = time.time() element['pairs'] = utils.get_function_docstring_pairs(element.pop('content')) self.tokenization_time_ms.inc(int((time.time() - start_time) * 1000.0)) yield element except Exception as e: #pylint: disable=broad-except logging.warning('Tokenization failed, %s', e.message) yield pvalue.TaggedOutput('err_rows', element)
def process(self, element, *args, **kwargs): # pylint: disable=unused-argument,no-self-use try: from preprocess.tokenizer import get_function_docstring_pairs start_time = time.time() element['pairs'] = get_function_docstring_pairs(element.pop('content')) self.tokenization_time_ms.inc(int((time.time() - start_time) * 1000.0)) yield element except Exception as e: #pylint: disable=broad-except logging.warning('Tokenization failed, %s', e.message) yield pvalue.TaggedOutput('err_rows', element)
def process(self, kmsg): tagged_state = _helpers.TaggedStates.DEFAULT item = kmsg.data.element.decode("utf-8") if self.ping(kmsg): self._klio.logger.info("Pass through '%s': Ping mode ON." % item) tagged_state = _helpers.TaggedStates.PASS_THRU else: self._klio.logger.debug("Process '%s': Ping mode OFF." % item) tagged_state = _helpers.TaggedStates.PROCESS yield pvalue.TaggedOutput(tagged_state.value, kmsg.SerializeToString())
def process(self, element): user_event = recommendationengine.UserEvent(element) request = recommendationengine.WriteUserEventRequest( parent=self.parent, user_event=user_event) try: created_user_event = self._client.write_user_event(request) self.counter.inc() yield recommendationengine.UserEvent.to_dict(created_user_event) except Exception: yield pvalue.TaggedOutput( self.FAILED_USER_EVENTS, recommendationengine.UserEvent.to_dict(user_event))
def process(self, element): ''' Overriding process method of beam's DoFn class element - JSON string ''' json_data = json.loads( element) # loading JSON string into a dictionary if json_data[ "event_name"] == 'super duper event - 1': # Check if event_name matches pre-decided tag # Some pre-processing steps here, if needed; Make sure contents of json_data["payload"] matches the row specification of BigQuery table yield pvalue.TaggedOutput( 'super duper', ast.literal_eval(json_data["payload"]) ) # return a generator object that produces tagged pValues.
def process(self, element, *args, **kwargs): # pylint: disable=unused-argument try: info_rows = [ dict(zip(self.info_keys, pair)) for pair in element.pop('pairs') ] info_rows = [ self.merge_two_dicts(info_dict, element) for info_dict in info_rows ] info_rows = map(self.dict_to_unicode, info_rows) yield info_rows except: #pylint: disable=bare-except yield pvalue.TaggedOutput('err_rows', element)
def process(self, element): user_event = recommendationengine.UserEvent(element) request = recommendationengine.PredictRequest(name=self.name, user_event=user_event) try: prediction = self._client.predict(request) self.counter.inc() yield [ recommendationengine.PredictResponse.to_dict(p) for p in prediction.pages ] except Exception: yield pvalue.TaggedOutput(self.FAILED_PREDICTIONS, user_event)
def parse_and_move(path_and_meta): import xml.etree.ElementTree as ET import re import sys import apache_beam as beam from apache_beam import pvalue try: path,_,_ = path_and_meta _,unprocessed_dir,_ = path_and_meta _,_,processed_dir = path_and_meta open_file = beam.io.filesystems.FileSystems.open(path) content = open_file.read() root = ET.fromstring(content) root.findall(".") item_list = [] for item in root.findall(".//channel/item"): link = item.find('link').text title = item.find('title').text pubdate = item.find('pubDate').text i = { "pubdate": pubdate, "link": link, "title": title } item_list.append(i) dest = re.sub(unprocessed_dir, processed_dir, path) beam.io.filesystems.FileSystems.rename([path], [dest]) yield pvalue.TaggedOutput('ok', item_list) yield item_list except Exception as e: error_pack = [{"filepath":path,"errormsg":str(e)}] yield pvalue.TaggedOutput('fail', error_pack) yield error_pack
def process(self, element, *_args, **_kwargs): """Get list of Function-Docstring tokens This processes each Python file's content and returns a list of metadata for each extracted pair. These contain the tokenized functions and docstrings. In cases where the tokenization fails, a side output is returned. All values are unicode for serialization. Args: element: A Python dict of the form, { "nwo": "STRING", "path": "STRING", "content": "STRING", } Yields: A Python list of the form, [ { "nwo": "STRING", "path": "STRING", "function_name": "STRING", "lineno": "STRING", "original_function": "STRING", "function_tokens": "STRING", "docstring_tokens": "STRING", }, ... ] """ try: content_blob = element.pop(self.content_key) pairs = utils.get_function_docstring_pairs(content_blob) result = [ dict(zip(self.info_keys, pair_tuple), **element) for pair_tuple in pairs ] yield result # TODO(jlewi): Can we narrow down the scope covered by swallowing # errors? It should really only be the AST parsing code so can # we move try/catch into get_function_docstring_pairs? except Exception as e: # pylint: disable=broad-except logging.warning('Tokenization failed, %s', e.message) yield pvalue.TaggedOutput('err', element)
def process(self, element, file_prefix): destination = element[0] row = element[1] if destination in self._destination_to_file_writer: writer = self._destination_to_file_writer[destination] elif len(self._destination_to_file_writer) < self.max_files_per_bundle: (file_path, writer) = _make_new_file_writer(file_prefix, destination) self._destination_to_file_writer[destination] = writer yield pvalue.TaggedOutput(WriteRecordsToFile.WRITTEN_FILE_TAG, (destination, file_path)) else: yield pvalue.TaggedOutput(WriteRecordsToFile.UNWRITTEN_RECORD_TAG, element) return # TODO(pabloem): Is it possible for this to throw exception? writer.write(self.coder.encode(row)) writer.write(b'\n') if writer.tell() > self.max_file_size: writer.close() self._destination_to_file_writer.pop(destination)
def process(self, element, *_args, **_kwargs): """Get list of Function-Docstring tokens This processes each Python file's content and returns a list of metadata for each extracted pair. These contain the tokenized functions and docstrings. In cases where the tokenization fails, a side output is returned. All values are unicode for serialization. Args: element: A Python dict of the form, { "nwo": "STRING", "path": "STRING", "content": "STRING", } Yields: A Python list of the form, [ { "nwo": "STRING", "path": "STRING", "function_name": "STRING", "lineno": "STRING", "original_function": "STRING", "function_tokens": "STRING", "docstring_tokens": "STRING", }, ... ] """ try: import code_search.dataflow.utils as utils content_blob = element.pop(self.content_key) pairs = utils.get_function_docstring_pairs(content_blob) result = [ dict(zip(self.info_keys, pair_tuple), **element) for pair_tuple in pairs ] yield result except Exception as e: # pylint: disable=broad-except logging.warning('Tokenization failed, %s', e.message) yield pvalue.TaggedOutput('err', element)
def _pickle_dump(ctx, kmsg, ret): tagged, tag = False, None if isinstance(ret, pvalue.TaggedOutput): tagged = True tag = ret.tag ret = ret.value try: dumped = pickle.dumps(ret) if tagged: return pvalue.TaggedOutput(tag, dumped) return dumped except Exception as err: ctx.logger.error( "Exception occurred when pickling payload for '%s'.\nError: %s" % (kmsg.element, err)) raise err
def _flush_batch(self, destination): # Flush the current batch of rows to BigQuery. rows = self._rows_buffer[destination] table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') logging.debug('Flushing data to %s. Total %s rows.', destination, len(rows)) while True: # TODO: Figure out an insertId to make calls idempotent. passed, errors = self.bigquery_wrapper.insert_rows( project_id=table_reference.projectId, dataset_id=table_reference.datasetId, table_id=table_reference.tableId, rows=rows, skip_invalid_rows=True) logging.debug("Passed: %s. Errors are %s", passed, errors) failed_rows = [rows[entry.index] for entry in errors] should_retry = any( bigquery_tools.RetryStrategy.should_retry( self._retry_strategy, entry.errors[0].reason) for entry in errors) rows = failed_rows if not should_retry: break else: retry_backoff = next(self._backoff_calculator) logging.info('Sleeping %s seconds before retrying insertion.', retry_backoff) time.sleep(retry_backoff) self._total_buffered_rows -= len(self._rows_buffer[destination]) del self._rows_buffer[destination] return [ pvalue.TaggedOutput( BigQueryWriteFn.FAILED_ROWS, GlobalWindows.windowed_value((destination, row))) for row in failed_rows ]
def process(self, element, load_job_name_prefix): destination = element[0] files = iter(element[1]) job_count = 0 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS)) while batch_of_files: table_reference = bigquery_tools.parse_table_reference(destination) if table_reference.projectId is None: table_reference.projectId = vp.RuntimeValueProvider.get_value( 'project', str, '') # Load jobs for a single des5tination are always triggered from the same # worker. This means that we can generate a deterministic numbered job id, # and not need to worry. job_name = '%s_%s_%s' % ( load_job_name_prefix, _bq_uuid('%s:%s.%s' % (table_reference.projectId, table_reference.datasetId, table_reference.tableId)), job_count) logging.debug("Batch of files has %s files. Job name is %s", len(batch_of_files), job_name) if self.temporary_tables: # For temporary tables, we create a new table with the name with JobId. table_reference.tableId = job_name yield pvalue.TaggedOutput(TriggerLoadJobs.TEMP_TABLES, table_reference) logging.info( "Triggering job %s to load data to BigQuery table %s.", job_name, table_reference) job_reference = self.bq_wrapper.perform_load_job( table_reference, batch_of_files, job_name, schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition) yield (destination, job_reference) # Prepare to trigger the next job job_count += 1 batch_of_files = list(itertools.islice(files, _MAXIMUM_SOURCE_URIS))
def process(self, kmsg): item = kmsg.data.element item_path = self._get_absolute_path(item) item_exists = self.exists(item_path) state = DataExistState.FOUND if not item_exists: state = DataExistState.NOT_FOUND self._klio.logger.info("%s %s at %s" % ( self.DIRECTION_PFX.value.title(), DataExistState.to_str(state), item_path, )) # double tag for easier user interface, i.e. pcoll.found vs pcoll.true yield pvalue.TaggedOutput(state.value, kmsg.SerializeToString())
def process(self, element): user_events = [recommendationengine.UserEvent(e) for e in element[1]] user_event_inline_source = recommendationengine.UserEventInlineSource( {"user_events": user_events}) input_config = recommendationengine.InputConfig( user_event_inline_source=user_event_inline_source) request = recommendationengine.ImportUserEventsRequest( parent=self.parent, input_config=input_config) try: operation = self._client.write_user_event(request) self.counter.inc(len(user_events)) yield recommendationengine.PredictResponse.to_dict( operation.result()) except Exception: yield pvalue.TaggedOutput(self.FAILED_USER_EVENTS, user_events)
def _numpy_dump(ctx, kmsg, ret): tagged, tag = False, None if isinstance(ret, pvalue.TaggedOutput): tagged = True tag = ret.tag ret = ret.value try: out = io.BytesIO() np.save(out, ret) dumped = out.getvalue() # returns the data in `out` in bytes if tagged: return pvalue.TaggedOutput(tag, dumped) return dumped except Exception as err: ctx.logger.error( "Exception occurred when dumping numpy payload for '%s'.\n" "Error: %s" % (kmsg.element, err)) raise err
def process(self, kmsg): tagged_state = _helpers.TaggedStates.DEFAULT item_path = self._get_absolute_path(kmsg.data.element) item = kmsg.data.element.decode("utf-8") if not self.force(kmsg): self._klio.logger.info( "Pass through '%s': Force mode OFF with output found at '%s'." % (item, item_path)) tagged_state = _helpers.TaggedStates.PASS_THRU else: self._klio.logger.info( "Process '%s': Force mode ON with output found at '%s'." % (item, item_path)) tagged_state = _helpers.TaggedStates.PROCESS yield pvalue.TaggedOutput(tagged_state.value, kmsg.SerializeToString())
def process(self, element): catalog_item = recommendationengine.CatalogItem(element) request = recommendationengine.CreateCatalogItemRequest( parent=self.parent, catalog_item=catalog_item) try: created_catalog_item = self._client.create_catalog_item( request=request, retry=self.retry, timeout=self.timeout, metadata=self.metadata) self.counter.inc() yield recommendationengine.CatalogItem.to_dict( created_catalog_item) except Exception: yield pvalue.TaggedOutput( FAILED_CATALOG_ITEMS, recommendationengine.CatalogItem.to_dict(catalog_item))
def __from_klio_message_generator(self, kmsg, payload, orig_item): try: yield serializer.from_klio_message(kmsg, payload) except Exception as err: self._klio.logger.error(_ERROR_MSG_KMSG_TO_BYTES.format(kmsg, err), exc_info=True) # Since the yielded value in the `try` clause may not tagged, that # one will be used by default by whatever executed this function, # and anything that has a tagged output value (like this dropped one) # will just be ignored, which is fine for dropped values. # But if the caller function wanted to, they could access this via # pcoll.drop. # We won't try to serialize kmsg to bytes since something already # went wrong. yield pvalue.TaggedOutput("drop", orig_item) # explicitly return so that Beam doesn't call `next` and # executes the next `yield` return
def process(self, element): catalog_items = [ recommendationengine.CatalogItem(e) for e in element[1] ] catalog_inline_source = recommendationengine.CatalogInlineSource( {"catalog_items": catalog_items}) input_config = recommendationengine.InputConfig( catalog_inline_source=catalog_inline_source) request = recommendationengine.ImportCatalogItemsRequest( parent=self.parent, input_config=input_config) try: operation = self._client.import_catalog_items( request=request, retry=self.retry, timeout=self.timeout, metadata=self.metadata) self.counter.inc(len(catalog_items)) yield operation.result() except Exception: yield pvalue.TaggedOutput(FAILED_CATALOG_ITEMS, catalog_items)