def delete_from_datastore(project, pipeline_options, run_locally): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() if run_locally: pass #q.add_filter('category', '=', 'BEBOP') q = client.query(kind='PRDebugAttendee') query.order = ['-created_date'] results = list(q.fetch(1)) if not results: logging.error('No PRDebugAttendee objects found') return newest_date = results[0]['created_date'] logging.info('Deleting elements older than %s', newest_date) q1 = client.query(kind='PRDebugAttendee') q2 = client.query(kind='PRCityCategory') datastore_1 = p | 'read PRDebugAttendee from datastore' >> ReadFromDatastore( project, query._pb_from_query(q1), num_splits=400) datastore_2 = p | 'read PRCityCategory from datastore' >> ReadFromDatastore( project, query._pb_from_query(q2), num_splits=400) # Set up our map/reduce pipeline output = ( (datastore_1, datastore_2) | beam.Flatten() | 'convert to entity' >> beam.Map(ConvertToEntity) # Find the events we want to count, and expand all the admins/attendees | 'find old rankings' >> beam.FlatMap(OldPRRecord, newest_date) # And save it all back to the database ) if not run_locally: output | 'delete from datastore' >> beam.ParDo(DeleteFromDatastore()) """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def check_estimated_size_bytes(self, entity_bytes, timestamp, namespace=None): """A helper method to test get_estimated_size_bytes""" timestamp_req = helper.make_request( self._PROJECT, namespace, helper.make_latest_timestamp_query(namespace)) timestamp_resp = self.make_stats_response( {'timestamp': datastore_helper.from_timestamp(timestamp)}) kind_stat_req = helper.make_request( self._PROJECT, namespace, helper.make_kind_stats_query( namespace, self._query.kind[0].name, datastore_helper.micros_from_timestamp(timestamp))) kind_stat_resp = self.make_stats_response( {'entity_bytes': entity_bytes}) def fake_run_query(req): if req == timestamp_req: return timestamp_resp elif req == kind_stat_req: return kind_stat_resp else: print kind_stat_req raise ValueError("Unknown req: %s" % req) self._mock_datastore.run_query.side_effect = fake_run_query self.assertEqual(entity_bytes, ReadFromDatastore.get_estimated_size_bytes( self._PROJECT, namespace, self._query, self._mock_datastore)) self.assertEqual(self._mock_datastore.run_query.call_args_list, [call(timestamp_req), call(kind_stat_req)])
def test_SplitQueryFn_without_num_splits(self): with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): # Force SplitQueryFn to compute the number of query splits num_splits = 0 expected_num_splits = 23 entity_bytes = (expected_num_splits * ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES) with patch.object(ReadFromDatastore, 'get_estimated_size_bytes', return_value=entity_bytes): def fake_get_splits(datastore, query, num_splits, partition=None): return self.split_query(query, num_splits) with patch.object(query_splitter, 'get_splits', side_effect=fake_get_splits): split_query_fn = ReadFromDatastore.SplitQueryFn( self._PROJECT, self._query, None, num_splits) split_query_fn.start_bundle() returned_split_queries = [] for split_query in split_query_fn.process(self._query): returned_split_queries.append(split_query) self.assertEqual(len(returned_split_queries), expected_num_splits) self.assertEqual( 0, len(self._mock_datastore.run_query.call_args_list)) self.verify_unique_keys(returned_split_queries)
def test_SplitQueryFn_with_exception(self): """A test that verifies that no split is performed when failures occur.""" with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): # Force SplitQueryFn to compute the number of query splits num_splits = 0 expected_num_splits = 1 entity_bytes = (expected_num_splits * ReadFromDatastore._DEFAULT_BUNDLE_SIZE_BYTES) with patch.object(ReadFromDatastore, 'get_estimated_size_bytes', return_value=entity_bytes): with patch.object( query_splitter, 'get_splits', side_effect=ValueError("Testing query split error")): split_query_fn = ReadFromDatastore.SplitQueryFn( self._PROJECT, self._query, None, num_splits) split_query_fn.start_bundle() returned_split_queries = [] for split_query in split_query_fn.process(self._query): returned_split_queries.append(split_query) self.assertEqual(len(returned_split_queries), expected_num_splits) self.assertEqual(returned_split_queries[0][1], self._query) self.assertEqual( 0, len(self._mock_datastore.run_query.call_args_list)) self.verify_unique_keys(returned_split_queries)
def read_from_datastore(project, user_options, pipeline_options): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_ancestor_query(user_options.kind, user_options.namespace, user_options.ancestor) # Read entities from Cloud Datastore into a PCollection. lines = p | 'read from datastore' >> ReadFromDatastore( project, query, user_options.namespace) # Count the occurrences of each word. counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones)))) # Format the counts into a PCollection of strings. output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c)) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> beam.io.WriteToText( file_path_prefix=user_options.output, num_shards=user_options.num_shards) result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def model_datastoreio(): """Using a Read and Write transform to read/write to Cloud Datastore.""" import uuid from google.cloud.proto.datastore.v1 import entity_pb2 from google.cloud.proto.datastore.v1 import query_pb2 import googledatastore import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore project = 'my_project' kind = 'my_kind' query = query_pb2.Query() query.kind.add().name = kind # [START model_datastoreio_read] p = beam.Pipeline(options=PipelineOptions()) entities = p | 'Read From Datastore' >> ReadFromDatastore(project, query) # [END model_datastoreio_read] # [START model_datastoreio_write] p = beam.Pipeline(options=PipelineOptions()) musicians = p | 'Musicians' >> beam.Create( ['Mozart', 'Chopin', 'Beethoven', 'Vivaldi']) def to_entity(content): entity = entity_pb2.Entity() googledatastore.helper.add_key_path(entity.key, kind, str(uuid.uuid4())) googledatastore.helper.add_properties(entity, {'content': unicode(content)}) return entity entities = musicians | 'To Entity' >> beam.Map(to_entity) entities | 'Write To Datastore' >> WriteToDatastore(project)
def read_from_datastore(user_options, pipeline_options): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_ancestor_query(user_options.inputKind, user_options.namespace, user_options.ancestor) # Read entities from Cloud Datastore into a PCollection. lines = p | 'read from datastore' >> ReadFromDatastore( user_options.project, query, user_options.namespace) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) processedTweets = ( lines | 'split' >> (beam.ParDo(processTweet())) | 'create entity' >> beam.Map( EntityWrapper(user_options.namespace, user_options.outputKind, user_options.ancestor).make_entity) | 'write to datastore' >> WriteToDatastore(user_options.project)) result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result
def test_SplitQueryFn_with_query_limit(self): """A test that verifies no split is performed when the query has a limit.""" with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): self._query.limit.value = 3 split_query_fn = ReadFromDatastore.SplitQueryFn( self._PROJECT, self._query, None, 4) split_query_fn.start_bundle() returned_split_queries = [] for split_query in split_query_fn.process(self._query): returned_split_queries.append(split_query) self.assertEqual(1, len(returned_split_queries)) self.assertEqual(0, len(self._mock_datastore.method_calls))
def expand(self, pcoll): query = query_pb2.Query() query.kind.add().name = 'Tweet' now = datetime.datetime.now() # The 'earlier' var will be set to a static value on template creation. # That is, because of the way that templates work, the value is defined # at template compile time, not runtime. # But defining a filter based on this value will still serve to make the # query more efficient than if we didn't filter at all. earlier = now - datetime.timedelta(days=self.days) datastore_helper.set_property_filter(query.filter, 'created_at', PropertyFilter.GREATER_THAN, earlier) return (pcoll | 'read from datastore' >> ReadFromDatastore( self.project, query, None))
def test_SplitQueryFn_with_num_splits(self): with patch.object(helper, 'get_datastore', return_value=self._mock_datastore): num_splits = 23 def fake_get_splits(datastore, query, num_splits, partition=None): return self.split_query(query, num_splits) with patch.object(query_splitter, 'get_splits', side_effect=fake_get_splits): split_query_fn = ReadFromDatastore.SplitQueryFn( self._PROJECT, self._query, None, num_splits) split_query_fn.start_bundle() returned_split_queries = [] for split_query in split_query_fn.process(self._query): returned_split_queries.append(split_query) self.assertEqual(len(returned_split_queries), num_splits) self.assertEqual(0, len(self._mock_datastore.run_query.call_args_list)) self.verify_unique_keys(returned_split_queries)
def run(argv=None): """Main entry point.""" parser = argparse.ArgumentParser() parser.add_argument('--kind', dest='kind', default='writereadtest', help='Datastore Kind') parser.add_argument('--num_entities', dest='num_entities', type=int, required=True, help='Number of entities to write') parser.add_argument('--limit', dest='limit', type=int, help='Limit of number of entities to write') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True gcloud_options = pipeline_options.view_as(GoogleCloudOptions) job_name = gcloud_options.job_name kind = known_args.kind num_entities = known_args.num_entities project = gcloud_options.project # a random ancesor key ancestor = str(uuid.uuid4()) query = make_ancestor_query(kind, None, ancestor) # Pipeline 1: Create and write the specified number of Entities to the # Cloud Datastore. logging.info('Writing %s entities to %s', num_entities, project) p = new_pipeline_with_job_name(pipeline_options, job_name, '-write') # pylint: disable=expression-not-assigned (p | 'Input' >> beam.Create(list(range(known_args.num_entities))) | 'To String' >> beam.Map(str) | 'To Entity' >> beam.Map(EntityWrapper(kind, None, ancestor).make_entity) | 'Write to Datastore' >> WriteToDatastore(project)) p.run() # Optional Pipeline 2: If a read limit was provided, read it and confirm # that the expected entities were read. if known_args.limit is not None: logging.info( 'Querying a limited set of %s entities and verifying count.', known_args.limit) p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-limit') query_with_limit = query_pb2.Query() query_with_limit.CopyFrom(query) query_with_limit.limit.value = known_args.limit entities = p | 'read from datastore' >> ReadFromDatastore( project, query_with_limit) assert_that(entities | beam.combiners.Count.Globally(), equal_to([known_args.limit])) p.run() # Pipeline 3: Query the written Entities and verify result. logging.info('Querying entities, asserting they match.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([num_entities])) p.run() # Pipeline 4: Delete Entities. logging.info('Deleting entities.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-delete') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) # pylint: disable=expression-not-assigned (entities | 'To Keys' >> beam.Map(lambda entity: entity.key) | 'Delete keys' >> DeleteFromDatastore(project)) p.run() # Pipeline 5: Query the written Entities, verify no results. logging.info( 'Querying for the entities to make sure there are none present.') p = new_pipeline_with_job_name(pipeline_options, job_name, '-verify-deleted') entities = p | 'read from datastore' >> ReadFromDatastore(project, query) assert_that(entities | beam.combiners.Count.Globally(), equal_to([0])) p.run()
def run(): import pickle import sys import math import numpy as np reload(sys) sys.setdefaultencoding('utf8') from gensim.models import KeyedVectors import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from google.cloud.proto.datastore.v1 import query_pb2 from apache_beam.io.textio import WriteToText import nltk.data import re import uuid import perceptron _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations model_file = "perceptron_word2vec_stemmed_normalized.pickle" with open(model_file, 'rb') as model: w, b = pickle.load(model) def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return sentence.split(" ") # nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToObject(jsonObj): x = jsonObj link = x.properties.get('link', None) link = link.string_value if link else "" title = x.properties.get('title', None) title = title.string_value if title else "" description = x.properties.get("description", None) description = description.string_value if description else "" content = x.properties.get("text", "") content = content.string_value if content else "" published = x.properties.get("published") published = published.string_value if published else "" obj = { "link": link, "title": title, "description": description, "content": content, "published": published } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'news-197916' google_cloud_options.job_name = 'sentiment-analysis' google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/' google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp' options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) setup_options.requirements_file = "requirements.txt" setup_options.save_main_session = True p = beam.Pipeline(options=options) query = query_pb2.Query() query.kind.add().name = "News_Entry" pairs = (p | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query) # | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line # | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings) ) tokens_1gram = (pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words) # also convert to key value pairs ) """ tokens_2gram = (pairs | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)]) ) """ tokens = tokens_1gram """ vocabulary = (tokens | "Get words only" >> beam.Values() | "Remove duplicate words" >> beam.RemoveDuplicates() ) vocabulary_size = (vocabulary | "Count Vocabulary elements" >> beam.combiners.Count.Globally() ) doc_total_words = (tokens | "Count Words of Doc" >> beam.combiners.Count.PerKey() ) """ tokens_paired_with_1 = (tokens | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)) ) """ token_counts_per_doc = (tokens_paired_with_1 | "Group by Doc,Word" >> beam.GroupByKey() | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts)))) | "Group by Doc" >> beam.GroupByKey() ) num_docs = (token_counts_per_doc | "Get Docs" >> beam.Keys() | "Count Docs" >> beam.combiners.Count.Globally() ) word_tf_pre = ( { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc } | "CoGroup By Document" >> beam.CoGroupByKey() ) def calc_tf((doc, count)): [token_count] = count['token_counts_per_doc'] [tokens_total] = count['total_tokens'] for token, cnt in token_count: yield token, (doc, float(cnt) / tokens_total) doc_word_tf = (word_tf_pre | "Compute Term Frequencies" >> beam.FlatMap(calc_tf) ) word_occurrences = (tokens | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates() | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1)) | "Group by Word" >> beam.GroupByKey() | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts))) ) token_df = ( word_occurrences | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs))) token_tf_df = ( { 'term_frequency': doc_word_tf, 'document_frequency': token_df} | "CoGroup By Token" >> beam.CoGroupByKey()) def calc_tfidf((token, tfdf)): [df] = tfdf['document_frequency'] for doc, tf in tfdf['term_frequency']: yield (doc, token), tf * math.log(1.0 / df) token_tf_idf = (token_tf_df | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf) ) """ word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) def get_vec(word2vec, token): if word2vec is None: word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) try: x = word2vec.get_vector(token) x = x.reshape(400) except: x = np.zeros(400) return x def analyze_sentiment(x): res = perceptron.f(x, w, b) return res doc_sentiment = (tokens_paired_with_1 | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token))) | "Group Word2Vec Vectors By Document" >> beam.GroupByKey() | "Sum Word2Vec Vectors" >> beam.Map( lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0])) ) result = (doc_sentiment | "Format Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)) ) (result | "Write Results" >> WriteToText("sentiments") ) p.run()
def process_datastore_tweets(project, dataset, pipeline_options): """Creates a pipeline that reads tweets from Cloud Datastore from the last N days. The pipeline finds the top most-used words, the top most-tweeted URLs, ranks word co-occurrences by an 'interestingness' metric (similar to on tf* idf). """ ts = str(datetime.datetime.utcnow()) p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. query = make_query('Tweet') # Read entities from Cloud Datastore into a PCollection. lines = (p | 'read from datastore' >> ReadFromDatastore(project, query, None)) global_count = AsSingleton( lines | 'global count' >> beam.combiners.Count.Globally()) # Count the occurrences of each word. percents = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'in tweets percent' >> beam.Map( lambda (word, wsum), gc: (word, float(wsum) / gc), global_count)) top_percents = ( percents | 'top 500' >> combiners.Top.Of(500, lambda x, y: x[1] < y[1])) # Count the occurrences of each expanded url in the tweets url_counts = ( lines | 'geturls' >> (beam.ParDo(URLExtractingDoFn()).with_output_types(unicode)) | 'urls_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'urls_group' >> beam.GroupByKey() | 'urls_count' >> beam.Map(lambda (word, ones): (word, sum(ones))) | 'urls top 300' >> combiners.Top.Of(300, lambda x, y: x[1] < y[1])) # Define some inline helper functions. def join_cinfo(cooccur, percents): """Calculate a co-occurence ranking.""" import math word1 = cooccur[0][0] word2 = cooccur[0][1] try: word1_percent = percents[word1] weight1 = 1 / word1_percent word2_percent = percents[word2] weight2 = 1 / word2_percent return (cooccur[0], cooccur[1], cooccur[1] * math.log(min(weight1, weight2))) except: return 0 def generate_cooccur_schema(): """BigQuery schema for the word co-occurrence table.""" json_str = json.dumps({ 'fields': [{ 'name': 'w1', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'w2', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'log_weight', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_url_schema(): """BigQuery schema for the urls count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'count', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) def generate_wc_schema(): """BigQuery schema for the word count table.""" json_str = json.dumps({ 'fields': [{ 'name': 'word', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'percent', 'type': 'FLOAT', 'mode': 'NULLABLE' }, { 'name': 'ts', 'type': 'TIMESTAMP', 'mode': 'NULLABLE' }] }) return parse_table_schema_from_json(json_str) # Now build the rest of the pipeline. # Calculate the word co-occurence scores. cooccur_rankings = ( lines | 'getcooccur' >> (beam.ParDo(CoOccurExtractingDoFn())) | 'co_pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'co_group' >> beam.GroupByKey() | 'co_count' >> beam.Map(lambda (wordts, ones): (wordts, sum(ones))) | 'weights' >> beam.Map(join_cinfo, AsDict(percents)) | 'co top 300' >> combiners.Top.Of(300, lambda x, y: x[2] < y[2])) # Format the counts into a PCollection of strings. wc_records = top_percents | 'format' >> beam.FlatMap( lambda x: [{ 'word': xx[0], 'percent': xx[1], 'ts': ts } for xx in x]) url_records = url_counts | 'urls_format' >> beam.FlatMap( lambda x: [{ 'url': xx[0], 'count': xx[1], 'ts': ts } for xx in x]) co_records = cooccur_rankings | 'co_format' >> beam.FlatMap( lambda x: [{ 'w1': xx[0][0], 'w2': xx[0][1], 'count': xx[1], 'log_weight': xx[2], 'ts': ts } for xx in x]) # Write the results to three BigQuery tables. wc_records | 'wc_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_counts' % (project, dataset), schema=generate_wc_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) url_records | 'urls_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.urls' % (project, dataset), schema=generate_url_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) co_records | 'co_write_bq' >> beam.io.Write( beam.io.BigQuerySink( '%s:%s.word_cooccur' % (project, dataset), schema=generate_cooccur_schema(), create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) # Actually run the pipeline. return p.run()
def run_pipeline(project, pipeline_options, run_locally, debug_attendees): """Creates a pipeline that reads entities from Cloud Datastore.""" p = beam.Pipeline(options=pipeline_options) # Create a query to read entities from datastore. client = datastore.Client() q = client.query(kind='DBEvent') if run_locally: q.key_filter(client.key('DBEvent', '999'), '>') q.key_filter(client.key('DBEvent', 'A'), '<') # Let's build a timestamp to save all our objects with timestamp = datetime.datetime.now() # Set up our map/reduce pipeline produce_attendees = ( p | 'read from datastore' >> ReadFromDatastore(project, query._pb_from_query(q), num_splits=400) | 'convert to entity' >> beam.Map(ConvertToEntity) # Find the events we want to count, and expand all the admins/attendees | 'filter events' >> beam.FlatMap(CountableEvent) | 'load fb attending' >> beam.ParDo(GetEventAndAttending()) | 'export attendees' >> beam.FlatMap(ExportPeople) ) top_attendee_lists = ( produce_attendees | 'map category -> person' >> beam.FlatMap(GroupPeopleByCategory) | 'group by category' >> beam.GroupByKey() | 'build top-people lists' >> beam.FlatMap(CountPeopleInfos) ) if debug_attendees: attendee_event_debugging = ( produce_attendees | 'map city-attendee -> event' >> beam.FlatMap(DebugExportEventPeopleForGrouping) | 'group by city-attendee' >> beam.GroupByKey() | 'within city-attendee, group event_ids by admin_hash' >> beam.FlatMap(DebugGroupEventIds) ) exploded_top_attendees = ( top_attendee_lists | 'explode the top attendees into a mapping: category-attendee -> YES' >> beam.FlatMap(DebugExplodeAttendeeList) # We don't deal with duplicates, since it requires the objects (ie our dicts) to be hashable # Instead, we rely on DebugFilterForTopAttendee to filter out duplicates created by the above # | 'remove duplicates from multiple overlapping attendee-lists' >> beam.RemoveDuplicates() ) ( # These both have the same keys: # key contains {person_type, city, category, person_id} (attendee_event_debugging, exploded_top_attendees) | beam.Flatten() # keys are {city, person_id} | 'group the attendee-debug info with the is-it-a-top-attendee info' >> beam.GroupByKey() | 'filter for TOP_ATTENDEE' >> beam.FlatMap(DebugFilterForTopAttendee) | 'build PRDebugAttendee' >> beam.ParDo(DebugBuildPRDebugAttendee(), timestamp) | 'write PRDebugAttendee to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) ( top_attendee_lists | 'generate PRCityCategory database record' >> beam.ParDo(BuildPRCityCategory(), timestamp, 'PRCityCategory', TOP_ALL_N) | 'write PRCityCategory to datastore (unbatched)' >> beam.ParDo(WriteToDatastoreSingle(), actually_save=not run_locally) ) """ (output | 'convert from entity' >> beam.Map(ConvertFromEntity) | 'write to datastore' >> WriteToDatastore(client.project) ) """ # Actually run the pipeline (all operations above are deferred). result = p.run() # Wait until completion, main thread would access post-completion job results. result.wait_until_finish() return result