def run_pipeline(in_file): import csv import apache_beam as beam from apache_beam.io.textio import ReadFromText from apache_beam.io.textio import WriteToText # Simple process for apache beam pipeline with beam.Pipeline(runner='DirectRunner') as p: # # Pipeline(0): Data ingestion # # "lines" will include pcollections of each line # Options # file_pattern: File path to file # skip_header_lines: First line will be skipped. Set to "1". # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText collections = p | 'ReadAiportInfo' >> ReadFromText( file_pattern=in_file[0], skip_header_lines=1) # # Pipeline(1): Create side input # Final PCollection will be used as side input for the date time convertion in the next transformation # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field # 2. Filter out invalid fields # 3. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26). Also add timezone for correspondng coordinates # airports = (collections | 'airports:Extract' >> beam.Map(lambda x: next(csv.reader([x], delimiter=','))) | 'airports:Filter' >> beam.Filter(lambda x: x[21] and x[26]) | 'airports:Timezone' >> beam.Map(lambda x: (x[0], addtimezone(x[21], x[26])))) # # Pipeline(2): Correct timezone # 1. Read flight data # 2. Convert times into UTC flights = (p | 'flights:read' >> ReadFromText(file_pattern=in_file[1], skip_header_lines=1) | 'flights:tzcorr' >> beam.FlatMap( tz_correct, beam.pvalue.AsDict(airports))) # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (flights | 'flights:out' >> WriteToText(file_path_prefix='flights')) # Pipeline(3): Generate departed and arrived events # 1. events = flights | '' >> beam.FlatMap(get_next_event) # # Pipeline(Final) # # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (events | 'event:out' >> WriteToText(file_path_prefix='events'))
def run(p, args, aggregator_dict, cloud_logger=None): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() # Create one pcollection per input file or file pattern. And then flatten # them into one pcollection. The duplicated names need to be removed as the # file name is used to create unique labels for the PTransform. readers = [] for pattern in list( set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))): # Setup reader. # # TODO(user): Perhaps simplify the batch prediction code by using # CompressionTypes.AUTO. if input_file_format.startswith("tfrecord"): if input_file_format == "tfrecord_gzip": compression_type = CompressionTypes.GZIP else: assert input_file_format == "tfrecord" compression_type = CompressionTypes.UNCOMPRESSED reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord( pattern, compression_type=compression_type) else: assert input_file_format == "text" reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern) # Put the pcollections into a list and flatten later. readers.append(p | reader) # Setup the whole pipeline. results, errors = (readers | beam.Flatten() | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), batch_size=args.batch_size, aggregator_dict=aggregator_dict, cloud_logger=cloud_logger)) # Convert predictions to JSON and then write to output files. _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText( os.path.join(args.output_location, OUTPUT_RESULTS_FILES_BASENAME_))) # Write prediction errors counts to output files. _ = ( errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText( os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_))) return p.run()
def main(): args, pipeline_args = get_args() # 먼저 PipelineOpions 을 통해서 pipeline에 대한 설정을 할 수 있습니다. # 예를 들어서 pipeline runner를 설정하여 무엇이 pipeline을 실행할지 설정할수 있습니다. pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Input data file -> TextIO.Read Transform -> PCollection(lines) lines = p | ReadFromText(args.input) counts = ( lines | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) .with_output_types(unicode)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(args.output)
def run_pipeline(in_file, out_file): # Simple process for apache beam pipeline with beam.Pipeline(runner='DirectRunner') as p: # # Pipeline(0): Data ingestion # # "lines" will include pcollections of each line # Options # file_pattern: File path to file # skip_header_lines: First line will be skipped. Set to "1". # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText collections = p | ReadFromText(file_pattern=in_file, skip_header_lines=1) # # Pipeline(n): Detailed Transformation # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field # 2. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26) # airports = (collections | 'Extract_Into_Fields' >> beam.Map(lambda x: next(csv.reader([x], delimiter=','))) | 'Set_Fields' >> beam.Map(lambda x: (x[0], (x[21], x[26])))) # # Pipeline(Final) # # Write results to a file. Tuples are unpacked while function call. # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText (airports | beam.Map(lambda (airport, data): "{0},{1}".format(airport, ','.join(data))) | WriteToText(file_path_prefix=out_file))
def get(self): """ Flask view that triggers the execution of the pipeline """ input_filename = 'data/input/titanic.txt' output_filename = 'data/output/titanic.txt' # project_id = os.environ['DATASTORE_PROJECT_ID'] # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] # client = datastore.Client.from_service_account_json(credentials_file) options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'test-job' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: rows = (p | ReadFromText(input_filename) | apache_beam.ParDo(Split())) survived = (rows | apache_beam.ParDo(CollectSurvived()) | apache_beam.GroupByKey() | apache_beam.ParDo(WriteToCSV()) | WriteToText(output_filename)) return 'All Titanic survivors are writte to data/output/titanic.txt-00000-of-00001'
def get(self): """ Flask view that triggers the execution of the pipeline """ input_filename = 'input.txt' output_filename = 'output.txt' # project_id = os.environ['DATASTORE_PROJECT_ID'] # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS'] # client = datastore.Client.from_service_account_json(credentials_file) options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) # gcloud_options.project = project_id gcloud_options.job_name = 'test-job' # Dataflow runner runner = os.environ['DATAFLOW_RUNNER'] options.view_as(StandardOptions).runner = runner with apache_beam.Pipeline(options=options) as p: rows = ( p | ReadFromText(input_filename) | apache_beam.ParDo(Split()) ) timings = ( rows | apache_beam.ParDo(CollectTimings()) | "Grouping timings" >> apache_beam.GroupByKey() | "Calculating average" >> apache_beam.CombineValues( apache_beam.combiners.MeanCombineFn() ) ) users = ( rows | apache_beam.ParDo(CollectUsers()) | "Grouping users" >> apache_beam.GroupByKey() | "Counting users" >> apache_beam.CombineValues( apache_beam.combiners.CountCombineFn() ) ) to_be_joined = ( { 'timings': timings, 'users': users } | apache_beam.CoGroupByKey() | apache_beam.ParDo(WriteToCSV()) | WriteToText(output_filename) ) return 'ok'
def run(p, args, aggregator_dict): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() input_file_patterns = args.input_file_patterns # Setup reader. if input_file_format == "text": reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText( input_file_patterns) elif input_file_format == "tfrecord": reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord( input_file_patterns) elif input_file_format == "tfrecord_gzip": reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip( input_file_patterns) # Setup the whole pipeline. results, errors = (reader | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), tags=args.tags, signature_name=args.signature_name, batch_size=args.batch_size, aggregator_dict=aggregator_dict, user_project_id=args.user_project_id, user_job_id=args.user_job_id, framework=args.framework)) # Convert predictions to JSON and then write to output files. _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText(args.output_result_prefix)) # Write prediction errors counts to output files. _ = (errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix)) return p.run()
def test_write_pipeline(self): with TestPipeline() as pipeline: pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText(self.path) # pylint: disable=expression-not-assigned read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(sorted(read_result), sorted(self.lines))
def test_write_dataflow_auto_compression(self): with TestPipeline() as pipeline: pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText(self.path, file_name_suffix='.gz') # pylint: disable=expression-not-assigned read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(sorted(read_result), sorted(self.lines))
def test_write_dataflow(self): pipeline = TestPipeline() pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText(self.path) # pylint: disable=expression-not-assigned pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(read_result, self.lines)
def test_write_pipeline_non_globalwindow_input(self): with TestPipeline() as p: _ = (p | beam.core.Create(self.lines) | beam.WindowInto(beam.transforms.window.FixedWindows(1)) | 'Write' >> WriteToText(self.path)) read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(sorted(read_result), sorted(self.lines))
def test_write_pipeline_auto_compression_unsharded(self): with TestPipeline() as pipeline: pcoll = pipeline | 'Create' >> beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText( # pylint: disable=expression-not-assigned self.path + '.gz', shard_name_template='') read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(sorted(read_result), sorted(self.lines))
def test_write_dataflow_auto_compression_unsharded(self): pipeline = TestPipeline() pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText(self.path + '.gz', shard_name_template='') # pylint: disable=expression-not-assigned pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'r') as f: read_result.extend(f.read().splitlines()) self.assertEqual(read_result, self.lines)
def expand(self, pcoll): # pylint: disable=arguments-differ return ( pcoll | "ToList" >> beam.Map(DictToList(self.columns)) | "Format" >> TransformAndLog(beam.Map( lambda x: format_csv_rows([x], delimiter=self.delimiter)), log_prefix='formatted csv: ', log_level='debug') | "Utf8Encode" >> beam.Map(lambda x: x.encode('utf-8')) | "Write" >> WriteToText( self.path, file_name_suffix=self.file_name_suffix, header=format_csv_rows( [self.columns], delimiter=self.delimiter).encode('utf-8')))
def test_write_pipeline_footer(self): with TestPipeline() as pipeline: footer_text = 'footer' pcoll = pipeline | beam.core.Create(self.lines) pcoll | 'Write' >> WriteToText( # pylint: disable=expression-not-assigned self.path, footer=footer_text) read_result = [] for file_name in glob.glob(self.path + '*'): with open(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(sorted(read_result[:-1]), sorted(self.lines)) self.assertEqual(read_result[-1], footer_text.encode('utf-8'))
def test_write_dataflow_header(self): pipeline = TestPipeline() pcoll = pipeline | 'Create' >> beam.core.Create(self.lines) header_text = b'foo' pcoll | 'Write' >> WriteToText( # pylint: disable=expression-not-assigned self.path + '.gz', shard_name_template='', header=header_text) pipeline.run() read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) self.assertEqual(read_result, [header_text] + self.lines)
def test_write_dataflow_header(self): with TestPipeline() as pipeline: pcoll = pipeline | 'Create' >> beam.core.Create(self.lines) header_text = 'foo' pcoll | 'Write' >> WriteToText( # pylint: disable=expression-not-assigned self.path + '.gz', shard_name_template='', header=header_text) read_result = [] for file_name in glob.glob(self.path + '*'): with gzip.GzipFile(file_name, 'rb') as f: read_result.extend(f.read().splitlines()) # header_text is automatically encoded in WriteToText self.assertEqual(read_result[0], header_text.encode('utf-8')) self.assertEqual(sorted(read_result[1:]), sorted(self.lines))
def BuildPipeline(pathToFiles, compute_table_name): raw_output = "output-raw" final_output = "output-final" options = PipelineOptions() #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**" pipeline = beam.Pipeline(options=options) vectors = ( pipeline | "Read Files" >> ReadFromTextWithFilename(pathToFiles) | "Group by File" >> beam.GroupByKey() | "Hashing Vectors" >> beam.ParDo(Hashing()) # | "Write CSV to biqquery" >> beam.io.WriteToBigQuery( # table=compute_table_name, # schema=GetSchema() # ) | "Write to file" >> WriteToText(raw_output) #| #| "Save" >> beam.ParDo(Save()) ) return pipeline
def run(): options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) sentiments = ( p | "Read From Text" >> ReadFromText("doc_sentiment.txt", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Doc, SentimentScore Tuple" >> beam.Map(lambda x: (x.split(" ")[0], x.split(" ")[1]))) nes = ( p | "Read Named Entites" >> ReadFromText("doc_nes.txt", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Doc, Entities Tuple" >> beam.Map(lambda x: eval(x))) def process_nes_sentiment((doc, nes_sentiment)): neslist = nes_sentiment["nes"] st = nes_sentiment["sentiment"][0] for nes in neslist: for ne in nes: yield (ne[0], ne[1], st) g = ({ "nes": nes, "sentiment": sentiments } | beam.CoGroupByKey() | beam.FlatMap(process_nes_sentiment)) (g | "Write Results" >> WriteToText("ne_sentiment.txt")) p.run()
def main(): logging.basicConfig(level=logging.INFO) logger = logging.getLogger('dataflow_poc') input_file = INPUT_FILEPATH summary_file = SUMMARY_FILEPATH output_file = OUTPUT_FILEPATH options = PipelineOptions() gcloud_options = options.view_as(GoogleCloudOptions) worker_options = options.view_as(WorkerOptions) gcloud_options.project = PROJECT_ID gcloud_options.temp_location = OUTPUT_TEMP_FILEPATH worker_options.num_workers = START_WORKERS worker_options.max_num_workers = MAX_WORKERS gcloud_options.job_name = 'csv-transform' options.view_as(StandardOptions).runner = RUNNER logger.info('Ready to load the file') with apache_beam.Pipeline(options=options) as pipe: datarows = pipe | ReadFromText(input_file) | apache_beam.ParDo(FilterHeader(header)) | apache_beam.ParDo(Parse()) datarows | apache_beam.ParDo(Summary()) | "WriteSummary" >> WriteToText(summary_file) datarows | apache_beam.ParDo(FilterTrainingDays()) | apache_beam.CombineGlobally(sum) | "WriteCount" >> WriteToText(output_file)
def main(source_path, destination_path, args): """ defining the whole pipeline """ p = beam.Pipeline(argv = args) values = ( p | "ReadCSV" >> ReadFromText(source_path, skip_header_lines = True) | beam.ParDo(Split()) ) mean_item_id = ( values | beam.ParDo(CollectOpen()) | "Grouping Keys Open" >> beam.GroupByKey() | "Calculating Mean for item price" >> beam.CombineValues( beam.combiners.MeanCombineFn() ) ) output = ( mean_item_id | "WriteCSV" >> WriteToText(destination_path, file_name_suffix = ".csv") ) p.run()
def run(): import pickle import sys import math import numpy as np import apache_beam as beam reload(sys) sys.setdefaultencoding('utf8') import argparse import simplejson from gensim.models import KeyedVectors from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions from apache_beam.io.textio import ReadFromText, WriteToText import nltk.data from nltk.tokenize import WordPunctTokenizer import re import uuid import perceptron # Sentences From Text _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") word_tokenizer = WordPunctTokenizer() abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations model_file = "perceptron_word2vec_stemmed_normalized.pickle" with open(model_file, 'rb') as model: w, b = pickle.load(model) def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToJsonObj(jsonText): return simplejson.loads(jsonText) def convertToObject(jsonObj): x = jsonObj obj = { "title": x.get("properties", {}).get("title", {}).get("stringValue", ""), "link": x.get("properties", {}).get("link", {}).get("stringValue", ""), "published": x.get("properties", {}).get("published", {}).get("stringValue", ""), "description": x.get("properties", {}).get("description", {}).get("stringValue", ""), "content": x.get("properties", {}).get("content", {}).get("stringValue", ""), } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) pairs = ( p | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)) tokens_1gram = ( pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap( tokenize_to_words) # also convert to key value pairs ) """ tokens_2gram = (pairs | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)]) ) """ tokens = tokens_1gram """ vocabulary = (tokens | "Get words only" >> beam.Values() | "Remove duplicate words" >> beam.RemoveDuplicates() ) vocabulary_size = (vocabulary | "Count Vocabulary elements" >> beam.combiners.Count.Globally() ) doc_total_words = (tokens | "Count Words of Doc" >> beam.combiners.Count.PerKey() ) """ tokens_paired_with_1 = ( tokens | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1))) """ token_counts_per_doc = (tokens_paired_with_1 | "Group by Doc,Word" >> beam.GroupByKey() | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts)))) | "Group by Doc" >> beam.GroupByKey() ) num_docs = (token_counts_per_doc | "Get Docs" >> beam.Keys() | "Count Docs" >> beam.combiners.Count.Globally() ) word_tf_pre = ( { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc } | "CoGroup By Document" >> beam.CoGroupByKey() ) def calc_tf((doc, count)): [token_count] = count['token_counts_per_doc'] [tokens_total] = count['total_tokens'] for token, cnt in token_count: yield token, (doc, float(cnt) / tokens_total) doc_word_tf = (word_tf_pre | "Compute Term Frequencies" >> beam.FlatMap(calc_tf) ) word_occurrences = (tokens | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates() | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1)) | "Group by Word" >> beam.GroupByKey() | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts))) ) token_df = ( word_occurrences | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs))) token_tf_df = ( { 'term_frequency': doc_word_tf, 'document_frequency': token_df} | "CoGroup By Token" >> beam.CoGroupByKey()) def calc_tfidf((token, tfdf)): [df] = tfdf['document_frequency'] for doc, tf in tfdf['term_frequency']: yield (doc, token), tf * math.log(1.0 / df) token_tf_idf = (token_tf_df | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf) ) """ word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) def get_vec(word2vec, token): try: x = word2vec.get_vector(token) x = x.reshape(400) except: x = np.zeros(400) return x def analyze_sentiment(x): res = perceptron.f(x, w, b) return res doc_sentiment = ( tokens_paired_with_1 | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token))) | "Group Word2Vec Vectors By Document" >> beam.CombinePerKey(sum) | "Sum Word2Vec Vectors" >> beam.Map(lambda (doc, vec): (doc, analyze_sentiment(vec)[0]))) result = (doc_sentiment | "Format Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens))) (result | "Write Results" >> WriteToText("sentiments")) p.run()
with apache_beam.Pipeline(options=options) as p: stamps = ( p | "Reading CSV" >> ReadFromText(input_filename, skip_header_lines=1) | "Parsing CSV" >> apache_beam.ParDo(Split()) | "Getting stamps" >> apache_beam.ParDo(CollectPixelVals())) psc_collection = (stamps | "Making PSCs" >> apache_beam.GroupByKey()) # calculate the mean for Open values stamp_sums = ( psc_collection | "Getting stamp sum" >> apache_beam.CombineValues(SumCombineFn())) #normalized = ( # { # 'stamps': stamps, # 'sums': stamp_sums, # } | # "Grouping together" >> apache_beam.CoGroupByKey() | # "NormalizingFlux" >> apache_beam.ParDo(GetNormal()) # ) # #results = ( # normalized | # "Getting normal PSC" >> apache_beam.GroupByKey() #) output = (stamp_sums | "Formatting CSV" >> apache_beam.ParDo(WriteToCSV()) | "Writing CSV" >> WriteToText(output_filename))
skip_header_lines=1) | "From text to list (rain)" >> beam.Map(text_to_list, delimiter=',') | "Create key UF-YEAR-MONTH" >> beam.Map(key_uf_year_month) | "Sum of total rain by key" >> beam.CombinePerKey(sum) # operacao pesada | "Round rain results" >> beam.Map(arredonda) # | "Show results" >> beam.Map(print) ) result = ( # (chuvas, dengue) # | "Pile PCollections" >> beam.Flatten() # | "GroupByKey" >> beam.GroupByKey() ({ 'chuvas': chuvas, 'dengue': dengue }) | "Merge PCollections" >> beam.CoGroupByKey() | "Filter empty data" >> beam.Filter(filter_empty_fields) | "Unzip elements" >> beam.Map(unzip_elements) | "Prepare csv" >> beam.Map(prepare_csv) # | "Show union results" >> beam.Map(print) ) header = 'state;year;month;rain;dengue' result | 'Create CSV file' >> WriteToText( './data/result', file_name_suffix='.csv', num_shards=2, # number of files used for output header=header) pipeline.run()
def run(): import pickle import sys import math import numpy as np reload(sys) sys.setdefaultencoding('utf8') from gensim.models import KeyedVectors import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore from google.cloud.proto.datastore.v1 import query_pb2 from apache_beam.io.textio import WriteToText import nltk.data import re import uuid import perceptron _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations model_file = "perceptron_word2vec_stemmed_normalized.pickle" with open(model_file, 'rb') as model: w, b = pickle.load(model) def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return sentence.split(" ") # nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToObject(jsonObj): x = jsonObj link = x.properties.get('link', None) link = link.string_value if link else "" title = x.properties.get('title', None) title = title.string_value if title else "" description = x.properties.get("description", None) description = description.string_value if description else "" content = x.properties.get("text", "") content = content.string_value if content else "" published = x.properties.get("published") published = published.string_value if published else "" obj = { "link": link, "title": title, "description": description, "content": content, "published": published } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = ( sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"]) ) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'news-197916' google_cloud_options.job_name = 'sentiment-analysis' google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/' google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp' options.view_as(StandardOptions).runner = 'DataflowRunner' setup_options = options.view_as(SetupOptions) setup_options.requirements_file = "requirements.txt" setup_options.save_main_session = True p = beam.Pipeline(options=options) query = query_pb2.Query() query.kind.add().name = "News_Entry" pairs = (p | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query) # | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line # | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings) ) tokens_1gram = (pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words) # also convert to key value pairs ) """ tokens_2gram = (pairs | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)]) ) """ tokens = tokens_1gram """ vocabulary = (tokens | "Get words only" >> beam.Values() | "Remove duplicate words" >> beam.RemoveDuplicates() ) vocabulary_size = (vocabulary | "Count Vocabulary elements" >> beam.combiners.Count.Globally() ) doc_total_words = (tokens | "Count Words of Doc" >> beam.combiners.Count.PerKey() ) """ tokens_paired_with_1 = (tokens | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)) ) """ token_counts_per_doc = (tokens_paired_with_1 | "Group by Doc,Word" >> beam.GroupByKey() | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts)))) | "Group by Doc" >> beam.GroupByKey() ) num_docs = (token_counts_per_doc | "Get Docs" >> beam.Keys() | "Count Docs" >> beam.combiners.Count.Globally() ) word_tf_pre = ( { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc } | "CoGroup By Document" >> beam.CoGroupByKey() ) def calc_tf((doc, count)): [token_count] = count['token_counts_per_doc'] [tokens_total] = count['total_tokens'] for token, cnt in token_count: yield token, (doc, float(cnt) / tokens_total) doc_word_tf = (word_tf_pre | "Compute Term Frequencies" >> beam.FlatMap(calc_tf) ) word_occurrences = (tokens | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates() | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1)) | "Group by Word" >> beam.GroupByKey() | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts))) ) token_df = ( word_occurrences | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs))) token_tf_df = ( { 'term_frequency': doc_word_tf, 'document_frequency': token_df} | "CoGroup By Token" >> beam.CoGroupByKey()) def calc_tfidf((token, tfdf)): [df] = tfdf['document_frequency'] for doc, tf in tfdf['term_frequency']: yield (doc, token), tf * math.log(1.0 / df) token_tf_idf = (token_tf_df | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf) ) """ word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) def get_vec(word2vec, token): if word2vec is None: word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True) try: x = word2vec.get_vector(token) x = x.reshape(400) except: x = np.zeros(400) return x def analyze_sentiment(x): res = perceptron.f(x, w, b) return res doc_sentiment = (tokens_paired_with_1 | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token))) | "Group Word2Vec Vectors By Document" >> beam.GroupByKey() | "Sum Word2Vec Vectors" >> beam.Map( lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0])) ) result = (doc_sentiment | "Format Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)) ) (result | "Write Results" >> WriteToText("sentiments") ) p.run()
) chuvas = ( pipeline | "Leitura do dataset de chuvas" >> ReadFromParquet('chuvas.parquet') | 'Chuvas - Criar chave uf_ano_mes' >> beam.Map(chuva_chave_uf_ano_mes_lista) | 'Chuvas - Soma dos mm pela chave' >> beam.CombinePerKey(sum) | 'Chuvas - Arredondar resultados' >> beam.Map(arredonda) #| "Chuvas teste - Mostrar resultados" >> beam.Map(print) ) final = ( ({ 'chuvas': chuvas, 'dengue': dengue }) | "Final - Mesclar pcols" >> beam.CoGroupByKey() | "Final - filtrar valores vazios" >> beam.Filter(filtra_campos_nao_vazios) | "Final - descompacta a saída" >> beam.Map(descompacta_elementos) | "Final - preparar csv" >> beam.Map(preparar_csv) #| "Final - Mostrar resultados" >> beam.Map(print) ) header = 'UF;ANO;MES;CHUVA;DENGUE' final | "Criar arquivo csv" >> WriteToText( 'final', file_name_suffix='.csv', header=header) pipeline.run()
def run(p, args, aggregator_dict): """Run the pipeline with the args and dataflow pipeline option.""" # Create a PCollection for model directory. model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir]) input_file_format = args.input_file_format.lower() input_file_patterns = args.input_file_patterns # Setup reader. if input_file_format == "json": reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText( input_file_patterns) elif input_file_format == "tfrecord": reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord( input_file_patterns) elif input_file_format == "tfrecord_gzip": reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip( input_file_patterns) # Setup the whole pipeline. results, errors = (reader | "BATCH_PREDICTION" >> batch_prediction.BatchPredict( beam.pvalue.AsSingleton(model_dir), tags=args.tags, signature_name=args.signature_name, batch_size=args.batch_size, aggregator_dict=aggregator_dict, user_project_id=args.user_project_id, user_job_id=args.user_job_id, framework=args.framework)) output_file_format = args.output_file_format.lower() # Convert predictions to target format and then write to output files. if output_file_format == "json": _ = (results | "TO_JSON" >> beam.Map(json.dumps) | "WRITE_PREDICTION_RESULTS" >> WriteToText( args.output_result_prefix)) elif output_file_format == "csv": fields = ( results | "SAMPLE_SINGLE_ELEMENT" >> Sample.FixedSizeGlobally(1) | "GET_KEYS" >> beam.Map( # entry could be None if no inputs were valid lambda entry: entry[0].keys() if entry else [])) _ = (fields | "KEYS_TO_CSV" >> beam.Map(keys_to_csv) | "WRITE_KEYS" >> WriteToText(args.output_result_prefix, file_name_suffix="_header.csv", shard_name_template="")) _ = (results | "VALUES_TO_CSV" >> beam.Map(values_to_csv, beam.pvalue.AsSingleton(fields)) | "WRITE_PREDICTION_RESULTS" >> WriteToText( args.output_result_prefix, file_name_suffix=".csv", append_trailing_newlines=False)) # Write prediction errors counts to output files. _ = (errors | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey() | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix)) return p.run()
skip_header_lines=1) | "De texto para lista (chuvas)" >> beam.Map(texto_para_lista, delimitador=',') | "Criando a chave UF-ANO-MES" >> beam.Map(chave_uf_ano_mes_de_lista) | "Soma do total de chuvas pela chave" >> beam.CombinePerKey(sum) | "Arrendondar resultados de chuvas" >> beam.Map(arredonda) # | "Mostrar resultados" >> beam.Map(print) ) resultado = ( # (chuvas, dengue) # | "Empilha as pcols" >> beam.Flatten() # | "Agrupa as pcols" >> beam.GroupByKey() ({ 'chuvas': chuvas, 'dengue': dengue }) | 'Mesclar pcols' >> beam.CoGroupByKey() | 'Filtrar dados vazios' >> beam.Filter(filtra_campos_vazios) | 'Descompactar elementos' >> beam.Map(descompactar_elementos) | 'Preparar csv' >> beam.Map(preparar_csv) # | "Mostrar resultados da união" >> beam.Map(print) ) # uf, ano, mes, str(chuva), str(dengue) header = 'UF;ANO;MES;CHUVA;DENGUE' resultado | 'Criar arquivo CSV' >> WriteToText( 'resultado', file_name_suffix='.csv', header=header) pipeline.run()
class WriteToCSV(beam.DoFn): def process(self, element): """ Prepares each row to be written in the csv """ result = [ "{},{},{}".format(element[0], element[1]['users'][0], element[1]['timings'][0]) ] return result with beam.Pipeline(options=options) as p: rows = (p | ReadFromText(input_filename) | beam.ParDo(Split())) timings = (rows | beam.ParDo(CollectTimings()) | "Grouping timings" >> beam.GroupByKey() | "Calculating average" >> beam.CombineValues( beam.combiners.MeanCombineFn())) users = (rows | beam.ParDo(CollectUsers()) | "Grouping users" >> beam.GroupByKey() | "Counting users" >> beam.CombineValues(beam.combiners.CountCombineFn())) to_be_joined = ({ 'timings': timings, 'users': users } | beam.CoGroupByKey() | beam.ParDo(WriteToCSV()) | WriteToText(output_filename))
def run(): # Sentences From Text _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle") word_tokenizer = WordPunctTokenizer() abbreviations = set() with open("./tokenizer/abbreviations-long.txt") as f: for l in f: abbreviations.add(l.split(':')[0]) _sentence_tokenizer._params.abbrev_types = abbreviations def sentences_from_text(text): return _sentence_tokenizer.tokenize(text.strip()) def tokens_from_sentence(sentence): return sentence.split() # nltk.word_tokenize(sentence) def ngrams(obj, n): tokens = [] sentences = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) for sentence in sentences: tokens += tokens_from_sentence(sentence) pairs = nltk.ngrams(tokens, n) return [" ".join(pair) for pair in pairs] def convertToJsonObj(jsonText): return simplejson.loads(jsonText) def convertToObject(jsonObj): x = jsonObj obj = { "title": x.get("properties", {}).get("title", {}).get("stringValue", ""), "link": x.get("properties", {}).get("link", {}).get("stringValue", ""), "published": x.get("properties", {}).get("published", {}).get("stringValue", ""), "description": x.get("properties", {}).get("description", {}).get("stringValue", ""), "content": x.get("properties", {}).get("content", {}).get("stringValue", ""), } obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4()) return obj # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def removeHTMLFromStrings(obj): for key in obj.keys(): obj[key] = cleanhtml(obj[key]) return obj def tokenize_to_sentences(obj): obj["sentences"] = (sentences_from_text(obj["title"]) + sentences_from_text(obj["description"]) + sentences_from_text(obj["content"])) return obj def tokenize_to_words(obj): obj["tokens"] = [] for sentence in obj["sentences"]: obj["tokens"] += tokens_from_sentence(sentence) for token in obj["tokens"]: yield (obj["key"], token) def get_named_entities(mdl, tokens): stemmer = TurkishStemmer() res = mdl.analyze(tokens) entities = [] for entity in res["entities"]: for entity2 in entity["text"].split(", "): ne = stemmer.stem(entity2).split("'")[0] entities.append((entity["type"], ne, entity["score"])) return entities options = PipelineOptions() options.view_as(StandardOptions).runner = 'DirectRunner' p = beam.Pipeline(options=options) pairs = ( p | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line | "Convert to Json Object" >> beam.Map(convertToJsonObj) | "Convert to Python Object" >> beam.Map(convertToObject) | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)) tokens_1gram = ( pairs | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences) | 'Word Tokenization' >> beam.FlatMap( tokenize_to_words) # also convert to key value pairs ) tokens = tokens_1gram def process_tokens_last(doc, tokens): return (doc, get_named_entities(tokens)) doc_named_entities = ( tokens | beam.GroupByKey() # | beam.Map(lambda (doc, tokens): process_tokens_last(mdl, tokens)) ) (doc_named_entities | "Write Results" >> WriteToText("doc_tokens")) p.run()