def run(): gcs_path = "gs://ct-felicidad_y_cultura" #Definicion de la raiz del bucket gcs_project = "contento-bi" mi_runer = ("DirectRunner", "DataflowRunner")[socket.gethostname() == "contentobi"] pipeline = beam.Pipeline( runner=mi_runer, argv=[ "--project", gcs_project, "--staging_location", ("%s/dataflow_files/staging_location" % gcs_path), "--temp_location", ("%s/dataflow_files/temp" % gcs_path), "--output", ("%s/dataflow_files/output" % gcs_path), "--setup_file", "./setup.py", "--max_num_workers", "5", "--subnetwork", "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1" ]) lines = pipeline | 'Lectura de Archivo' >> ReadFromText( gcs_path + "/Clima/personal" + ".csv") transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData())) # transformed | 'Escribir en Archivo' >> WriteToText(gcs_path + "/Seguimiento/Avon_inf_seg_2",file_name_suffix='.csv',shard_name_template='') transformed | 'Escritura a BigQuery Felicidad_y_Cultura' >> beam.io.WriteToBigQuery( gcs_project + ":Felicidad_y_Cultura.Personal", schema=TABLE_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) jobObject = pipeline.run() jobObject.wait_until_finish() # jobID = jobObject.job_id() return ("Corrio sin problema")
def run(table, TABLE_DB): gcs_path = 'gs://ct-bridge' #Definicion de la raiz del bucket gcs_project = "contento-bi" FECHA_CARGUE = str(datetime.date.today()) mi_runner = ("DirectRunner", "DataflowRunner")[socket.gethostname() == "contentobi"] pipeline = beam.Pipeline( runner=mi_runner, argv=[ "--project", gcs_project, "--staging_location", ("%s/dataflow_files/staging_location" % gcs_path), "--temp_location", ("%s/dataflow_files/temp" % gcs_path), "--output", ("%s/dataflow_files/output" % gcs_path), "--setup_file", "./setup.py", "--max_num_workers", "10", "--subnetwork", "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1" ]) lines = pipeline | 'Lectura de Archivo' >> ReadFromText( gcs_path + "/" + FECHA_CARGUE + "_" + TABLE_DB + ".csv") transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData())) # transformed | 'Escribir en Archivo' >> WriteToText(gcs_path + "/" + "REWORK",file_name_suffix='.csv',shard_name_template='') transformed | 'Escritura a BigQuery Bridge' >> beam.io.WriteToBigQuery( gcs_project + ":Contactabilidad." + table, schema=TABLE_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) jobObject = pipeline.run() return ("Proceso de transformacion y cargue, completado") ################################################################################
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://justlikethat-294122/sample.txt', help='Input file to process.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: utility = Utility() quotes = ( p | 'Read' >> ReadFromText(known_args.input) | 'Parse Log' >> beam.Map(lambda line: utility.parse(line)) ) table_spec = bigquery.TableReference( projectId='justlikethat-294122', datasetId='mydataset', tableId='quotes' ) table_schema='source:STRING,quote:STRING' quotes | beam.io.gcp.bigquery.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED )
def run(archivo, mifecha): gcs_path = "gs://ct-metlife" #Definicion de la raiz del bucket gcs_project = "contento-bi" mi_runer = ("DirectRunner", "DirectRunner")[socket.gethostname() == "contentobi"] pipeline = beam.Pipeline( runner=mi_runer, argv=[ "--project", gcs_project, "--staging_location", ("%s/dataflow_files/staging_location" % gcs_path), "--temp_location", ("%s/dataflow_files/temp" % gcs_path), "--output", ("%s/dataflow_files/output" % gcs_path), "--setup_file", "./setup.py", "--max_num_workers", "10", "--subnetwork", "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1" # "--num_workers", "30", # "--autoscaling_algorithm", "NONE" ]) lines = pipeline | 'Lectura de Archivo' >> ReadFromText( archivo, skip_header_lines=1) transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData(mifecha))) transformed | 'Escritura a BigQuery metlife bases inspector colsubsidio monetaria novedades' >> beam.io.WriteToBigQuery( gcs_project + ":MetLife.bases_iniciales", schema=TABLE_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) jobObject = pipeline.run() return ("Successful beam process")
def run(p, input_file, output_file): # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(input_file) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes)) | 'count' >> beam.ExternalTransform('pytest:beam:transforms:count', None, EXPANSION_SERVICE_ADDR)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(output_file) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = p | 'ReadFromText' >> ReadFromText(known_args.input) results = ( lines | 'Split' >> beam.Map(split) | 'Group' >> beam.GroupByKey() | 'CPFirstIntervalSkew' >> beam.Map(skew) | 'FormatResult' >> beam.Map(format_result) ) results | 'WriteToText' >> WriteToText(known_args.output)
def run(archivo, mifecha): gcs_path = "gs://ct-fanalca" #Definicion de la raiz del bucket gcs_project = "contento-bi" mi_runer = ("DirectRunner", "DataflowRunner")[socket.gethostname() == "contentobi"] pipeline = beam.Pipeline( runner=mi_runer, argv=[ "--project", gcs_project, "--staging_location", ("%s/dataflow_files/staging_location" % gcs_path), "--temp_location", ("%s/dataflow_files/temp" % gcs_path), "--output", ("%s/dataflow_files/output" % gcs_path), "--setup_file", "./setup.py", "--max_num_workers", "5", "--subnetwork", "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1" ]) lines = pipeline | 'Lectura de Archivo HONDA-DIGITAL' >> ReadFromText( archivo, skip_header_lines=1) transformed = ( lines | 'Formatear Data HONDA-DIGITAL' >> beam.ParDo(formatearData(mifecha))) transformed | 'Escritura a BigQuery fanalca HONDA-DIGITAL' >> beam.io.WriteToBigQuery( gcs_project + ":fanalca.asignacion_digital", schema=TABLE_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND) # transformed | 'Borrar Archivo' >> FileSystems.delete('gs://ct-avon/prejuridico/AVON_INF_PREJ_20181111.TXT') # 'Eliminar' >> FileSystems.delete (["archivos/Info_carga_avon.1.txt"]) jobObject = pipeline.run() # jobID = jobObject.job_id() return ("Corrio Full HD")
def run(argv=None): """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='File to read in.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # 1. Read in the file from Google Cloud Storage. Hint: remember there is a header line in the CSV. input_rows = p | 'ReadFile' >> ReadFromText() # 2. Convert the rows into Key, Value pairs. Hint: use tuples # 3. For each Key, sum up the values. Hint: CombinePerKey(sum) # 4. Format the as Python dictionaries for writing to BigQuery # 5. Write the output to BigQuery (input_rows | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='department:STRING, value:FLOAT', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) p.run().wait_until_finish()
def run(argv=None): """Main entry point; defines and runs the PDkit_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='tremor_data_with_user.csv', help='Input file to process.') known_args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: events = ( p | 'Read' >> ReadFromText(known_args.input) | 'AddEventTimestamps' >> beam.ParDo(TransformTimestampDoFn()) ) a = ( events | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn()) | 'CalculateTeamScores' >> CalculateTeamScores( args.team_window_duration, args.allowed_lateness) ) a | beam.ParDo(lambda (x): print(x))
def simple_beam(text_input=parameter[PathStr], text_output=beam_output): class RequireSomeParameter(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument("--param", dest="param", required=True) dataflow_pipeline = current_task().build_pipeline(["--param", "2"]) # Read the text file[pattern] into a PCollection. lines = dataflow_pipeline | "read" >> ReadFromText(text_input) x = dataflow_pipeline._options.view_as(RequireSomeParameter) assert x.param == "2" counts = (lines | "pair_with_one" >> beam.Map(lambda x: (x, 1)) | "group" >> beam.GroupByKey()) counts | "write" >> WriteToText(text_output) target(text_output).mkdir() result = dataflow_pipeline.run() result.wait_until_finish()
def run(argv=None): """Runs the workflow computing total points from a collection of matches.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) p = beam.Pipeline(argv=pipeline_args) (p # pylint: disable=expression-not-assigned | 'read' >> ReadFromText(known_args.input, coder=JsonCoder()) | 'points' >> beam.FlatMap(compute_points) | beam.CombinePerKey(sum) | 'write' >> WriteToText(known_args.output, coder=JsonCoder())) p.run()
def runMyPipeline(argv=None): """Main entry point""" # Parse arguments, prepare pipeline options parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True # Define the dataflow pipeline p = beam.Pipeline(options=pipeline_options) # Import local or cloud data and write contents to local disk dataset1 = '/Users/jdoe/mydataset.txt' dataset2 = 'gs://apache-beam-samples/shakespeare/kinglear.txt' dataout = (p | 'dataset_import' >> ReadFromText(dataset1) | 'write_to_disk' >> WriteToText('outputfile')) # Execute the pipeline result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='gs://BUCKET-NAME/output', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions() options.view_as(StandardOptions).runner = 'DataflowRunner' google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'PROJECT-NAME' google_cloud_options.staging_location = 'gs://BUCKET-NAME/staging' google_cloud_options.temp_location = 'gs://BUCKET-NAME/temp' google_cloud_options.job_name = 'JOBNAME-USERNAME-DATETIME' with beam.Pipeline(options=options) as p: counts = ( p | ReadFromText(known_args.input) | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x) ) # Find all word matches | beam.Map(lambda x: (x, 1)) # Create tuple (word,1) | beam.CombinePerKey(sum) # Reduce by key i.e. the word | beam.ParDo(Split()) | beam.io.Write( beam.io.BigQuerySink('PROJECT-NAME:DATASET-NAME.TABLE-NAME', schema='word:STRING,freq:INTEGER', write_disposition=beam.io. BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io. BigQueryDisposition.CREATE_IF_NEEDED)))
def run(): pipeline = beam.Pipeline() output = 'wordcount-multiple-outputs' output_suffix = '.txt' lines = pipeline | 'read' >> ReadFromText('data/king_arthur.txt') # Split lines into several outputs. split_lines_result = (lines | beam.ParDo( SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # Multiple ways to access result. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # Write character count. (character_count | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x)) | beam.GroupByKey() | 'count_chars' >> beam.Map(lambda (_, counts): sum(counts)) | 'write_chars' >> WriteToText(output + '-chars', output_suffix)) # Write short word counts. (short_words | 'count_short_words' >> CountWords() | 'write_short_words' >> WriteToText(output + '-short', output_suffix)) # Write word counts. (words | 'count_words' >> CountWords() | 'write_words' >> WriteToText(output + '-words', output_suffix)) pipeline.run().wait_until_finish()
def run(argv=None): """Build beam pipeline and run it.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: (p | 'read_input' >> ReadFromText(known_args.input, coder=JsonCoder()) | 'apply_model' >> beam.ParDo( MLModelPredictOperation(module_name="iris_model.iris_predict", class_name="IrisModel")) | 'write_output' >> WriteToText(known_args.output, coder=JsonCoder())) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) (p # pylint: disable=expression-not-assigned | 'read' >> ReadFromText(known_args.input) | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)) | 'TopPerPrefix' >> TopPerPrefix(5) | 'format' >> beam.Map(lambda (prefix, candidates): '%s: %s' % (prefix, candidates)) | 'write' >> WriteToText(known_args.output)) p.run()
def run(argv=None): num_docs = 100 num_hash = 100 prime = 4294967311 N = 2**32 #generating random hash function parameters a = [] b = [] for it in range(num_hash): a.append(random.randint(1, int(N / 2))) b.append(random.randint(1, int(N / 2))) def apply(minsig, a, b, bands, num_hash): rows = num_hash / bands result = [] for i in range(num_hash): res = 2**33 for hShingle in minsig[1]: res = min(res, (a[i] * hShingle + b[i]) % prime) result.append(res) for j in range(bands): lower = int(j * rows) upper = int(j * rows + rows) yield (result[lower:upper], minsig[0]) def showCandidatePairs(buckets): if len(buckets[1]) > 1: for subset in itertools.combinations(buckets[1], 2): if subset[0] > subset[1]: yield (list((subset[1], subset[0])), 1) else: yield (list(subset), 1) def shing(tup, k): result = set() line, index = tup.split(',') words = line.split() for i in range(len(words) - k + 1): #shingle generations k shingles shingle = "" for j in range(i, i + k): shingle += words[j] + " " hShingle = binascii.crc32(shingle.encode('utf-8')) & 0xffffffff if hShingle not in result: result.add(hShingle) yield (index, hShingle) pipeline_options = PipelineOptions() pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).runner = 'DirectRunner' inde = 0 for filename in glob.glob('corpusdataset.txt'): with beam.Pipeline(options=pipeline_options) as p: lines = p | ReadFromText(filename) lsh = (lines | "Read From Text" >> (beam.FlatMap(lambda x: shing(x, 2))) | "gruping" >> beam.GroupByKey() | "banding and hashing" >> beam.FlatMap(lambda x: apply(x, a, b, 25, num_hash)) | "same buckets" >> beam.GroupByKey() | "shoe final result" >> beam.FlatMap(lambda x: showCandidatePairs(x)) | "en son" >> beam.GroupByKey()) lsh | WriteToText("outcorpus")
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def main(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', # CHANGE 1/6: The Google Cloud Storage path is required # for outputting the results. default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to # run your pipeline on the Google Cloud Dataflow Service. '--project=SET_YOUR_PROJECT_ID_HERE', # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1) # is required in order to run your pipeline on the Google Cloud # Dataflow Service. '--region=SET_REGION_HERE', # CHANGE 5/6: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 6/6: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. lines = p | ReadFromText(known_args.input) # Count the occurrences of each word. counts = (lines | 'Split' >> (beam.FlatMap(lambda x: re.findall( r'[A-Za-z\']+', x)).with_output_types(str)) | 'PairWithOne' >> beam.Map(lambda x: (x, 1)) | 'GroupAndSum' >> beam.CombinePerKey(sum)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %s' % (word, count) output = counts | 'Format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | WriteToText(known_args.output)
winner = values[2] entity = values[3] if 'ACTOR' in category or 'ACTRESS' in category: return [(entity, 1)] # PTransform: sum up nominations for a given actor/actress class ActorSumFn(beam.DoFn): def process(self, element): actor, counts = element total_count = len(counts) return [(actor, total_count)] # Create a Pipeline using a local runner for execution with beam.Pipeline('DirectRunner') as p: # create a PCollection from the file contents. in_pcoll = p | 'Read File' >> ReadFromText('oscars_input.tsv') # apply ParDo to the PCollection actor_pcoll = in_pcoll | 'Extract Actor' >> beam.ParDo(ActorCountFn()) # apply GroupByKey to the PCollection group_pcoll = actor_pcoll | 'Group by Actor' >> beam.GroupByKey() # apply ParDo to the PCollection out_pcoll = group_pcoll | 'Sum up Counts' >> beam.ParDo(ActorSumFn()) # write PCollection to a file out_pcoll | 'Write File' >> WriteToText('oscars_output.txt')
import sys import os import apache_beam as beam from apache_beam.io import ReadFromText from pprint import pprint if __name__ == '__main__': file = sys.argv[1] if not os.path.isfile(file): print('File not found') else: with beam.Pipeline() as pipeline: # Reading lines = pipeline | ReadFromText(file) # Mapping measured_paths = ( lines | "Measure" >> beam.Map( lambda line: (line, len(line.split('/'))) ) ) # Reducing max_value = ( measured_paths | "Max" >> beam.CombineGlobally( lambda paths: max(paths, key=lambda p: p[1]) )
options = { 'runner': 'DataflowRunner', 'job_name': 'nomination-count-8', 'project': PROJECT_ID, 'temp_location': BUCKET + '/temp', 'staging_location': BUCKET + '/staging', 'machine_type': 'n1-standard-1', # machine types listed here: https://cloud.google.com/compute/docs/machine-types 'num_workers': 1 } opts = PipelineOptions(flags=[], **options) with beam.Pipeline('DataflowRunner', options=opts) as p: # create PCollection from the file contents in_pcoll = p | 'Read File' >> ReadFromText(DIR_PATH_IN + 'oscars_data.tsv') # apply ParDo with tagged outputs out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo( ActorActressCountFn()).with_outputs( ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT, ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT) actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT] actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT] # write PCollections to files actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT + 'actor_output.txt') actress_pcoll | 'Write Actress File 1' >> WriteToText(DIR_PATH_OUT + 'actress_output.txt')
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://big-data-pipe/country.csv', help='Country Data file to process.') parser.add_argument('--project', dest='project', help='Google Cloud Project ID') parser.add_argument('--bucket', dest='bucket', help='Google Cloud Storage Bucket (gs://...)') parser.add_argument('--run_target', dest='run_target', default='local', help='Where to run job (local,gcp)') parser.add_argument( '--schema', dest='schema', default='/Users/sachinholla/Documents/GCP/advertiser.schema', help='Schema file (from bq)') parser.add_argument('--table', dest='table', default='big-data-pipe:dblclick_data.advertiser', help='Table') parser.add_argument('--output_dest', dest='output_dest', default='file', help='Output location (file,bq)') known_args, pipeline_args = parser.parse_known_args(argv) print(known_args) extendList = list() if known_args.run_target == "gcp": runner = "DataflowRunner" else: runner = "DirectRunner" extendList.append("--runner=%s" % runner) if known_args.bucket and known_args.project and known_args.bucket.startswith( "gs:"): # build the output location known_args.output = known_args.bucket + "/" + OUTPUT_PREFIX extendList.append("--project=%s" % known_args.project) staging_location = known_args.bucket + "/" + STAGING_DIR temp_location = known_args.bucket + "/" + TEMP_DIR extendList.append("--staging_location=%s" % staging_location) extendList.append("--temp_location=%s" % temp_location) elif known_args.run_target == "gcp": print("Can't proceed with invalid Bucket and/or Project") exit(1) else: known_args.output = "/tmp/output" from datetime import datetime from time import gmtime, strftime curr_time = str(datetime.now().strftime('%Y-%m-%dt%H-%M-%S')) #print(curr_time) job_name = JOB_NAME_PREFIX + "--" + curr_time #+ "-jobname" #extendList.append("--job_name=%s" % (JOB_NAME_PREFIX+"--"+curr_time)) extendList.append("--job_name=%s" % (job_name)) #print(extendList) pipeline_args.extend(extendList) # setting the schema from the env. input vars #tsu.setSchema(known_args.schema) #tsu.setFixedSchema() print(known_args, pipeline_args) #exit() # headers for impression data imp_headers_list = [ "Time", "UserId", "AdvertiserId", "OrderId", "LineItemId", "CreativeId", "CreativeVersion", "CreativeSize", "AdUnitId", "CustomTargeting", "Domain", "CountryId", "Country", "RegionId", "Region", "MetroId", "Metro", "CityId", "City", "PostalCodeId", "PostalCode", "BrowserId", "Browser", "OSId", "OS", "OSVersion", "BandwidthId", "BandWidth", "TimeUsec", "AudienceSegmentIds", "Product", "RequestedAdUnitSizes", "BandwidthGroupId", "MobileDevice", "MobileCapability", "MobileCarrier", "IsCompanion", "TargetedCustomCriteria", "DeviceCategory", "IsInterstitial", "EventTimeUsec2", "YieldGroupNames", "YieldGroupCompanyId", "MobileAppId", "RequestLanguage", "DealId", "DealType", "AdxAccountId", "SellerReservePrice", "Buyer", "Advertiser", "Anonymous", "ImpressionId" ] cntry_headers_list = [ "CountryId", "CountryAbbrev", "CountryName", "CountryLocale", "CountryCurrency" ] advrtsr_headers_list = [ "AdvertiserId", "AdvertiserName", "AdvertiserType", "AdvertiserRegion" ] #dept_headers = [("dept_id","dept_name","dept_start_year")] #dept_headers_list = ["dept_id","dept_name","dept_start_year"] #emp_headers_list = ["emp_id","emp_name","emp_dept","emp_country","emp_gender","emp_birth_year","emp_salary"] def buildSchemaString(header_list, prefix="pfx"): schema_string = "" string_type = "STRING" sep = "" for header in header_list: if schema_string: sep = ", " schema_string += "%s%s_%s:%s" % (sep, prefix, header.lower(), string_type) return schema_string def getDBSchema(): # start building the schema with the assumption that all fields are string # 1. start with the impressions table_schema = buildSchemaString(imp_headers_list, prefix="impressions") table_schema += ", " + buildSchemaString(cntry_headers_list, prefix="country") table_schema += ", " + buildSchemaString(advrtsr_headers_list, prefix="advertiser") #print(pcollData) #keys_only = pcollData | "keys" >> beam.Map(lambda (k,v): k) return (table_schema) class GenericFormatDoFn(beam.DoFn): def process(self, element, headers_list=[], prefix="pfx"): #print(element) #print(headers_list) if not headers_list: print("Exception!!! No headers list provided!") return #print(len(element),len(headers_list)) formatted_data = {} for index, elem in enumerate(element): #print(index,elem,headers_list[index]) formatted_data["%s_%s" % (prefix, headers_list[index].lower())] = str(elem) return [formatted_data] pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True if known_args.run_target == "local": imp_file = "/Users/sachinholla/Documents/GCP/impressions_data.csv" cntry_file = "/Users/sachinholla/Documents/GCP/country_data.csv" advrtsr_file = "/Users/sachinholla/Documents/GCP/advertiser_data.csv" else: imp_file = known_args.bucket + "/input/" + "impressions_data.csv" cntry_file = known_args.bucket + "/input/" + "country_data.csv" advrtsr_file = known_args.bucket + "/input/" + "advertiser_data.csv" with beam.Pipeline(options=pipeline_options) as p: # step 1. retrieve the impression data imp_data = ( p | "ImpressionsRead" >> ReadFromText(imp_file) | 'ImpressionsSplit' >> beam.Map(lambda x: x.split(",")) #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x)) | 'ImpressionsFormat' >> beam.ParDo(GenericFormatDoFn(), headers_list=imp_headers_list, prefix="impression") | 'ImpressionsMapping' >> beam.Map(lambda imp: (imp["impression_countryid"], imp))) if known_args.run_target == "local": imp_data | "ImpressionsWrite" >> WriteToText("impressions") else: imp_data | "ImpressionsWrite" >> WriteToText(known_args.output + "-impressions") # step 2. retrieve the Country data cntry_data = ( p | "CountriesRead" >> ReadFromText(cntry_file) | 'CountriesSplit' >> beam.Map(lambda x: x.split(",")) #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x)) | 'CountriesFormat' >> beam.ParDo(GenericFormatDoFn(), headers_list=cntry_headers_list, prefix="country") | 'CountriesMapping' >> beam.Map(lambda cntry: (cntry["country_countryid"], cntry))) if known_args.run_target == "local": cntry_data | "CountriesWrite" >> WriteToText("countries") else: cntry_data | "CountriesWrite" >> WriteToText(known_args.output + "-countries") # step 3. retrieve the Advertiser data advrtsr_data = ( p | "AdvertisersRead" >> ReadFromText(advrtsr_file) | 'AdvertisersSplit' >> beam.Map(lambda x: x.split(",")) #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x)) | 'AdvertisersFormat' >> beam.ParDo( GenericFormatDoFn(), headers_list=advrtsr_headers_list, prefix="advertiser") | 'AdvertisersMapping' >> beam.Map( lambda advrtsr: (advrtsr["advertiser_advertiserid"], advrtsr))) if known_args.run_target == "local": advrtsr_data | "AdvertisersWrite" >> WriteToText("advertisers") else: advrtsr_data | "AdvertisersWrite" >> WriteToText( known_args.output + "-advertisers") # step 4: now combine impressions and country data cntry_side_input = beam.pvalue.AsDict(cntry_data) def join_imp_cntry(impression, cntry_dict): imp = impression[1] if impression[0] in cntry_dict.keys(): cntry = cntry_dict[impression[0]] else: print("No Country Found!") return (imp) imp.update(cntry) return (imp) joined_dicts = (imp_data | 'JoiningCountry' >> beam.Map( join_imp_cntry, cntry_dict=cntry_side_input)) if known_args.run_target == "local": joined_dicts | "CountryJoined" >> WriteToText("CountryJoined") else: joined_dicts | "CountryJoined" >> WriteToText(known_args.output + "-CountryJoined") # step 5: now combine impressions and advertiser data # need to first re-key the impressions data to be on AdvertiserId to enable the join rekeyed_imp_data = ( joined_dicts | 'ImpressionsReMapping' >> beam.Map(lambda impression: (impression["impression_advertiserid"], impression))) advrtsr_side_input = beam.pvalue.AsDict(advrtsr_data) def join_imp_advrtsr(impression, advrtsr_dict): empty_advrtsr = {} empty_advrtsr['advertiser_advertiserid'] = '' empty_advrtsr['advertiser_advertisername'] = '' empty_advrtsr['advertiser_advertiserregion'] = '' empty_advrtsr['advertiser_advertisertype'] = '' imp = impression[1] if impression[0] in advrtsr_dict.keys(): advrtsr = advrtsr_dict[impression[0]] else: #print("No Advertiser Found!") advrtsr = empty_advrtsr imp.update(advrtsr) return (imp) joined_dicts = (rekeyed_imp_data | 'JoiningAdvertiser' >> beam.Map( join_imp_advrtsr, advrtsr_dict=advrtsr_side_input)) if known_args.run_target == "local": print("Writing out final dataset") joined_dicts | "AdvertiserJoined" >> WriteToText( "AdvertiserJoined") else: joined_dicts | "AdvertiserJoined" >> WriteToText( known_args.output + "-AdvertiserJoined") if known_args.output_dest == "bq": print("updating bigquery table") #getDBSchema() joined_dicts | 'Write' >> beam.io.WriteToBigQuery( "big-data-pipe:dblclick_data.gcp_demo", schema=getDBSchema(), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
state, year, month = key.split('-') rain = data['chuvas'][0] dengue = data['dengue'][0] return (state, year, month, rain, dengue) def prepare_csv(element, delimiter=';'): return f"{delimiter}".join([str(e) for e in element]) # dengue is a PCollection dengue = ( pipeline | "Leitura do dataset de dengue" >> ReadFromText( './data/sample_casos_dengue.txt', skip_header_lines=1) | "From text to list" >> beam.Map(text_to_list) | "From list to dict" >> beam.Map(list_to_dict, columns) | "Add year_month" >> beam.Map(treat_date) | "Create key by state" >> beam.Map(key_uf) | "GroupBy State" >> beam.GroupByKey() | "Unzip dengue cases" >> beam.FlatMap( dengue_cases) # Permite a utilização de yield | "Sum of cases by key" >> beam.CombinePerKey(sum) # | "Show results" >> beam.Map(print) ) chuvas = ( pipeline | "Read rain dataset" >> ReadFromText('./data/sample_chuvas.csv', skip_header_lines=1)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) ############################################### # (1) pipeline を作成する ############################################### # まず PipelineOptions オブジェクトを作成 # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる pipeline_options = PipelineOptions(pipeline_args) # 作成した PipelineOptions オプジェクトを直接編集する例 # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする pipeline_options.view_as(SetupOptions).save_main_session = True # オプションを元に pipeline (p) を作成 p = beam.Pipeline(options=pipeline_options) #in→text out→textのパイプライン p2 = beam.Pipeline(options=pipeline_options) #in→bigquery out→textのパイプライン ############################################## # (2) transformを設定 ############################################### #pにtransformを設定 lines = p | 'read' >> ReadFromText(known_args.input) #ファイル出力するためのサンプルメソッド def add(line): num = int(line.strip()) return num * 3 counts = lines | 'add' >> beam.Map(add) counts | 'write' >> WriteToText(known_args.output) #p2にtransformを設定 query = 'select * from babynames.names2012 limit 1000' p2 | 'read' >> Read(beam.io.BigQuerySource(project='gcp-project-210712', use_standard_sql=False, query=query)) \ | 'write' >> WriteToText('gs://gcp_dataflowsample/query_result.txt', num_shards=1) ############################################### # (3) Pipeline を実行 ############################################### #result = p.run() result2 = p2.run() # 終了を待つ # 記述しなければそのまま抜ける # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。 #result.wait_until_finish() result2.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input_mode', default='file', help='Streaming input or file based batch input') for ticker in TICKER_LIST: parser.add_argument( '--input_{}'.format(ticker), default='{}_hist.csv'.format(ticker), help= 'Cloud Pub/Sub topic of tick market data for a stock, fall back to flat csv' ) parser.add_argument( '--output_topic', default='/tmp/trading_signals.txt', help='Topic of output trading signals in Google Cloud Pub/Sub') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True if known_args.input_mode == 'stream': pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: # Read input input_stage = {} for ticker in TICKER_LIST: if known_args.input_mode == 'streaming': input_ticks = (p | beam.io.ReadFromPubSub( topic=known_args.input_topic).with_output_types( six.binary_type)) else: input_ticks = (p | 'Read: %s' % ticker >> ReadFromText( getattr(known_args, 'input_%s' % ticker))) input_stage[ticker] = ( input_ticks | 'decode: %s' % ticker >> beam.Map(lambda x: x.decode('utf-8')) | 'Filter: %s' % ticker >> beam.Filter(lambda row: row.split(',')[0] != 'date') | 'Add Timestamp: %s' % ticker >> beam.ParDo(AddTimestampDoFn()) | 'Window: %s' % ticker >> beam.WindowInto( window.SlidingWindows(size=SECONDS_IN_1_DAY * 10, period=SECONDS_IN_1_DAY)) | 'Pair: %s' % ticker >> beam.ParDo(CorrelationPairDoFn(ticker))) # Group together all entries under the same ticker grouped = input_stage | 'group_by_name' >> beam.CoGroupByKey() correlations = (grouped | 'Calculate pair correlation' >> beam.Map(calculate_correlation_pair)) if known_args.input_mode == 'stream': trading_signals = ( correlations | 'Filter correlation threshold' >> beam.Filter( lambda x: x[1] < CORRELATION_THRESHOLD).with_output_types( six.binary_type)) # pylint: disable=expression-not-assigned trading_signals | beam.io.WriteToPubSub(known_args.output_topic) else: trading_signals = ( correlations | 'Filter correlation threshold' >> beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD)) # pylint: disable=expression-not-assigned trading_signals | 'WriteOutput' >> WriteToText( known_args.output_topic)
def create_pipeline(embedding_model, files_input_list=None, tfrecord_input=None, embeddings_output=None, stats_output=None, feature_key=None, name='all_train_embeddings', batch_size=64): """Returns a pipeline that extracts stats from audio examples. Args: embedding_model: ModelConfig namedtuple; contains model ckpt, embedding dimension size and step size. files_input_list: List of files from where the audio is to be read. tfrecord_input: Path to a tfrecord containing audio. embeddings_output: location to where the embeddings should be written. stats_output: location to where the stats should be written. feature_key: tf.example feature that contains the samples that are to be processed. name: Identifier for the set of examples processed in this pipeline. batch_size: batch_size. Returns: The beam pipeline. """ pipeline = beam.Pipeline() if files_input_list: examples = ( pipeline | 'Read File List' >> ReadFromText(files_input_list) | 'Read Files' >> beam.ParDo(ReadWavFiles())) else: examples = ( pipeline | 'Read Examples' >> ReadFromTFRecord( tfrecord_input, value_coder=beam.coders.ProtoCoder(tf.train.Example)) | 'Add Keys' >> beam.ParDo(AddKey())) embeddings = ( examples | 'Batched Inference' >> beam.ParDo( BatchedInference( batch_size=batch_size, model=embedding_model, feature_key=feature_key)).with_outputs('raw', main='examples')) if stats_output: _ = ( embeddings.raw | 'Combine Embeddings' >> beam.CombineGlobally( ComputeMeanAndCovariance(key_name=name, embedding_dim=128)) | 'DropKey' >> beam.ParDo(DropKey()) | 'Write Stats' >> WriteToTFRecord( stats_output, shard_name_template='', coder=beam.coders.ProtoCoder(tf.train.Example))) if embeddings_output: _ = ( embeddings.examples | 'DropKey' >> beam.ParDo(DropKey()) | 'Write Examples' >> WriteToTFRecord( embeddings_output, shard_name_template='', coder=beam.coders.ProtoCoder(tf.Example))) return pipeline
# get the total transactions for one item return [(str(element[0]), sum(element[1]))] # Print function, for printing output in command terminal- useful for debugging class Printer(beam.DoFn): def process(self, data_item): print data_item # Pipeline # Read the flights data raw_flights = ( p | "flights:read" >> ReadFromText( "C:/Users/mirel/Desktop/flights_small.csv", skip_header_lines=1) | beam.Map(lambda record: (record.split(',')))) # Turn it into a KV pair flights_data = (raw_flights | beam.ParDo(FlightKeys())) # Read the Weather Data, and turn it into KV pair weather = (p | "readweather" >> ReadFromText( "C:/Users/mirel/Desktop/weather.csv", skip_header_lines=1) | beam.Map(lambda record: (record.split(','))) | beam.ParDo(SplitWeather())) """ Turn Flights and weather data into a dictionary, Group them by their common key, Filter out values that don't match, and extract values which are a match """ results = ( (weather, flights_data)
def get_recently_active_users(p, file): return (p | "Read recently active users" >> ReadFromText(file) | "Parse recently active users" >> beam.ParDo( ParseRecentlyActiveUsersCSV()))
def get_recent_questions(p, file): return (p | "Read recent questions" >> ReadFromText(file) | "Parse recent questions" >> beam.ParDo(ParseRecentQuestionsCSV()))