Beispiel #1
0
def run():

    gcs_path = "gs://ct-felicidad_y_cultura"  #Definicion de la raiz del bucket
    gcs_project = "contento-bi"

    mi_runer = ("DirectRunner",
                "DataflowRunner")[socket.gethostname() == "contentobi"]
    pipeline = beam.Pipeline(
        runner=mi_runer,
        argv=[
            "--project", gcs_project, "--staging_location",
            ("%s/dataflow_files/staging_location" % gcs_path),
            "--temp_location",
            ("%s/dataflow_files/temp" % gcs_path), "--output",
            ("%s/dataflow_files/output" % gcs_path), "--setup_file",
            "./setup.py", "--max_num_workers", "5", "--subnetwork",
            "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1"
        ])

    lines = pipeline | 'Lectura de Archivo' >> ReadFromText(
        gcs_path + "/Clima/personal" + ".csv")
    transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData()))
    # transformed | 'Escribir en Archivo' >> WriteToText(gcs_path + "/Seguimiento/Avon_inf_seg_2",file_name_suffix='.csv',shard_name_template='')

    transformed | 'Escritura a BigQuery Felicidad_y_Cultura' >> beam.io.WriteToBigQuery(
        gcs_project + ":Felicidad_y_Cultura.Personal",
        schema=TABLE_SCHEMA,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    jobObject = pipeline.run()
    jobObject.wait_until_finish()

    # jobID = jobObject.job_id()

    return ("Corrio sin problema")
Beispiel #2
0
def run(table, TABLE_DB):

    gcs_path = 'gs://ct-bridge'  #Definicion de la raiz del bucket
    gcs_project = "contento-bi"
    FECHA_CARGUE = str(datetime.date.today())

    mi_runner = ("DirectRunner",
                 "DataflowRunner")[socket.gethostname() == "contentobi"]
    pipeline = beam.Pipeline(
        runner=mi_runner,
        argv=[
            "--project", gcs_project, "--staging_location",
            ("%s/dataflow_files/staging_location" % gcs_path),
            "--temp_location",
            ("%s/dataflow_files/temp" % gcs_path), "--output",
            ("%s/dataflow_files/output" % gcs_path), "--setup_file",
            "./setup.py", "--max_num_workers", "10", "--subnetwork",
            "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1"
        ])

    lines = pipeline | 'Lectura de Archivo' >> ReadFromText(
        gcs_path + "/" + FECHA_CARGUE + "_" + TABLE_DB + ".csv")
    transformed = (lines | 'Formatear Data' >> beam.ParDo(formatearData()))
    # transformed | 'Escribir en Archivo' >> WriteToText(gcs_path + "/" + "REWORK",file_name_suffix='.csv',shard_name_template='')

    transformed | 'Escritura a BigQuery Bridge' >> beam.io.WriteToBigQuery(
        gcs_project + ":Contactabilidad." + table,
        schema=TABLE_SCHEMA,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    jobObject = pipeline.run()
    return ("Proceso de transformacion y cargue, completado")


################################################################################
Beispiel #3
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://justlikethat-294122/sample.txt',
        help='Input file to process.')


    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

    with beam.Pipeline(options=pipeline_options) as p:

        utility = Utility()
        quotes = (
                p
                | 'Read' >> ReadFromText(known_args.input)
                | 'Parse Log' >> beam.Map(lambda line: utility.parse(line))
        )
        table_spec = bigquery.TableReference(
            projectId='justlikethat-294122',
            datasetId='mydataset',
            tableId='quotes'
        )
        table_schema='source:STRING,quote:STRING'

        quotes | beam.io.gcp.bigquery.WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
        )
Beispiel #4
0
def run(archivo, mifecha):

    gcs_path = "gs://ct-metlife"  #Definicion de la raiz del bucket
    gcs_project = "contento-bi"

    mi_runer = ("DirectRunner",
                "DirectRunner")[socket.gethostname() == "contentobi"]
    pipeline = beam.Pipeline(
        runner=mi_runer,
        argv=[
            "--project", gcs_project, "--staging_location",
            ("%s/dataflow_files/staging_location" % gcs_path),
            "--temp_location",
            ("%s/dataflow_files/temp" % gcs_path), "--output",
            ("%s/dataflow_files/output" % gcs_path), "--setup_file",
            "./setup.py", "--max_num_workers", "10", "--subnetwork",
            "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1"
            # "--num_workers", "30",
            # "--autoscaling_algorithm", "NONE"
        ])

    lines = pipeline | 'Lectura de Archivo' >> ReadFromText(
        archivo, skip_header_lines=1)

    transformed = (lines
                   | 'Formatear Data' >> beam.ParDo(formatearData(mifecha)))

    transformed | 'Escritura a BigQuery metlife bases inspector colsubsidio monetaria novedades' >> beam.io.WriteToBigQuery(
        gcs_project + ":MetLife.bases_iniciales",
        schema=TABLE_SCHEMA,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    jobObject = pipeline.run()

    return ("Successful beam process")
Beispiel #5
0
def run(p, input_file, output_file):
    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(input_file)

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes))
              |
              'count' >> beam.ExternalTransform('pytest:beam:transforms:count',
                                                None, EXPANSION_SERVICE_ADDR))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(output_file)

    result = p.run()
    result.wait_until_finish()
Beispiel #6
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | 'ReadFromText' >> ReadFromText(known_args.input)
        results = (
            lines
            | 'Split' >> beam.Map(split)
            | 'Group' >> beam.GroupByKey()
            | 'CPFirstIntervalSkew' >> beam.Map(skew)
            | 'FormatResult' >> beam.Map(format_result)
        )
        results | 'WriteToText' >> WriteToText(known_args.output)
def run(archivo, mifecha):

    gcs_path = "gs://ct-fanalca"  #Definicion de la raiz del bucket
    gcs_project = "contento-bi"

    mi_runer = ("DirectRunner",
                "DataflowRunner")[socket.gethostname() == "contentobi"]
    pipeline = beam.Pipeline(
        runner=mi_runer,
        argv=[
            "--project", gcs_project, "--staging_location",
            ("%s/dataflow_files/staging_location" % gcs_path),
            "--temp_location",
            ("%s/dataflow_files/temp" % gcs_path), "--output",
            ("%s/dataflow_files/output" % gcs_path), "--setup_file",
            "./setup.py", "--max_num_workers", "5", "--subnetwork",
            "https://www.googleapis.com/compute/v1/projects/contento-bi/regions/us-central1/subnetworks/contento-subnet1"
        ])
    lines = pipeline | 'Lectura de Archivo HONDA-DIGITAL' >> ReadFromText(
        archivo, skip_header_lines=1)
    transformed = (
        lines
        | 'Formatear Data HONDA-DIGITAL' >> beam.ParDo(formatearData(mifecha)))
    transformed | 'Escritura a BigQuery fanalca HONDA-DIGITAL' >> beam.io.WriteToBigQuery(
        gcs_project + ":fanalca.asignacion_digital",
        schema=TABLE_SCHEMA,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    # transformed | 'Borrar Archivo' >> FileSystems.delete('gs://ct-avon/prejuridico/AVON_INF_PREJ_20181111.TXT')
    # 'Eliminar' >> FileSystems.delete (["archivos/Info_carga_avon.1.txt"])

    jobObject = pipeline.run()
    # jobID = jobObject.job_id()

    return ("Corrio Full HD")
Beispiel #8
0
def run(argv=None):
    """Pipeline for reading data from a Cloud Storage bucket and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', dest='input', help='File to read in.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # 1. Read in the file from Google Cloud Storage. Hint: remember there is a header line in the CSV.
        input_rows = p | 'ReadFile' >> ReadFromText()

        # 2. Convert the rows into Key, Value pairs. Hint: use tuples

        # 3. For each Key, sum up the values. Hint: CombinePerKey(sum)

        # 4. Format the as Python dictionaries for writing to BigQuery

        # 5. Write the output to BigQuery
        (input_rows
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='department:STRING, value:FLOAT',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

        p.run().wait_until_finish()
Beispiel #9
0
def run(argv=None):
    """Main entry point; defines and runs the PDkit_score pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='tremor_data_with_user.csv',
                        help='Input file to process.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        events = ( p
                   | 'Read' >> ReadFromText(known_args.input)
                   | 'AddEventTimestamps' >> beam.ParDo(TransformTimestampDoFn())
                   )

        a = ( events
              | 'ParseAccEventFn' >> beam.ParDo(ParseAccEventFn())
              | 'CalculateTeamScores' >> CalculateTeamScores(
                    args.team_window_duration, args.allowed_lateness)
              )

        a | beam.ParDo(lambda (x): print(x))
        def simple_beam(text_input=parameter[PathStr],
                        text_output=beam_output):
            class RequireSomeParameter(PipelineOptions):
                @classmethod
                def _add_argparse_args(cls, parser):
                    parser.add_argument("--param", dest="param", required=True)

            dataflow_pipeline = current_task().build_pipeline(["--param", "2"])

            # Read the text file[pattern] into a PCollection.
            lines = dataflow_pipeline | "read" >> ReadFromText(text_input)

            x = dataflow_pipeline._options.view_as(RequireSomeParameter)
            assert x.param == "2"

            counts = (lines
                      | "pair_with_one" >> beam.Map(lambda x: (x, 1))
                      | "group" >> beam.GroupByKey())

            counts | "write" >> WriteToText(text_output)
            target(text_output).mkdir()

            result = dataflow_pipeline.run()
            result.wait_until_finish()
Beispiel #11
0
def run(argv=None):
  """Runs the workflow computing total points from a collection of matches."""

  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      required=True,
                      help='Input file to process.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  p = beam.Pipeline(argv=pipeline_args)
  (p  # pylint: disable=expression-not-assigned
   | 'read' >> ReadFromText(known_args.input, coder=JsonCoder())
   | 'points' >> beam.FlatMap(compute_points)
   | beam.CombinePerKey(sum)
   | 'write' >> WriteToText(known_args.output, coder=JsonCoder()))
  p.run()
Beispiel #12
0
def runMyPipeline(argv=None):
  """Main entry point"""

  # Parse arguments, prepare pipeline options
  parser = argparse.ArgumentParser()
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True

  # Define the dataflow pipeline
  p = beam.Pipeline(options=pipeline_options)

  # Import local or cloud data and write contents to local disk
  dataset1 = '/Users/jdoe/mydataset.txt'
  dataset2 = 'gs://apache-beam-samples/shakespeare/kinglear.txt'

  dataout = (p
    | 'dataset_import' >> ReadFromText(dataset1)
    | 'write_to_disk' >> WriteToText('outputfile'))

  # Execute the pipeline
  result = p.run()
  result.wait_until_finish()
Beispiel #13
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://BUCKET-NAME/output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DataflowRunner'
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'PROJECT-NAME'
    google_cloud_options.staging_location = 'gs://BUCKET-NAME/staging'
    google_cloud_options.temp_location = 'gs://BUCKET-NAME/temp'
    google_cloud_options.job_name = 'JOBNAME-USERNAME-DATETIME'
    with beam.Pipeline(options=options) as p:
        counts = (
            p
            | ReadFromText(known_args.input)
            | beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)
                           )  # Find all word matches
            | beam.Map(lambda x: (x, 1))  # Create tuple (word,1)
            | beam.CombinePerKey(sum)  # Reduce by key i.e. the word
            | beam.ParDo(Split())
            | beam.io.Write(
                beam.io.BigQuerySink('PROJECT-NAME:DATASET-NAME.TABLE-NAME',
                                     schema='word:STRING,freq:INTEGER',
                                     write_disposition=beam.io.
                                     BigQueryDisposition.WRITE_TRUNCATE,
                                     create_disposition=beam.io.
                                     BigQueryDisposition.CREATE_IF_NEEDED)))
Beispiel #14
0
def run():
    pipeline = beam.Pipeline()
    output = 'wordcount-multiple-outputs'
    output_suffix = '.txt'

    lines = pipeline | 'read' >> ReadFromText('data/king_arthur.txt')

    # Split lines into several outputs.
    split_lines_result = (lines | beam.ParDo(
        SplitLinesToWordsFn()).with_outputs(
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
            main='words'))

    # Multiple ways to access result.
    words, _, _ = split_lines_result
    short_words = split_lines_result[
        SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
    character_count = split_lines_result.tag_character_count

    # Write character count.
    (character_count
     | 'pair_with_key' >> beam.Map(lambda x: ('chars_temp_key', x))
     | beam.GroupByKey()
     | 'count_chars' >> beam.Map(lambda (_, counts): sum(counts))
     | 'write_chars' >> WriteToText(output + '-chars', output_suffix))

    # Write short word counts.
    (short_words | 'count_short_words' >> CountWords()
     | 'write_short_words' >> WriteToText(output + '-short', output_suffix))

    # Write word counts.
    (words | 'count_words' >> CountWords()
     | 'write_words' >> WriteToText(output + '-words', output_suffix))

    pipeline.run().wait_until_finish()
Beispiel #15
0
def run(argv=None):
    """Build beam pipeline and run it."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        (p
         | 'read_input' >> ReadFromText(known_args.input, coder=JsonCoder())
         | 'apply_model' >> beam.ParDo(
             MLModelPredictOperation(module_name="iris_model.iris_predict",
                                     class_name="IrisModel"))
         | 'write_output' >> WriteToText(known_args.output, coder=JsonCoder()))

        result = p.run()
        result.wait_until_finish()
Beispiel #16
0
def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    (p  # pylint: disable=expression-not-assigned
     | 'read' >> ReadFromText(known_args.input)
     | 'split' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
     | 'TopPerPrefix' >> TopPerPrefix(5)
     | 'format' >> beam.Map(lambda (prefix, candidates): '%s: %s' %
                            (prefix, candidates))
     | 'write' >> WriteToText(known_args.output))
    p.run()
Beispiel #17
0
def run(argv=None):

    num_docs = 100
    num_hash = 100
    prime = 4294967311

    N = 2**32
    #generating random hash function parameters
    a = []
    b = []
    for it in range(num_hash):
        a.append(random.randint(1, int(N / 2)))
        b.append(random.randint(1, int(N / 2)))

    def apply(minsig, a, b, bands, num_hash):
        rows = num_hash / bands

        result = []
        for i in range(num_hash):

            res = 2**33
            for hShingle in minsig[1]:
                res = min(res, (a[i] * hShingle + b[i]) % prime)

            result.append(res)

        for j in range(bands):

            lower = int(j * rows)
            upper = int(j * rows + rows)
            yield (result[lower:upper], minsig[0])

    def showCandidatePairs(buckets):
        if len(buckets[1]) > 1:
            for subset in itertools.combinations(buckets[1], 2):
                if subset[0] > subset[1]:
                    yield (list((subset[1], subset[0])), 1)
                else:
                    yield (list(subset), 1)

    def shing(tup, k):
        result = set()

        line, index = tup.split(',')
        words = line.split()
        for i in range(len(words) - k + 1):

            #shingle generations k shingles
            shingle = ""
            for j in range(i, i + k):
                shingle += words[j] + " "

            hShingle = binascii.crc32(shingle.encode('utf-8')) & 0xffffffff
            if hShingle not in result:

                result.add(hShingle)

                yield (index, hShingle)

    pipeline_options = PipelineOptions()
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'
    inde = 0
    for filename in glob.glob('corpusdataset.txt'):
        with beam.Pipeline(options=pipeline_options) as p:

            lines = p | ReadFromText(filename)
            lsh = (lines | "Read From Text" >>
                   (beam.FlatMap(lambda x: shing(x, 2)))
                   | "gruping" >> beam.GroupByKey() | "banding and hashing" >>
                   beam.FlatMap(lambda x: apply(x, a, b, 25, num_hash))
                   | "same buckets" >> beam.GroupByKey() | "shoe final result"
                   >> beam.FlatMap(lambda x: showCandidatePairs(x))
                   | "en son" >> beam.GroupByKey())

            lsh | WriteToText("outcorpus")
Beispiel #18
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
def main(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/6: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
        # is required in order to run your pipeline on the Google Cloud
        # Dataflow Service.
        '--region=SET_REGION_HERE',
        # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(str))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output)
Beispiel #20
0
    winner = values[2]
    entity = values[3]

    if 'ACTOR' in category or 'ACTRESS' in category:
	    return [(entity, 1)]    

# PTransform: sum up nominations for a given actor/actress
class ActorSumFn(beam.DoFn):
  def process(self, element):
     actor, counts = element
     total_count = len(counts)
     return [(actor, total_count)]  
    
# Create a Pipeline using a local runner for execution
with beam.Pipeline('DirectRunner') as p:

    # create a PCollection from the file contents.
    in_pcoll = p | 'Read File' >> ReadFromText('oscars_input.tsv')

    # apply ParDo to the PCollection 
    actor_pcoll = in_pcoll | 'Extract Actor' >> beam.ParDo(ActorCountFn())

    # apply GroupByKey to the PCollection
    group_pcoll = actor_pcoll | 'Group by Actor' >> beam.GroupByKey()

    # apply ParDo to the PCollection
    out_pcoll = group_pcoll | 'Sum up Counts' >> beam.ParDo(ActorSumFn())

    # write PCollection to a file
    out_pcoll | 'Write File' >> WriteToText('oscars_output.txt')
Beispiel #21
0
import sys
import os
import apache_beam as beam

from apache_beam.io import ReadFromText
from pprint import pprint

if __name__ == '__main__':
    file = sys.argv[1]
    if not os.path.isfile(file):
        print('File not found')
    else:
        with beam.Pipeline() as pipeline:
            # Reading
            lines = pipeline | ReadFromText(file)

            # Mapping
            measured_paths = (
                lines
                | "Measure" >> beam.Map(
                    lambda line: (line, len(line.split('/')))
                    )
            )

            # Reducing
            max_value = (
                measured_paths
                | "Max" >> beam.CombineGlobally(
                    lambda paths: max(paths, key=lambda p: p[1])
                    )
Beispiel #22
0
options = {
    'runner': 'DataflowRunner',
    'job_name': 'nomination-count-8',
    'project': PROJECT_ID,
    'temp_location': BUCKET + '/temp',
    'staging_location': BUCKET + '/staging',
    'machine_type':
    'n1-standard-1',  # machine types listed here: https://cloud.google.com/compute/docs/machine-types
    'num_workers': 1
}
opts = PipelineOptions(flags=[], **options)

with beam.Pipeline('DataflowRunner', options=opts) as p:

    # create PCollection from the file contents
    in_pcoll = p | 'Read File' >> ReadFromText(DIR_PATH_IN + 'oscars_data.tsv')

    # apply ParDo with tagged outputs
    out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo(
        ActorActressCountFn()).with_outputs(
            ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT,
            ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT)

    actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT]
    actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT]

    # write PCollections to files
    actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT +
                                                      'actor_output.txt')
    actress_pcoll | 'Write Actress File 1' >> WriteToText(DIR_PATH_OUT +
                                                          'actress_output.txt')
Beispiel #23
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://big-data-pipe/country.csv',
                        help='Country Data file to process.')

    parser.add_argument('--project',
                        dest='project',
                        help='Google Cloud Project ID')

    parser.add_argument('--bucket',
                        dest='bucket',
                        help='Google Cloud Storage Bucket (gs://...)')

    parser.add_argument('--run_target',
                        dest='run_target',
                        default='local',
                        help='Where to run job (local,gcp)')

    parser.add_argument(
        '--schema',
        dest='schema',
        default='/Users/sachinholla/Documents/GCP/advertiser.schema',
        help='Schema file (from bq)')

    parser.add_argument('--table',
                        dest='table',
                        default='big-data-pipe:dblclick_data.advertiser',
                        help='Table')

    parser.add_argument('--output_dest',
                        dest='output_dest',
                        default='file',
                        help='Output location (file,bq)')

    known_args, pipeline_args = parser.parse_known_args(argv)

    print(known_args)

    extendList = list()
    if known_args.run_target == "gcp":
        runner = "DataflowRunner"
    else:
        runner = "DirectRunner"
    extendList.append("--runner=%s" % runner)

    if known_args.bucket and known_args.project and known_args.bucket.startswith(
            "gs:"):
        # build the output location
        known_args.output = known_args.bucket + "/" + OUTPUT_PREFIX
        extendList.append("--project=%s" % known_args.project)
        staging_location = known_args.bucket + "/" + STAGING_DIR
        temp_location = known_args.bucket + "/" + TEMP_DIR
        extendList.append("--staging_location=%s" % staging_location)
        extendList.append("--temp_location=%s" % temp_location)
    elif known_args.run_target == "gcp":
        print("Can't proceed with invalid Bucket and/or Project")
        exit(1)
    else:
        known_args.output = "/tmp/output"

    from datetime import datetime
    from time import gmtime, strftime
    curr_time = str(datetime.now().strftime('%Y-%m-%dt%H-%M-%S'))
    #print(curr_time)
    job_name = JOB_NAME_PREFIX + "--" + curr_time  #+ "-jobname"

    #extendList.append("--job_name=%s" % (JOB_NAME_PREFIX+"--"+curr_time))
    extendList.append("--job_name=%s" % (job_name))

    #print(extendList)

    pipeline_args.extend(extendList)

    # setting the schema from the env. input vars
    #tsu.setSchema(known_args.schema)
    #tsu.setFixedSchema()

    print(known_args, pipeline_args)
    #exit()

    # headers for impression data
    imp_headers_list = [
        "Time", "UserId", "AdvertiserId", "OrderId", "LineItemId",
        "CreativeId", "CreativeVersion", "CreativeSize", "AdUnitId",
        "CustomTargeting", "Domain", "CountryId", "Country", "RegionId",
        "Region", "MetroId", "Metro", "CityId", "City", "PostalCodeId",
        "PostalCode", "BrowserId", "Browser", "OSId", "OS", "OSVersion",
        "BandwidthId", "BandWidth", "TimeUsec", "AudienceSegmentIds",
        "Product", "RequestedAdUnitSizes", "BandwidthGroupId", "MobileDevice",
        "MobileCapability", "MobileCarrier", "IsCompanion",
        "TargetedCustomCriteria", "DeviceCategory", "IsInterstitial",
        "EventTimeUsec2", "YieldGroupNames", "YieldGroupCompanyId",
        "MobileAppId", "RequestLanguage", "DealId", "DealType", "AdxAccountId",
        "SellerReservePrice", "Buyer", "Advertiser", "Anonymous",
        "ImpressionId"
    ]
    cntry_headers_list = [
        "CountryId", "CountryAbbrev", "CountryName", "CountryLocale",
        "CountryCurrency"
    ]
    advrtsr_headers_list = [
        "AdvertiserId", "AdvertiserName", "AdvertiserType", "AdvertiserRegion"
    ]

    #dept_headers = [("dept_id","dept_name","dept_start_year")]
    #dept_headers_list = ["dept_id","dept_name","dept_start_year"]
    #emp_headers_list = ["emp_id","emp_name","emp_dept","emp_country","emp_gender","emp_birth_year","emp_salary"]

    def buildSchemaString(header_list, prefix="pfx"):
        schema_string = ""
        string_type = "STRING"
        sep = ""
        for header in header_list:
            if schema_string:
                sep = ", "
            schema_string += "%s%s_%s:%s" % (sep, prefix, header.lower(),
                                             string_type)
        return schema_string

    def getDBSchema():
        # start building the schema with the assumption that all fields are string
        # 1. start with the impressions
        table_schema = buildSchemaString(imp_headers_list,
                                         prefix="impressions")
        table_schema += ", " + buildSchemaString(cntry_headers_list,
                                                 prefix="country")
        table_schema += ", " + buildSchemaString(advrtsr_headers_list,
                                                 prefix="advertiser")

        #print(pcollData)
        #keys_only = pcollData | "keys" >> beam.Map(lambda (k,v): k)
        return (table_schema)

    class GenericFormatDoFn(beam.DoFn):
        def process(self, element, headers_list=[], prefix="pfx"):
            #print(element)
            #print(headers_list)
            if not headers_list:
                print("Exception!!! No headers list provided!")
                return

            #print(len(element),len(headers_list))
            formatted_data = {}
            for index, elem in enumerate(element):
                #print(index,elem,headers_list[index])
                formatted_data["%s_%s" %
                               (prefix,
                                headers_list[index].lower())] = str(elem)
            return [formatted_data]

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    if known_args.run_target == "local":
        imp_file = "/Users/sachinholla/Documents/GCP/impressions_data.csv"
        cntry_file = "/Users/sachinholla/Documents/GCP/country_data.csv"
        advrtsr_file = "/Users/sachinholla/Documents/GCP/advertiser_data.csv"
    else:
        imp_file = known_args.bucket + "/input/" + "impressions_data.csv"
        cntry_file = known_args.bucket + "/input/" + "country_data.csv"
        advrtsr_file = known_args.bucket + "/input/" + "advertiser_data.csv"

    with beam.Pipeline(options=pipeline_options) as p:

        # step 1. retrieve the impression data
        imp_data = (
            p
            | "ImpressionsRead" >> ReadFromText(imp_file)
            | 'ImpressionsSplit' >> beam.Map(lambda x: x.split(","))
            #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x))
            | 'ImpressionsFormat' >> beam.ParDo(GenericFormatDoFn(),
                                                headers_list=imp_headers_list,
                                                prefix="impression")
            | 'ImpressionsMapping' >>
            beam.Map(lambda imp: (imp["impression_countryid"], imp)))

        if known_args.run_target == "local":
            imp_data | "ImpressionsWrite" >> WriteToText("impressions")
        else:
            imp_data | "ImpressionsWrite" >> WriteToText(known_args.output +
                                                         "-impressions")

        # step 2. retrieve the Country data
        cntry_data = (
            p
            | "CountriesRead" >> ReadFromText(cntry_file)
            | 'CountriesSplit' >> beam.Map(lambda x: x.split(","))
            #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x))
            | 'CountriesFormat' >> beam.ParDo(GenericFormatDoFn(),
                                              headers_list=cntry_headers_list,
                                              prefix="country")
            | 'CountriesMapping' >>
            beam.Map(lambda cntry: (cntry["country_countryid"], cntry)))

        if known_args.run_target == "local":
            cntry_data | "CountriesWrite" >> WriteToText("countries")
        else:
            cntry_data | "CountriesWrite" >> WriteToText(known_args.output +
                                                         "-countries")

        # step 3. retrieve the Advertiser data
        advrtsr_data = (
            p
            | "AdvertisersRead" >> ReadFromText(advrtsr_file)
            | 'AdvertisersSplit' >> beam.Map(lambda x: x.split(","))
            #| 'ImpressionsSplit' >> beam.Map(lambda x: re.findall(r'[0-9A-Za-z\:\;\|\-\=\.\&\ \#\_\']?', x))
            | 'AdvertisersFormat' >> beam.ParDo(
                GenericFormatDoFn(),
                headers_list=advrtsr_headers_list,
                prefix="advertiser")
            | 'AdvertisersMapping' >> beam.Map(
                lambda advrtsr: (advrtsr["advertiser_advertiserid"], advrtsr)))

        if known_args.run_target == "local":
            advrtsr_data | "AdvertisersWrite" >> WriteToText("advertisers")
        else:
            advrtsr_data | "AdvertisersWrite" >> WriteToText(
                known_args.output + "-advertisers")

        # step 4: now combine impressions and country data
        cntry_side_input = beam.pvalue.AsDict(cntry_data)

        def join_imp_cntry(impression, cntry_dict):
            imp = impression[1]

            if impression[0] in cntry_dict.keys():
                cntry = cntry_dict[impression[0]]
            else:
                print("No Country Found!")
                return (imp)

            imp.update(cntry)
            return (imp)

        joined_dicts = (imp_data
                        | 'JoiningCountry' >> beam.Map(
                            join_imp_cntry, cntry_dict=cntry_side_input))
        if known_args.run_target == "local":
            joined_dicts | "CountryJoined" >> WriteToText("CountryJoined")
        else:
            joined_dicts | "CountryJoined" >> WriteToText(known_args.output +
                                                          "-CountryJoined")

        # step 5: now combine impressions and advertiser data

        # need to first re-key the impressions data to be on AdvertiserId to enable the join
        rekeyed_imp_data = (
            joined_dicts
            | 'ImpressionsReMapping' >>
            beam.Map(lambda impression:
                     (impression["impression_advertiserid"], impression)))

        advrtsr_side_input = beam.pvalue.AsDict(advrtsr_data)

        def join_imp_advrtsr(impression, advrtsr_dict):
            empty_advrtsr = {}
            empty_advrtsr['advertiser_advertiserid'] = ''
            empty_advrtsr['advertiser_advertisername'] = ''
            empty_advrtsr['advertiser_advertiserregion'] = ''
            empty_advrtsr['advertiser_advertisertype'] = ''

            imp = impression[1]

            if impression[0] in advrtsr_dict.keys():
                advrtsr = advrtsr_dict[impression[0]]
            else:
                #print("No Advertiser Found!")
                advrtsr = empty_advrtsr

            imp.update(advrtsr)
            return (imp)

        joined_dicts = (rekeyed_imp_data
                        | 'JoiningAdvertiser' >> beam.Map(
                            join_imp_advrtsr, advrtsr_dict=advrtsr_side_input))

        if known_args.run_target == "local":
            print("Writing out final dataset")
            joined_dicts | "AdvertiserJoined" >> WriteToText(
                "AdvertiserJoined")
        else:
            joined_dicts | "AdvertiserJoined" >> WriteToText(
                known_args.output + "-AdvertiserJoined")

        if known_args.output_dest == "bq":
            print("updating bigquery table")
            #getDBSchema()
            joined_dicts | 'Write' >> beam.io.WriteToBigQuery(
                "big-data-pipe:dblclick_data.gcp_demo",
                schema=getDBSchema(),
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)
Beispiel #24
0
    state, year, month = key.split('-')
    rain = data['chuvas'][0]
    dengue = data['dengue'][0]

    return (state, year, month, rain, dengue)


def prepare_csv(element, delimiter=';'):
    return f"{delimiter}".join([str(e) for e in element])


# dengue is a PCollection
dengue = (
    pipeline
    | "Leitura do dataset de dengue" >> ReadFromText(
        './data/sample_casos_dengue.txt', skip_header_lines=1)
    | "From text to list" >> beam.Map(text_to_list)
    | "From list to dict" >> beam.Map(list_to_dict, columns)
    | "Add year_month" >> beam.Map(treat_date)
    | "Create key by state" >> beam.Map(key_uf)
    | "GroupBy State" >> beam.GroupByKey()
    | "Unzip dengue cases" >> beam.FlatMap(
        dengue_cases)  # Permite a utilização de yield
    | "Sum of cases by key" >> beam.CombinePerKey(sum)
    # | "Show results" >> beam.Map(print)
)

chuvas = (
    pipeline
    | "Read rain dataset" >> ReadFromText('./data/sample_chuvas.csv',
                                          skip_header_lines=1)
Beispiel #25
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    ###############################################
    # (1) pipeline を作成する
    ###############################################

    # まず PipelineOptions オブジェクトを作成
    # パイプラインを実行する pipeline runner や、選択した runner が必要とする固有の設定など、さまざまなオプションを設定できる
    pipeline_options = PipelineOptions(pipeline_args)

    # 作成した PipelineOptions オプジェクトを直接編集する例
    # 今回は DoFn transform を使用するため、save_main_sessionオプションを有効にする
    pipeline_options.view_as(SetupOptions).save_main_session = True

    # オプションを元に pipeline (p) を作成
    p = beam.Pipeline(options=pipeline_options)  #in→text out→textのパイプライン
    p2 = beam.Pipeline(options=pipeline_options)  #in→bigquery out→textのパイプライン

    ##############################################
    # (2) transformを設定
    ###############################################

    #pにtransformを設定
    lines = p | 'read' >> ReadFromText(known_args.input)

    #ファイル出力するためのサンプルメソッド
    def add(line):
        num = int(line.strip())
        return num * 3

    counts = lines | 'add' >> beam.Map(add)
    counts | 'write' >> WriteToText(known_args.output)

    #p2にtransformを設定
    query = 'select * from babynames.names2012 limit 1000'
    p2 | 'read' >> Read(beam.io.BigQuerySource(project='gcp-project-210712', use_standard_sql=False, query=query)) \
       | 'write' >> WriteToText('gs://gcp_dataflowsample/query_result.txt', num_shards=1)

    ###############################################
    # (3) Pipeline を実行
    ###############################################

    #result = p.run()
    result2 = p2.run()

    # 終了を待つ
    # 記述しなければそのまま抜ける
    # →DataFlowRunnerの場合、Ctrl-Cでもパイプラインは停止しない。Gooleコンソールから停止する必要がある
    #ここで結果が終了するのを待ち合わせている。記載がなければ後続は処理されない。
    #result.wait_until_finish()
    result2.wait_until_finish()
Beispiel #26
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_mode',
                        default='file',
                        help='Streaming input or file based batch input')

    for ticker in TICKER_LIST:
        parser.add_argument(
            '--input_{}'.format(ticker),
            default='{}_hist.csv'.format(ticker),
            help=
            'Cloud Pub/Sub topic of tick market data for a stock, fall back to flat csv'
        )

    parser.add_argument(
        '--output_topic',
        default='/tmp/trading_signals.txt',
        help='Topic of output trading signals in Google Cloud Pub/Sub')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    if known_args.input_mode == 'stream':
        pipeline_options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:

        # Read input
        input_stage = {}
        for ticker in TICKER_LIST:
            if known_args.input_mode == 'streaming':
                input_ticks = (p | beam.io.ReadFromPubSub(
                    topic=known_args.input_topic).with_output_types(
                        six.binary_type))
            else:
                input_ticks = (p | 'Read: %s' % ticker >> ReadFromText(
                    getattr(known_args, 'input_%s' % ticker)))

            input_stage[ticker] = (
                input_ticks
                |
                'decode: %s' % ticker >> beam.Map(lambda x: x.decode('utf-8'))
                | 'Filter: %s' % ticker >>
                beam.Filter(lambda row: row.split(',')[0] != 'date')
                |
                'Add Timestamp: %s' % ticker >> beam.ParDo(AddTimestampDoFn())
                | 'Window: %s' % ticker >> beam.WindowInto(
                    window.SlidingWindows(size=SECONDS_IN_1_DAY * 10,
                                          period=SECONDS_IN_1_DAY))
                |
                'Pair: %s' % ticker >> beam.ParDo(CorrelationPairDoFn(ticker)))

        # Group together all entries under the same ticker
        grouped = input_stage | 'group_by_name' >> beam.CoGroupByKey()

        correlations = (grouped
                        | 'Calculate pair correlation' >>
                        beam.Map(calculate_correlation_pair))

        if known_args.input_mode == 'stream':
            trading_signals = (
                correlations | 'Filter correlation threshold' >> beam.Filter(
                    lambda x: x[1] < CORRELATION_THRESHOLD).with_output_types(
                        six.binary_type))
            # pylint: disable=expression-not-assigned
            trading_signals | beam.io.WriteToPubSub(known_args.output_topic)
        else:
            trading_signals = (
                correlations | 'Filter correlation threshold' >>
                beam.Filter(lambda x: x[1] < CORRELATION_THRESHOLD))
            # pylint: disable=expression-not-assigned
            trading_signals | 'WriteOutput' >> WriteToText(
                known_args.output_topic)
Beispiel #27
0
def create_pipeline(embedding_model,
                    files_input_list=None,
                    tfrecord_input=None,
                    embeddings_output=None,
                    stats_output=None,
                    feature_key=None,
                    name='all_train_embeddings',
                    batch_size=64):
  """Returns a pipeline that extracts stats from audio examples.

  Args:
    embedding_model: ModelConfig namedtuple; contains model ckpt, embedding
      dimension size and step size.
    files_input_list: List of files from where the audio is to be read.
    tfrecord_input: Path to a tfrecord containing audio.
    embeddings_output: location to where the embeddings should be written.
    stats_output: location to where the stats should be written.
    feature_key: tf.example feature that contains the samples that are to be
      processed.
    name: Identifier for the set of examples processed in this pipeline.
    batch_size: batch_size.

  Returns:
    The beam pipeline.
  """
  pipeline = beam.Pipeline()
  if files_input_list:
    examples = (
        pipeline
        | 'Read File List' >> ReadFromText(files_input_list)
        | 'Read Files' >> beam.ParDo(ReadWavFiles()))
  else:
    examples = (
        pipeline
        | 'Read Examples' >> ReadFromTFRecord(
            tfrecord_input,
            value_coder=beam.coders.ProtoCoder(tf.train.Example))
        | 'Add Keys' >> beam.ParDo(AddKey()))
  embeddings = (
      examples
      | 'Batched Inference' >> beam.ParDo(
          BatchedInference(
              batch_size=batch_size,
              model=embedding_model,
              feature_key=feature_key)).with_outputs('raw', main='examples'))
  if stats_output:
    _ = (
        embeddings.raw
        | 'Combine Embeddings' >> beam.CombineGlobally(
            ComputeMeanAndCovariance(key_name=name, embedding_dim=128))
        | 'DropKey' >> beam.ParDo(DropKey())
        | 'Write Stats' >> WriteToTFRecord(
            stats_output,
            shard_name_template='',
            coder=beam.coders.ProtoCoder(tf.train.Example)))
  if embeddings_output:
    _ = (
        embeddings.examples
        | 'DropKey' >> beam.ParDo(DropKey())
        | 'Write Examples' >> WriteToTFRecord(
            embeddings_output,
            shard_name_template='',
            coder=beam.coders.ProtoCoder(tf.Example)))
  return pipeline
Beispiel #28
0
        # get the total transactions for one item
        return [(str(element[0]), sum(element[1]))]


# Print function, for printing output in command terminal- useful for debugging
class Printer(beam.DoFn):
    def process(self, data_item):
        print data_item


# Pipeline

# Read the flights data
raw_flights = (
    p
    | "flights:read" >> ReadFromText(
        "C:/Users/mirel/Desktop/flights_small.csv", skip_header_lines=1)
    | beam.Map(lambda record: (record.split(','))))
# Turn it into a KV pair
flights_data = (raw_flights | beam.ParDo(FlightKeys()))
# Read the Weather Data, and turn it into KV pair
weather = (p
           | "readweather" >> ReadFromText(
               "C:/Users/mirel/Desktop/weather.csv", skip_header_lines=1)
           | beam.Map(lambda record: (record.split(',')))
           | beam.ParDo(SplitWeather()))
"""
Turn Flights and weather data into a dictionary, Group them by their common key, Filter out values that don't match,
and extract values which are a match
"""
results = (
    (weather, flights_data)
Beispiel #29
0
def get_recently_active_users(p, file):
    return (p
            | "Read recently active users" >> ReadFromText(file)
            | "Parse recently active users" >> beam.ParDo(
                ParseRecentlyActiveUsersCSV()))
Beispiel #30
0
def get_recent_questions(p, file):
    return (p
            | "Read recent questions" >> ReadFromText(file)
            |
            "Parse recent questions" >> beam.ParDo(ParseRecentQuestionsCSV()))