def run(argv=None):
    parser = argparse.ArgumentParser()

    parser.add_argument("--file_location", dest="file_location", required=True)
    parser.add_argument("--schema_location",
                        dest="schema_location",
                        required=True)
    parser.add_argument("--output_table", dest="output_table", required=True)

    known_args, pipeline_args = parser.parse_known_args(argv)

    file_location = known_args.file_location
    schema_location = known_args.schema_location
    output_table = known_args.output_table

    cloud_storage_to_bq = CloudStorageToBigQuery()

    with open("./schemas/customers.json") as f:
        schema_dict = json.load(f)
        table_schema = json_to_schema(schema_dict)

    p = beam.Pipeline(options=_options)
    (p
     | "Read CSV file from cloud storage" >> ReadFromText(
         file_pattern=file_location, skip_header_lines=True)
     | 'CSV Row To BigQuery Row' >>
     beam.Map(lambda s: cloud_storage_to_bq.process_row(s, schema_dict))
     | "Write to BigQuery" >> bigquery.WriteToBigQuery(
         table=f"precis-digital-case-interview:part_1.{output_table}",
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    p.run().wait_until_finish()
예제 #2
0
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: _create_example(*args)
    )
    examples = _shuffle_examples(examples)

    examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps
    else:
        assert args.dataset_format == _TF_FORMAT
        write_sink = WriteToTFRecord
        file_name_suffix = ".tfrecord"
        serialize_fn = _features_to_serialized_tf_example

    for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG),
                      ("test", _TrainTestSplitFn.TEST_TAG)]:
        serialized_examples = examples[tag] | (
                "serialize {} examples".format(name) >> beam.Map(serialize_fn))
        (
                serialized_examples | ("write " + name)
                >> write_sink(
            os.path.join(args.output_dir, name),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
        )

    result = p.run()
    result.wait_until_finish()
예제 #3
0
파일: textio_test.py 프로젝트: wikier/beam
 def test_read_gzip_empty_file(self):
     filename = tempfile.NamedTemporaryFile(delete=False,
                                            prefix=tempfile.template).name
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
예제 #4
0
 def test_read_gzip_empty_file(self):
     with TempDir() as tempdir:
         file_name = tempdir.create_temp_file()
         with TestPipeline() as pipeline:
             pcoll = pipeline | 'Read' >> ReadFromText(
                 file_name, 0, CompressionTypes.GZIP, True,
                 coders.StrUtf8Coder())
             assert_that(pcoll, equal_to([]))
예제 #5
0
 def process(self, element):
     file = {
         'name':
         element['name'],
         'content':
         ReadFromText(element['file']) | apache_beam.combiners.ToList()
     }
     return [file]
예제 #6
0
def execute():
    with beam.Pipeline('DirectRunner') as p:
        (p 
            | 'ReadFile'      >> ReadFromText(file_pattern='./data/SMSSpamCollection')
            | 'Deduplicate'   >> beam.RemoveDuplicates()
            | 'Parse'         >> beam.FlatMap(parse)
            | 'Write'         >> beam.io.WriteToText('./data/Output.jsonl') 
        )
예제 #7
0
    def test_read_from_text_single_file_with_coder(self):

        file_name, expected_data = write_data(5)
        assert len(expected_data) == 5
        with TestPipeline() as pipeline:
            pcoll = pipeline | 'Read' >> ReadFromText(file_name,
                                                      coder=DummyCoder())
            assert_that(pcoll,
                        equal_to([record * 2 for record in expected_data]))
예제 #8
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
예제 #9
0
def run(p, args, aggregator_dict, cloud_logger=None):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()

    # Create one pcollection per input file or file pattern. And then flatten
    # them into one pcollection. The duplicated names need to be removed as the
    # file name is used to create unique labels for the PTransform.
    readers = []
    for pattern in list(
            set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))):
        # Setup reader.
        #
        # TODO(user): Perhaps simplify the batch prediction code by using
        # CompressionTypes.AUTO.
        if input_file_format.startswith("tfrecord"):
            if input_file_format == "tfrecord_gzip":
                compression_type = CompressionTypes.GZIP
            else:
                assert input_file_format == "tfrecord"
                compression_type = CompressionTypes.UNCOMPRESSED
            reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord(
                pattern, compression_type=compression_type)

        else:
            assert input_file_format == "text"
            reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern)

        # Put the pcollections into a list and flatten later.
        readers.append(p | reader)

    # Setup the whole pipeline.
    results, errors = (readers
                       | beam.Flatten()
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           cloud_logger=cloud_logger))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         | "WRITE_PREDICTION_RESULTS" >> WriteToText(
             os.path.join(args.output_location,
                          OUTPUT_RESULTS_FILES_BASENAME_)))
    # Write prediction errors counts to output files.
    _ = (
        errors
        | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
        | "WRITE_ERRORS" >> WriteToText(
            os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_)))

    return p.run()
예제 #10
0
    def get(self):
        """
        Flask view that triggers the execution of the pipeline
        """
        input_filename = 'input.txt'
        output_filename = 'output.txt'

        # project_id = os.environ['DATASTORE_PROJECT_ID']
        # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
        # client = datastore.Client.from_service_account_json(credentials_file)

        options = PipelineOptions()
        gcloud_options = options.view_as(GoogleCloudOptions)
        # gcloud_options.project = project_id
        gcloud_options.job_name = 'test-job'

        # Dataflow runner
        runner = os.environ['DATAFLOW_RUNNER']
        options.view_as(StandardOptions).runner = runner

        with apache_beam.Pipeline(options=options) as p:
            rows = (
                p |
                ReadFromText(input_filename) |
                apache_beam.ParDo(Split())
            )

            timings = (
                rows |
                apache_beam.ParDo(CollectTimings()) |
                "Grouping timings" >> apache_beam.GroupByKey() |
                "Calculating average" >> apache_beam.CombineValues(
                    apache_beam.combiners.MeanCombineFn()
                )
            )

            users = (
                rows |
                apache_beam.ParDo(CollectUsers()) |
                "Grouping users" >> apache_beam.GroupByKey() |
                "Counting users" >> apache_beam.CombineValues(
                    apache_beam.combiners.CountCombineFn()
                )
            )

            to_be_joined = (
                {
                    'timings': timings,
                    'users': users
                } |
                apache_beam.CoGroupByKey() |
                apache_beam.ParDo(WriteToCSV()) |
                WriteToText(output_filename)
            )

        return 'ok'
예제 #11
0
    def test_read_auto_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file(suffix='.bz2')
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #12
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.bz2')
      with bz2.BZ2File(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
예제 #13
0
  def test_read_auto_deflate(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.deflate')
      with open(file_name, 'wb') as f:
        f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
예제 #14
0
def run(argv=None):
    """Run the beam pipeline."""
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    lines = p | "read qa files" >> ReadFromText(args.file_pattern)

    # The lines are not JSON, but the string representation of python
    # dictionary objects. Parse them with ast.literal_eval.
    json_objects = lines | "parsing dictionaries" >> beam.Map(ast.literal_eval)
    qa_tuples = json_objects | "create tuples" >> beam.FlatMap(
        partial(
            _create_tuples,
            min_words=args.min_words, max_words=args.max_words)
    )

    # Remove duplicate examples.
    qa_tuples |= "key by QA" >> beam.Map(lambda v: (v[1:], v))
    qa_tuples |= "group duplicates" >> beam.GroupByKey()
    qa_tuples |= "remove duplicates" >> beam.Map(lambda v: sorted(v[1])[0])

    # Create the examples.
    serialized_examples = qa_tuples | "create examples" >> beam.Map(
        lambda args: create_example(*args).SerializeToString()
    )
    serialized_examples = _shuffle_examples(serialized_examples)

    serialized_examples |= "split train and test" >> beam.ParDo(
        _TrainTestSplitFn(args.train_split)
    ).with_outputs(_TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG)

    (
        serialized_examples[_TrainTestSplitFn.TRAIN_TAG]
        | "write train"
        >> WriteToTFRecord(
            os.path.join(args.output_dir, "train"),
            file_name_suffix=".tfrecords",
            num_shards=args.num_shards_train,
        )
    )
    (
        serialized_examples[_TrainTestSplitFn.TEST_TAG]
        | "write test"
        >> WriteToTFRecord(
            os.path.join(args.output_dir, "test"),
            file_name_suffix=".tfrecords",
            num_shards=args.num_shards_test,
        )
    )

    result = p.run()
    result.wait_until_finish()
예제 #15
0
  def test_read_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      with TestPipeline() as pipeline:
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
예제 #16
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with bz2.BZ2File(file_name, 'wb') as f:
                f.write('\n'.join(lines).encode('utf-8'))

            with TestPipeline() as pipeline:
                pcoll = pipeline | 'Read' >> ReadFromText(
                    file_name, compression_type=CompressionTypes.BZIP2)
                assert_that(pcoll, equal_to(lines))
예제 #17
0
    def test_read_gzip(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #18
0
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #19
0
  def test_read_auto_gzip(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file(suffix='.gz')

      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines).encode('utf-8'))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(file_name)
      assert_that(pcoll, equal_to(lines))
      pipeline.run()
예제 #20
0
파일: textio_test.py 프로젝트: wikier/beam
    def test_read_gzip_large(self):
        _, lines = write_data(10000)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #21
0
파일: textio_test.py 프로젝트: wikier/beam
    def test_read_bzip2(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template).name
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #22
0
파일: textio_test.py 프로젝트: wikier/beam
    def test_read_auto_gzip(self):
        _, lines = write_data(15)
        file_name = tempfile.NamedTemporaryFile(delete=False,
                                                prefix=tempfile.template,
                                                suffix='.gz').name
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
예제 #23
0
  def test_read_from_text_single_file_with_coder(self):
    class DummyCoder(coders.Coder):
      def encode(self, x):
        raise ValueError

      def decode(self, x):
        return (x * 2).decode('utf-8')

    file_name, expected_data = write_data(5)
    assert len(expected_data) == 5
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder())
      assert_that(pcoll, equal_to([record * 2 for record in expected_data]))
예제 #24
0
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    with TempDir() as tempdir:
      file_name = tempdir.create_temp_file()
      with gzip.GzipFile(file_name, 'wb') as f:
        f.write('\n'.join(lines))

      pipeline = TestPipeline()
      pcoll = pipeline | 'Read' >> ReadFromText(
          file_name, 0, CompressionTypes.GZIP,
          True, coders.StrUtf8Coder(), skip_header_lines=2)
      assert_that(pcoll, equal_to(lines[2:]))
      pipeline.run()
def run():

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    sentiments = (
        p
        | "Read From Text" >>
        ReadFromText("doc_sentiment.txt",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Doc, SentimentScore Tuple" >>
        beam.Map(lambda x: (x.split(" ")[0], x.split(" ")[1])))

    nes = (
        p
        | "Read Named Entites" >>
        ReadFromText("doc_nes.txt",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Doc, Entities Tuple" >> beam.Map(lambda x: eval(x)))

    def process_nes_sentiment((doc, nes_sentiment)):
        neslist = nes_sentiment["nes"]
        st = nes_sentiment["sentiment"][0]
        for nes in neslist:
            for ne in nes:
                yield (ne[0], ne[1], st)

    g = ({
        "nes": nes,
        "sentiment": sentiments
    }
         | beam.CoGroupByKey()
         | beam.FlatMap(process_nes_sentiment))

    (g | "Write Results" >> WriteToText("ne_sentiment.txt"))

    p.run()
예제 #26
0
    def test_read_deflate(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with open(file_name, 'wb') as f:
                f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

            pipeline = TestPipeline()
            pcoll = pipeline | 'Read' >> ReadFromText(
                file_name, 0, CompressionTypes.DEFLATE, True,
                coders.StrUtf8Coder())
            assert_that(pcoll, equal_to(lines))
            pipeline.run()
예제 #27
0
  def test_dataflow_single_file_with_coder(self):
    class DummyCoder(coders.Coder):
      def encode(self, x):
        raise ValueError

      def decode(self, x):
        return x * 2

    file_name, expected_data = write_data(5)
    assert len(expected_data) == 5
    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name, coder=DummyCoder())
    assert_that(pcoll, equal_to([record * 2 for record in expected_data]))
    pipeline.run()
예제 #28
0
    def test_read_corrupted_bzip2_fails(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        with open(file_name, 'wb') as f:
            f.write('corrupt')

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, compression_type=CompressionTypes.BZIP2)
        assert_that(pcoll, equal_to(lines))
        with self.assertRaises(Exception):
            pipeline.run()
예제 #29
0
    def test_read_corrupted_deflate_fails(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with open(file_name, 'wb') as f:
                f.write(zlib.compress('\n'.join(lines).encode('utf-8')))

            with open(file_name, 'wb') as f:
                f.write(b'corrupt')

            with self.assertRaises(Exception):
                with TestPipeline() as pipeline:
                    pcoll = pipeline | 'Read' >> ReadFromText(
                        file_name, 0, CompressionTypes.DEFLATE, True,
                        coders.StrUtf8Coder())
                    assert_that(pcoll, equal_to(lines))
예제 #30
0
    def test_read_corrupted_gzip_fails(self):
        _, lines = write_data(15)
        with TempDir() as tempdir:
            file_name = tempdir.create_temp_file()
            with gzip.GzipFile(file_name, 'wb') as f:
                f.write(b'\n'.join(lines))

            with open(file_name, 'wb') as f:
                f.write('corrupt')

            pipeline = TestPipeline()
            pcoll = pipeline | 'Read' >> ReadFromText(
                file_name, 0, CompressionTypes.GZIP, True,
                coders.StrUtf8Coder())
            assert_that(pcoll, equal_to(lines))

            with self.assertRaises(Exception):
                pipeline.run()