コード例 #1
0
def run_pipeline(in_file):
    import csv

    import apache_beam as beam
    from apache_beam.io.textio import ReadFromText
    from apache_beam.io.textio import WriteToText

    # Simple process for apache beam pipeline
    with beam.Pipeline(runner='DirectRunner') as p:
        #
        # Pipeline(0): Data ingestion
        #
        # "lines" will include pcollections of each line
        # Options
        # file_pattern: File path to file
        # skip_header_lines: First line will be skipped. Set to "1".

        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText
        collections = p | 'ReadAiportInfo' >> ReadFromText(
            file_pattern=in_file[0], skip_header_lines=1)

        #
        # Pipeline(1): Create side input
        # Final PCollection will be used as side input for the date time convertion in the next transformation
        # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field
        # 2. Filter out invalid fields
        # 3. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26). Also add timezone for correspondng coordinates
        #
        airports = (collections
                    | 'airports:Extract' >>
                    beam.Map(lambda x: next(csv.reader([x], delimiter=',')))
                    |
                    'airports:Filter' >> beam.Filter(lambda x: x[21] and x[26])
                    | 'airports:Timezone' >>
                    beam.Map(lambda x: (x[0], addtimezone(x[21], x[26]))))

        #
        # Pipeline(2): Correct timezone
        # 1. Read flight data
        # 2. Convert times into UTC
        flights = (p | 'flights:read' >> ReadFromText(file_pattern=in_file[1],
                                                      skip_header_lines=1)
                   | 'flights:tzcorr' >> beam.FlatMap(
                       tz_correct, beam.pvalue.AsDict(airports)))

        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (flights | 'flights:out' >> WriteToText(file_path_prefix='flights'))

        # Pipeline(3): Generate departed and arrived events
        # 1.
        events = flights | '' >> beam.FlatMap(get_next_event)

        #
        # Pipeline(Final)
        #
        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (events | 'event:out' >> WriteToText(file_path_prefix='events'))
コード例 #2
0
def run(p, args, aggregator_dict, cloud_logger=None):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()

    # Create one pcollection per input file or file pattern. And then flatten
    # them into one pcollection. The duplicated names need to be removed as the
    # file name is used to create unique labels for the PTransform.
    readers = []
    for pattern in list(
            set(args.input_file_patterns.split(FILE_LIST_SEPARATOR))):
        # Setup reader.
        #
        # TODO(user): Perhaps simplify the batch prediction code by using
        # CompressionTypes.AUTO.
        if input_file_format.startswith("tfrecord"):
            if input_file_format == "tfrecord_gzip":
                compression_type = CompressionTypes.GZIP
            else:
                assert input_file_format == "tfrecord"
                compression_type = CompressionTypes.UNCOMPRESSED
            reader = "READ_TFRECORD_FILES_%s" % pattern >> ReadFromTFRecord(
                pattern, compression_type=compression_type)

        else:
            assert input_file_format == "text"
            reader = "READ_TEXT_FILES_%s" % pattern >> ReadFromText(pattern)

        # Put the pcollections into a list and flatten later.
        readers.append(p | reader)

    # Setup the whole pipeline.
    results, errors = (readers
                       | beam.Flatten()
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           cloud_logger=cloud_logger))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         | "WRITE_PREDICTION_RESULTS" >> WriteToText(
             os.path.join(args.output_location,
                          OUTPUT_RESULTS_FILES_BASENAME_)))
    # Write prediction errors counts to output files.
    _ = (
        errors
        | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
        | "WRITE_ERRORS" >> WriteToText(
            os.path.join(args.output_location, OUTPUT_ERRORS_FILES_BASENAME_)))

    return p.run()
コード例 #3
0
def main():
    args, pipeline_args = get_args()

    # 먼저 PipelineOpions 을 통해서 pipeline에 대한 설정을 할 수 있습니다.
    # 예를 들어서 pipeline runner를 설정하여 무엇이 pipeline을 실행할지 설정할수 있습니다.

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Input data file -> TextIO.Read Transform -> PCollection(lines)
        lines = p | ReadFromText(args.input)

        counts = (
                lines
                | 'Split' >> (beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
                              .with_output_types(unicode))
                | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(args.output)
コード例 #4
0
def run_pipeline(in_file, out_file):
    # Simple process for apache beam pipeline
    with beam.Pipeline(runner='DirectRunner') as p:
        #
        # Pipeline(0): Data ingestion
        #
        # "lines" will include pcollections of each line
        # Options
        # file_pattern: File path to file
        # skip_header_lines: First line will be skipped. Set to "1".

        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.ReadFromText
        collections = p | ReadFromText(file_pattern=in_file,
                                       skip_header_lines=1)

        #
        # Pipeline(n): Detailed Transformation
        # 1. Parse each line and return fields as a list. Use csv module to remove any double quotes inside field
        # 2. Just get "AIRPORT_SEQ_ID"(0),"LATITUDE"(21),"LONGITUDE"(26)
        #
        airports = (collections
                    | 'Extract_Into_Fields' >>
                    beam.Map(lambda x: next(csv.reader([x], delimiter=',')))
                    |
                    'Set_Fields' >> beam.Map(lambda x: (x[0], (x[21], x[26]))))

        #
        # Pipeline(Final)
        #
        # Write results to a file. Tuples are unpacked while function call.
        # https://beam.apache.org/releases/pydoc/2.11.0/apache_beam.io.textio.html#apache_beam.io.textio.WriteToText
        (airports
         | beam.Map(lambda
                    (airport, data): "{0},{1}".format(airport, ','.join(data)))
         | WriteToText(file_path_prefix=out_file))
コード例 #5
0
    def get(self):
        """
        Flask view that triggers the execution of the pipeline
        """
        input_filename = 'data/input/titanic.txt'
        output_filename = 'data/output/titanic.txt'

        # project_id = os.environ['DATASTORE_PROJECT_ID']
        # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
        # client = datastore.Client.from_service_account_json(credentials_file)

        options = PipelineOptions()
        gcloud_options = options.view_as(GoogleCloudOptions)
        # gcloud_options.project = project_id
        gcloud_options.job_name = 'test-job'

        # Dataflow runner
        runner = os.environ['DATAFLOW_RUNNER']
        options.view_as(StandardOptions).runner = runner

        with apache_beam.Pipeline(options=options) as p:
            rows = (p | ReadFromText(input_filename)
                    | apache_beam.ParDo(Split()))

            survived = (rows | apache_beam.ParDo(CollectSurvived())
                        | apache_beam.GroupByKey()
                        | apache_beam.ParDo(WriteToCSV())
                        | WriteToText(output_filename))

        return 'All Titanic survivors are writte to data/output/titanic.txt-00000-of-00001'
コード例 #6
0
ファイル: views.py プロジェクト: flybirdgroup/beam-example
    def get(self):
        """
        Flask view that triggers the execution of the pipeline
        """
        input_filename = 'input.txt'
        output_filename = 'output.txt'

        # project_id = os.environ['DATASTORE_PROJECT_ID']
        # credentials_file = os.environ['GOOGLE_APPLICATION_CREDENTIALS']
        # client = datastore.Client.from_service_account_json(credentials_file)

        options = PipelineOptions()
        gcloud_options = options.view_as(GoogleCloudOptions)
        # gcloud_options.project = project_id
        gcloud_options.job_name = 'test-job'

        # Dataflow runner
        runner = os.environ['DATAFLOW_RUNNER']
        options.view_as(StandardOptions).runner = runner

        with apache_beam.Pipeline(options=options) as p:
            rows = (
                p |
                ReadFromText(input_filename) |
                apache_beam.ParDo(Split())
            )

            timings = (
                rows |
                apache_beam.ParDo(CollectTimings()) |
                "Grouping timings" >> apache_beam.GroupByKey() |
                "Calculating average" >> apache_beam.CombineValues(
                    apache_beam.combiners.MeanCombineFn()
                )
            )

            users = (
                rows |
                apache_beam.ParDo(CollectUsers()) |
                "Grouping users" >> apache_beam.GroupByKey() |
                "Counting users" >> apache_beam.CombineValues(
                    apache_beam.combiners.CountCombineFn()
                )
            )

            to_be_joined = (
                {
                    'timings': timings,
                    'users': users
                } |
                apache_beam.CoGroupByKey() |
                apache_beam.ParDo(WriteToCSV()) |
                WriteToText(output_filename)
            )

        return 'ok'
コード例 #7
0
def run(p, args, aggregator_dict):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()
    input_file_patterns = args.input_file_patterns

    # Setup reader.
    if input_file_format == "text":
        reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText(
            input_file_patterns)
    elif input_file_format == "tfrecord":
        reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord(
            input_file_patterns)
    elif input_file_format == "tfrecord_gzip":
        reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip(
            input_file_patterns)

    # Setup the whole pipeline.
    results, errors = (reader
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           tags=args.tags,
                           signature_name=args.signature_name,
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           user_project_id=args.user_project_id,
                           user_job_id=args.user_job_id,
                           framework=args.framework))

    # Convert predictions to JSON and then write to output files.
    _ = (results
         | "TO_JSON" >> beam.Map(json.dumps)
         |
         "WRITE_PREDICTION_RESULTS" >> WriteToText(args.output_result_prefix))
    # Write prediction errors counts to output files.
    _ = (errors
         | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
         | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix))

    return p.run()
コード例 #8
0
    def test_write_pipeline(self):
        with TestPipeline() as pipeline:
            pcoll = pipeline | beam.core.Create(self.lines)
            pcoll | 'Write' >> WriteToText(self.path)  # pylint: disable=expression-not-assigned

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with open(file_name, 'rb') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(sorted(read_result), sorted(self.lines))
コード例 #9
0
ファイル: textio_test.py プロジェクト: wanwanzhu/beam
  def test_write_dataflow_auto_compression(self):
    with TestPipeline() as pipeline:
      pcoll = pipeline | beam.core.Create(self.lines)
      pcoll | 'Write' >> WriteToText(self.path, file_name_suffix='.gz')  # pylint: disable=expression-not-assigned

    read_result = []
    for file_name in glob.glob(self.path + '*'):
      with gzip.GzipFile(file_name, 'rb') as f:
        read_result.extend(f.read().splitlines())

    self.assertEqual(sorted(read_result), sorted(self.lines))
コード例 #10
0
  def test_write_dataflow(self):
    pipeline = TestPipeline()
    pcoll = pipeline | beam.core.Create(self.lines)
    pcoll | 'Write' >> WriteToText(self.path)  # pylint: disable=expression-not-assigned
    pipeline.run()

    read_result = []
    for file_name in glob.glob(self.path + '*'):
      with open(file_name, 'rb') as f:
        read_result.extend(f.read().splitlines())

    self.assertEqual(read_result, self.lines)
コード例 #11
0
    def test_write_pipeline_non_globalwindow_input(self):
        with TestPipeline() as p:
            _ = (p
                 | beam.core.Create(self.lines)
                 | beam.WindowInto(beam.transforms.window.FixedWindows(1))
                 | 'Write' >> WriteToText(self.path))

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with open(file_name, 'rb') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(sorted(read_result), sorted(self.lines))
コード例 #12
0
    def test_write_pipeline_auto_compression_unsharded(self):
        with TestPipeline() as pipeline:
            pcoll = pipeline | 'Create' >> beam.core.Create(self.lines)
            pcoll | 'Write' >> WriteToText(  # pylint: disable=expression-not-assigned
                self.path + '.gz',
                shard_name_template='')

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with gzip.GzipFile(file_name, 'rb') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(sorted(read_result), sorted(self.lines))
コード例 #13
0
ファイル: textio_test.py プロジェクト: wikier/beam
    def test_write_dataflow_auto_compression_unsharded(self):
        pipeline = TestPipeline()
        pcoll = pipeline | beam.core.Create(self.lines)
        pcoll | 'Write' >> WriteToText(self.path + '.gz',
                                       shard_name_template='')  # pylint: disable=expression-not-assigned
        pipeline.run()

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with gzip.GzipFile(file_name, 'r') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(read_result, self.lines)
コード例 #14
0
 def expand(self, pcoll):  # pylint: disable=arguments-differ
     return (
         pcoll | "ToList" >> beam.Map(DictToList(self.columns))
         | "Format" >> TransformAndLog(beam.Map(
             lambda x: format_csv_rows([x], delimiter=self.delimiter)),
                                       log_prefix='formatted csv: ',
                                       log_level='debug')
         | "Utf8Encode" >> beam.Map(lambda x: x.encode('utf-8'))
         | "Write" >> WriteToText(
             self.path,
             file_name_suffix=self.file_name_suffix,
             header=format_csv_rows(
                 [self.columns], delimiter=self.delimiter).encode('utf-8')))
コード例 #15
0
    def test_write_pipeline_footer(self):
        with TestPipeline() as pipeline:
            footer_text = 'footer'
            pcoll = pipeline | beam.core.Create(self.lines)
            pcoll | 'Write' >> WriteToText(  # pylint: disable=expression-not-assigned
                self.path,
                footer=footer_text)

        read_result = []
        for file_name in glob.glob(self.path + '*'):
            with open(file_name, 'rb') as f:
                read_result.extend(f.read().splitlines())

        self.assertEqual(sorted(read_result[:-1]), sorted(self.lines))
        self.assertEqual(read_result[-1], footer_text.encode('utf-8'))
コード例 #16
0
  def test_write_dataflow_header(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> beam.core.Create(self.lines)
    header_text = b'foo'
    pcoll | 'Write' >> WriteToText(  # pylint: disable=expression-not-assigned
        self.path + '.gz',
        shard_name_template='',
        header=header_text)
    pipeline.run()

    read_result = []
    for file_name in glob.glob(self.path + '*'):
      with gzip.GzipFile(file_name, 'rb') as f:
        read_result.extend(f.read().splitlines())

    self.assertEqual(read_result, [header_text] + self.lines)
コード例 #17
0
ファイル: textio_test.py プロジェクト: wanwanzhu/beam
  def test_write_dataflow_header(self):
    with TestPipeline() as pipeline:
      pcoll = pipeline | 'Create' >> beam.core.Create(self.lines)
      header_text = 'foo'
      pcoll | 'Write' >> WriteToText(  # pylint: disable=expression-not-assigned
          self.path + '.gz',
          shard_name_template='',
          header=header_text)

    read_result = []
    for file_name in glob.glob(self.path + '*'):
      with gzip.GzipFile(file_name, 'rb') as f:
        read_result.extend(f.read().splitlines())
    # header_text is automatically encoded in WriteToText
    self.assertEqual(read_result[0], header_text.encode('utf-8'))
    self.assertEqual(sorted(read_result[1:]), sorted(self.lines))
コード例 #18
0
def BuildPipeline(pathToFiles, compute_table_name):
    raw_output = "output-raw"
    final_output = "output-final"
    options = PipelineOptions()
    #pathToFiles= "C:\\Users\\Abhijeet\\Documents\\GitHub\\dsba6155project\\dsba6155project\\data\\**"
    pipeline = beam.Pipeline(options=options)
    vectors = (
        pipeline
        | "Read Files" >> ReadFromTextWithFilename(pathToFiles)
        | "Group by File" >> beam.GroupByKey()
        | "Hashing Vectors" >> beam.ParDo(Hashing())
        # | "Write CSV to biqquery" >> beam.io.WriteToBigQuery(
        #     table=compute_table_name,
        #     schema=GetSchema()
        # )
        | "Write to file" >> WriteToText(raw_output)
        #|
        #| "Save" >> beam.ParDo(Save())
    )
    return pipeline
コード例 #19
0
def run():

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    sentiments = (
        p
        | "Read From Text" >>
        ReadFromText("doc_sentiment.txt",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Doc, SentimentScore Tuple" >>
        beam.Map(lambda x: (x.split(" ")[0], x.split(" ")[1])))

    nes = (
        p
        | "Read Named Entites" >>
        ReadFromText("doc_nes.txt",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Doc, Entities Tuple" >> beam.Map(lambda x: eval(x)))

    def process_nes_sentiment((doc, nes_sentiment)):
        neslist = nes_sentiment["nes"]
        st = nes_sentiment["sentiment"][0]
        for nes in neslist:
            for ne in nes:
                yield (ne[0], ne[1], st)

    g = ({
        "nes": nes,
        "sentiment": sentiments
    }
         | beam.CoGroupByKey()
         | beam.FlatMap(process_nes_sentiment))

    (g | "Write Results" >> WriteToText("ne_sentiment.txt"))

    p.run()
コード例 #20
0
def main():
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger('dataflow_poc')

    input_file = INPUT_FILEPATH
    summary_file = SUMMARY_FILEPATH
    output_file = OUTPUT_FILEPATH

    options = PipelineOptions()
    gcloud_options = options.view_as(GoogleCloudOptions)
    worker_options = options.view_as(WorkerOptions)
    gcloud_options.project = PROJECT_ID
    gcloud_options.temp_location = OUTPUT_TEMP_FILEPATH
    worker_options.num_workers = START_WORKERS
    worker_options.max_num_workers = MAX_WORKERS
    gcloud_options.job_name = 'csv-transform'

    options.view_as(StandardOptions).runner = RUNNER
    logger.info('Ready to load the file')
    with apache_beam.Pipeline(options=options) as pipe:
        datarows = pipe | ReadFromText(input_file) | apache_beam.ParDo(FilterHeader(header)) | apache_beam.ParDo(Parse())
        datarows | apache_beam.ParDo(Summary()) | "WriteSummary" >>  WriteToText(summary_file)
        datarows | apache_beam.ParDo(FilterTrainingDays()) | apache_beam.CombineGlobally(sum) |  "WriteCount" >> WriteToText(output_file)
コード例 #21
0
def main(source_path, destination_path, args):
    """
    defining the whole pipeline
    """
    p = beam.Pipeline(argv = args)

    values = (
        p | "ReadCSV" >> ReadFromText(source_path, skip_header_lines = True)
          | beam.ParDo(Split())
    )

    mean_item_id = (
        values | beam.ParDo(CollectOpen()) |
        "Grouping Keys Open" >> beam.GroupByKey() |
        "Calculating Mean for item price" >> beam.CombineValues(
            beam.combiners.MeanCombineFn()
        )
    )

    output = (
        mean_item_id | "WriteCSV" >> WriteToText(destination_path, file_name_suffix = ".csv")
    )

    p.run()
コード例 #22
0
def run():
    import pickle
    import sys

    import math

    import numpy as np
    import apache_beam as beam

    reload(sys)
    sys.setdefaultencoding('utf8')

    import argparse
    import simplejson
    from gensim.models import KeyedVectors

    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.textio import ReadFromText, WriteToText
    import nltk.data
    from nltk.tokenize import WordPunctTokenizer
    import re
    import uuid
    import perceptron
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram
    """
    vocabulary = (tokens
            | "Get words only" >> beam.Values()
            | "Remove duplicate words" >> beam.RemoveDuplicates()
        )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (
        tokens
        | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1)))
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (
        tokens_paired_with_1
        | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt):
                                               (doc, get_vec(word2vec, token)))
        | "Group Word2Vec Vectors By Document" >> beam.CombinePerKey(sum)
        | "Sum Word2Vec Vectors" >> beam.Map(lambda (doc, vec):
                                             (doc, analyze_sentiment(vec)[0])))

    result = (doc_sentiment | "Format  Results" >>
              beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens)))

    (result | "Write Results" >> WriteToText("sentiments"))

    p.run()
コード例 #23
0
with apache_beam.Pipeline(options=options) as p:
    stamps = (
        p | "Reading CSV" >> ReadFromText(input_filename, skip_header_lines=1)
        | "Parsing CSV" >> apache_beam.ParDo(Split())
        | "Getting stamps" >> apache_beam.ParDo(CollectPixelVals()))

    psc_collection = (stamps | "Making PSCs" >> apache_beam.GroupByKey())

    # calculate the mean for Open values
    stamp_sums = (
        psc_collection
        | "Getting stamp sum" >> apache_beam.CombineValues(SumCombineFn()))

    #normalized = (
    #    {
    #        'stamps': stamps,
    #        'sums': stamp_sums,
    #    } |
    #    "Grouping together" >> apache_beam.CoGroupByKey() |
    #    "NormalizingFlux" >> apache_beam.ParDo(GetNormal())
    #    )
    #
    #results = (
    #    normalized |
    #    "Getting normal PSC" >> apache_beam.GroupByKey()
    #)

    output = (stamp_sums | "Formatting CSV" >> apache_beam.ParDo(WriteToCSV())
              | "Writing CSV" >> WriteToText(output_filename))
コード例 #24
0
ファイル: main.py プロジェクト: guedes-joaofelipe/DataScience
                                          skip_header_lines=1)
    | "From text to list (rain)" >> beam.Map(text_to_list, delimiter=',')
    | "Create key UF-YEAR-MONTH" >> beam.Map(key_uf_year_month)
    | "Sum of total rain by key" >> beam.CombinePerKey(sum)  # operacao pesada
    | "Round rain results" >> beam.Map(arredonda)
    # | "Show results" >> beam.Map(print)
)

result = (
    # (chuvas, dengue)
    # | "Pile PCollections" >> beam.Flatten()
    # | "GroupByKey" >> beam.GroupByKey()
    ({
        'chuvas': chuvas,
        'dengue': dengue
    })
    | "Merge PCollections" >> beam.CoGroupByKey()
    | "Filter empty data" >> beam.Filter(filter_empty_fields)
    | "Unzip elements" >> beam.Map(unzip_elements)
    | "Prepare csv" >> beam.Map(prepare_csv)
    # | "Show union results" >> beam.Map(print)
)

header = 'state;year;month;rain;dengue'
result | 'Create CSV file' >> WriteToText(
    './data/result',
    file_name_suffix='.csv',
    num_shards=2,  # number of files used for output
    header=header)

pipeline.run()
コード例 #25
0
def run():
    import pickle
    import sys

    import math

    import numpy as np

    reload(sys)
    sys.setdefaultencoding('utf8')

    from gensim.models import KeyedVectors
    import apache_beam as beam
    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
    from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
    from google.cloud.proto.datastore.v1 import query_pb2
    from apache_beam.io.textio import WriteToText
    import nltk.data
    import re
    import uuid
    import perceptron

    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    model_file = "perceptron_word2vec_stemmed_normalized.pickle"
    with open(model_file, 'rb') as model:
        w, b = pickle.load(model)

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split(" ") # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]


    def convertToObject(jsonObj):
        x = jsonObj

        link = x.properties.get('link', None)
        link = link.string_value if link else ""

        title = x.properties.get('title', None)
        title = title.string_value if title else ""

        description = x.properties.get("description", None)
        description = description.string_value if description else ""

        content = x.properties.get("text", "")
        content = content.string_value if content else ""

        published = x.properties.get("published")
        published = published.string_value if published else ""

        obj = {
            "link": link,
            "title": title,
            "description": description,
            "content": content,
            "published": published
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (
            sentences_from_text(obj["title"]) +
            sentences_from_text(obj["description"]) +
            sentences_from_text(obj["content"])
        )

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    options = PipelineOptions()
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'news-197916'
    google_cloud_options.job_name = 'sentiment-analysis'
    google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/'
    google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    setup_options = options.view_as(SetupOptions)
    setup_options.requirements_file = "requirements.txt"
    setup_options.save_main_session = True

    p = beam.Pipeline(options=options)
    query = query_pb2.Query()
    query.kind.add().name = "News_Entry"

    pairs = (p
            | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query)
        #     | "Read From Text" >> ReadFromText("news.json", coder=beam.coders.coders.StrUtf8Coder()) # line by line
        #     | "Convert to Json Object" >> beam.Map(convertToJsonObj)
             | "Convert to Python Object" >> beam.Map(convertToObject)
             | "Remove HTML Tags From Strings (Normalization 1)" >> beam.Map(removeHTMLFromStrings)
    )

    tokens_1gram = (pairs
                    | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
                    | 'Word Tokenization' >> beam.FlatMap(tokenize_to_words)  # also convert to key value pairs
                    )
    """
    tokens_2gram = (pairs
            | "Create 2-grams" >> beam.FlatMap(lambda obj: [(obj["key"], token) for token in ngrams(obj, 2)])
        )
    """

    tokens = tokens_1gram

    """
    vocabulary = (tokens
                  | "Get words only" >> beam.Values()
                  | "Remove duplicate words" >> beam.RemoveDuplicates()
                  )
    vocabulary_size = (vocabulary
            | "Count Vocabulary elements" >> beam.combiners.Count.Globally()
        )

    doc_total_words = (tokens
            | "Count Words of Doc" >> beam.combiners.Count.PerKey()
    )
    """

    tokens_paired_with_1 = (tokens
                            | "Pair with 1" >> beam.Map(lambda (doc, token): ((doc, token), 1))
                            )
    """
    token_counts_per_doc = (tokens_paired_with_1
            | "Group by Doc,Word" >> beam.GroupByKey()
            | "Count ones" >> beam.Map(lambda ((doc, token), counts): (doc, (token, sum(counts))))
            | "Group by Doc" >> beam.GroupByKey()
        )



    num_docs = (token_counts_per_doc
            | "Get Docs" >> beam.Keys()
            | "Count Docs" >> beam.combiners.Count.Globally()
    )


    word_tf_pre = (
        { 'total_tokens': doc_total_words, 'token_counts_per_doc': token_counts_per_doc }
        | "CoGroup By Document" >> beam.CoGroupByKey()
    )

    def calc_tf((doc, count)):
        [token_count] = count['token_counts_per_doc']

        [tokens_total] = count['total_tokens']

        for token, cnt in token_count:
            yield token, (doc, float(cnt) / tokens_total)


    doc_word_tf = (word_tf_pre
        | "Compute Term Frequencies" >> beam.FlatMap(calc_tf)
        )

    word_occurrences = (tokens
        | "Remove Multiple occurrences per doc" >> beam.RemoveDuplicates()
        | "Pair with 1s" >> beam.Map(lambda (doc, word): (word, 1))
        | "Group by Word" >> beam.GroupByKey()
        | "Sum 1s" >> beam.Map(lambda (word, counts): (word, sum(counts)))
    )

    token_df = (
        word_occurrences
        | "Compute Document Frequency">> beam.Map(lambda (token, count), total: (token, float(count) / total), AsSingleton(num_docs)))

    token_tf_df = (
        { 'term_frequency': doc_word_tf, 'document_frequency': token_df}
        | "CoGroup By Token" >> beam.CoGroupByKey())

    def calc_tfidf((token, tfdf)):
      [df] = tfdf['document_frequency']
      for doc, tf in tfdf['term_frequency']:
        yield (doc, token), tf * math.log(1.0 / df)

    token_tf_idf = (token_tf_df
        | "Calculate TF-IDF Scores" >> beam.FlatMap(calc_tfidf)
    )
    """

    word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

    def get_vec(word2vec, token):
        if word2vec is None:
            word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)

        try:
            x = word2vec.get_vector(token)
            x = x.reshape(400)
        except:
            x = np.zeros(400)

        return x

    def analyze_sentiment(x):

        res = perceptron.f(x, w, b)

        return res

    doc_sentiment = (tokens_paired_with_1
                     | "Create Word2Vec Vector" >> beam.Map(lambda ((doc, token), cnt): (doc, get_vec(word2vec, token)))
                     | "Group Word2Vec Vectors By Document" >> beam.GroupByKey()
                     | "Sum Word2Vec Vectors" >> beam.Map(
        lambda (doc, vecs): (doc, analyze_sentiment(np.sum(vecs, axis=0))[0]))
                     )

    result = (doc_sentiment |
              "Format  Results" >> beam.Map(lambda (doc, tokens): '%s %s' % (doc, tokens))
              )

    (result
     | "Write Results" >> WriteToText("sentiments")
     )

    p.run()
コード例 #26
0
)

chuvas = (
    pipeline
    | "Leitura do dataset de chuvas" >> ReadFromParquet('chuvas.parquet')
    |
    'Chuvas - Criar chave uf_ano_mes' >> beam.Map(chuva_chave_uf_ano_mes_lista)
    | 'Chuvas - Soma dos mm pela chave' >> beam.CombinePerKey(sum)
    | 'Chuvas - Arredondar resultados' >> beam.Map(arredonda)
    #| "Chuvas teste - Mostrar resultados" >> beam.Map(print)
)

final = (
    ({
        'chuvas': chuvas,
        'dengue': dengue
    })
    | "Final - Mesclar pcols" >> beam.CoGroupByKey()
    | "Final - filtrar valores vazios" >> beam.Filter(filtra_campos_nao_vazios)
    | "Final - descompacta a saída" >> beam.Map(descompacta_elementos)
    | "Final - preparar csv" >> beam.Map(preparar_csv)

    #| "Final - Mostrar resultados" >> beam.Map(print)
)

header = 'UF;ANO;MES;CHUVA;DENGUE'

final | "Criar arquivo csv" >> WriteToText(
    'final', file_name_suffix='.csv', header=header)

pipeline.run()
def run(p, args, aggregator_dict):
    """Run the pipeline with the args and dataflow pipeline option."""
    # Create a PCollection for model directory.
    model_dir = p | "Create Model Directory" >> beam.Create([args.model_dir])

    input_file_format = args.input_file_format.lower()
    input_file_patterns = args.input_file_patterns

    # Setup reader.
    if input_file_format == "json":
        reader = p | "READ_TEXT_FILES" >> ReadFromMultiFilesText(
            input_file_patterns)
    elif input_file_format == "tfrecord":
        reader = p | "READ_TF_FILES" >> ReadFromMultiFilesTFRecord(
            input_file_patterns)
    elif input_file_format == "tfrecord_gzip":
        reader = p | "READ_TFGZIP_FILES" >> ReadFromMultiFilesTFRecordGZip(
            input_file_patterns)

    # Setup the whole pipeline.
    results, errors = (reader
                       | "BATCH_PREDICTION" >> batch_prediction.BatchPredict(
                           beam.pvalue.AsSingleton(model_dir),
                           tags=args.tags,
                           signature_name=args.signature_name,
                           batch_size=args.batch_size,
                           aggregator_dict=aggregator_dict,
                           user_project_id=args.user_project_id,
                           user_job_id=args.user_job_id,
                           framework=args.framework))

    output_file_format = args.output_file_format.lower()
    # Convert predictions to target format and then write to output files.
    if output_file_format == "json":
        _ = (results
             | "TO_JSON" >> beam.Map(json.dumps)
             | "WRITE_PREDICTION_RESULTS" >> WriteToText(
                 args.output_result_prefix))
    elif output_file_format == "csv":
        fields = (
            results
            | "SAMPLE_SINGLE_ELEMENT" >> Sample.FixedSizeGlobally(1)
            | "GET_KEYS" >> beam.Map(
                # entry could be None if no inputs were valid
                lambda entry: entry[0].keys() if entry else []))
        _ = (fields
             | "KEYS_TO_CSV" >> beam.Map(keys_to_csv)
             | "WRITE_KEYS" >> WriteToText(args.output_result_prefix,
                                           file_name_suffix="_header.csv",
                                           shard_name_template=""))
        _ = (results
             | "VALUES_TO_CSV" >> beam.Map(values_to_csv,
                                           beam.pvalue.AsSingleton(fields))
             | "WRITE_PREDICTION_RESULTS" >> WriteToText(
                 args.output_result_prefix,
                 file_name_suffix=".csv",
                 append_trailing_newlines=False))
    # Write prediction errors counts to output files.
    _ = (errors
         | "GROUP_BY_ERROR_TYPE" >> beam.combiners.Count.PerKey()
         | "WRITE_ERRORS" >> WriteToText(args.output_error_prefix))

    return p.run()
コード例 #28
0
                                                     skip_header_lines=1)
    | "De texto para lista (chuvas)" >> beam.Map(texto_para_lista,
                                                 delimitador=',')
    | "Criando a chave UF-ANO-MES" >> beam.Map(chave_uf_ano_mes_de_lista)
    | "Soma do total de chuvas pela chave" >> beam.CombinePerKey(sum)
    | "Arrendondar resultados de chuvas" >> beam.Map(arredonda)
    # | "Mostrar resultados" >> beam.Map(print)
)

resultado = (
    # (chuvas, dengue)
    # | "Empilha as pcols" >> beam.Flatten()
    # | "Agrupa as pcols" >> beam.GroupByKey()
    ({
        'chuvas': chuvas,
        'dengue': dengue
    })
    | 'Mesclar pcols' >> beam.CoGroupByKey()
    | 'Filtrar dados vazios' >> beam.Filter(filtra_campos_vazios)
    | 'Descompactar elementos' >> beam.Map(descompactar_elementos)
    | 'Preparar csv' >> beam.Map(preparar_csv)
    # | "Mostrar resultados da união" >> beam.Map(print)
)

# uf, ano, mes, str(chuva), str(dengue)
header = 'UF;ANO;MES;CHUVA;DENGUE'

resultado | 'Criar arquivo CSV' >> WriteToText(
    'resultado', file_name_suffix='.csv', header=header)

pipeline.run()
コード例 #29
0
class WriteToCSV(beam.DoFn):
    def process(self, element):
        """
        Prepares each row to be written in the csv
        """
        result = [
            "{},{},{}".format(element[0], element[1]['users'][0],
                              element[1]['timings'][0])
        ]
        return result


with beam.Pipeline(options=options) as p:
    rows = (p | ReadFromText(input_filename) | beam.ParDo(Split()))

    timings = (rows | beam.ParDo(CollectTimings())
               | "Grouping timings" >> beam.GroupByKey()
               | "Calculating average" >> beam.CombineValues(
                   beam.combiners.MeanCombineFn()))

    users = (rows | beam.ParDo(CollectUsers())
             | "Grouping users" >> beam.GroupByKey() | "Counting users" >>
             beam.CombineValues(beam.combiners.CountCombineFn()))

    to_be_joined = ({
        'timings': timings,
        'users': users
    } | beam.CoGroupByKey() | beam.ParDo(WriteToCSV())
                    | WriteToText(output_filename))
コード例 #30
0
def run():
    # Sentences From Text
    _sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
    word_tokenizer = WordPunctTokenizer()
    abbreviations = set()
    with open("./tokenizer/abbreviations-long.txt") as f:
        for l in f:
            abbreviations.add(l.split(':')[0])

    _sentence_tokenizer._params.abbrev_types = abbreviations

    def sentences_from_text(text):
        return _sentence_tokenizer.tokenize(text.strip())

    def tokens_from_sentence(sentence):
        return sentence.split()  # nltk.word_tokenize(sentence)

    def ngrams(obj, n):
        tokens = []
        sentences = (sentences_from_text(obj["title"]) +
                     sentences_from_text(obj["description"]) +
                     sentences_from_text(obj["content"]))

        for sentence in sentences:
            tokens += tokens_from_sentence(sentence)

        pairs = nltk.ngrams(tokens, n)
        return [" ".join(pair) for pair in pairs]

    def convertToJsonObj(jsonText):
        return simplejson.loads(jsonText)

    def convertToObject(jsonObj):
        x = jsonObj

        obj = {
            "title":
            x.get("properties", {}).get("title", {}).get("stringValue", ""),
            "link":
            x.get("properties", {}).get("link", {}).get("stringValue", ""),
            "published":
            x.get("properties", {}).get("published",
                                        {}).get("stringValue", ""),
            "description":
            x.get("properties", {}).get("description",
                                        {}).get("stringValue", ""),
            "content":
            x.get("properties", {}).get("content", {}).get("stringValue", ""),
        }

        obj["key"] = obj["link"] if obj["link"] else str(uuid.uuid4())

        return obj

    # https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
    def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext

    def removeHTMLFromStrings(obj):
        for key in obj.keys():
            obj[key] = cleanhtml(obj[key])

        return obj

    def tokenize_to_sentences(obj):

        obj["sentences"] = (sentences_from_text(obj["title"]) +
                            sentences_from_text(obj["description"]) +
                            sentences_from_text(obj["content"]))

        return obj

    def tokenize_to_words(obj):

        obj["tokens"] = []

        for sentence in obj["sentences"]:
            obj["tokens"] += tokens_from_sentence(sentence)

        for token in obj["tokens"]:
            yield (obj["key"], token)

    def get_named_entities(mdl, tokens):
        stemmer = TurkishStemmer()
        res = mdl.analyze(tokens)
        entities = []
        for entity in res["entities"]:
            for entity2 in entity["text"].split(", "):
                ne = stemmer.stem(entity2).split("'")[0]
                entities.append((entity["type"], ne, entity["score"]))
        return entities

    options = PipelineOptions()
    options.view_as(StandardOptions).runner = 'DirectRunner'

    p = beam.Pipeline(options=options)

    pairs = (
        p
        | "Read From Text" >>
        ReadFromText("news.json",
                     coder=beam.coders.coders.StrUtf8Coder())  # line by line
        | "Convert to Json Object" >> beam.Map(convertToJsonObj)
        | "Convert to Python Object" >> beam.Map(convertToObject)
        | "Remove HTML Tags From Strings (Normalization 1)" >>
        beam.Map(removeHTMLFromStrings))

    tokens_1gram = (
        pairs
        | 'Sentence Tokenization' >> beam.Map(tokenize_to_sentences)
        | 'Word Tokenization' >> beam.FlatMap(
            tokenize_to_words)  # also convert to key value pairs
    )

    tokens = tokens_1gram

    def process_tokens_last(doc, tokens):
        return (doc, get_named_entities(tokens))

    doc_named_entities = (
        tokens
        | beam.GroupByKey()
        #     | beam.Map(lambda (doc, tokens): process_tokens_last(mdl, tokens))
    )

    (doc_named_entities | "Write Results" >> WriteToText("doc_tokens"))

    p.run()