Esempio n. 1
0
    def preprocess(self, input_path, input_dict, output_path):
        """

    Args:
      input_path: Input specified as uri to CSV file. Each line of csv file
                  contains colon-separated GCS uri to an image and labels
      input_dict: Input dictionary. Specified as text file uri.
                  Each line of the file stores one label.
    """
        opt = self.pipeline_options.view_as(PrepareImagesOptions)
        p = df.Pipeline(options=self.pipeline_options)

        # Read input data.
        csv_data = df.io.TextFileSource(input_path,
                                        strip_trailing_newlines=True)
        dict_data = df.io.TextFileSource(input_dict,
                                         strip_trailing_newlines=True)
        labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data))
        content = (p | df.Read(StageName.READ_CSV, csv_data)
                   | df.Map(StageName.PARSE_CSV,
                            lambda line: csv.reader([line]).next())
                   | df.ParDo(StageName.EXTRACT_LABEL_IDS,
                              ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels))
                   | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn()))

        # Process input data using common transformations.
        image_graph_uri = os.path.join(opt.input_data_location,
                                       Default.IMAGE_GRAPH_FILENAME)
        examples = (
            content
            | df.ParDo(
                StageName.CONVERT_IMAGE,
                ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width,
                                opt.max_image_height))
            | df.ParDo(
                StageName.ENCODE_EXAMPLE,
                EncodeExampleDoFn(image_graph_uri,
                                  opt.image_graph_jpeg_input_tensor,
                                  opt.image_graph_output_tensor,
                                  opt.training_data_percentage)))

        # Write in JSON format to Text file.
        # Remove redundant whitespace for more compact representation.
        # Images/labels are base64 encoded so will not contain spaces.
        to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0])
                                   )

        for dataset in Dataset.ALL:
            _ = (examples
                 | df.Filter(StageName.FILTER + dataset,
                             lambda x, dataset=dataset: x[1] == dataset)
                 | df.Map(StageName.TO_JSON + dataset, to_json)
                 | df.Write(
                     StageName.SAVE + dataset,
                     df.io.TextFileSink('{}.{}.json'.format(
                         output_path, dataset),
                                        num_shards=opt.output_shard_count)))

        # Execute the pipeline.
        p.run()
def run(argv=None):
  """Runs the Wikipedia top edits pipeline.

  Args:
    argv: Pipeline options as a list of arguments.
  """

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/wikipedia_edits/*.json',
      help='Input specified as a GCS path containing a BigQuery table exported '
      'as json.')
  parser.add_argument('--output',
                      required=True,
                      help='Output file to write results to.')
  parser.add_argument('--sampling_threshold',
                      type=float,
                      default=0.1,
                      help='Fraction of entries used for session tracking')
  known_args, pipeline_args = parser.parse_known_args(argv)

  p = df.Pipeline(argv=pipeline_args)

  (p  # pylint: disable=expression-not-assigned
   | df.Read('read', df.io.TextFileSource(known_args.input))
   | ComputeTopSessions(known_args.sampling_threshold)
   | df.io.Write('write', df.io.TextFileSink(known_args.output)))

  p.run()
Esempio n. 3
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = df.Pipeline('DirectPipelineRunner')
    pcoll = pipeline | df.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
Esempio n. 4
0
def run(argv=None):
    """Constructs and runs the example filtering pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        help='BigQuery table to read from.',
        default='clouddataflow-readonly:samples.weather_stations')
    parser.add_argument('--output',
                        required=True,
                        help='BigQuery table to write to.')
    parser.add_argument('--month_filter',
                        default=7,
                        help='Numeric value of month to filter on.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    input_data = p | df.Read('input', df.io.BigQuerySource(known_args.input))

    # pylint: disable=expression-not-assigned
    (filter_cold_days(input_data, known_args.month_filter)
     | df.io.Write(
         'save to BQ',
         df.io.BigQuerySink(
             known_args.output,
             schema='year:INTEGER,month:INTEGER,day:INTEGER,mean_temp:FLOAT',
             create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Esempio n. 5
0
def run(argv=None):
    """Runs the workflow counting the long words and short words separately."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output prefix for files to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = df.Pipeline(argv=pipeline_args)

    lines = p | df.Read('read', df.io.TextFileSource(known_args.input))

    # with_outputs allows accessing the side outputs of a DoFn.
    split_lines_result = (
        lines
        | df.ParDo(SplitLinesToWordsFn()).with_outputs(
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS,
            SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT,
            main='words'))

    # split_lines_result is an object of type DoOutputsTuple. It supports
    # accessing result in alternative ways.
    words, _, _ = split_lines_result
    short_words = split_lines_result[
        SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS]
    character_count = split_lines_result.tag_character_count

    # pylint: disable=expression-not-assigned
    (character_count
     | df.Map('pair_with_key', lambda x: ('chars_temp_key', x))
     | df.GroupByKey()
     | df.Map('count chars', lambda (_, counts): sum(counts))
     | df.Write('write chars',
                df.io.TextFileSink(known_args.output + '-chars')))

    # pylint: disable=expression-not-assigned
    (short_words
     | CountWords('count short words')
     | df.Write('write short words',
                df.io.TextFileSink(known_args.output + '-short-words')))

    # pylint: disable=expression-not-assigned
    (words
     | CountWords('count words')
     | df.Write('write words',
                df.io.TextFileSink(known_args.output + '-words')))

    p.run()
Esempio n. 6
0
def run(argv=None):
    """Run the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--ignore_corpus', default='')
    parser.add_argument('--ignore_word', default='')
    parser.add_argument('--num_groups')

    known_args, pipeline_args = parser.parse_known_args(argv)
    p = df.Pipeline(argv=pipeline_args)

    group_ids = []
    for i in xrange(0, int(known_args.num_groups)):
        group_ids.append('id' + str(i))

    query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
    query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
    ignore_corpus = known_args.ignore_corpus
    ignore_word = known_args.ignore_word

    pcoll_corpus = p | df.Read('read corpus',
                               df.io.BigQuerySource(query=query_corpus))
    pcoll_word = p | df.Read('read words',
                             df.io.BigQuerySource(query=query_word))
    pcoll_ignore_corpus = p | df.Create('create_ignore_corpus',
                                        [ignore_corpus])
    pcoll_ignore_word = p | df.Create('create_ignore_word', [ignore_word])
    pcoll_group_ids = p | df.Create('create groups', group_ids)

    pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word,
                                 pcoll_ignore_corpus, pcoll_ignore_word)

    # pylint:disable=expression-not-assigned
    pcoll_groups | df.io.Write('WriteToText',
                               df.io.TextFileSink(known_args.output))
    p.run()
Esempio n. 7
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='BigQuery request input table.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output table.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    output_table = '%s' % known_args.output
    input_query = """
    SELECT
      page, url,
      DOMAIN(page) as domain,
      IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party,
    FROM [%s]
  """ % known_args.input

    classifiers = {}
    for file in ['ad', 'tracker', 'social']:
        rules = [line.rstrip('\n') for line in open('local/' + file + '.txt')]
        classifier = AdblockRules(rules,
                                  supported_options=['domain', 'third-party'],
                                  skip_unsupported_rules=False,
                                  use_re2=True)
        del rules
        classifiers[file] = classifier

    p = df.Pipeline(argv=pipeline_args)

    (p
     | df.Read('read', df.io.BigQuerySource(query=input_query))
     | df.ParDo('classify', EasylistClassifyDoFn(), classifiers)
     # | df.io.Write('write', df.io.TextFileSink('out')))
     | df.Write(
         'write',
         df.io.BigQuerySink(
             output_table,
             schema='page:STRING, url:STRING, type:STRING',
             create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE)))

    p.run()
Esempio n. 8
0
              Sagittarius=((11, 22), (12, 21)),
              Capricron=((12, 21), (1, 19)),
              Aquarius=((1, 20), (2, 18)),
              Pisces=((2, 19), (3, 20)))


def get_zodiac_sign(line):
    name, day, month = line.split(',')

    d = int(day)
    m = int(month)

    for sign, (s, e) in zodiac.iteritems():
        # special case for Capricorn
        if (m == 12 and d >= 21) or (m == 1 and d <= 19):
            return 'Capricorn'

        if s[0] <= m <= e[0]:
            if (m == s[0] and d >= s[1]) or (m == e[0] and d <= e[1]):
                return sign
    return


p = df.Pipeline('DirectPipelineRunner')
(p
 | df.Read('load messages', df.io.TextFileSource('./player_birth_dates.csv'))
 | df.Map('get zodiac sign', get_zodiac_sign)
 | df.combiners.Count.PerElement('count signscount words -> count ')
 | df.Write('save', df.io.TextFileSink('./results')))
p.run()