def preprocess(self, input_path, input_dict, output_path): """ Args: input_path: Input specified as uri to CSV file. Each line of csv file contains colon-separated GCS uri to an image and labels input_dict: Input dictionary. Specified as text file uri. Each line of the file stores one label. """ opt = self.pipeline_options.view_as(PrepareImagesOptions) p = df.Pipeline(options=self.pipeline_options) # Read input data. csv_data = df.io.TextFileSource(input_path, strip_trailing_newlines=True) dict_data = df.io.TextFileSource(input_dict, strip_trailing_newlines=True) labels = (p | df.Read(StageName.READ_DICTIONARY, dict_data)) content = (p | df.Read(StageName.READ_CSV, csv_data) | df.Map(StageName.PARSE_CSV, lambda line: csv.reader([line]).next()) | df.ParDo(StageName.EXTRACT_LABEL_IDS, ExtractLabelIdsDoFn(), df.pvalue.AsIter(labels)) | df.ParDo(StageName.READ_IMAGE, ExtractImageDoFn())) # Process input data using common transformations. image_graph_uri = os.path.join(opt.input_data_location, Default.IMAGE_GRAPH_FILENAME) examples = ( content | df.ParDo( StageName.CONVERT_IMAGE, ResizeImageDoFn(Default.IMAGE_TYPE, opt.max_image_width, opt.max_image_height)) | df.ParDo( StageName.ENCODE_EXAMPLE, EncodeExampleDoFn(image_graph_uri, opt.image_graph_jpeg_input_tensor, opt.image_graph_output_tensor, opt.training_data_percentage))) # Write in JSON format to Text file. # Remove redundant whitespace for more compact representation. # Images/labels are base64 encoded so will not contain spaces. to_json = lambda x: re.sub(r'\s+', ' ', json_format.MessageToJson(x[0]) ) for dataset in Dataset.ALL: _ = (examples | df.Filter(StageName.FILTER + dataset, lambda x, dataset=dataset: x[1] == dataset) | df.Map(StageName.TO_JSON + dataset, to_json) | df.Write( StageName.SAVE + dataset, df.io.TextFileSink('{}.{}.json'.format( output_path, dataset), num_shards=opt.output_shard_count))) # Execute the pipeline. p.run()
def run(argv=None): """Runs the Wikipedia top edits pipeline. Args: argv: Pipeline options as a list of arguments. """ parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/wikipedia_edits/*.json', help='Input specified as a GCS path containing a BigQuery table exported ' 'as json.') parser.add_argument('--output', required=True, help='Output file to write results to.') parser.add_argument('--sampling_threshold', type=float, default=0.1, help='Fraction of entries used for session tracking') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) (p # pylint: disable=expression-not-assigned | df.Read('read', df.io.TextFileSource(known_args.input)) | ComputeTopSessions(known_args.sampling_threshold) | df.io.Write('write', df.io.TextFileSink(known_args.output))) p.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = df.Pipeline('DirectPipelineRunner') pcoll = pipeline | df.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def run(argv=None): """Constructs and runs the example filtering pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', help='BigQuery table to read from.', default='clouddataflow-readonly:samples.weather_stations') parser.add_argument('--output', required=True, help='BigQuery table to write to.') parser.add_argument('--month_filter', default=7, help='Numeric value of month to filter on.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) input_data = p | df.Read('input', df.io.BigQuerySource(known_args.input)) # pylint: disable=expression-not-assigned (filter_cold_days(input_data, known_args.month_filter) | df.io.Write( 'save to BQ', df.io.BigQuerySink( known_args.output, schema='year:INTEGER,month:INTEGER,day:INTEGER,mean_temp:FLOAT', create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))) # Actually run the pipeline (all operations above are deferred). p.run()
def run(argv=None): """Runs the workflow counting the long words and short words separately.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', required=True, help='Output prefix for files to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) lines = p | df.Read('read', df.io.TextFileSource(known_args.input)) # with_outputs allows accessing the side outputs of a DoFn. split_lines_result = ( lines | df.ParDo(SplitLinesToWordsFn()).with_outputs( SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS, SplitLinesToWordsFn.SIDE_OUTPUT_TAG_CHARACTER_COUNT, main='words')) # split_lines_result is an object of type DoOutputsTuple. It supports # accessing result in alternative ways. words, _, _ = split_lines_result short_words = split_lines_result[ SplitLinesToWordsFn.SIDE_OUTPUT_TAG_SHORT_WORDS] character_count = split_lines_result.tag_character_count # pylint: disable=expression-not-assigned (character_count | df.Map('pair_with_key', lambda x: ('chars_temp_key', x)) | df.GroupByKey() | df.Map('count chars', lambda (_, counts): sum(counts)) | df.Write('write chars', df.io.TextFileSink(known_args.output + '-chars'))) # pylint: disable=expression-not-assigned (short_words | CountWords('count short words') | df.Write('write short words', df.io.TextFileSink(known_args.output + '-short-words'))) # pylint: disable=expression-not-assigned (words | CountWords('count words') | df.Write('write words', df.io.TextFileSink(known_args.output + '-words'))) p.run()
def run(argv=None): """Run the workflow.""" parser = argparse.ArgumentParser() parser.add_argument('--output') parser.add_argument('--ignore_corpus', default='') parser.add_argument('--ignore_word', default='') parser.add_argument('--num_groups') known_args, pipeline_args = parser.parse_known_args(argv) p = df.Pipeline(argv=pipeline_args) group_ids = [] for i in xrange(0, int(known_args.num_groups)): group_ids.append('id' + str(i)) query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare' query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare' ignore_corpus = known_args.ignore_corpus ignore_word = known_args.ignore_word pcoll_corpus = p | df.Read('read corpus', df.io.BigQuerySource(query=query_corpus)) pcoll_word = p | df.Read('read words', df.io.BigQuerySource(query=query_word)) pcoll_ignore_corpus = p | df.Create('create_ignore_corpus', [ignore_corpus]) pcoll_ignore_word = p | df.Create('create_ignore_word', [ignore_word]) pcoll_group_ids = p | df.Create('create groups', group_ids) pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word, pcoll_ignore_corpus, pcoll_ignore_word) # pylint:disable=expression-not-assigned pcoll_groups | df.io.Write('WriteToText', df.io.TextFileSink(known_args.output)) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', required=True, help='BigQuery request input table.') parser.add_argument('--output', dest='output', help='BigQuery output table.') known_args, pipeline_args = parser.parse_known_args(argv) output_table = '%s' % known_args.output input_query = """ SELECT page, url, DOMAIN(page) as domain, IF (DOMAIN(page) == DOMAIN(url), false, true) AS third_party, FROM [%s] """ % known_args.input classifiers = {} for file in ['ad', 'tracker', 'social']: rules = [line.rstrip('\n') for line in open('local/' + file + '.txt')] classifier = AdblockRules(rules, supported_options=['domain', 'third-party'], skip_unsupported_rules=False, use_re2=True) del rules classifiers[file] = classifier p = df.Pipeline(argv=pipeline_args) (p | df.Read('read', df.io.BigQuerySource(query=input_query)) | df.ParDo('classify', EasylistClassifyDoFn(), classifiers) # | df.io.Write('write', df.io.TextFileSink('out'))) | df.Write( 'write', df.io.BigQuerySink( output_table, schema='page:STRING, url:STRING, type:STRING', create_disposition=df.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=df.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
Sagittarius=((11, 22), (12, 21)), Capricron=((12, 21), (1, 19)), Aquarius=((1, 20), (2, 18)), Pisces=((2, 19), (3, 20))) def get_zodiac_sign(line): name, day, month = line.split(',') d = int(day) m = int(month) for sign, (s, e) in zodiac.iteritems(): # special case for Capricorn if (m == 12 and d >= 21) or (m == 1 and d <= 19): return 'Capricorn' if s[0] <= m <= e[0]: if (m == s[0] and d >= s[1]) or (m == e[0] and d <= e[1]): return sign return p = df.Pipeline('DirectPipelineRunner') (p | df.Read('load messages', df.io.TextFileSource('./player_birth_dates.csv')) | df.Map('get zodiac sign', get_zodiac_sign) | df.combiners.Count.PerElement('count signscount words -> count ') | df.Write('save', df.io.TextFileSink('./results'))) p.run()