def pipeline(root): """Beam pipeline to run.""" file_patterns = FLAGS.input_patterns.split(',') featurization_config = lib.EtcFeaturizationConfig( long_max_length=FLAGS.long_max_length, global_max_length=FLAGS.global_max_length, url_max_code_points=FLAGS.url_max_code_points, bert_vocab_path=FLAGS.bert_vocab_path, spm_model_path=FLAGS.spm_model_path, do_lower_case=FLAGS.do_lower_case, fixed_block_len=FLAGS.fixed_block_len) stats = collections.OrderedDict() for i, pattern in enumerate(file_patterns): prefix_str = 'Pattern' + str(i) outputs = (root | f'{prefix_str}Read' >> beam.io.textio.ReadFromText(pattern) | f'{prefix_str}Reshuffle' >> beam.transforms.Reshuffle() | f'{prefix_str}Parse' >> beam.ParDo( beam_utils.ParseExampleFn( featurization_config)).with_outputs()) output_name_prefix = os.path.basename(pattern) period_idx = output_name_prefix.rfind('.') if period_idx != -1: output_name_prefix = output_name_prefix[:period_idx] # Write TF Examples. _ = (outputs[None] | f'{prefix_str}WriteTfExamples' >> beam.io.WriteToTFRecord( os.path.join(FLAGS.output_dir, f'{output_name_prefix}.tfrecord'), coder=beam.coders.ProtoCoder(tf.train.Example), num_shards=FLAGS.output_num_shards)) # Write text examples. _ = ( outputs.text_examples | f'{prefix_str}WriteTextExamples' >> beam.io.WriteToText( os.path.join(FLAGS.output_dir, f'{output_name_prefix}_text_examples.jsonl'), shard_name_template='', # To force unsharded output. )) # Write failure cases. _ = ( outputs.parse_failures | f'{prefix_str}WriteFailures' >> beam.io.WriteToText( os.path.join(FLAGS.output_dir, f'{output_name_prefix}_parse_failures.jsonl'), shard_name_template='', # To force unsharded output. )) # Collect statistics. counts = collections.OrderedDict() counts['parse_success_count'] = ( outputs[None] # Undeclared main output. | f'{prefix_str}SuccessCount' >> beam.combiners.Count.Globally()) counts['parse_fail_count'] = ( outputs.parse_failures | f'{prefix_str}FailureCount' >> beam.combiners.Count.Globally()) stats[pattern] = beam_utils.singletons_to_dict( beam_label=f'{prefix_str}Stats', **counts) _ = ( beam_utils.singletons_to_dict(**stats) | 'StatsToJson' >> beam.Map(lambda x: json.dumps(x, indent=2)) | 'WriteStats' >> beam.io.WriteToText( os.path.join(FLAGS.output_dir, 'example_gen_stats.txt'), shard_name_template='', # To force unsharded output. ))
def test_etc_features_with_long_overflow(self): text = 'Star Wars and not Trek ' + ' '.join(['star'] * 12) vdom = [ lib.VdomElement(id=0, text='Star Wars and not Trek', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=5), lib.VdomElement(id=0, text=' '.join(['star'] * 99), features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=5, end_idx=17) ] example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text=text, vdom=vdom, key_phrases=[ lib.KeyPhrase(['Star', 'Wars']), lib.KeyPhrase(['Trek']), ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[0, 4, -1], label_phrase_len=[2, 1, -1], long_token_ids=[3, 14, 11, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_word_idx=[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_vdom_idx=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_input_mask=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE3, global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 0, 0, 0], global_x_coords=[44.0, 0, 0, 0], global_y_coords=[78.0, 0, 0, 0], global_widths=[728.0, 0, 0, 0], global_heights=[45.0, 0, 0, 0], global_font_ids=[13, 0, 0, 0], global_block_indicator=[1, 0, 0, 0], global_inline_indicator=[0, 0, 0, 0], global_heading_indicator=[1, 0, 0, 0], global_leaf_indicator=[0, 0, 0, 0], global_bold_indicator=[0, 0, 0, 0], global_parent_x_coords=[44.0, 0, 0, 0], global_parent_y_coords=[78.0, 0, 0, 0], global_parent_widths=[728.0, 0, 0, 0], global_parent_heights=[45.0, 0, 0, 0], global_parent_font_ids=[13, 0, 0, 0], global_parent_heading_indicator=[1, 0, 0, 0], global_parent_leaf_indicator=[0, 0, 0, 0], global_parent_bold_indicator=[0, 0, 0, 0]) self.assertEqual(expected, etc_features)
def test_etc_features_fixed_global_blocks(self): example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text= 'Star Trek Discovery Season 1 Jason Isaacs Jason Isaacs and Doug', vdom=[ lib.VdomElement(id=0, text='Star Trek Discovery Season 1 Jason', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures( x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=6), lib.VdomElement(id=0, text='Isaacs Jason Isaacs and Doug', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), parent_features=lib.VdomFeatures( x_coord=198.0, width=564.0, y_coord=138.0, height=15.0, is_block=True, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), start_idx=6, end_idx=11) ], key_phrases=[ lib.KeyPhrase(['Star', 'Trek']), lib.KeyPhrase(['Jason', 'Isaacs']) ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True, fixed_block_len=4) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[5, 0, -1], label_phrase_len=[2, 2, -1], long_token_ids=[ 3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 0, 0, 0 ], long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 0, 0, 0], long_vdom_idx=[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0], long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 ], global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 1, 1, 1], global_x_coords=[], global_y_coords=[], global_widths=[], global_heights=[], global_font_ids=[], global_block_indicator=[], global_inline_indicator=[], global_heading_indicator=[], global_leaf_indicator=[], global_bold_indicator=[], global_parent_x_coords=[], global_parent_y_coords=[], global_parent_widths=[], global_parent_heights=[], global_parent_font_ids=[], global_parent_heading_indicator=[], global_parent_leaf_indicator=[], global_parent_bold_indicator=[]) self.assertEqual(expected, etc_features)
def test_etc_features_with_vdom_overflow(self): vdom = [ lib.VdomElement(id=0, text='Star Trek Discovery Season 1 Jason', features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), parent_features=lib.VdomFeatures(x_coord=44.0, width=728.0, y_coord=78.0, height=45.0, is_block=True, is_inline=False, is_heading=True, is_leaf=False, font_size=20, is_bold=False), start_idx=0, end_idx=5), lib.VdomElement(id=0, text='Isaacs Jason Isaacs and Doug', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), parent_features=lib.VdomFeatures(x_coord=198.0, width=564.0, y_coord=138.0, height=15.0, is_block=True, is_inline=False, is_heading=False, is_leaf=False, font_size=12, is_bold=True), start_idx=5, end_idx=8) ] text = 'Star Trek Discovery Season 1 Director Jason Isaacs' text += ' foo' * (20 - 8) vdom.extend([ lib.VdomElement(id=0, text='foo', features=lib.VdomFeatures(x_coord=208.0, width=49.0, y_coord=138.0, height=15.0, is_block=False, is_inline=False, is_heading=False, is_leaf=True, font_size=12, is_bold=True), parent_features=lib.VdomFeatures(x_coord=3110.0, width=92.0, y_coord=123.0, height=75.0, is_block=True, is_inline=False, is_heading=False, is_leaf=True, font_size=13, is_bold=True), start_idx=start_idx, end_idx=start_idx + 1) for start_idx in range(8, 20) ]) example = lib.OpenKpExample( url= 'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html', text=text, vdom=vdom, key_phrases=[ lib.KeyPhrase(['Star', 'Trek']), lib.KeyPhrase(['Jason', 'Isaacs']), ]) bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(), VOCAB_PATH) config = lib.EtcFeaturizationConfig(long_max_length=16, global_max_length=4, url_max_code_points=80, bert_vocab_path=bert_vocab_path, do_lower_case=True) tokenizer = tokenization.FullTokenizer( config.bert_vocab_path, do_lower_case=config.do_lower_case) etc_features = example.to_etc_features(tokenizer, config) expected = lib.OpenKpEtcFeatures( url_code_points=[ 104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116, 108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97, 116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115, 116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115, 99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110, 45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1 ], label_start_idx=[0, 7, -1], label_phrase_len=[2, 2, -1], long_token_ids=[ 3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 13, 0 ], long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 0], long_vdom_idx=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 0], long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], long_word_input_mask=[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 ], long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE2, global_token_ids=[1, 1, 1, 1], global_input_mask=[1, 1, 1, 1], global_x_coords=[44.0, 208.0, 208.0, 208.0], global_y_coords=[78.0, 138.0, 138.0, 138.0], global_widths=[728.0, 49.0, 49.0, 49.0], global_heights=[45.0, 15.0, 15.0, 15.0], global_font_ids=[13, 5, 5, 5], global_block_indicator=[1, 0, 0, 0], global_inline_indicator=[0, 0, 0, 0], global_heading_indicator=[1, 0, 0, 0], global_leaf_indicator=[0, 0, 1, 1], global_bold_indicator=[0, 1, 1, 1], global_parent_x_coords=[44.0, 198.0, 3110.0, 3110.0], global_parent_y_coords=[78.0, 138.0, 123.0, 123.0], global_parent_widths=[728.0, 564.0, 92.0, 92.0], global_parent_heights=[45.0, 15.0, 75.0, 75.0], global_parent_font_ids=[13, 5, 6, 6], global_parent_heading_indicator=[1, 0, 0, 0], global_parent_leaf_indicator=[0, 0, 1, 1], global_parent_bold_indicator=[0, 1, 1, 1]) self.assertEqual(expected, etc_features)