def pipeline(root):
    """Beam pipeline to run."""
    file_patterns = FLAGS.input_patterns.split(',')
    featurization_config = lib.EtcFeaturizationConfig(
        long_max_length=FLAGS.long_max_length,
        global_max_length=FLAGS.global_max_length,
        url_max_code_points=FLAGS.url_max_code_points,
        bert_vocab_path=FLAGS.bert_vocab_path,
        spm_model_path=FLAGS.spm_model_path,
        do_lower_case=FLAGS.do_lower_case,
        fixed_block_len=FLAGS.fixed_block_len)

    stats = collections.OrderedDict()
    for i, pattern in enumerate(file_patterns):
        prefix_str = 'Pattern' + str(i)
        outputs = (root
                   |
                   f'{prefix_str}Read' >> beam.io.textio.ReadFromText(pattern)
                   | f'{prefix_str}Reshuffle' >> beam.transforms.Reshuffle()
                   | f'{prefix_str}Parse' >> beam.ParDo(
                       beam_utils.ParseExampleFn(
                           featurization_config)).with_outputs())

        output_name_prefix = os.path.basename(pattern)
        period_idx = output_name_prefix.rfind('.')
        if period_idx != -1:
            output_name_prefix = output_name_prefix[:period_idx]

        # Write TF Examples.
        _ = (outputs[None]
             | f'{prefix_str}WriteTfExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(FLAGS.output_dir,
                              f'{output_name_prefix}.tfrecord'),
                 coder=beam.coders.ProtoCoder(tf.train.Example),
                 num_shards=FLAGS.output_num_shards))

        # Write text examples.
        _ = (
            outputs.text_examples
            | f'{prefix_str}WriteTextExamples' >> beam.io.WriteToText(
                os.path.join(FLAGS.output_dir,
                             f'{output_name_prefix}_text_examples.jsonl'),
                shard_name_template='',  # To force unsharded output.
            ))

        # Write failure cases.
        _ = (
            outputs.parse_failures
            | f'{prefix_str}WriteFailures' >> beam.io.WriteToText(
                os.path.join(FLAGS.output_dir,
                             f'{output_name_prefix}_parse_failures.jsonl'),
                shard_name_template='',  # To force unsharded output.
            ))

        # Collect statistics.
        counts = collections.OrderedDict()
        counts['parse_success_count'] = (
            outputs[None]  # Undeclared main output.
            | f'{prefix_str}SuccessCount' >> beam.combiners.Count.Globally())
        counts['parse_fail_count'] = (
            outputs.parse_failures
            | f'{prefix_str}FailureCount' >> beam.combiners.Count.Globally())

        stats[pattern] = beam_utils.singletons_to_dict(
            beam_label=f'{prefix_str}Stats', **counts)

    _ = (
        beam_utils.singletons_to_dict(**stats)
        | 'StatsToJson' >> beam.Map(lambda x: json.dumps(x, indent=2))
        | 'WriteStats' >> beam.io.WriteToText(
            os.path.join(FLAGS.output_dir, 'example_gen_stats.txt'),
            shard_name_template='',  # To force unsharded output.
        ))
Example #2
0
    def test_etc_features_with_long_overflow(self):
        text = 'Star Wars and not Trek ' + ' '.join(['star'] * 12)
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Wars and not Trek',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text=' '.join(['star'] * 99),
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=5,
                            end_idx=17)
        ]
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Wars']),
                lib.KeyPhrase(['Trek']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 4, -1],
            label_phrase_len=[2, 1, -1],
            long_token_ids=[3, 14, 11, 15, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_idx=[0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE3,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 0, 0, 0],
            global_x_coords=[44.0, 0, 0, 0],
            global_y_coords=[78.0, 0, 0, 0],
            global_widths=[728.0, 0, 0, 0],
            global_heights=[45.0, 0, 0, 0],
            global_font_ids=[13, 0, 0, 0],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 0, 0],
            global_bold_indicator=[0, 0, 0, 0],
            global_parent_x_coords=[44.0, 0, 0, 0],
            global_parent_y_coords=[78.0, 0, 0, 0],
            global_parent_widths=[728.0, 0, 0, 0],
            global_parent_heights=[45.0, 0, 0, 0],
            global_parent_font_ids=[13, 0, 0, 0],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 0, 0],
            global_parent_bold_indicator=[0, 0, 0, 0])

        self.assertEqual(expected, etc_features)
Example #3
0
    def test_etc_features_fixed_global_blocks(self):
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=
            'Star Trek Discovery Season 1 Jason Isaacs Jason Isaacs and Doug',
            vdom=[
                lib.VdomElement(id=0,
                                text='Star Trek Discovery Season 1 Jason',
                                features=lib.VdomFeatures(x_coord=44.0,
                                                          width=728.0,
                                                          y_coord=78.0,
                                                          height=45.0,
                                                          is_block=True,
                                                          is_inline=False,
                                                          is_heading=True,
                                                          is_leaf=False,
                                                          font_size=20,
                                                          is_bold=False),
                                parent_features=lib.VdomFeatures(
                                    x_coord=44.0,
                                    width=728.0,
                                    y_coord=78.0,
                                    height=45.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=True,
                                    is_leaf=False,
                                    font_size=20,
                                    is_bold=False),
                                start_idx=0,
                                end_idx=6),
                lib.VdomElement(id=0,
                                text='Isaacs Jason Isaacs and Doug',
                                features=lib.VdomFeatures(x_coord=208.0,
                                                          width=49.0,
                                                          y_coord=138.0,
                                                          height=15.0,
                                                          is_block=False,
                                                          is_inline=False,
                                                          is_heading=False,
                                                          is_leaf=False,
                                                          font_size=12,
                                                          is_bold=True),
                                parent_features=lib.VdomFeatures(
                                    x_coord=198.0,
                                    width=564.0,
                                    y_coord=138.0,
                                    height=15.0,
                                    is_block=True,
                                    is_inline=False,
                                    is_heading=False,
                                    is_leaf=False,
                                    font_size=12,
                                    is_bold=True),
                                start_idx=6,
                                end_idx=11)
            ],
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs'])
            ])

        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True,
                                            fixed_block_len=4)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[5, 0, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 0, 0, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 0, 0, 0],
            long_vdom_idx=[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 0, 0, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0
            ],
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[],
            global_y_coords=[],
            global_widths=[],
            global_heights=[],
            global_font_ids=[],
            global_block_indicator=[],
            global_inline_indicator=[],
            global_heading_indicator=[],
            global_leaf_indicator=[],
            global_bold_indicator=[],
            global_parent_x_coords=[],
            global_parent_y_coords=[],
            global_parent_widths=[],
            global_parent_heights=[],
            global_parent_font_ids=[],
            global_parent_heading_indicator=[],
            global_parent_leaf_indicator=[],
            global_parent_bold_indicator=[])
        self.assertEqual(expected, etc_features)
Example #4
0
    def test_etc_features_with_vdom_overflow(self):
        vdom = [
            lib.VdomElement(id=0,
                            text='Star Trek Discovery Season 1 Jason',
                            features=lib.VdomFeatures(x_coord=44.0,
                                                      width=728.0,
                                                      y_coord=78.0,
                                                      height=45.0,
                                                      is_block=True,
                                                      is_inline=False,
                                                      is_heading=True,
                                                      is_leaf=False,
                                                      font_size=20,
                                                      is_bold=False),
                            parent_features=lib.VdomFeatures(x_coord=44.0,
                                                             width=728.0,
                                                             y_coord=78.0,
                                                             height=45.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=True,
                                                             is_leaf=False,
                                                             font_size=20,
                                                             is_bold=False),
                            start_idx=0,
                            end_idx=5),
            lib.VdomElement(id=0,
                            text='Isaacs Jason Isaacs and Doug',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=False,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=198.0,
                                                             width=564.0,
                                                             y_coord=138.0,
                                                             height=15.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=False,
                                                             font_size=12,
                                                             is_bold=True),
                            start_idx=5,
                            end_idx=8)
        ]

        text = 'Star Trek Discovery Season 1 Director Jason Isaacs'
        text += ' foo' * (20 - 8)
        vdom.extend([
            lib.VdomElement(id=0,
                            text='foo',
                            features=lib.VdomFeatures(x_coord=208.0,
                                                      width=49.0,
                                                      y_coord=138.0,
                                                      height=15.0,
                                                      is_block=False,
                                                      is_inline=False,
                                                      is_heading=False,
                                                      is_leaf=True,
                                                      font_size=12,
                                                      is_bold=True),
                            parent_features=lib.VdomFeatures(x_coord=3110.0,
                                                             width=92.0,
                                                             y_coord=123.0,
                                                             height=75.0,
                                                             is_block=True,
                                                             is_inline=False,
                                                             is_heading=False,
                                                             is_leaf=True,
                                                             font_size=13,
                                                             is_bold=True),
                            start_idx=start_idx,
                            end_idx=start_idx + 1)
            for start_idx in range(8, 20)
        ])
        example = lib.OpenKpExample(
            url=
            'http://0123putlocker.com/watch/qd7kBodK-star-trek-discovery-season-1.html',
            text=text,
            vdom=vdom,
            key_phrases=[
                lib.KeyPhrase(['Star', 'Trek']),
                lib.KeyPhrase(['Jason', 'Isaacs']),
            ])
        bert_vocab_path = os.path.join(absltest.get_default_test_srcdir(),
                                       VOCAB_PATH)
        config = lib.EtcFeaturizationConfig(long_max_length=16,
                                            global_max_length=4,
                                            url_max_code_points=80,
                                            bert_vocab_path=bert_vocab_path,
                                            do_lower_case=True)
        tokenizer = tokenization.FullTokenizer(
            config.bert_vocab_path, do_lower_case=config.do_lower_case)
        etc_features = example.to_etc_features(tokenizer, config)
        expected = lib.OpenKpEtcFeatures(
            url_code_points=[
                104, 116, 116, 112, 58, 47, 47, 48, 49, 50, 51, 112, 117, 116,
                108, 111, 99, 107, 101, 114, 46, 99, 111, 109, 47, 119, 97,
                116, 99, 104, 47, 113, 100, 55, 107, 66, 111, 100, 75, 45, 115,
                116, 97, 114, 45, 116, 114, 101, 107, 45, 100, 105, 115,
                99, 111, 118, 101, 114, 121, 45, 115, 101, 97, 115, 111, 110,
                45, 49, 46, 104, 116, 109, 108, -1, -1, -1, -1, -1, -1, -1
            ],
            label_start_idx=[0, 7, -1],
            label_phrase_len=[2, 2, -1],
            long_token_ids=[
                3, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 13, 0
            ],
            long_word_idx=[0, 1, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 11, 12, 0],
            long_vdom_idx=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 3, 0],
            long_input_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
            long_word_input_mask=[
                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
            ],
            long_word_first_occurrence=LONG_WORD_FIRST_OCCURRENCE2,
            global_token_ids=[1, 1, 1, 1],
            global_input_mask=[1, 1, 1, 1],
            global_x_coords=[44.0, 208.0, 208.0, 208.0],
            global_y_coords=[78.0, 138.0, 138.0, 138.0],
            global_widths=[728.0, 49.0, 49.0, 49.0],
            global_heights=[45.0, 15.0, 15.0, 15.0],
            global_font_ids=[13, 5, 5, 5],
            global_block_indicator=[1, 0, 0, 0],
            global_inline_indicator=[0, 0, 0, 0],
            global_heading_indicator=[1, 0, 0, 0],
            global_leaf_indicator=[0, 0, 1, 1],
            global_bold_indicator=[0, 1, 1, 1],
            global_parent_x_coords=[44.0, 198.0, 3110.0, 3110.0],
            global_parent_y_coords=[78.0, 138.0, 123.0, 123.0],
            global_parent_widths=[728.0, 564.0, 92.0, 92.0],
            global_parent_heights=[45.0, 15.0, 75.0, 75.0],
            global_parent_font_ids=[13, 5, 6, 6],
            global_parent_heading_indicator=[1, 0, 0, 0],
            global_parent_leaf_indicator=[0, 0, 1, 1],
            global_parent_bold_indicator=[0, 1, 1, 1])

        self.assertEqual(expected, etc_features)