def get_segmenter_corpus(input_data_path, use_text_format):
  """Reads in a character corpus for segmenting."""
  # Read in the documents.
  tf.logging.info('Reading documents...')
  if use_text_format:
    char_corpus = sentence_io.FormatSentenceReader(input_data_path,
                                                   'untokenized-text').corpus()
  else:
    input_corpus = sentence_io.ConllSentenceReader(input_data_path).corpus()
    with tf.Session(graph=tf.Graph()) as tmp_session:
      char_input = gen_parser_ops.char_token_generator(input_corpus)
      char_corpus = tmp_session.run(char_input)
    check.Eq(len(input_corpus), len(char_corpus))

  return char_corpus
    def annotate_text(self, text):
        sentence = sentence_pb2.Sentence(
            text=text, token=[sentence_pb2.Token(word=text, start=-1, end=-1)])

        # preprocess
        with tf.Session(graph=tf.Graph()) as tmp_session:
            char_input = gen_parser_ops.char_token_generator(
                [sentence.SerializeToString()])
            preprocessed = tmp_session.run(char_input)[0]
        segmented, _ = self.segmenter_model(preprocessed)

        annotations, traces = self.parser_model(segmented[0])
        assert len(annotations) == 1
        assert len(traces) == 1
        return sentence_pb2.Sentence.FromString(annotations[0])
Esempio n. 3
0
def syntaxnet_tokenize(text):
    sentence = sentence_pb2.Sentence(
        text=text,
        token=[sentence_pb2.Token(word=text, start=-1, end=-1)]
    )

    # preprocess
    with tf.Session(graph=tf.Graph()) as tmp_session:
        char_input = gen_parser_ops.char_token_generator([sentence.SerializeToString()])
        preprocessed = tmp_session.run(char_input)[0]
    segmented, _ = segmenter_model(preprocessed)
    tokens = []

    for t in sentence_pb2.Sentence.FromString(segmented[0]).token:
        tokens.append(t.word)

    return tokens
Esempio n. 4
0
def annotate_text(text):
    """
    Segment and parse input text using syntaxnet models.
    """
    sentence = sentence_pb2.Sentence(
        text=text, token=[sentence_pb2.Token(word=text, start=-1, end=-1)])

    # preprocess
    with tf.Session(graph=tf.Graph()) as tmp_session:
        char_input = gen_parser_ops.char_token_generator(
            [sentence.SerializeToString()])
        preprocessed = tmp_session.run(char_input)[0]
    segmented, _ = SEGMENTER_MODEL(preprocessed)

    annotations, traces = PARSER_MODEL(segmented[0])
    assert len(annotations) == 1
    assert len(traces) == 1
    return sentence_pb2.Sentence.FromString(annotations[0]), traces[0]
Esempio n. 5
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    if not gfile.IsDirectory(FLAGS.resource_path):
        gfile.MakeDirs(FLAGS.resource_path)

    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    if FLAGS.compute_lexicon:
        logging.info('Computing lexicon...')
        lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

    # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
    # sequence model, which encodes the context to the right of each token. It has
    # no loss except for the downstream components.
    lookahead = spec_builder.ComponentSpecBuilder('lookahead')
    lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='256')
    lookahead.set_transition_system(name='shift-only', left_to_right='false')
    lookahead.add_fixed_feature(name='char',
                                fml='input(-1).char input.char input(1).char',
                                embedding_dim=32)
    lookahead.add_fixed_feature(name='char-bigram',
                                fml='input.char-bigram',
                                embedding_dim=32)
    lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Construct the ComponentSpec for segmentation.
    segmenter = spec_builder.ComponentSpecBuilder('segmenter')
    segmenter.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='128')
    segmenter.set_transition_system(name='binary-segment-transitions')
    segmenter.add_token_link(source=lookahead,
                             fml='input.focus stack.focus',
                             embedding_dim=64)
    segmenter.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Build and write master_spec.
    master_spec = spec_pb2.MasterSpec()
    master_spec.component.extend([lookahead.spec, segmenter.spec])
    logging.info('Constructed master spec: %s', str(master_spec))
    with gfile.GFile(FLAGS.resource_path + '/master_spec', 'w') as f:
        f.write(str(master_spec).encode('utf-8'))

    hyperparam_config = spec_pb2.GridPoint()
    try:
        text_format.Parse(FLAGS.hyperparams, hyperparam_config)
    except text_format.ParseError:
        text_format.Parse(base64.b64decode(FLAGS.hyperparams),
                          hyperparam_config)

    # Build the TensorFlow graph.
    graph = tf.Graph()
    with graph.as_default():
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
        component_targets = spec_builder.default_targets_from_spec(master_spec)
        trainers = [
            builder.add_training_from_config(target)
            for target in component_targets
        ]
        assert len(trainers) == 1
        annotator = builder.add_annotation()
        builder.add_saver()

    # Read in serialized protos from training data.
    training_set = ConllSentenceReader(FLAGS.training_corpus_path,
                                       projectivize=False).corpus()
    dev_set = ConllSentenceReader(FLAGS.dev_corpus_path,
                                  projectivize=False).corpus()

    # Convert word-based docs to char-based documents for segmentation training
    # and evaluation.
    with tf.Session(graph=tf.Graph()) as tmp_session:
        char_training_set_op = gen_parser_ops.segmenter_training_data_constructor(
            training_set)
        char_dev_set_op = gen_parser_ops.char_token_generator(dev_set)
        char_training_set = tmp_session.run(char_training_set_op)
        char_dev_set = tmp_session.run(char_dev_set_op)

    # Ready to train!
    logging.info('Training on %d sentences.', len(training_set))
    logging.info('Tuning on %d sentences.', len(dev_set))

    pretrain_steps = [0]
    train_steps = [FLAGS.num_epochs * len(training_set)]

    tf.logging.info('Creating TensorFlow checkpoint dir...')
    gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
    summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

    with tf.Session(FLAGS.tf_master, graph=graph) as sess:
        # Make sure to re-initialize all underlying state.
        sess.run(tf.global_variables_initializer())
        trainer_lib.run_training(
            sess, trainers, annotator, evaluation.segmentation_summaries,
            pretrain_steps, train_steps, char_training_set, char_dev_set,
            dev_set, FLAGS.batch_size, summary_writer, FLAGS.report_every,
            builder.saver, FLAGS.checkpoint_filename)
Esempio n. 6
0
def main(unused_argv):

  # Parse the flags containint lists, using regular expressions.
  # This matches and extracts key=value pairs.
  component_beam_sizes = re.findall(r'([^=,]+)=(\d+)',
                                    FLAGS.inference_beam_size)
  # This matches strings separated by a comma. Does not return any empty
  # strings.
  components_to_locally_normalize = re.findall(r'[^,]+',
                                               FLAGS.locally_normalize)

  # Reads master spec.
  master_spec = spec_pb2.MasterSpec()
  with gfile.FastGFile(FLAGS.master_spec) as fin:
    text_format.Parse(fin.read(), master_spec)

  # Rewrite resource locations.
  if FLAGS.resource_dir:
    for component in master_spec.component:
      for resource in component.resource:
        for part in resource.part:
          part.file_pattern = os.path.join(FLAGS.resource_dir,
                                           part.file_pattern)

  if FLAGS.complete_master_spec:
    spec_builder.complete_master_spec(master_spec, None, FLAGS.resource_dir)

  # Graph building.
  tf.logging.info('Building the graph')
  g = tf.Graph()
  with g.as_default(), tf.device('/device:CPU:0'):
    hyperparam_config = spec_pb2.GridPoint()
    hyperparam_config.use_moving_average = True
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
    annotator = builder.add_annotation()
    builder.add_saver()

  tf.logging.info('Reading documents...')
  input_corpus = sentence_io.ConllSentenceReader(FLAGS.input_file).corpus()
  with tf.Session(graph=tf.Graph()) as tmp_session:
    char_input = gen_parser_ops.char_token_generator(input_corpus)
    char_corpus = tmp_session.run(char_input)
  check.Eq(len(input_corpus), len(char_corpus))

  session_config = tf.ConfigProto(
      log_device_placement=False,
      intra_op_parallelism_threads=FLAGS.threads,
      inter_op_parallelism_threads=FLAGS.threads)

  with tf.Session(graph=g, config=session_config) as sess:
    tf.logging.info('Initializing variables...')
    sess.run(tf.global_variables_initializer())

    tf.logging.info('Loading from checkpoint...')
    sess.run('save/restore_all', {'save/Const:0': FLAGS.checkpoint_file})

    tf.logging.info('Processing sentences...')

    processed = []
    start_time = time.time()
    run_metadata = tf.RunMetadata()
    for start in range(0, len(char_corpus), FLAGS.max_batch_size):
      end = min(start + FLAGS.max_batch_size, len(char_corpus))
      feed_dict = {annotator['input_batch']: char_corpus[start:end]}
      for comp, beam_size in component_beam_sizes:
        feed_dict['%s/InferenceBeamSize:0' % comp] = beam_size
      for comp in components_to_locally_normalize:
        feed_dict['%s/LocallyNormalize:0' % comp] = True
      if FLAGS.timeline_output_file and end == len(char_corpus):
        serialized_annotations = sess.run(
            annotator['annotations'], feed_dict=feed_dict,
            options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
            run_metadata=run_metadata)
        trace = timeline.Timeline(step_stats=run_metadata.step_stats)
        with open(FLAGS.timeline_output_file, 'w') as trace_file:
          trace_file.write(trace.generate_chrome_trace_format())
      else:
        serialized_annotations = sess.run(
            annotator['annotations'], feed_dict=feed_dict)
      processed.extend(serialized_annotations)

    tf.logging.info('Processed %d documents in %.2f seconds.',
                    len(char_corpus), time.time() - start_time)
    evaluation.calculate_segmentation_metrics(input_corpus, processed)

    if FLAGS.output_file:
      with gfile.GFile(FLAGS.output_file, 'w') as f:
        for serialized_sentence in processed:
          sentence = sentence_pb2.Sentence()
          sentence.ParseFromString(serialized_sentence)
          f.write(text_format.MessageToString(sentence) + '\n\n')
def main(unused_argv):

    # Parse the flags containint lists, using regular expressions.
    # This matches and extracts key=value pairs.
    component_beam_sizes = re.findall(r'([^=,]+)=(\d+)',
                                      FLAGS.inference_beam_size)
    # This matches strings separated by a comma. Does not return any empty
    # strings.
    components_to_locally_normalize = re.findall(r'[^,]+',
                                                 FLAGS.locally_normalize)

    # Reads master spec.
    master_spec = spec_pb2.MasterSpec()
    with gfile.FastGFile(FLAGS.master_spec) as fin:
        text_format.Parse(fin.read(), master_spec)

    # Rewrite resource locations.
    if FLAGS.resource_dir:
        for component in master_spec.component:
            for resource in component.resource:
                for part in resource.part:
                    part.file_pattern = os.path.join(FLAGS.resource_dir,
                                                     part.file_pattern)

    if FLAGS.complete_master_spec:
        spec_builder.complete_master_spec(master_spec, None,
                                          FLAGS.resource_dir)

    # Graph building.
    tf.logging.info('Building the graph')
    g = tf.Graph()
    with g.as_default(), tf.device('/device:CPU:0'):
        hyperparam_config = spec_pb2.GridPoint()
        hyperparam_config.use_moving_average = True
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
        annotator = builder.add_annotation()
        builder.add_saver()

    tf.logging.info('Reading documents...')
    input_corpus = sentence_io.ConllSentenceReader(FLAGS.input_file).corpus()
    with tf.Session(graph=tf.Graph()) as tmp_session:
        char_input = gen_parser_ops.char_token_generator(input_corpus)
        char_corpus = tmp_session.run(char_input)
    check.Eq(len(input_corpus), len(char_corpus))

    session_config = tf.ConfigProto(log_device_placement=False,
                                    intra_op_parallelism_threads=FLAGS.threads,
                                    inter_op_parallelism_threads=FLAGS.threads)

    with tf.Session(graph=g, config=session_config) as sess:
        tf.logging.info('Initializing variables...')
        sess.run(tf.global_variables_initializer())

        tf.logging.info('Loading from checkpoint...')
        sess.run('save/restore_all', {'save/Const:0': FLAGS.checkpoint_file})

        tf.logging.info('Processing sentences...')

        processed = []
        start_time = time.time()
        run_metadata = tf.RunMetadata()
        for start in range(0, len(char_corpus), FLAGS.max_batch_size):
            end = min(start + FLAGS.max_batch_size, len(char_corpus))
            feed_dict = {annotator['input_batch']: char_corpus[start:end]}
            for comp, beam_size in component_beam_sizes:
                feed_dict['%s/InferenceBeamSize:0' % comp] = beam_size
            for comp in components_to_locally_normalize:
                feed_dict['%s/LocallyNormalize:0' % comp] = True
            if FLAGS.timeline_output_file and end == len(char_corpus):
                serialized_annotations = sess.run(
                    annotator['annotations'],
                    feed_dict=feed_dict,
                    options=tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE),
                    run_metadata=run_metadata)
                trace = timeline.Timeline(step_stats=run_metadata.step_stats)
                with open(FLAGS.timeline_output_file, 'w') as trace_file:
                    trace_file.write(trace.generate_chrome_trace_format())
            else:
                serialized_annotations = sess.run(annotator['annotations'],
                                                  feed_dict=feed_dict)
            processed.extend(serialized_annotations)

        tf.logging.info('Processed %d documents in %.2f seconds.',
                        len(char_corpus),
                        time.time() - start_time)
        evaluation.calculate_segmentation_metrics(input_corpus, processed)

        if FLAGS.output_file:
            with gfile.GFile(FLAGS.output_file, 'w') as f:
                for serialized_sentence in processed:
                    sentence = sentence_pb2.Sentence()
                    sentence.ParseFromString(serialized_sentence)
                    f.write(text_format.MessageToString(sentence) + '\n\n')
Esempio n. 8
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)

  if not gfile.IsDirectory(FLAGS.resource_path):
    gfile.MakeDirs(FLAGS.resource_path)

  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  if FLAGS.compute_lexicon:
    logging.info('Computing lexicon...')
    lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

  # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
  # sequence model, which encodes the context to the right of each token. It has
  # no loss except for the downstream components.
  lookahead = spec_builder.ComponentSpecBuilder('lookahead')
  lookahead.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256')
  lookahead.set_transition_system(name='shift-only', left_to_right='false')
  lookahead.add_fixed_feature(name='char',
                              fml='input(-1).char input.char input(1).char',
                              embedding_dim=32)
  lookahead.add_fixed_feature(name='char-bigram',
                              fml='input.char-bigram',
                              embedding_dim=32)
  lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Construct the ComponentSpec for segmentation.
  segmenter = spec_builder.ComponentSpecBuilder('segmenter')
  segmenter.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='128')
  segmenter.set_transition_system(name='binary-segment-transitions')
  segmenter.add_token_link(
      source=lookahead, fml='input.focus stack.focus',
      embedding_dim=64)
  segmenter.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Build and write master_spec.
  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([lookahead.spec, segmenter.spec])
  logging.info('Constructed master spec: %s', str(master_spec))
  with gfile.GFile(FLAGS.resource_path + '/master_spec', 'w') as f:
    f.write(str(master_spec).encode('utf-8'))

  hyperparam_config = spec_pb2.GridPoint()
  try:
    text_format.Parse(FLAGS.hyperparams, hyperparam_config)
  except text_format.ParseError:
    text_format.Parse(base64.b64decode(FLAGS.hyperparams), hyperparam_config)

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
    component_targets = spec_builder.default_targets_from_spec(master_spec)
    trainers = [
        builder.add_training_from_config(target) for target in component_targets
    ]
    assert len(trainers) == 1
    annotator = builder.add_annotation()
    builder.add_saver()

  # Read in serialized protos from training data.
  training_set = ConllSentenceReader(
      FLAGS.training_corpus_path, projectivize=False).corpus()
  dev_set = ConllSentenceReader(
      FLAGS.dev_corpus_path, projectivize=False).corpus()

  # Convert word-based docs to char-based documents for segmentation training
  # and evaluation.
  with tf.Session(graph=tf.Graph()) as tmp_session:
    char_training_set_op = gen_parser_ops.segmenter_training_data_constructor(
        training_set)
    char_dev_set_op = gen_parser_ops.char_token_generator(dev_set)
    char_training_set = tmp_session.run(char_training_set_op)
    char_dev_set = tmp_session.run(char_dev_set_op)

  # Ready to train!
  logging.info('Training on %d sentences.', len(training_set))
  logging.info('Tuning on %d sentences.', len(dev_set))

  pretrain_steps = [0]
  train_steps = [FLAGS.num_epochs * len(training_set)]

  tf.logging.info('Creating TensorFlow checkpoint dir...')
  gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
  summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

  with tf.Session(FLAGS.tf_master, graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.global_variables_initializer())
    trainer_lib.run_training(
        sess, trainers, annotator, evaluation.segmentation_summaries,
        pretrain_steps, train_steps, char_training_set, char_dev_set, dev_set,
        FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver,
        FLAGS.checkpoint_filename)