def complete_master_spec(master_spec, lexicon_corpus, output_path,
                         tf_master=''):
  """Finishes a MasterSpec that defines the network config.

  Given a MasterSpec that defines the DRAGNN architecture, completes the spec so
  that it can be used to build a DRAGNN graph and run training/inference.

  Args:
    master_spec: MasterSpec.
    lexicon_corpus: the corpus to be used with the LexiconBuilder.
    output_path: directory to save resources to.
    tf_master: TensorFlow master executor (string, defaults to '' to use the
      local instance).

  Returns:
    None, since the spec is changed in-place.
  """
  if lexicon_corpus:
    lexicon.build_lexicon(output_path, lexicon_corpus)

  # Use Syntaxnet builder to fill out specs.
  for i, spec in enumerate(master_spec.component):
    builder = ComponentSpecBuilder(spec.name)
    builder.spec = spec
    builder.fill_from_resources(output_path, tf_master=tf_master)
    master_spec.component[i].CopyFrom(builder.spec)
Beispiel #2
0
def complete_master_spec(master_spec,
                         lexicon_corpus,
                         output_path,
                         tf_master=''):
    """Finishes a MasterSpec that defines the network config.

  Given a MasterSpec that defines the DRAGNN architecture, completes the spec so
  that it can be used to build a DRAGNN graph and run training/inference.

  Args:
    master_spec: MasterSpec.
    lexicon_corpus: the corpus to be used with the LexiconBuilder.
    output_path: directory to save resources to.
    tf_master: TensorFlow master executor (string, defaults to '' to use the
      local instance).

  Returns:
    None, since the spec is changed in-place.
  """
    if lexicon_corpus:
        lexicon.build_lexicon(output_path, lexicon_corpus)

    # Use Syntaxnet builder to fill out specs.
    for i, spec in enumerate(master_spec.component):
        builder = ComponentSpecBuilder(spec.name)
        builder.spec = spec
        builder.fill_from_resources(output_path, tf_master=tf_master)
        master_spec.component[i].CopyFrom(builder.spec)
Beispiel #3
0
def main(unused_argv) :
    if len(sys.argv) == 1 :
        flags._global_parser.print_help()
        sys.exit(0)

    logging.set_verbosity(logging.INFO)
    check.IsTrue(FLAGS.training_corpus_path)
    check.IsTrue(FLAGS.tune_corpus_path)
    check.IsTrue(FLAGS.resource_path)
    check.IsTrue(FLAGS.checkpoint_filename)

    if not gfile.IsDirectory(FLAGS.resource_path):
        gfile.MakeDirs(FLAGS.resource_path)

    training_corpus_path = gfile.Glob(FLAGS.training_corpus_path)[0]
    tune_corpus_path = gfile.Glob(FLAGS.tune_corpus_path)[0]

    # SummaryWriter for TensorBoard
    tf.logging.info('TensorBoard directory: "%s"', FLAGS.tensorboard_dir)
    tf.logging.info('Deleting prior data if exists...')

    stats_file = '%s.stats' % FLAGS.checkpoint_filename
    try :
        stats = gfile.GFile(stats_file, 'r').readlines()[0].split(',')
        stats = [int(x) for x in stats]
    except errors.OpError :
        stats = [-1, 0, 0]

    tf.logging.info('Read ckpt stats: %s', str(stats))
    do_restore = True
    if stats[0] < FLAGS.job_id :
        do_restore = False
        tf.logging.info('Deleting last job: %d', stats[0])
        try :
            gfile.DeleteRecursively(FLAGS.tensorboard_dir)
            gfile.Remove(FLAGS.checkpoint_filename)
        except errors.OpError as err :
            tf.logging.error('Unable to delete prior files: %s', err)
        stats = [FLAGS.job_id, 0, 0]

    tf.logging.info('Creating the directory again...')
    gfile.MakeDirs(FLAGS.tensorboard_dir)
    tf.logging.info('Created! Instatiating SummaryWriter...')
    summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)
    tf.logging.info('Creating TensorFlow checkpoint dir...')
    gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))

    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    if FLAGS.compute_lexicon : 
        logging.info('Computing lexicon...')
        lexicon.build_lexicon(FLAGS.resource_path, training_corpus_path, morph_to_pos=True)

    # Load master spec
    master_spec = model.load_master_spec(FLAGS.dragnn_spec, FLAGS.resource_path)
    # Build graph
    graph, builder, trainers, annotator = model.build_train_graph(master_spec)
    # Train
    train(graph, builder, trainers, annotator, summary_writer, do_restore, stats)
Beispiel #4
0
  def testBuildLexicon(self):
    empty_input_path = os.path.join(test_flags.temp_dir(), 'empty-input')
    lexicon_output_path = os.path.join(test_flags.temp_dir(), 'lexicon-output')

    with open(empty_input_path, 'w'):
      pass

    # The directory may already exist when running locally multiple times.
    if not os.path.exists(lexicon_output_path):
      os.mkdir(lexicon_output_path)

    # Just make sure this doesn't crash; the lexicon builder op is already
    # exercised in its own unit test.
    lexicon.build_lexicon(lexicon_output_path, empty_input_path)
  def testBuildLexicon(self):
    empty_input_path = os.path.join(FLAGS.test_tmpdir, 'empty-input')
    lexicon_output_path = os.path.join(FLAGS.test_tmpdir, 'lexicon-output')

    with open(empty_input_path, 'w'):
      pass

    # The directory may already exist when running locally multiple times.
    if not os.path.exists(lexicon_output_path):
      os.mkdir(lexicon_output_path)

    # Just make sure this doesn't crash; the lexicon builder op is already
    # exercised in its own unit test.
    lexicon.build_lexicon(lexicon_output_path, empty_input_path)
def main(argv):
    del argv  # unused
    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    lexicon.build_lexicon(lexicon_dir,
                          training_sentence,
                          training_corpus_format='sentence-prototext')

    # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
    # sequence tagger.
    tagger = spec_builder.ComponentSpecBuilder('tagger')
    tagger.set_network_unit(name='FeedForwardNetwork',
                            hidden_layer_sizes='256')
    tagger.set_transition_system(name='tagger')
    tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64)
    tagger.add_rnn_link(embedding_dim=-1)
    tagger.fill_from_resources(lexicon_dir)

    master_spec = spec_pb2.MasterSpec()
    master_spec.component.extend([tagger.spec])

    hyperparam_config = spec_pb2.GridPoint()

    # Build the TensorFlow graph.
    graph = tf.Graph()
    with graph.as_default():
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)

        target = spec_pb2.TrainTarget()
        target.name = 'all'
        target.unroll_using_oracle.extend([True])
        dry_run = builder.add_training_from_config(target, trace_only=True)

    # Read in serialized protos from training data.
    sentence = sentence_pb2.Sentence()
    text_format.Merge(open(training_sentence).read(), sentence)
    training_set = [sentence.SerializeToString()]

    with tf.Session(graph=graph) as sess:
        # Make sure to re-initialize all underlying state.
        sess.run(tf.initialize_all_variables())
        traces = sess.run(dry_run['traces'],
                          feed_dict={dry_run['input_batch']: training_set})

    with open('dragnn_tutorial_1.html', 'w') as f:
        f.write(
            visualization.trace_html(traces[0],
                                     height='300px').encode('utf-8'))
Beispiel #7
0
def main(argv):
  del argv  # unused
  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  lexicon.build_lexicon(
      lexicon_dir,
      training_sentence,
      training_corpus_format='sentence-prototext')

  # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
  # sequence tagger.
  tagger = spec_builder.ComponentSpecBuilder('tagger')
  tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256')
  tagger.set_transition_system(name='tagger')
  tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64)
  tagger.add_rnn_link(embedding_dim=-1)
  tagger.fill_from_resources(lexicon_dir)

  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([tagger.spec])

  hyperparam_config = spec_pb2.GridPoint()

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)

    target = spec_pb2.TrainTarget()
    target.name = 'all'
    target.unroll_using_oracle.extend([True])
    dry_run = builder.add_training_from_config(target, trace_only=True)

  # Read in serialized protos from training data.
  sentence = sentence_pb2.Sentence()
  text_format.Merge(open(training_sentence).read(), sentence)
  training_set = [sentence.SerializeToString()]

  with tf.Session(graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.initialize_all_variables())
    traces = sess.run(
        dry_run['traces'], feed_dict={dry_run['input_batch']: training_set})

  with open('dragnn_tutorial_1.html', 'w') as f:
    f.write(visualization.trace_html(traces[0], height='300px').encode('utf-8'))
Beispiel #8
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    check.IsTrue(FLAGS.checkpoint_filename)
    check.IsTrue(FLAGS.tensorboard_dir)
    check.IsTrue(FLAGS.resource_path)

    if not gfile.IsDirectory(FLAGS.resource_path):
        gfile.MakeDirs(FLAGS.resource_path)

    training_corpus_path = gfile.Glob(FLAGS.training_corpus_path)[0]
    tune_corpus_path = gfile.Glob(FLAGS.tune_corpus_path)[0]

    # SummaryWriter for TensorBoard
    tf.logging.info('TensorBoard directory: "%s"', FLAGS.tensorboard_dir)
    tf.logging.info('Deleting prior data if exists...')

    stats_file = '%s.stats' % FLAGS.checkpoint_filename
    try:
        stats = gfile.GFile(stats_file, 'r').readlines()[0].split(',')
        stats = [int(x) for x in stats]
    except errors.OpError:
        stats = [-1, 0, 0]

    tf.logging.info('Read ckpt stats: %s', str(stats))
    do_restore = True
    if stats[0] < FLAGS.job_id:
        do_restore = False
        tf.logging.info('Deleting last job: %d', stats[0])
        try:
            gfile.DeleteRecursively(FLAGS.tensorboard_dir)
            gfile.Remove(FLAGS.checkpoint_filename)
        except errors.OpError as err:
            tf.logging.error('Unable to delete prior files: %s', err)
        stats = [FLAGS.job_id, 0, 0]

    tf.logging.info('Creating the directory again...')
    gfile.MakeDirs(FLAGS.tensorboard_dir)
    tf.logging.info('Created! Instatiating SummaryWriter...')
    summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)
    tf.logging.info('Creating TensorFlow checkpoint dir...')
    gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))

    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    if FLAGS.compute_lexicon:
        logging.info('Computing lexicon...')
        lexicon.build_lexicon(FLAGS.resource_path,
                              training_corpus_path,
                              morph_to_pos=True)

    tf.logging.info('Loading MasterSpec...')
    master_spec = spec_pb2.MasterSpec()
    with gfile.FastGFile(FLAGS.dragnn_spec, 'r') as fin:
        text_format.Parse(fin.read(), master_spec)
    spec_builder.complete_master_spec(master_spec, None, FLAGS.resource_path)
    logging.info('Constructed master spec: %s', str(master_spec))
    hyperparam_config = spec_pb2.GridPoint()

    # Build the TensorFlow graph.
    tf.logging.info('Building Graph...')
    hyperparam_config = spec_pb2.GridPoint()
    try:
        text_format.Parse(FLAGS.hyperparams, hyperparam_config)
    except text_format.ParseError:
        text_format.Parse(base64.b64decode(FLAGS.hyperparams),
                          hyperparam_config)
    g = tf.Graph()
    with g.as_default():
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
        component_targets = [
            spec_pb2.TrainTarget(name=component.name,
                                 max_index=idx + 1,
                                 unroll_using_oracle=[False] * idx + [True])
            for idx, component in enumerate(master_spec.component)
            if 'shift-only' not in component.transition_system.registered_name
        ]
        trainers = [
            builder.add_training_from_config(target)
            for target in component_targets
        ]
        annotator = builder.add_annotation()
        builder.add_saver()

    # Read in serialized protos from training data.
    training_set = ConllSentenceReader(
        training_corpus_path,
        projectivize=FLAGS.projectivize_training_set,
        morph_to_pos=True).corpus()
    tune_set = ConllSentenceReader(tune_corpus_path,
                                   projectivize=False,
                                   morph_to_pos=True).corpus()

    # Ready to train_bkp!
    logging.info('Training on %d sentences.', len(training_set))
    logging.info('Tuning on %d sentences.', len(tune_set))

    pretrain_steps = [10000, 0]
    tagger_steps = 100000
    train_steps = [tagger_steps, 8 * tagger_steps]

    with tf.Session(FLAGS.tf_master, graph=g) as sess:
        # Make sure to re-initialize all underlying state.
        sess.run(tf.global_variables_initializer())

        if do_restore:
            tf.logging.info('Restoring from checkpoint...')
            builder.saver.restore(sess, FLAGS.checkpoint_filename)

            prev_tagger_steps = stats[1]
            prev_parser_steps = stats[2]
            tf.logging.info('adjusting schedule from steps: %d, %d',
                            prev_tagger_steps, prev_parser_steps)
            pretrain_steps[0] = max(pretrain_steps[0] - prev_tagger_steps, 0)
            tf.logging.info('new pretrain steps: %d', pretrain_steps[0])

        trainer_lib.run_training(sess, trainers, annotator,
                                 evaluation.parser_summaries, pretrain_steps,
                                 train_steps, training_set, tune_set, tune_set,
                                 FLAGS.batch_size, summary_writer,
                                 FLAGS.report_every, builder.saver,
                                 FLAGS.checkpoint_filename, stats)
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    if not gfile.IsDirectory(FLAGS.resource_path):
        gfile.MakeDirs(FLAGS.resource_path)

    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    if FLAGS.compute_lexicon:
        logging.info('Computing lexicon...')
        lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

    # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
    # sequence model, which encodes the context to the right of each token. It has
    # no loss except for the downstream components.
    lookahead = spec_builder.ComponentSpecBuilder('lookahead')
    lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='256')
    lookahead.set_transition_system(name='shift-only', left_to_right='false')
    lookahead.add_fixed_feature(name='char',
                                fml='input(-1).char input.char input(1).char',
                                embedding_dim=32)
    lookahead.add_fixed_feature(name='char-bigram',
                                fml='input.char-bigram',
                                embedding_dim=32)
    lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Construct the ComponentSpec for segmentation.
    segmenter = spec_builder.ComponentSpecBuilder('segmenter')
    segmenter.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='128')
    segmenter.set_transition_system(name='binary-segment-transitions')
    segmenter.add_token_link(source=lookahead,
                             fml='input.focus stack.focus',
                             embedding_dim=64)
    segmenter.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Build and write master_spec.
    master_spec = spec_pb2.MasterSpec()
    master_spec.component.extend([lookahead.spec, segmenter.spec])
    logging.info('Constructed master spec: %s', str(master_spec))
    with gfile.GFile(FLAGS.resource_path + '/master_spec', 'w') as f:
        f.write(str(master_spec).encode('utf-8'))

    hyperparam_config = spec_pb2.GridPoint()
    try:
        text_format.Parse(FLAGS.hyperparams, hyperparam_config)
    except text_format.ParseError:
        text_format.Parse(base64.b64decode(FLAGS.hyperparams),
                          hyperparam_config)

    # Build the TensorFlow graph.
    graph = tf.Graph()
    with graph.as_default():
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
        component_targets = spec_builder.default_targets_from_spec(master_spec)
        trainers = [
            builder.add_training_from_config(target)
            for target in component_targets
        ]
        assert len(trainers) == 1
        annotator = builder.add_annotation()
        builder.add_saver()

    # Read in serialized protos from training data.
    training_set = ConllSentenceReader(FLAGS.training_corpus_path,
                                       projectivize=False).corpus()
    dev_set = ConllSentenceReader(FLAGS.dev_corpus_path,
                                  projectivize=False).corpus()

    # Convert word-based docs to char-based documents for segmentation training
    # and evaluation.
    with tf.Session(graph=tf.Graph()) as tmp_session:
        char_training_set_op = gen_parser_ops.segmenter_training_data_constructor(
            training_set)
        char_dev_set_op = gen_parser_ops.char_token_generator(dev_set)
        char_training_set = tmp_session.run(char_training_set_op)
        char_dev_set = tmp_session.run(char_dev_set_op)

    # Ready to train!
    logging.info('Training on %d sentences.', len(training_set))
    logging.info('Tuning on %d sentences.', len(dev_set))

    pretrain_steps = [0]
    train_steps = [FLAGS.num_epochs * len(training_set)]

    tf.logging.info('Creating TensorFlow checkpoint dir...')
    gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
    summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

    with tf.Session(FLAGS.tf_master, graph=graph) as sess:
        # Make sure to re-initialize all underlying state.
        sess.run(tf.global_variables_initializer())
        trainer_lib.run_training(
            sess, trainers, annotator, evaluation.segmentation_summaries,
            pretrain_steps, train_steps, char_training_set, char_dev_set,
            dev_set, FLAGS.batch_size, summary_writer, FLAGS.report_every,
            builder.saver, FLAGS.checkpoint_filename)
def main(unused_argv):
  logging.set_verbosity(logging.INFO)

  if not gfile.IsDirectory(FLAGS.resource_path):
    gfile.MakeDirs(FLAGS.resource_path)

  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  if FLAGS.compute_lexicon:
    logging.info('Computing lexicon...')
    lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

  # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
  # sequence model, which encodes the context to the right of each token. It has
  # no loss except for the downstream components.

  char2word = spec_builder.ComponentSpecBuilder('char_lstm')
  char2word.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork',
      hidden_layer_sizes='256')
  char2word.set_transition_system(name='char-shift-only', left_to_right='true')
  char2word.add_fixed_feature(name='chars', fml='char-input.text-char',
                              embedding_dim=16)
  char2word.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  lookahead = spec_builder.ComponentSpecBuilder('lookahead')
  lookahead.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork',
      hidden_layer_sizes='256')
  lookahead.set_transition_system(name='shift-only', left_to_right='false')
  lookahead.add_link(source=char2word, fml='input.last-char-focus',
                     embedding_dim=32)
  lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
  # sequence tagger.
  tagger = spec_builder.ComponentSpecBuilder('tagger')
  tagger.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork',
      hidden_layer_sizes='256')
  tagger.set_transition_system(name='tagger')
  tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32)
  tagger.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Construct the ComponentSpec for parsing.
  parser = spec_builder.ComponentSpecBuilder('parser')
  parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256',
                          layer_norm_hidden='True')
  parser.set_transition_system(name='arc-standard')
  parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32)
  parser.add_token_link(
      source=tagger,
      fml='input.focus stack.focus stack(1).focus',
      embedding_dim=32)

  # Recurrent connection for the arc-standard parser. For both tokens on the
  # stack, we connect to the last time step to either SHIFT or REDUCE that
  # token. This allows the parser to build up compositional representations of
  # phrases.
  parser.add_link(
      source=parser,  # recurrent connection
      name='rnn-stack',  # unique identifier
      fml='stack.focus stack(1).focus',  # look for both stack tokens
      source_translator='shift-reduce-step',  # maps token indices -> step
      embedding_dim=32)  # project down to 32 dims

  parser.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([char2word.spec, lookahead.spec,
                                tagger.spec, parser.spec])
  logging.info('Constructed master spec: %s', str(master_spec))
  hyperparam_config = spec_pb2.GridPoint()
  hyperparam_config.decay_steps = 128000
  hyperparam_config.learning_rate = 0.001
  hyperparam_config.learning_method = 'adam'
  hyperparam_config.adam_beta1 = 0.9
  hyperparam_config.adam_beta2 = 0.9
  hyperparam_config.adam_eps = 0.0001
  hyperparam_config.gradient_clip_norm = 1
  hyperparam_config.self_norm_alpha = 1.0
  hyperparam_config.use_moving_average = True
  hyperparam_config.dropout_rate = 0.7
  hyperparam_config.seed = 1

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
    component_targets = spec_builder.default_targets_from_spec(master_spec)
    trainers = [
        builder.add_training_from_config(target) for target in component_targets
    ]
    assert len(trainers) == 2
    annotator = builder.add_annotation()
    builder.add_saver()

  # Read in serialized protos from training data.
  training_set = sentence_io.ConllSentenceReader(
      FLAGS.training_corpus_path,
      projectivize=FLAGS.projectivize_training_set).corpus()
  dev_set = sentence_io.ConllSentenceReader(
      FLAGS.dev_corpus_path, projectivize=False).corpus()

  # Ready to train!
  logging.info('Training on %d sentences.', len(training_set))
  logging.info('Tuning on %d sentences.', len(dev_set))

  pretrain_steps = [100, 0]
  tagger_steps = 1000
  train_steps = [tagger_steps, 8 * tagger_steps]

  tf.logging.info('Creating TensorFlow checkpoint dir...')
  gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
  summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

  with tf.Session(FLAGS.tf_master, graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.global_variables_initializer())
    trainer_lib.run_training(
        sess, trainers, annotator, evaluation.parser_summaries, pretrain_steps,
        train_steps, training_set, dev_set, dev_set, FLAGS.batch_size,
        summary_writer, FLAGS.report_every, builder.saver,
        FLAGS.checkpoint_filename)
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    if not gfile.IsDirectory(FLAGS.resource_path):
        gfile.MakeDirs(FLAGS.resource_path)

    # Constructs lexical resources for SyntaxNet in the given resource path, from
    # the training data.
    if FLAGS.compute_lexicon:
        logging.info('Computing lexicon...')
        lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

    # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
    # sequence model, which encodes the context to the right of each token. It has
    # no loss except for the downstream components.

    char2word = spec_builder.ComponentSpecBuilder('char_lstm')
    char2word.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='256')
    char2word.set_transition_system(name='char-shift-only',
                                    left_to_right='true')
    char2word.add_fixed_feature(name='chars',
                                fml='char-input.text-char',
                                embedding_dim=16)
    char2word.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    lookahead = spec_builder.ComponentSpecBuilder('lookahead')
    lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                               hidden_layer_sizes='256')
    lookahead.set_transition_system(name='shift-only', left_to_right='false')
    lookahead.add_link(source=char2word,
                       fml='input.last-char-focus',
                       embedding_dim=32)
    lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
    # sequence tagger.
    tagger = spec_builder.ComponentSpecBuilder('tagger')
    tagger.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork',
                            hidden_layer_sizes='256')
    tagger.set_transition_system(name='tagger')
    tagger.add_token_link(source=lookahead,
                          fml='input.focus',
                          embedding_dim=32)
    tagger.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    # Construct the ComponentSpec for parsing.
    parser = spec_builder.ComponentSpecBuilder('parser')
    parser.set_network_unit(name='FeedForwardNetwork',
                            hidden_layer_sizes='256',
                            layer_norm_hidden='True')
    parser.set_transition_system(name='arc-standard')
    parser.add_token_link(source=lookahead,
                          fml='input.focus',
                          embedding_dim=32)
    parser.add_token_link(source=tagger,
                          fml='input.focus stack.focus stack(1).focus',
                          embedding_dim=32)

    # Recurrent connection for the arc-standard parser. For both tokens on the
    # stack, we connect to the last time step to either SHIFT or REDUCE that
    # token. This allows the parser to build up compositional representations of
    # phrases.
    parser.add_link(
        source=parser,  # recurrent connection
        name='rnn-stack',  # unique identifier
        fml='stack.focus stack(1).focus',  # look for both stack tokens
        source_translator='shift-reduce-step',  # maps token indices -> step
        embedding_dim=32)  # project down to 32 dims

    parser.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

    master_spec = spec_pb2.MasterSpec()
    master_spec.component.extend(
        [char2word.spec, lookahead.spec, tagger.spec, parser.spec])
    logging.info('Constructed master spec: %s', str(master_spec))
    hyperparam_config = spec_pb2.GridPoint()
    hyperparam_config.decay_steps = 128000
    hyperparam_config.learning_rate = 0.001
    hyperparam_config.learning_method = 'adam'
    hyperparam_config.adam_beta1 = 0.9
    hyperparam_config.adam_beta2 = 0.9
    hyperparam_config.adam_eps = 0.0001
    hyperparam_config.gradient_clip_norm = 1
    hyperparam_config.self_norm_alpha = 1.0
    hyperparam_config.use_moving_average = True
    hyperparam_config.dropout_rate = 0.7
    hyperparam_config.seed = 1

    # Build the TensorFlow graph.
    graph = tf.Graph()
    with graph.as_default():
        builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
        component_targets = spec_builder.default_targets_from_spec(master_spec)
        trainers = [
            builder.add_training_from_config(target)
            for target in component_targets
        ]
        assert len(trainers) == 2
        annotator = builder.add_annotation()
        builder.add_saver()

    # Read in serialized protos from training data.
    training_set = sentence_io.ConllSentenceReader(
        FLAGS.training_corpus_path,
        projectivize=FLAGS.projectivize_training_set).corpus()
    dev_set = sentence_io.ConllSentenceReader(FLAGS.dev_corpus_path,
                                              projectivize=False).corpus()

    # Ready to train!
    logging.info('Training on %d sentences.', len(training_set))
    logging.info('Tuning on %d sentences.', len(dev_set))

    pretrain_steps = [100, 0]
    tagger_steps = 1000
    train_steps = [tagger_steps, 8 * tagger_steps]

    tf.logging.info('Creating TensorFlow checkpoint dir...')
    gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
    summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

    with tf.Session(FLAGS.tf_master, graph=graph) as sess:
        # Make sure to re-initialize all underlying state.
        sess.run(tf.global_variables_initializer())
        trainer_lib.run_training(sess, trainers, annotator,
                                 evaluation.parser_summaries, pretrain_steps,
                                 train_steps, training_set, dev_set, dev_set,
                                 FLAGS.batch_size, summary_writer,
                                 FLAGS.report_every, builder.saver,
                                 FLAGS.checkpoint_filename)
Beispiel #12
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  check.IsTrue(FLAGS.checkpoint_filename)
  check.IsTrue(FLAGS.tensorboard_dir)
  check.IsTrue(FLAGS.resource_path)

  if not gfile.IsDirectory(FLAGS.resource_path):
    gfile.MakeDirs(FLAGS.resource_path)

  training_corpus_path = gfile.Glob(FLAGS.training_corpus_path)[0]
  tune_corpus_path = gfile.Glob(FLAGS.tune_corpus_path)[0]

  # SummaryWriter for TensorBoard
  tf.logging.info('TensorBoard directory: "%s"', FLAGS.tensorboard_dir)
  tf.logging.info('Deleting prior data if exists...')

  stats_file = '%s.stats' % FLAGS.checkpoint_filename
  try:
    stats = gfile.GFile(stats_file, 'r').readlines()[0].split(',')
    stats = [int(x) for x in stats]
  except errors.OpError:
    stats = [-1, 0, 0]

  tf.logging.info('Read ckpt stats: %s', str(stats))
  do_restore = True
  if stats[0] < FLAGS.job_id:
    do_restore = False
    tf.logging.info('Deleting last job: %d', stats[0])
    try:
      gfile.DeleteRecursively(FLAGS.tensorboard_dir)
      gfile.Remove(FLAGS.checkpoint_filename)
    except errors.OpError as err:
      tf.logging.error('Unable to delete prior files: %s', err)
    stats = [FLAGS.job_id, 0, 0]

  tf.logging.info('Creating the directory again...')
  gfile.MakeDirs(FLAGS.tensorboard_dir)
  tf.logging.info('Created! Instatiating SummaryWriter...')
  summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)
  tf.logging.info('Creating TensorFlow checkpoint dir...')
  gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))

  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  if FLAGS.compute_lexicon:
    logging.info('Computing lexicon...')
    lexicon.build_lexicon(
        FLAGS.resource_path, training_corpus_path, morph_to_pos=True)

  tf.logging.info('Loading MasterSpec...')
  master_spec = spec_pb2.MasterSpec()
  with gfile.FastGFile(FLAGS.dragnn_spec, 'r') as fin:
    text_format.Parse(fin.read(), master_spec)
  spec_builder.complete_master_spec(master_spec, None, FLAGS.resource_path)
  logging.info('Constructed master spec: %s', str(master_spec))
  hyperparam_config = spec_pb2.GridPoint()

  # Build the TensorFlow graph.
  tf.logging.info('Building Graph...')
  hyperparam_config = spec_pb2.GridPoint()
  try:
    text_format.Parse(FLAGS.hyperparams, hyperparam_config)
  except text_format.ParseError:
    text_format.Parse(base64.b64decode(FLAGS.hyperparams), hyperparam_config)
  g = tf.Graph()
  with g.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
    component_targets = [
        spec_pb2.TrainTarget(
            name=component.name,
            max_index=idx + 1,
            unroll_using_oracle=[False] * idx + [True])
        for idx, component in enumerate(master_spec.component)
        if 'shift-only' not in component.transition_system.registered_name
    ]
    trainers = [
        builder.add_training_from_config(target) for target in component_targets
    ]
    annotator = builder.add_annotation()
    builder.add_saver()

  # Read in serialized protos from training data.
  training_set = ConllSentenceReader(
      training_corpus_path,
      projectivize=FLAGS.projectivize_training_set,
      morph_to_pos=True).corpus()
  tune_set = ConllSentenceReader(
      tune_corpus_path, projectivize=False, morph_to_pos=True).corpus()

  # Ready to train!
  logging.info('Training on %d sentences.', len(training_set))
  logging.info('Tuning on %d sentences.', len(tune_set))

  pretrain_steps = [10000, 0]
  tagger_steps = 100000
  train_steps = [tagger_steps, 8 * tagger_steps]

  with tf.Session(FLAGS.tf_master, graph=g) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.global_variables_initializer())

    if do_restore:
      tf.logging.info('Restoring from checkpoint...')
      builder.saver.restore(sess, FLAGS.checkpoint_filename)

      prev_tagger_steps = stats[1]
      prev_parser_steps = stats[2]
      tf.logging.info('adjusting schedule from steps: %d, %d',
                      prev_tagger_steps, prev_parser_steps)
      pretrain_steps[0] = max(pretrain_steps[0] - prev_tagger_steps, 0)
      tf.logging.info('new pretrain steps: %d', pretrain_steps[0])

    trainer_lib.run_training(
        sess, trainers, annotator, evaluation.parser_summaries, pretrain_steps,
        train_steps, training_set, tune_set, tune_set, FLAGS.batch_size,
        summary_writer, FLAGS.report_every, builder.saver,
        FLAGS.checkpoint_filename, stats)
Beispiel #13
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)

  if not gfile.IsDirectory(FLAGS.resource_path):
    gfile.MakeDirs(FLAGS.resource_path)

  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  if FLAGS.compute_lexicon:
    logging.info('Computing lexicon...')
    lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path)

  # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN
  # sequence model, which encodes the context to the right of each token. It has
  # no loss except for the downstream components.
  lookahead = spec_builder.ComponentSpecBuilder('lookahead')
  lookahead.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256')
  lookahead.set_transition_system(name='shift-only', left_to_right='false')
  lookahead.add_fixed_feature(name='char',
                              fml='input(-1).char input.char input(1).char',
                              embedding_dim=32)
  lookahead.add_fixed_feature(name='char-bigram',
                              fml='input.char-bigram',
                              embedding_dim=32)
  lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Construct the ComponentSpec for segmentation.
  segmenter = spec_builder.ComponentSpecBuilder('segmenter')
  segmenter.set_network_unit(
      name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='128')
  segmenter.set_transition_system(name='binary-segment-transitions')
  segmenter.add_token_link(
      source=lookahead, fml='input.focus stack.focus',
      embedding_dim=64)
  segmenter.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master)

  # Build and write master_spec.
  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([lookahead.spec, segmenter.spec])
  logging.info('Constructed master spec: %s', str(master_spec))
  with gfile.GFile(FLAGS.resource_path + '/master_spec', 'w') as f:
    f.write(str(master_spec).encode('utf-8'))

  hyperparam_config = spec_pb2.GridPoint()
  try:
    text_format.Parse(FLAGS.hyperparams, hyperparam_config)
  except text_format.ParseError:
    text_format.Parse(base64.b64decode(FLAGS.hyperparams), hyperparam_config)

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)
    component_targets = spec_builder.default_targets_from_spec(master_spec)
    trainers = [
        builder.add_training_from_config(target) for target in component_targets
    ]
    assert len(trainers) == 1
    annotator = builder.add_annotation()
    builder.add_saver()

  # Read in serialized protos from training data.
  training_set = ConllSentenceReader(
      FLAGS.training_corpus_path, projectivize=False).corpus()
  dev_set = ConllSentenceReader(
      FLAGS.dev_corpus_path, projectivize=False).corpus()

  # Convert word-based docs to char-based documents for segmentation training
  # and evaluation.
  with tf.Session(graph=tf.Graph()) as tmp_session:
    char_training_set_op = gen_parser_ops.segmenter_training_data_constructor(
        training_set)
    char_dev_set_op = gen_parser_ops.char_token_generator(dev_set)
    char_training_set = tmp_session.run(char_training_set_op)
    char_dev_set = tmp_session.run(char_dev_set_op)

  # Ready to train!
  logging.info('Training on %d sentences.', len(training_set))
  logging.info('Tuning on %d sentences.', len(dev_set))

  pretrain_steps = [0]
  train_steps = [FLAGS.num_epochs * len(training_set)]

  tf.logging.info('Creating TensorFlow checkpoint dir...')
  gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename))
  summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir)

  with tf.Session(FLAGS.tf_master, graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.global_variables_initializer())
    trainer_lib.run_training(
        sess, trainers, annotator, evaluation.segmentation_summaries,
        pretrain_steps, train_steps, char_training_set, char_dev_set, dev_set,
        FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver,
        FLAGS.checkpoint_filename)
Beispiel #14
0
def main(argv):
  del argv  # unused
  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  lexicon.build_lexicon(
      lexicon_dir,
      training_sentence,
      training_corpus_format='sentence-prototext')

  # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
  # sequence tagger.
  tagger = spec_builder.ComponentSpecBuilder('tagger')
  tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256')
  tagger.set_transition_system(name='tagger')
  tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64)
  tagger.add_rnn_link(embedding_dim=-1)
  tagger.fill_from_resources(lexicon_dir)

  # Construct the ComponentSpec for parsing.
  parser = spec_builder.ComponentSpecBuilder('parser')
  parser.set_network_unit(
      name='FeedForwardNetwork',
      hidden_layer_sizes='256',
      layer_norm_hidden='True')
  parser.set_transition_system(name='arc-standard')
  parser.add_token_link(
      source=tagger,
      fml='input.focus stack.focus stack(1).focus',
      embedding_dim=32,
      source_layer='logits')

  # Recurrent connection for the arc-standard parser. For both tokens on the
  # stack, we connect to the last time step to either SHIFT or REDUCE that
  # token. This allows the parser to build up compositional representations of
  # phrases.
  parser.add_link(
      source=parser,  # recurrent connection
      name='rnn-stack',  # unique identifier
      fml='stack.focus stack(1).focus',  # look for both stack tokens
      source_translator='shift-reduce-step',  # maps token indices -> step
      embedding_dim=32)  # project down to 32 dims
  parser.fill_from_resources(lexicon_dir)

  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([tagger.spec, parser.spec])

  hyperparam_config = spec_pb2.GridPoint()

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)

    target = spec_pb2.TrainTarget()
    target.name = 'all'
    target.unroll_using_oracle.extend([True, True])
    dry_run = builder.add_training_from_config(target, trace_only=True)

  # Read in serialized protos from training data.
  sentence = sentence_pb2.Sentence()
  text_format.Merge(open(training_sentence).read(), sentence)
  training_set = [sentence.SerializeToString()]

  with tf.Session(graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.initialize_all_variables())
    traces = sess.run(
        dry_run['traces'], feed_dict={dry_run['input_batch']: training_set})

  with open('dragnn_tutorial_2.html', 'w') as f:
    f.write(
        visualization.trace_html(
            traces[0], height='400px', master_spec=master_spec).encode('utf-8'))
Beispiel #15
0
def main(argv):
  del argv  # unused
  # Constructs lexical resources for SyntaxNet in the given resource path, from
  # the training data.
  lexicon.build_lexicon(
      lexicon_dir,
      training_sentence,
      training_corpus_format='sentence-prototext')

  # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN
  # sequence tagger.
  tagger = spec_builder.ComponentSpecBuilder('tagger')
  tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256')
  tagger.set_transition_system(name='tagger')
  tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64)
  tagger.add_rnn_link(embedding_dim=-1)
  tagger.fill_from_resources(lexicon_dir)

  # Construct the ComponentSpec for parsing.
  parser = spec_builder.ComponentSpecBuilder('parser')
  parser.set_network_unit(
      name='FeedForwardNetwork',
      hidden_layer_sizes='256',
      layer_norm_hidden='True')
  parser.set_transition_system(name='arc-standard')
  parser.add_token_link(
      source=tagger,
      fml='input.focus stack.focus stack(1).focus',
      embedding_dim=32,
      source_layer='logits')

  # Recurrent connection for the arc-standard parser. For both tokens on the
  # stack, we connect to the last time step to either SHIFT or REDUCE that
  # token. This allows the parser to build up compositional representations of
  # phrases.
  parser.add_link(
      source=parser,  # recurrent connection
      name='rnn-stack',  # unique identifier
      fml='stack.focus stack(1).focus',  # look for both stack tokens
      source_translator='shift-reduce-step',  # maps token indices -> step
      embedding_dim=32)  # project down to 32 dims
  parser.fill_from_resources(lexicon_dir)

  master_spec = spec_pb2.MasterSpec()
  master_spec.component.extend([tagger.spec, parser.spec])

  hyperparam_config = spec_pb2.GridPoint()

  # Build the TensorFlow graph.
  graph = tf.Graph()
  with graph.as_default():
    builder = graph_builder.MasterBuilder(master_spec, hyperparam_config)

    target = spec_pb2.TrainTarget()
    target.name = 'all'
    target.unroll_using_oracle.extend([True, True])
    dry_run = builder.add_training_from_config(target, trace_only=True)

  # Read in serialized protos from training data.
  sentence = sentence_pb2.Sentence()
  text_format.Merge(open(training_sentence).read(), sentence)
  training_set = [sentence.SerializeToString()]

  with tf.Session(graph=graph) as sess:
    # Make sure to re-initialize all underlying state.
    sess.run(tf.initialize_all_variables())
    traces = sess.run(
        dry_run['traces'], feed_dict={dry_run['input_batch']: training_set})

  with open('dragnn_tutorial_2.html', 'w') as f:
    f.write(
        visualization.trace_html(
            traces[0], height='400px', master_spec=master_spec).encode('utf-8'))