def _make_basic_master_spec(): """Constructs a simple spec. Modified version of nlp/saft/opensource/dragnn/tools/parser_trainer.py Returns: spec_pb2.MasterSpec instance. """ # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN # sequence model, which encodes the context to the right of each token. It has # no loss except for the downstream components. lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='true') lookahead.add_fixed_feature(name='words', fml='input.word', embedding_dim=64) lookahead.add_rnn_link(embedding_dim=-1) # Construct the ComponentSpec for parsing. parser = spec_builder.ComponentSpecBuilder('parser') parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') parser.set_transition_system(name='arc-standard') parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32) master_spec = spec_pb2.MasterSpec() master_spec.component.extend([lookahead.spec, parser.spec]) return master_spec
def testFillsTaggerTransitions(self): lexicon_dir = tempfile.mkdtemp() def write_lines(filename, lines): with open(os.path.join(lexicon_dir, filename), 'w') as f: f.write(''.join('{}\n'.format(line) for line in lines)) # Label map is required, even though it isn't used write_lines('label-map', ['0']) write_lines('word-map', ['2', 'miranda 1', 'rights 1']) write_lines('tag-map', ['2', 'NN 1', 'NNP 1']) write_lines('tag-to-category', ['NN\tNOUN', 'NNP\tNOUN']) tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64) tagger.add_rnn_link(embedding_dim=-1) tagger.fill_from_resources(lexicon_dir) fixed_feature, = tagger.spec.fixed_feature linked_feature, = tagger.spec.linked_feature self.assertEqual(fixed_feature.vocabulary_size, 5) self.assertEqual(fixed_feature.size, 1) self.assertEqual(fixed_feature.size, 1) self.assertEqual(linked_feature.size, 1) self.assertEqual(tagger.spec.num_actions, 2)
def testComponentSpecBuilderEmpty(self): builder = spec_builder.ComponentSpecBuilder('test') self.assertSpecEqual(r""" name: "test" backend { registered_name: "SyntaxNetComponent" } component_builder { registered_name: "DynamicComponentBuilder" } """, builder.spec)
def testComponentSpecBuilderLinkedFeature(self): builder1 = spec_builder.ComponentSpecBuilder('test1') builder1.set_transition_system('shift-only') builder1.add_fixed_feature(name='words', fml='input.word', embedding_dim=16) builder2 = spec_builder.ComponentSpecBuilder('test2') builder2.set_network_unit('IdentityNetwork') builder2.set_transition_system('tagger') builder2.add_token_link( source=builder1, source_layer='words', fml='input.focus', embedding_dim=-1) self.assertSpecEqual(r""" name: "test2" linked_feature { name: "test1" source_component: "test1" source_layer: "words" source_translator: "identity" fml: "input.focus" embedding_dim: -1 } backend { registered_name: "SyntaxNetComponent" } component_builder { registered_name: "DynamicComponentBuilder" } network_unit { registered_name: "IdentityNetwork" } transition_system { registered_name: "tagger" } """, builder2.spec)
def testComponentSpecBuilderFixedFeature(self): builder = spec_builder.ComponentSpecBuilder('test') builder.set_network_unit('FeedForwardNetwork', hidden_layer_sizes='64,64') builder.set_transition_system('shift-only') builder.add_fixed_feature(name='words', fml='input.word', embedding_dim=16) self.assertSpecEqual(r""" name: "test" fixed_feature { name: "words" fml: "input.word" embedding_dim: 16 } backend { registered_name: "SyntaxNetComponent" } component_builder { registered_name: "DynamicComponentBuilder" } network_unit { registered_name: "FeedForwardNetwork" parameters { key: "hidden_layer_sizes" value: "64,64" } } transition_system { registered_name: "shift-only" } """, builder.spec)
def main(argv): del argv # unused # Constructs lexical resources for SyntaxNet in the given resource path, from # the training data. lexicon.build_lexicon(lexicon_dir, training_sentence, training_corpus_format='sentence-prototext') # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN # sequence tagger. tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64) tagger.add_rnn_link(embedding_dim=-1) tagger.fill_from_resources(lexicon_dir) master_spec = spec_pb2.MasterSpec() master_spec.component.extend([tagger.spec]) hyperparam_config = spec_pb2.GridPoint() # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) target = spec_pb2.TrainTarget() target.name = 'all' target.unroll_using_oracle.extend([True]) dry_run = builder.add_training_from_config(target, trace_only=True) # Read in serialized protos from training data. sentence = sentence_pb2.Sentence() text_format.Merge(open(training_sentence).read(), sentence) training_set = [sentence.SerializeToString()] with tf.Session(graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.initialize_all_variables()) traces = sess.run(dry_run['traces'], feed_dict={dry_run['input_batch']: training_set}) with open('dragnn_tutorial_1.html', 'w') as f: f.write( visualization.trace_html(traces[0], height='300px').encode('utf-8'))
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(FLAGS.resource_path): gfile.MakeDirs(FLAGS.resource_path) # Constructs lexical resources for SyntaxNet in the given resource path, from # the training data. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path) # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN # sequence model, which encodes the context to the right of each token. It has # no loss except for the downstream components. lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='false') lookahead.add_fixed_feature(name='char', fml='input(-1).char input.char input(1).char', embedding_dim=32) lookahead.add_fixed_feature(name='char-bigram', fml='input.char-bigram', embedding_dim=32) lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Construct the ComponentSpec for segmentation. segmenter = spec_builder.ComponentSpecBuilder('segmenter') segmenter.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='128') segmenter.set_transition_system(name='binary-segment-transitions') segmenter.add_token_link(source=lookahead, fml='input.focus stack.focus', embedding_dim=64) segmenter.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Build and write master_spec. master_spec = spec_pb2.MasterSpec() master_spec.component.extend([lookahead.spec, segmenter.spec]) logging.info('Constructed master spec: %s', str(master_spec)) with gfile.GFile(FLAGS.resource_path + '/master_spec', 'w') as f: f.write(str(master_spec).encode('utf-8')) hyperparam_config = spec_pb2.GridPoint() try: text_format.Parse(FLAGS.hyperparams, hyperparam_config) except text_format.ParseError: text_format.Parse(base64.b64decode(FLAGS.hyperparams), hyperparam_config) # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) component_targets = spec_builder.default_targets_from_spec(master_spec) trainers = [ builder.add_training_from_config(target) for target in component_targets ] assert len(trainers) == 1 annotator = builder.add_annotation() builder.add_saver() # Read in serialized protos from training data. training_set = ConllSentenceReader(FLAGS.training_corpus_path, projectivize=False).corpus() dev_set = ConllSentenceReader(FLAGS.dev_corpus_path, projectivize=False).corpus() # Convert word-based docs to char-based documents for segmentation training # and evaluation. with tf.Session(graph=tf.Graph()) as tmp_session: char_training_set_op = gen_parser_ops.segmenter_training_data_constructor( training_set) char_dev_set_op = gen_parser_ops.char_token_generator(dev_set) char_training_set = tmp_session.run(char_training_set_op) char_dev_set = tmp_session.run(char_dev_set_op) # Ready to train! logging.info('Training on %d sentences.', len(training_set)) logging.info('Tuning on %d sentences.', len(dev_set)) pretrain_steps = [0] train_steps = [FLAGS.num_epochs * len(training_set)] tf.logging.info('Creating TensorFlow checkpoint dir...') gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename)) summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir) with tf.Session(FLAGS.tf_master, graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.global_variables_initializer()) trainer_lib.run_training( sess, trainers, annotator, evaluation.segmentation_summaries, pretrain_steps, train_steps, char_training_set, char_dev_set, dev_set, FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver, FLAGS.checkpoint_filename)
def build_master_spec(): ''' # Left-to-right, character-based LSTM. char2word = spec_builder.ComponentSpecBuilder('char_lstm') char2word.set_network_unit( name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') char2word.set_transition_system(name='char-shift-only', left_to_right='true') char2word.add_fixed_feature(name='chars', fml='char-input.text-char', embedding_dim=16) # Lookahead LSTM reads right-to-left to represent the rightmost context of the # words. It gets word embeddings from the char model. lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit( name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='false') lookahead.add_link(source=char2word, fml='input.last-char-focus', embedding_dim=64) ''' # Construct the 'lookahead' ComponentSpec. This is a simple right-to-left RNN # sequence model, which encodes the context to the right of each token. It has # no loss except for the downstream components. lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='true') lookahead.add_fixed_feature(name='words', fml='input.word', embedding_dim=64) lookahead.add_rnn_link(embedding_dim=-1) # Construct the tagger. This is a simple left-to-right LSTM sequence tagger. tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64) # Construct the parser. parser = spec_builder.ComponentSpecBuilder('parser') parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256', layer_norm_hidden='true') parser.set_transition_system(name='arc-standard') parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=64) parser.add_token_link(source=tagger, fml='input.focus stack.focus stack(1).focus', embedding_dim=64) # Add discrete features of the predicted parse tree so far, like in Parsey # McParseface. parser.add_fixed_feature( name='labels', embedding_dim=16, fml=' '.join([ 'stack.child(1).label', 'stack.child(1).sibling(-1).label', 'stack.child(-1).label', 'stack.child(-1).sibling(1).label', 'stack(1).child(1).label', 'stack(1).child(1).sibling(-1).label', 'stack(1).child(-1).label', 'stack(1).child(-1).sibling(1).label', 'stack.child(2).label', 'stack.child(-2).label', 'stack(1).child(2).label', 'stack(1).child(-2).label' ])) # Recurrent connection for the arc-standard parser. For both tokens on the # stack, we connect to the last time step to either SHIFT or REDUCE that # token. This allows the parser to build up compositional representations of # phrases. parser.add_link( source=parser, # recurrent connection name='rnn-stack', # unique identifier fml='stack.focus stack(1).focus', # look for both stack tokens source_translator='shift-reduce-step', # maps token indices -> step embedding_dim=64) # project down to 64 dims master_spec = spec_pb2.MasterSpec() ''' master_spec.component.extend( [char2word.spec, lookahead.spec, tagger.spec, parser.spec]) ''' master_spec.component.extend([lookahead.spec, tagger.spec, parser.spec]) return master_spec
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(FLAGS.resource_path): gfile.MakeDirs(FLAGS.resource_path) # Constructs lexical resources for SyntaxNet in the given resource path, from # the training data. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path) # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN # sequence model, which encodes the context to the right of each token. It has # no loss except for the downstream components. char2word = spec_builder.ComponentSpecBuilder('char_lstm') char2word.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') char2word.set_transition_system(name='char-shift-only', left_to_right='true') char2word.add_fixed_feature(name='chars', fml='char-input.text-char', embedding_dim=16) char2word.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='false') lookahead.add_link(source=char2word, fml='input.last-char-focus', embedding_dim=32) lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN # sequence tagger. tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32) tagger.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Construct the ComponentSpec for parsing. parser = spec_builder.ComponentSpecBuilder('parser') parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256', layer_norm_hidden='True') parser.set_transition_system(name='arc-standard') parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32) parser.add_token_link(source=tagger, fml='input.focus stack.focus stack(1).focus', embedding_dim=32) # Recurrent connection for the arc-standard parser. For both tokens on the # stack, we connect to the last time step to either SHIFT or REDUCE that # token. This allows the parser to build up compositional representations of # phrases. parser.add_link( source=parser, # recurrent connection name='rnn-stack', # unique identifier fml='stack.focus stack(1).focus', # look for both stack tokens source_translator='shift-reduce-step', # maps token indices -> step embedding_dim=32) # project down to 32 dims parser.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) master_spec = spec_pb2.MasterSpec() master_spec.component.extend( [char2word.spec, lookahead.spec, tagger.spec, parser.spec]) logging.info('Constructed master spec: %s', str(master_spec)) hyperparam_config = spec_pb2.GridPoint() hyperparam_config.decay_steps = 128000 hyperparam_config.learning_rate = 0.001 hyperparam_config.learning_method = 'adam' hyperparam_config.adam_beta1 = 0.9 hyperparam_config.adam_beta2 = 0.9 hyperparam_config.adam_eps = 0.0001 hyperparam_config.gradient_clip_norm = 1 hyperparam_config.self_norm_alpha = 1.0 hyperparam_config.use_moving_average = True hyperparam_config.dropout_rate = 0.7 hyperparam_config.seed = 1 # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) component_targets = spec_builder.default_targets_from_spec(master_spec) trainers = [ builder.add_training_from_config(target) for target in component_targets ] assert len(trainers) == 2 annotator = builder.add_annotation() builder.add_saver() # Read in serialized protos from training data. training_set = sentence_io.ConllSentenceReader( FLAGS.training_corpus_path, projectivize=FLAGS.projectivize_training_set).corpus() dev_set = sentence_io.ConllSentenceReader(FLAGS.dev_corpus_path, projectivize=False).corpus() # Ready to train! logging.info('Training on %d sentences.', len(training_set)) logging.info('Tuning on %d sentences.', len(dev_set)) pretrain_steps = [100, 0] tagger_steps = 1000 train_steps = [tagger_steps, 8 * tagger_steps] tf.logging.info('Creating TensorFlow checkpoint dir...') gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename)) summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir) with tf.Session(FLAGS.tf_master, graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.global_variables_initializer()) trainer_lib.run_training(sess, trainers, annotator, evaluation.parser_summaries, pretrain_steps, train_steps, training_set, dev_set, dev_set, FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver, FLAGS.checkpoint_filename)
def main(argv): del argv # unused # Constructs lexical resources for SyntaxNet in the given resource path, from # the training data. lexicon.build_lexicon( lexicon_dir, training_sentence, training_corpus_format='sentence-prototext') # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN # sequence tagger. tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_fixed_feature(name='words', fml='input.word', embedding_dim=64) tagger.add_rnn_link(embedding_dim=-1) tagger.fill_from_resources(lexicon_dir) # Construct the ComponentSpec for parsing. parser = spec_builder.ComponentSpecBuilder('parser') parser.set_network_unit( name='FeedForwardNetwork', hidden_layer_sizes='256', layer_norm_hidden='True') parser.set_transition_system(name='arc-standard') parser.add_token_link( source=tagger, fml='input.focus stack.focus stack(1).focus', embedding_dim=32, source_layer='logits') # Recurrent connection for the arc-standard parser. For both tokens on the # stack, we connect to the last time step to either SHIFT or REDUCE that # token. This allows the parser to build up compositional representations of # phrases. parser.add_link( source=parser, # recurrent connection name='rnn-stack', # unique identifier fml='stack.focus stack(1).focus', # look for both stack tokens source_translator='shift-reduce-step', # maps token indices -> step embedding_dim=32) # project down to 32 dims parser.fill_from_resources(lexicon_dir) master_spec = spec_pb2.MasterSpec() master_spec.component.extend([tagger.spec, parser.spec]) hyperparam_config = spec_pb2.GridPoint() # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) target = spec_pb2.TrainTarget() target.name = 'all' target.unroll_using_oracle.extend([True, True]) dry_run = builder.add_training_from_config(target, trace_only=True) # Read in serialized protos from training data. sentence = sentence_pb2.Sentence() text_format.Merge(open(training_sentence).read(), sentence) training_set = [sentence.SerializeToString()] with tf.Session(graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.initialize_all_variables()) traces = sess.run( dry_run['traces'], feed_dict={dry_run['input_batch']: training_set}) with open('dragnn_tutorial_2.html', 'w') as f: f.write( visualization.trace_html( traces[0], height='400px', master_spec=master_spec).encode('utf-8'))