def pretty_print(): _write_input(_read_output().strip()) logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source( batch_size=32, corpus_name='input-from-file-conll', task_context=task_context_path) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) # for d in documents: # sentence.ParseFromString(d) # as_asciitree(sentence) for d in documents: sentence.ParseFromString(d) tr = asciitree.LeftAligned() d = to_dict(sentence) print('Input: %s' % sentence.text) print('Parse:') tr_str = tr(d) pat = re.compile(r'\s*@\d+$') for tr_ln in tr_str.splitlines(): print(pat.sub('', tr_ln)) if finished: break
def testConllSentence(self): # This test sentence includes a multiword token and an empty node, # both of which are to be ignored. test_sentence = """ 1-2 We've _ 1 We we PRON PRP Case=Nom 3 nsubj _ SpaceAfter=No 2 've have AUX VBP Mood=Ind 3 aux _ _ 3 moved move VERB VBN Tense=Past 0 root _ _ 4 on on ADV RB _ 3 advmod _ SpaceAfter=No 4.1 ignored ignore VERB VBN Tense=Past 0 _ _ _ 5 . . PUNCT . _ 3 punct _ _ """ # Prepare test sentence. with open(self.corpus_file, 'w') as f: f.write(test_sentence) # Prepare context. self.WriteContext('conll-sentence') # Test converted sentence. sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) # Expected texts, words, and start/end offsets. expected_text = u'We\'ve moved on.' expected_words = [u'We', u'\'ve', u'moved', u'on', u'.'] expected_starts = [0, 2, 6, 12, 14] expected_ends = [1, 4, 10, 13, 14] with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(expected_text, sentence_doc.text) self.assertEqual(expected_words, [t.word for t in sentence_doc.token]) self.assertEqual(expected_starts, [t.start for t in sentence_doc.token]) self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
def __init__(self, filepath, batch_size=32, projectivize=False, morph_to_pos=False): self._graph = tf.Graph() self._session = tf.Session(graph=self._graph) task_context_str = """ input { name: 'documents' record_format: 'conll-sentence' Part { file_pattern: '%s' } }""" % filepath if morph_to_pos: task_context_str += """ Parameter { name: "join_category_to_pos" value: "true" } Parameter { name: "add_pos_as_attribute" value: "true" } Parameter { name: "serialize_morph_to_pos" value: "true" } """ with self._graph.as_default(): self._source, self._is_last = gen_parser_ops.document_source( task_context_str=task_context_str, batch_size=batch_size) self._source = gen_parser_ops.well_formed_filter(self._source) if projectivize: self._source = gen_parser_ops.projectivize_filter(self._source)
def testConllSentence(self): # This test sentence includes a multiword token and an empty node, # both of which are to be ignored. test_sentence = """ 1-2\tWe've\t_ 1\tWe\twe\tPRON\tPRP\tCase=Nom\t3\tnsubj\t_\tSpaceAfter=No 2\t've\thave\tAUX\tVBP\tMood=Ind\t3\taux\t_\t_ 3\tmoved\tmove\tVERB\tVBN\tTense=Past\t0\troot\t_\t_ 4\ton\ton\tADV\tRB\t_\t3\tadvmod\t_\tSpaceAfter=No|foobar=baz 4.1\tignored\tignore\tVERB\tVBN\tTense=Past\t0\t_\t_\t_ 5\t.\t.\tPUNCT\t.\t_\t3\tpunct\t_\t_ """ # Prepare test sentence. with open(self.corpus_file, 'w') as f: f.write(test_sentence) # Prepare context. self.WriteContext('conll-sentence') # Test converted sentence. sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) # Expected texts, words, and start/end offsets. expected_text = u'We\'ve moved on.' expected_words = [u'We', u'\'ve', u'moved', u'on', u'.'] expected_starts = [0, 2, 6, 12, 14] expected_ends = [1, 4, 10, 13, 14] with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(expected_text, sentence_doc.text) self.assertEqual(expected_words, [t.word for t in sentence_doc.token]) self.assertEqual(expected_starts, [t.start for t in sentence_doc.token]) self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) #print '...Sentence string before serialization: ', d tr = asciitree.LeftAligned() d = to_dict(sentence) print 'Input: %s' % sentence.text serializedStr = sentence.SerializeToString() #print '...Sentence string protobuf: ', serializedStr file = open("/Users/yihed/Documents/workspace/Other/src/thmp/data/serializedSentence.txt", "wb") #file = open("serializedSentence.txt", "wb") file.write(serializedStr) file.close() print 'Parse:' print tr(d) if finished: break
def CheckTokenization(self, sentence, tokenization): self.WriteContext('english-text') logging.info('Writing text file to: %s', self.corpus_file) with open(self.corpus_file, 'w') as f: f.write(sentence) sentence, _ = gen_parser_ops.document_source(self.context_file, batch_size=1) with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(' '.join([t.word for t in sentence_doc.token]), tokenization)
def CheckTokenization(self, sentence, tokenization): self.WriteContext('english-text') logging.info('Writing text file to: %s', self.corpus_file) with open(self.corpus_file, 'w') as f: f.write(sentence) sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(' '.join([t.word for t in sentence_doc.token]), tokenization)
def CheckUntokenizedDoc(self, sentence, words, starts, ends): self.WriteContext('untokenized-text') logging.info('Writing text file to: %s', self.corpus_file) with open(self.corpus_file, 'w') as f: f.write(sentence) sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(len(sentence_doc.token), len(words)) self.assertEqual(len(sentence_doc.token), len(starts)) self.assertEqual(len(sentence_doc.token), len(ends)) for i, token in enumerate(sentence_doc.token): self.assertEqual(token.word.encode('utf-8'), words[i]) self.assertEqual(token.start, starts[i]) self.assertEqual(token.end, ends[i])
def testSentencePrototext(self): # Note: lstrip() is to avoid an empty line at the beginning, which will # cause an empty record to be emitted. These empty records currently aren't # supported by the sentence prototext format (which is currently mostly for # debugging). test_sentence = """ text: "fair enough; you people have eaten me." token { word: "fair" start: 0 end: 3 break_level: NO_BREAK } token { word: "enough" start: 5 end: 10 head: 0 break_level: SPACE_BREAK } """.lstrip() # Prepare test sentence. with open(self.corpus_file, 'w') as f: f.write(test_sentence) # Prepare context. self.WriteContext('sentence-prototext') # Test converted sentence. sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) # Expected texts, words, and start/end offsets. expected_text = u'fair enough; you people have eaten me.' expected_words = [u'fair', u'enough'] expected_starts = [0, 5] expected_ends = [3, 10] with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(expected_text, sentence_doc.text) self.assertEqual(expected_words, [t.word for t in sentence_doc.token]) self.assertEqual(expected_starts, [t.start for t in sentence_doc.token]) self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
def ValidateDocuments(self): doc_source = gen_parser_ops.document_source(self.context_file, batch_size=1) with self.test_session() as sess: logging.info('Reading document1') doc, last = self.ReadNextDocument(sess, doc_source) self.assertEqual(len(doc.token), 12) self.assertEqual(u'लाजमी', doc.token[9].word) self.assertFalse(last) logging.info('Reading document2') doc, last = self.ReadNextDocument(sess, doc_source) self.assertEqual(len(doc.token), 13) self.assertEqual(u'भंग', doc.token[9].word) self.assertFalse(last) logging.info('Hitting end of the dataset') doc, last = self.ReadNextDocument(sess, doc_source) self.assertTrue(doc is None) self.assertTrue(last)
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) d = to_dict(sentence) print json.dumps(d) if finished: break
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) # Rewrite context. RewriteContext() # Creates necessary term maps. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') with tf.Session(FLAGS.tf_master) as sess: gen_parser_ops.lexicon_builder(task_context=OutputPath('context'), corpus_name=FLAGS.training_corpus).run() with tf.Session(FLAGS.tf_master) as sess: feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( gen_parser_ops.feature_size(task_context=OutputPath('context'), arg_prefix=FLAGS.arg_prefix)) # Well formed and projectivize. if FLAGS.projectivize_training_set: logging.info('Preprocessing...') with tf.Session(FLAGS.tf_master) as sess: source, last = gen_parser_ops.document_source( task_context=OutputPath('context'), batch_size=FLAGS.batch_size, corpus_name=FLAGS.training_corpus) sink = gen_parser_ops.document_sink( task_context=OutputPath('context'), corpus_name='projectivized-training-corpus', documents=gen_parser_ops.projectivize_filter( gen_parser_ops.well_formed_filter(source, task_context=OutputPath( 'context')), task_context=OutputPath('context'))) while True: tf_last, _ = sess.run([last, sink]) if tf_last: break logging.info('Training...') with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words, break_levels): # Prepare context. self.WriteContext('segment-train-data') # Prepare test sentence. with open(self.corpus_file, 'w') as f: f.write(''.join(doc_lines)) # Test converted sentence. sentence, _ = gen_parser_ops.document_source( task_context=self.context_file, batch_size=1) with self.test_session() as sess: sentence_doc = self.ReadNextDocument(sess, sentence) self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text) self.assertEqual([t.decode('utf-8') for t in doc_words], [t.word for t in sentence_doc.token]) self.assertEqual(break_levels, [t.break_level for t in sentence_doc.token])
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) # Rewrite context. RewriteContext() # Creates necessary term maps. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') with tf.Session(FLAGS.tf_master) as sess: gen_parser_ops.lexicon_builder( task_context=OutputPath('context'), corpus_name=FLAGS.training_corpus).run() with tf.Session(FLAGS.tf_master) as sess: feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( gen_parser_ops.feature_size(task_context=OutputPath('context'), arg_prefix=FLAGS.arg_prefix)) # Well formed and projectivize. if FLAGS.projectivize_training_set: logging.info('Preprocessing...') with tf.Session(FLAGS.tf_master) as sess: source, last = gen_parser_ops.document_source( task_context=OutputPath('context'), batch_size=FLAGS.batch_size, corpus_name=FLAGS.training_corpus) sink = gen_parser_ops.document_sink( task_context=OutputPath('context'), corpus_name='projectivized-training-corpus', documents=gen_parser_ops.projectivize_filter( gen_parser_ops.well_formed_filter( source, task_context=OutputPath('context')), task_context=OutputPath('context'))) while True: tf_last, _ = sess.run([last, sink]) if tf_last: break logging.info('Training...') with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
def _get_sentence_dict(): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source( batch_size=32, corpus_name='input-from-file-conll', task_context=task_context_path) sentence = sentence_pb2.Sentence() result_dict = None while True: documents, finished = sess.run(src) for d in documents: sentence.ParseFromString(d) d = to_dict(sentence) result_dict = d if finished: break return result_dict
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) tr = asciitree.LeftAligned() d = to_dict(sentence) print('Input: %s' % sentence.text) print('Parse:') print(tr(d)) if finished: break
def __init__(self, filepath, record_format, batch_size=32, check_well_formed=False, projectivize=False, morph_to_pos=False): self._graph = tf.Graph() self._session = tf.Session(graph=self._graph) task_context_str = """ input { name: 'documents' record_format: '%s' Part { file_pattern: '%s' } }""" % (record_format, filepath) if morph_to_pos: task_context_str += """ Parameter { name: "join_category_to_pos" value: "true" } Parameter { name: "add_pos_as_attribute" value: "true" } Parameter { name: "serialize_morph_to_pos" value: "true" } """ with self._graph.as_default(): self._source, self._is_last = gen_parser_ops.document_source( task_context_str=task_context_str, batch_size=batch_size) if check_well_formed: self._source = gen_parser_ops.well_formed_filter(self._source) if projectivize: self._source = gen_parser_ops.projectivize_filter(self._source)
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) # tr = asciitree.LeftAligned() d = to_dict(sentence) print 'Input: %s' % sentence.text print 'Parse:' print json.dumps(d, indent=True) #dom = parseString(dicttoxml.dicttoxml(d, attr_type=False)) #print dom.toprettyxml() #print dicttoxml.dicttoxml(d) if finished: break
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) tr = asciitree.LeftAligned() d = to_dict(sentence) print('Input: %s' % sentence.text) print('Parse:') tr_str = tr(d) pat = re.compile(r'\s*@\d+$') for tr_ln in tr_str.splitlines(): print(pat.sub('', tr_ln)) if finished: break
def main(unused_argv): logging.set_verbosity(logging.INFO) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) tr = asciitree.LeftAligned() d = to_dict(sentence) print 'Input: %s' % sentence.text print 'Parse:' tr_str = tr(d) pat = re.compile(r'\s*@\d+$') for tr_ln in tr_str.splitlines(): print pat.sub('', tr_ln) if finished: break
def main(unused_argv): logging.set_verbosity(logging.DEBUG) with tf.Session() as sess: src = gen_parser_ops.document_source(batch_size=32, corpus_name=FLAGS.corpus_name, task_context=FLAGS.task_context) sentence = sentence_pb2.Sentence() while True: documents, finished = sess.run(src) logging.info('Read %d documents', len(documents)) for d in documents: sentence.ParseFromString(d) print formatter.format(sentence) # d_raw = d # # sentence.ParseFromString(d) # # tr = asciitree.LeftAligned() # # d = to_dict(sentence) # # print 'Input: %s' % sentence.text # # print 'Parse:' # # print tr(d) # print d_raw if finished: break
def main(unused_argv): logging.set_verbosity(logging.INFO) model_dir = FLAGS.model_dir task_context = "%s/context.pbtxt" % model_dir common_params = { "task_context": task_context, "beam_size": 8, "max_steps": 1000, "graph_builder": "structured", "batch_size": 1024, "slim_model": True, } model = { "brain_parser": { "arg_prefix": "brain_parser", "hidden_layer_sizes": "512,512", # input is taken from input tensor, not from corpus "input": None, "model_path": "%s/parser-params" % model_dir, }, } for prefix in ["brain_parser"]: model[prefix].update(common_params) feature_sizes, domain_sizes, embedding_dims, num_actions = GetFeatureSize( task_context, prefix) model[prefix].update({ 'feature_sizes': feature_sizes, 'domain_sizes': domain_sizes, 'embedding_dims': embedding_dims, 'num_actions': num_actions }) with tf.Session() as sess: if FLAGS.export_path is not None: text_input = tf.placeholder(tf.string, [None]) else: text_input = tf.constant(["parsey is the greatest"], tf.string) # corpus_name must be specified and valid because it indirectly informs # the document format ("english-text" vs "conll-sentence") used to parse # the input text document_source = gen_parser_ops.document_source( text=text_input, task_context=task_context, corpus_name="stdin-conll", batch_size=common_params['batch_size'], documents_from_input=True) for prefix in ["brain_parser"]: with tf.variable_scope(prefix): if True or prefix == "brain_tagger": #source = document_source.documents if prefix == "brain_tagger" else model["brain_tagger"]["documents"] source = document_source.documents model[prefix]["documents"] = Build(sess, source, model[prefix]) if FLAGS.export_path is None: sink = gen_parser_ops.document_sink( model["brain_parser"]["documents"], task_context=task_context, corpus_name="stdout-conll") sess.run(sink) else: assets = [] for model_file in os.listdir(model_dir): path = os.path.join(model_dir, model_file) if not os.path.isdir(path): assets.append(tf.constant(path)) ExportModel(sess, FLAGS.export_path, text_input, model["brain_parser"]["documents"], assets)