def pretty_print():
    _write_input(_read_output().strip())
    logging.set_verbosity(logging.INFO)
    with tf.Session() as sess:
        src = gen_parser_ops.document_source(
            batch_size=32,
            corpus_name='input-from-file-conll',
            task_context=task_context_path)
        sentence = sentence_pb2.Sentence()
        while True:
            documents, finished = sess.run(src)
            logging.info('Read %d documents', len(documents))
            # for d in documents:
            # 	sentence.ParseFromString(d)
            # 	as_asciitree(sentence)
            for d in documents:
                sentence.ParseFromString(d)
                tr = asciitree.LeftAligned()
                d = to_dict(sentence)
                print('Input: %s' % sentence.text)
                print('Parse:')
                tr_str = tr(d)
                pat = re.compile(r'\s*@\d+$')
                for tr_ln in tr_str.splitlines():
                    print(pat.sub('', tr_ln))
            if finished:
                break
Exemple #2
0
  def testConllSentence(self):
    # This test sentence includes a multiword token and an empty node,
    # both of which are to be ignored.
    test_sentence = """
1-2	We've	_
1	We	we	PRON	PRP	Case=Nom	3	nsubj	_	SpaceAfter=No
2	've	have	AUX	VBP	Mood=Ind	3	aux	_	_
3	moved	move	VERB	VBN	Tense=Past	0	root	_	_
4	on	on	ADV	RB	_	3	advmod	_	SpaceAfter=No
4.1	ignored	ignore	VERB	VBN	Tense=Past	0	_	_	_
5	.	.	PUNCT	.	_	3	punct	_	_
"""

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('conll-sentence')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'We\'ve moved on.'
    expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
    expected_starts = [0, 2, 6, 12, 14]
    expected_ends = [1, 4, 10, 13, 14]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
 def __init__(self, filepath, batch_size=32,
              projectivize=False, morph_to_pos=False):
   self._graph = tf.Graph()
   self._session = tf.Session(graph=self._graph)
   task_context_str = """
         input {
           name: 'documents'
           record_format: 'conll-sentence'
           Part {
            file_pattern: '%s'
           }
         }""" % filepath
   if morph_to_pos:
     task_context_str += """
         Parameter {
           name: "join_category_to_pos"
           value: "true"
         }
         Parameter {
           name: "add_pos_as_attribute"
           value: "true"
         }
         Parameter {
           name: "serialize_morph_to_pos"
           value: "true"
         }
         """
   with self._graph.as_default():
     self._source, self._is_last = gen_parser_ops.document_source(
         task_context_str=task_context_str, batch_size=batch_size)
     self._source = gen_parser_ops.well_formed_filter(self._source)
     if projectivize:
       self._source = gen_parser_ops.projectivize_filter(self._source)
  def testConllSentence(self):
    # This test sentence includes a multiword token and an empty node,
    # both of which are to be ignored.
    test_sentence = """
1-2\tWe've\t_
1\tWe\twe\tPRON\tPRP\tCase=Nom\t3\tnsubj\t_\tSpaceAfter=No
2\t've\thave\tAUX\tVBP\tMood=Ind\t3\taux\t_\t_
3\tmoved\tmove\tVERB\tVBN\tTense=Past\t0\troot\t_\t_
4\ton\ton\tADV\tRB\t_\t3\tadvmod\t_\tSpaceAfter=No|foobar=baz
4.1\tignored\tignore\tVERB\tVBN\tTense=Past\t0\t_\t_\t_
5\t.\t.\tPUNCT\t.\t_\t3\tpunct\t_\t_
"""

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('conll-sentence')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'We\'ve moved on.'
    expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
    expected_starts = [0, 2, 6, 12, 14]
    expected_ends = [1, 4, 10, 13, 14]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name=FLAGS.corpus_name,
                                         task_context=FLAGS.task_context)
    sentence = sentence_pb2.Sentence()
    
    while True:
      documents, finished = sess.run(src)
      logging.info('Read %d documents', len(documents))
      for d in documents:
        sentence.ParseFromString(d)
        #print '...Sentence string before serialization: ', d
        tr = asciitree.LeftAligned()
        d = to_dict(sentence)
        print 'Input: %s' % sentence.text
        serializedStr = sentence.SerializeToString()
	#print '...Sentence string protobuf: ', serializedStr
        file = open("/Users/yihed/Documents/workspace/Other/src/thmp/data/serializedSentence.txt", "wb")
        #file = open("serializedSentence.txt", "wb")
        file.write(serializedStr)
        file.close()
        print 'Parse:'
        print tr(d)

      if finished:
        break
  def testConllSentence(self):
    # This test sentence includes a multiword token and an empty node,
    # both of which are to be ignored.
    test_sentence = """
1-2\tWe've\t_
1\tWe\twe\tPRON\tPRP\tCase=Nom\t3\tnsubj\t_\tSpaceAfter=No
2\t've\thave\tAUX\tVBP\tMood=Ind\t3\taux\t_\t_
3\tmoved\tmove\tVERB\tVBN\tTense=Past\t0\troot\t_\t_
4\ton\ton\tADV\tRB\t_\t3\tadvmod\t_\tSpaceAfter=No|foobar=baz
4.1\tignored\tignore\tVERB\tVBN\tTense=Past\t0\t_\t_\t_
5\t.\t.\tPUNCT\t.\t_\t3\tpunct\t_\t_
"""

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('conll-sentence')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'We\'ve moved on.'
    expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
    expected_starts = [0, 2, 6, 12, 14]
    expected_ends = [1, 4, 10, 13, 14]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
Exemple #7
0
 def CheckTokenization(self, sentence, tokenization):
     self.WriteContext('english-text')
     logging.info('Writing text file to: %s', self.corpus_file)
     with open(self.corpus_file, 'w') as f:
         f.write(sentence)
     sentence, _ = gen_parser_ops.document_source(self.context_file,
                                                  batch_size=1)
     with self.test_session() as sess:
         sentence_doc = self.ReadNextDocument(sess, sentence)
         self.assertEqual(' '.join([t.word for t in sentence_doc.token]),
                          tokenization)
 def CheckTokenization(self, sentence, tokenization):
   self.WriteContext('english-text')
   logging.info('Writing text file to: %s', self.corpus_file)
   with open(self.corpus_file, 'w') as f:
     f.write(sentence)
   sentence, _ = gen_parser_ops.document_source(
       task_context=self.context_file, batch_size=1)
   with self.test_session() as sess:
     sentence_doc = self.ReadNextDocument(sess, sentence)
     self.assertEqual(' '.join([t.word
                                for t in sentence_doc.token]), tokenization)
 def CheckUntokenizedDoc(self, sentence, words, starts, ends):
   self.WriteContext('untokenized-text')
   logging.info('Writing text file to: %s', self.corpus_file)
   with open(self.corpus_file, 'w') as f:
     f.write(sentence)
   sentence, _ = gen_parser_ops.document_source(
       task_context=self.context_file, batch_size=1)
   with self.test_session() as sess:
     sentence_doc = self.ReadNextDocument(sess, sentence)
     self.assertEqual(len(sentence_doc.token), len(words))
     self.assertEqual(len(sentence_doc.token), len(starts))
     self.assertEqual(len(sentence_doc.token), len(ends))
     for i, token in enumerate(sentence_doc.token):
       self.assertEqual(token.word.encode('utf-8'), words[i])
       self.assertEqual(token.start, starts[i])
       self.assertEqual(token.end, ends[i])
 def CheckUntokenizedDoc(self, sentence, words, starts, ends):
     self.WriteContext('untokenized-text')
     logging.info('Writing text file to: %s', self.corpus_file)
     with open(self.corpus_file, 'w') as f:
         f.write(sentence)
     sentence, _ = gen_parser_ops.document_source(
         task_context=self.context_file, batch_size=1)
     with self.test_session() as sess:
         sentence_doc = self.ReadNextDocument(sess, sentence)
         self.assertEqual(len(sentence_doc.token), len(words))
         self.assertEqual(len(sentence_doc.token), len(starts))
         self.assertEqual(len(sentence_doc.token), len(ends))
         for i, token in enumerate(sentence_doc.token):
             self.assertEqual(token.word.encode('utf-8'), words[i])
             self.assertEqual(token.start, starts[i])
             self.assertEqual(token.end, ends[i])
    def testSentencePrototext(self):
        # Note: lstrip() is to avoid an empty line at the beginning, which will
        # cause an empty record to be emitted. These empty records currently aren't
        # supported by the sentence prototext format (which is currently mostly for
        # debugging).
        test_sentence = """
text: "fair enough; you people have eaten me."
token {
  word: "fair"
  start: 0
  end: 3
  break_level: NO_BREAK
}
token {
  word: "enough"
  start: 5
  end: 10
  head: 0
  break_level: SPACE_BREAK
}
""".lstrip()

        # Prepare test sentence.
        with open(self.corpus_file, 'w') as f:
            f.write(test_sentence)

        # Prepare context.
        self.WriteContext('sentence-prototext')

        # Test converted sentence.
        sentence, _ = gen_parser_ops.document_source(
            task_context=self.context_file, batch_size=1)

        # Expected texts, words, and start/end offsets.
        expected_text = u'fair enough; you people have eaten me.'
        expected_words = [u'fair', u'enough']
        expected_starts = [0, 5]
        expected_ends = [3, 10]
        with self.test_session() as sess:
            sentence_doc = self.ReadNextDocument(sess, sentence)
            self.assertEqual(expected_text, sentence_doc.text)
            self.assertEqual(expected_words,
                             [t.word for t in sentence_doc.token])
            self.assertEqual(expected_starts,
                             [t.start for t in sentence_doc.token])
            self.assertEqual(expected_ends,
                             [t.end for t in sentence_doc.token])
 def ValidateDocuments(self):
   doc_source = gen_parser_ops.document_source(self.context_file, batch_size=1)
   with self.test_session() as sess:
     logging.info('Reading document1')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertEqual(len(doc.token), 12)
     self.assertEqual(u'लाजमी', doc.token[9].word)
     self.assertFalse(last)
     logging.info('Reading document2')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertEqual(len(doc.token), 13)
     self.assertEqual(u'भंग', doc.token[9].word)
     self.assertFalse(last)
     logging.info('Hitting end of the dataset')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertTrue(doc is None)
     self.assertTrue(last)
Exemple #13
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name=FLAGS.corpus_name,
                                         task_context=FLAGS.task_context)
    sentence = sentence_pb2.Sentence()
    while True:
      documents, finished = sess.run(src)
      logging.info('Read %d documents', len(documents))
      for d in documents:
        sentence.ParseFromString(d)
        d = to_dict(sentence)
        print json.dumps(d)

      if finished:
        break
 def ValidateDocuments(self):
   doc_source = gen_parser_ops.document_source(self.context_file, batch_size=1)
   with self.test_session() as sess:
     logging.info('Reading document1')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertEqual(len(doc.token), 12)
     self.assertEqual(u'लाजमी', doc.token[9].word)
     self.assertFalse(last)
     logging.info('Reading document2')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertEqual(len(doc.token), 13)
     self.assertEqual(u'भंग', doc.token[9].word)
     self.assertFalse(last)
     logging.info('Hitting end of the dataset')
     doc, last = self.ReadNextDocument(sess, doc_source)
     self.assertTrue(doc is None)
     self.assertTrue(last)
  def testSentencePrototext(self):
    # Note: lstrip() is to avoid an empty line at the beginning, which will
    # cause an empty record to be emitted. These empty records currently aren't
    # supported by the sentence prototext format (which is currently mostly for
    # debugging).
    test_sentence = """
text: "fair enough; you people have eaten me."
token {
  word: "fair"
  start: 0
  end: 3
  break_level: NO_BREAK
}
token {
  word: "enough"
  start: 5
  end: 10
  head: 0
  break_level: SPACE_BREAK
}
""".lstrip()

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(test_sentence)

    # Prepare context.
    self.WriteContext('sentence-prototext')

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)

    # Expected texts, words, and start/end offsets.
    expected_text = u'fair enough; you people have eaten me.'
    expected_words = [u'fair', u'enough']
    expected_starts = [0, 5]
    expected_ends = [3, 10]
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(expected_text, sentence_doc.text)
      self.assertEqual(expected_words, [t.word for t in sentence_doc.token])
      self.assertEqual(expected_starts, [t.start for t in sentence_doc.token])
      self.assertEqual(expected_ends, [t.end for t in sentence_doc.token])
Exemple #16
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  if not gfile.IsDirectory(OutputPath('')):
    gfile.MakeDirs(OutputPath(''))

  # Rewrite context.
  RewriteContext()

  # Creates necessary term maps.
  if FLAGS.compute_lexicon:
    logging.info('Computing lexicon...')
    with tf.Session(FLAGS.tf_master) as sess:
      gen_parser_ops.lexicon_builder(task_context=OutputPath('context'),
                                     corpus_name=FLAGS.training_corpus).run()
  with tf.Session(FLAGS.tf_master) as sess:
    feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run(
        gen_parser_ops.feature_size(task_context=OutputPath('context'),
                                    arg_prefix=FLAGS.arg_prefix))

  # Well formed and projectivize.
  if FLAGS.projectivize_training_set:
    logging.info('Preprocessing...')
    with tf.Session(FLAGS.tf_master) as sess:
      source, last = gen_parser_ops.document_source(
          task_context=OutputPath('context'),
          batch_size=FLAGS.batch_size,
          corpus_name=FLAGS.training_corpus)
      sink = gen_parser_ops.document_sink(
          task_context=OutputPath('context'),
          corpus_name='projectivized-training-corpus',
          documents=gen_parser_ops.projectivize_filter(
              gen_parser_ops.well_formed_filter(source,
                                                task_context=OutputPath(
                                                    'context')),
              task_context=OutputPath('context')))
      while True:
        tf_last, _ = sess.run([last, sink])
        if tf_last:
          break

  logging.info('Training...')
  with tf.Session(FLAGS.tf_master) as sess:
    Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
    def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
                                      break_levels):
        # Prepare context.
        self.WriteContext('segment-train-data')

        # Prepare test sentence.
        with open(self.corpus_file, 'w') as f:
            f.write(''.join(doc_lines))

        # Test converted sentence.
        sentence, _ = gen_parser_ops.document_source(
            task_context=self.context_file, batch_size=1)
        with self.test_session() as sess:
            sentence_doc = self.ReadNextDocument(sess, sentence)
            self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
            self.assertEqual([t.decode('utf-8') for t in doc_words],
                             [t.word for t in sentence_doc.token])
            self.assertEqual(break_levels,
                             [t.break_level for t in sentence_doc.token])
Exemple #18
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    if not gfile.IsDirectory(OutputPath('')):
        gfile.MakeDirs(OutputPath(''))

    # Rewrite context.
    RewriteContext()

    # Creates necessary term maps.
    if FLAGS.compute_lexicon:
        logging.info('Computing lexicon...')
        with tf.Session(FLAGS.tf_master) as sess:
            gen_parser_ops.lexicon_builder(
                task_context=OutputPath('context'),
                corpus_name=FLAGS.training_corpus).run()
    with tf.Session(FLAGS.tf_master) as sess:
        feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run(
            gen_parser_ops.feature_size(task_context=OutputPath('context'),
                                        arg_prefix=FLAGS.arg_prefix))

    # Well formed and projectivize.
    if FLAGS.projectivize_training_set:
        logging.info('Preprocessing...')
        with tf.Session(FLAGS.tf_master) as sess:
            source, last = gen_parser_ops.document_source(
                task_context=OutputPath('context'),
                batch_size=FLAGS.batch_size,
                corpus_name=FLAGS.training_corpus)
            sink = gen_parser_ops.document_sink(
                task_context=OutputPath('context'),
                corpus_name='projectivized-training-corpus',
                documents=gen_parser_ops.projectivize_filter(
                    gen_parser_ops.well_formed_filter(
                        source, task_context=OutputPath('context')),
                    task_context=OutputPath('context')))
            while True:
                tf_last, _ = sess.run([last, sink])
                if tf_last:
                    break

    logging.info('Training...')
    with tf.Session(FLAGS.tf_master) as sess:
        Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
  def CheckSegmentationTrainingData(self, doc_lines, doc_text, doc_words,
                                    break_levels):
    # Prepare context.
    self.WriteContext('segment-train-data')

    # Prepare test sentence.
    with open(self.corpus_file, 'w') as f:
      f.write(''.join(doc_lines))

    # Test converted sentence.
    sentence, _ = gen_parser_ops.document_source(
        task_context=self.context_file, batch_size=1)
    with self.test_session() as sess:
      sentence_doc = self.ReadNextDocument(sess, sentence)
      self.assertEqual(doc_text.decode('utf-8'), sentence_doc.text)
      self.assertEqual([t.decode('utf-8') for t in doc_words],
                       [t.word for t in sentence_doc.token])
      self.assertEqual(break_levels,
                       [t.break_level for t in sentence_doc.token])
Exemple #20
0
def _get_sentence_dict():
    logging.set_verbosity(logging.INFO)
    with tf.Session() as sess:
        src = gen_parser_ops.document_source(
            batch_size=32,
            corpus_name='input-from-file-conll',
            task_context=task_context_path)
        sentence = sentence_pb2.Sentence()
        result_dict = None
        while True:
            documents, finished = sess.run(src)
            for d in documents:
                sentence.ParseFromString(d)
                d = to_dict(sentence)
                result_dict = d

            if finished:
                break

    return result_dict
Exemple #21
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    with tf.Session() as sess:
        src = gen_parser_ops.document_source(batch_size=32,
                                             corpus_name=FLAGS.corpus_name,
                                             task_context=FLAGS.task_context)
        sentence = sentence_pb2.Sentence()
        while True:
            documents, finished = sess.run(src)
            logging.info('Read %d documents', len(documents))
            for d in documents:
                sentence.ParseFromString(d)
                tr = asciitree.LeftAligned()
                d = to_dict(sentence)
                print('Input: %s' % sentence.text)
                print('Parse:')
                print(tr(d))

            if finished:
                break
 def __init__(self,
              filepath,
              record_format,
              batch_size=32,
              check_well_formed=False,
              projectivize=False,
              morph_to_pos=False):
     self._graph = tf.Graph()
     self._session = tf.Session(graph=self._graph)
     task_context_str = """
       input {
         name: 'documents'
         record_format: '%s'
         Part {
          file_pattern: '%s'
         }
       }""" % (record_format, filepath)
     if morph_to_pos:
         task_context_str += """
       Parameter {
         name: "join_category_to_pos"
         value: "true"
       }
       Parameter {
         name: "add_pos_as_attribute"
         value: "true"
       }
       Parameter {
         name: "serialize_morph_to_pos"
         value: "true"
       }
       """
     with self._graph.as_default():
         self._source, self._is_last = gen_parser_ops.document_source(
             task_context_str=task_context_str, batch_size=batch_size)
         if check_well_formed:
             self._source = gen_parser_ops.well_formed_filter(self._source)
         if projectivize:
             self._source = gen_parser_ops.projectivize_filter(self._source)
Exemple #23
0
    def testConllSentence(self):
        # This test sentence includes a multiword token and an empty node,
        # both of which are to be ignored.
        test_sentence = """
1-2	We've	_
1	We	we	PRON	PRP	Case=Nom	3	nsubj	_	SpaceAfter=No
2	've	have	AUX	VBP	Mood=Ind	3	aux	_	_
3	moved	move	VERB	VBN	Tense=Past	0	root	_	_
4	on	on	ADV	RB	_	3	advmod	_	SpaceAfter=No
4.1	ignored	ignore	VERB	VBN	Tense=Past	0	_	_	_
5	.	.	PUNCT	.	_	3	punct	_	_
"""

        # Prepare test sentence.
        with open(self.corpus_file, 'w') as f:
            f.write(test_sentence)

        # Prepare context.
        self.WriteContext('conll-sentence')

        # Test converted sentence.
        sentence, _ = gen_parser_ops.document_source(
            task_context=self.context_file, batch_size=1)

        # Expected texts, words, and start/end offsets.
        expected_text = u'We\'ve moved on.'
        expected_words = [u'We', u'\'ve', u'moved', u'on', u'.']
        expected_starts = [0, 2, 6, 12, 14]
        expected_ends = [1, 4, 10, 13, 14]
        with self.test_session() as sess:
            sentence_doc = self.ReadNextDocument(sess, sentence)
            self.assertEqual(expected_text, sentence_doc.text)
            self.assertEqual(expected_words,
                             [t.word for t in sentence_doc.token])
            self.assertEqual(expected_starts,
                             [t.start for t in sentence_doc.token])
            self.assertEqual(expected_ends,
                             [t.end for t in sentence_doc.token])
Exemple #24
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name=FLAGS.corpus_name,
                                         task_context=FLAGS.task_context)
    sentence = sentence_pb2.Sentence()
    while True:
      documents, finished = sess.run(src)
      logging.info('Read %d documents', len(documents))
      for d in documents:
        sentence.ParseFromString(d)
        # tr = asciitree.LeftAligned()
        d = to_dict(sentence)
        print 'Input: %s' % sentence.text
        print 'Parse:'
	print json.dumps(d, indent=True)
	#dom = parseString(dicttoxml.dicttoxml(d, attr_type=False))
	#print dom.toprettyxml()
        #print dicttoxml.dicttoxml(d)

      if finished:
        break
Exemple #25
0
def main(unused_argv):
  logging.set_verbosity(logging.INFO)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name=FLAGS.corpus_name,
                                         task_context=FLAGS.task_context)
    sentence = sentence_pb2.Sentence()
    while True:
      documents, finished = sess.run(src)
      logging.info('Read %d documents', len(documents))
      for d in documents:
        sentence.ParseFromString(d)
        tr = asciitree.LeftAligned()
        d = to_dict(sentence)
        print('Input: %s' % sentence.text)
        print('Parse:')
        tr_str = tr(d)
        pat = re.compile(r'\s*@\d+$')
        for tr_ln in tr_str.splitlines():
          print(pat.sub('', tr_ln))

      if finished:
        break
Exemple #26
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)
    with tf.Session() as sess:
        src = gen_parser_ops.document_source(batch_size=32,
                                             corpus_name=FLAGS.corpus_name,
                                             task_context=FLAGS.task_context)
        sentence = sentence_pb2.Sentence()
        while True:
            documents, finished = sess.run(src)
            logging.info('Read %d documents', len(documents))
            for d in documents:
                sentence.ParseFromString(d)
                tr = asciitree.LeftAligned()
                d = to_dict(sentence)
                print 'Input: %s' % sentence.text
                print 'Parse:'
                tr_str = tr(d)
                pat = re.compile(r'\s*@\d+$')
                for tr_ln in tr_str.splitlines():
                    print pat.sub('', tr_ln)

            if finished:
                break
def main(unused_argv):
  logging.set_verbosity(logging.DEBUG)
  with tf.Session() as sess:
    src = gen_parser_ops.document_source(batch_size=32,
                                         corpus_name=FLAGS.corpus_name,
                                         task_context=FLAGS.task_context)
    sentence = sentence_pb2.Sentence()
    while True:
      documents, finished = sess.run(src)
      logging.info('Read %d documents', len(documents))
      for d in documents:
        sentence.ParseFromString(d)
        print formatter.format(sentence)
        # d_raw = d
        # # sentence.ParseFromString(d)
        # # tr = asciitree.LeftAligned()
        # # d = to_dict(sentence)
        # # print 'Input: %s' % sentence.text
        # # print 'Parse:'
        # # print tr(d)
        # print d_raw

      if finished:
        break
Exemple #28
0
def main(unused_argv):
    logging.set_verbosity(logging.INFO)

    model_dir = FLAGS.model_dir
    task_context = "%s/context.pbtxt" % model_dir

    common_params = {
        "task_context": task_context,
        "beam_size": 8,
        "max_steps": 1000,
        "graph_builder": "structured",
        "batch_size": 1024,
        "slim_model": True,
    }

    model = {
        "brain_parser": {
            "arg_prefix": "brain_parser",
            "hidden_layer_sizes": "512,512",
            # input is taken from input tensor, not from corpus
            "input": None,
            "model_path": "%s/parser-params" % model_dir,
        },
    }

    for prefix in ["brain_parser"]:
        model[prefix].update(common_params)
        feature_sizes, domain_sizes, embedding_dims, num_actions = GetFeatureSize(
            task_context, prefix)
        model[prefix].update({
            'feature_sizes': feature_sizes,
            'domain_sizes': domain_sizes,
            'embedding_dims': embedding_dims,
            'num_actions': num_actions
        })

    with tf.Session() as sess:
        if FLAGS.export_path is not None:
            text_input = tf.placeholder(tf.string, [None])
        else:
            text_input = tf.constant(["parsey is the greatest"], tf.string)

        # corpus_name must be specified and valid because it indirectly informs
        # the document format ("english-text" vs "conll-sentence") used to parse
        # the input text
        document_source = gen_parser_ops.document_source(
            text=text_input,
            task_context=task_context,
            corpus_name="stdin-conll",
            batch_size=common_params['batch_size'],
            documents_from_input=True)

        for prefix in ["brain_parser"]:
            with tf.variable_scope(prefix):
                if True or prefix == "brain_tagger":
                    #source = document_source.documents if prefix == "brain_tagger" else model["brain_tagger"]["documents"]
                    source = document_source.documents
                    model[prefix]["documents"] = Build(sess, source,
                                                       model[prefix])

        if FLAGS.export_path is None:
            sink = gen_parser_ops.document_sink(
                model["brain_parser"]["documents"],
                task_context=task_context,
                corpus_name="stdout-conll")
            sess.run(sink)
        else:
            assets = []
            for model_file in os.listdir(model_dir):
                path = os.path.join(model_dir, model_file)
                if not os.path.isdir(path):
                    assets.append(tf.constant(path))
            ExportModel(sess, FLAGS.export_path, text_input,
                        model["brain_parser"]["documents"], assets)