Python html_to_document Examples

Programming Language: Python

Namespace/Package Name: yalign.input_conversion

Method/Function: html_to_document

Examples at hotexamples.com: 10

Python html_to_document - 10 examples found. These are the top rated real world Python examples of yalign.input_conversion.html_to_document extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: test_input_conversion.py Project: wannaphong/yalign

 def test_extract(self):
     html = "<html><head></head><body><p>Hello Peter</p></body></html>"
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter'.split()], d)
     html = ("<html><head></head><body><p>Hello Peter. "
             "Go for gold.</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter .'.split(), u'Go for gold .'.split()],
                       d)

Example #2

Show file

File: test_input_conversion.py Project: ahurriyetoglu/yalign

 def test_extract(self):
     html = "<html><head></head><body><p>Hello Peter</p></body></html>"
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter'.split()], d)
     html = ("<html><head></head><body><p>Hello Peter. "
             "Go for gold.</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter .'.split(), u'Go for gold .'.split()],
                        d)

Example #3

Show file

File: test_input_conversion.py Project: ahurriyetoglu/yalign

 def test_sentence_splitting(self):
     html = ("<html><head></head><body><p>Wow!! "
             "I did not know! Are you sure?</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Wow !!'.split(),
                        u'I did not know !'.split(),
                        u'Are you sure ?'.split()], d)

Example #4

Show file

File: test_input_conversion.py Project: ahurriyetoglu/yalign

 def test_generates_something(self):
     text = open(os.path.join(data_path, "index.html")).read()
     document = html_to_document(text, "en")
     self.assertGreater(len(document), 1)
     for sentence in document:
         self.assertIsInstance(sentence, Sentence)
         for word in sentence:
             self.assertIsInstance(word, unicode)

Example #5

Show file

File: test_input_conversion.py Project: wannaphong/yalign

 def test_sentence_splitting(self):
     html = ("<html><head></head><body><p>Wow!! "
             "I did not know! Are you sure?</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([
         u'Wow !!'.split(), u'I did not know !'.split(),
         u'Are you sure ?'.split()
     ], d)

Example #6

Show file

File: test_input_conversion.py Project: wannaphong/yalign

 def test_generates_something(self):
     text = open(os.path.join(data_path, "index.html")).read()
     document = html_to_document(text, "en")
     self.assertGreater(len(document), 1)
     for sentence in document:
         self.assertIsInstance(sentence, Sentence)
         for word in sentence:
             self.assertIsInstance(word, unicode)

Example #7

Show file

File: test_input_conversion.py Project: ahurriyetoglu/yalign

 def test_remove_whitespacing(self):
     html = ("<html><head></head><body><p>Wow\n\tWhat now?\t\t"
             "</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Wow What now ?'.split()], d)

Example #8

Show file

File: test_input_conversion.py Project: ahurriyetoglu/yalign

 def test_newlines(self):
     html = ("<html><head></head>\n\n<body><p>\nHello Peter."
             "\n\n\n Go for gold.\n</p>\n</body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter .'.split(), u'Go for gold .'.split()],
                       d)

Example #9

Show file

File: test_input_conversion.py Project: wannaphong/yalign

 def test_remove_whitespacing(self):
     html = ("<html><head></head><body><p>Wow\n\tWhat now?\t\t"
             "</p></body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Wow What now ?'.split()], d)

Example #10

Show file

File: test_input_conversion.py Project: wannaphong/yalign

 def test_newlines(self):
     html = ("<html><head></head>\n\n<body><p>\nHello Peter."
             "\n\n\n Go for gold.\n</p>\n</body></html>")
     d = [list(xs) for xs in html_to_document(html, "en")]
     self.assertEquals([u'Hello Peter .'.split(), u'Go for gold .'.split()],
                       d)