Example #1
0
def html():
    html_parser = HTMLParser()
    html_parser.parse(r'files/Test2.html')
    print('html parser', html_parser.get_processed_stems(),
          len(html_parser.get_processed_stems()))
    print('html parser link result', html_parser.get_links(),
          len(html_parser.get_links()))
Example #2
0
 def test_parseHTMLParser(self):
     html = HTMLParser()
     html.parse('files/Test.html')
     text = [
         'page', 'margin', '2cm', 'p', 'margin', '0', '25cm', 'direct',
         'ltr', 'color', '00000a', 'line', 'height', '115', 'text', 'align',
         'left', 'orphan', '2', 'widow', '2', 'p', 'western', 'font',
         'famili', 'liber', 'serif', 'serif', 'font', 'size', '12pt',
         'languag', 'ru', 'ru', 'p', 'cjk', 'font', 'famili', 'noto', 'san',
         'cjk', 'sc', 'regular', 'font', 'size', '12pt', 'languag', 'zh',
         'cn', 'p', 'ctl', 'font', 'famili', 'lohit', 'devanagari', 'font',
         'size', '12pt', 'languag', 'hi', 'in', 'link', 'languag', 'zxx',
         'i', 'test', 'poop', 'test', 'anim', 'test', 'anim', 'googl',
         'link'
     ]
     assert html.get_processed_stems() == text
Example #3
0
def html_test():
    html_parser = HTMLParser()
    html_parser.parse(r'D:\Test2.html')
    print(html_parser.get_processed_stems())
    print(html_parser.get_links())
Example #4
0
 def test_get_linksHTMLParser(self):
     html = HTMLParser()
     html.parse('files/Test.html')
     text = [('google\nlink', 'http://google.com/'),
             ('google\nlink', 'http://google.com/')]
     assert html.get_links() == text
Example #5
0
def html(link):
    html_parser = HTMLParser()
    html_parser.parse(link)
    word_list = html_parser.get_processed_stems()
    return word_list