Python HtmlFullText Examples, html2vect.string.attrib_text.HtmlFullText Python Examples

Example #1

0

Show file

File: test_words.py Project: dpritsos/html2vectors

 def setUp(self):
     self.html2tf = words.Html2TF(lowercase=False, valid_html=True, ndtype=np.dtype([('terms', 'S128'), ('freq', 'float32')]))
     self.html2tf_lowercase = words.Html2TF(lowercase=True, valid_html=True, ndtype=np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
     self.htmltext = HtmlFullText(valid_html=True)
     self.str2wl = String2WordList()
     
     self.html_sample = "<html> \
                         <head> \
                         </head> \
                         <body>\
                          <p>This is a unit test (IT IS!) for <b>html2tfd.charngrams.BaseString2TF.</b> class, @package/module html2vectors</p>\
                         </body>\
                        </html>"
     
     self.expected_words_arr = np.array( [('(', 1.0), (')', 1.0), (',', 1.0), ('.', 1.0), ('@', 1.0),\
                                          ('IS!', 1.0), ('IT', 1.0), ('This', 1.0), ('a', 1.0), ('class', 1.0),\
                                          ('for', 1.0), ('html2tfd.charngrams.BaseString2TF', 1.0), ('html2vectors', 1.0),\
                                          ('is', 1.0), ('package/module', 1.0), ('test', 1.0), ('unit', 1.0)],\
                                          np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
      
     self.expected_words_freq_arr_lowercase = np.array( [('(', 1.0), (')', 1.0), (',', 1.0), ('.', 1.0), ('@', 1.0),\
                                                         ('a', 1.0), ('class', 1.0), ('for', 1.0), ('html2tfd.charngrams.basestring2tf', 1.0),\
                                                         ('html2vectors', 1.0), ('is', 1.0), ('is!', 1.0), ('it', 1.0), ('package/module', 1.0),\
                                                         ('test', 1.0), ('this', 1.0), ('unit', 1.0)],\
                                                         np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
     
    
     self.tables_filename = "../../../unit_test_data/hd5files/CorpusTable.h5"
     self.pathto_htmls = "../../../unit_test_data/html/"
     self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]

Example #2

0

Show file

File: test_cngrams.py Project: dpritsos/html2vectors

 def setUp(self):
     self.n = 3
     self.html2tf = cngrams.Html2TF( self.n, lowercase=False, valid_html=True )
     self.html2tf_lowercase = cngrams.Html2TF( self.n, lowercase=True, valid_html=True )
     self.htmltext = HtmlFullText(valid_html=True)
     
     self.html_sample = "<html> \
                         <head> \
                         </head> \
                         <body>\
                          <p>This is a unit test for <b>html2tfd.charngrams.BaseString2TF</b> class for html2vectors package/module</p>\
                         </body>\
                        </html>"
     
     self.expected_ngrams_freq = {'s i': 1, 't t': 1, 'ase': 1, 's a': 1, 'htm': 2, 'ram': 1, 'rs ': 1, 'TF ': 1, 's f': 1,\
                                  '.ch': 1, 't f': 1, ' un': 1, '2tf': 1, 'l2t': 1, 'l2v': 1, 's p': 1, 'eSt': 1, 'tes': 1,\
                                  'ge/': 1, 'ams': 1, 'or ': 2, 'cha': 1, 'est': 1, 'st ': 1, 'Str': 1, 'for': 2, 'tor': 1,\
                                  ' is': 1, 'ing': 1, 'cla': 1, 'e/m': 1, 'fd.': 1, 'ml2': 2, 'pac': 1, 'arn': 1, 'ngr': 1,\
                                  'r h': 2, '2TF': 1, 'har': 1, 'is ': 2, 'tml': 2, 'F c': 1, 'ass': 1, 'tri': 1, 'g2T': 1,\
                                  'his': 1, 'kag': 1, 'Bas': 1, '2ve': 1, 'tfd': 1, 'gra': 1, 'rng': 1, 'ors': 1, 'it ': 1,\
                                  'odu': 1, 'mod': 1, ' pa': 1, 'ect': 1, 'ule': 1, 'Thi': 1, 's.B': 1, ' te': 1, '.Ba': 1,\
                                  'nit': 1, 'las': 1, ' a ': 1, 'rin': 1, 'seS': 1, 'cka': 1, ' cl': 1, 'd.c': 1, 'dul': 1,\
                                  'ack': 1, 'age': 1, ' ht': 2, 'ms.': 1, '/mo': 1, 'ng2': 1, 'ss ': 1, 'uni': 1, 'cto': 1,\
                                  'vec': 1, ' fo': 2, 'a u': 1, 'le ': 1} 
     
     #NOTICE the 'le ':1 on the above dictionary which is extra 3gram compare toTest_BaseString2NgramList__3grams because of HTML clean-up process 
     self.expected_ngrams_freq_lowercase = {u'le ': 1, u's i': 1, u't t': 1, u's.b': 1, u'f c': 1, u's a': 1, u'htm': 2, u'ram': 1,\
                                            u'rs ': 1, u'tf ': 1, u's f': 1, u'.ch': 1, u't f': 1, u' un': 1, u'2tf': 2, u'.ba': 1,\
                                            u'l2t': 1, u'l2v': 1, u's p': 1, u'ses': 1, u'mod': 1, u'tes': 1, u'ge/': 1, u'ams': 1,\
                                            u'or ': 2, u'cha': 1, u'est': 2, u'st ': 1, u'for': 2, u'tor': 1, u' is': 1, u'ing': 1,\
                                            u'cla': 1, u'e/m': 1, u'fd.': 1, u'ml2': 2, u'pac': 1, u'arn': 1, u'ngr': 1, u'r h': 2,\
                                            u'ule': 1, u'har': 1, u'is ': 2, u'tml': 2, u'ng2': 1, u' cl': 1, u'ass': 1, u'tri': 1,\
                                            u'his': 1, u'kag': 1, u'str': 1, u'2ve': 1, u'tfd': 1, u'gra': 1, u'rng': 1, u'ors': 1,\
                                            u'it ': 1, u'odu': 1, u' pa': 1, u'ect': 1, u'ase': 1, u'dul': 1, u' te': 1, u'nit': 1,\
                                            u'las': 1, u' a ': 1, u'rin': 1, u'g2t': 1, u'cka': 1, u'bas': 1, u'd.c': 1, u'ack': 1,\
                                            u'age': 1, u' ht': 2, u'ms.': 1, u'/mo': 1, u'thi': 1, u'ss ': 1, u'uni': 1, u'cto': 1,\
                                            u'vec': 1, u' fo': 2, u'a u': 1}
     
     self.pathto_htmls = "../../../unit_test_data/html/"
     self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]
     self.txt_file_l = [ "../../../unit_test_data/txt/test_01.txt" ]

Example #3

0

Show file

File: test_words.py Project: dpritsos/html2vectors

 def setUp(self):
     self.html2tf = words.Html2TF( lowercase=False, valid_html=True )
     self.html2tf_lowercase = words.Html2TF( lowercase=True, valid_html=True )
     self.htmltext = HtmlFullText(valid_html=True)
     self.str2wl = String2WordList()
     
     self.html_sample = "<html> \
                         <head> \
                         </head> \
                         <body>\
                          <p>This is a unit test (IT IS!) for <b>html2tfd.charngrams.BaseString2TF.</b> class, @package/module html2vectors</p>\
                         </body>\
                        </html>"
     
     self.expected_words = {u'a': 1, u'@': 1, u',': 1, u'html2vectors': 1, u'for': 1, u'This': 1, u'(': 1,\
                            u'is': 1, u'IT': 1, u'.': 1, u')': 1, u'test': 1, u'package/module': 1, u'IS!': 1,\
                            u'class': 1, u'html2tfd.charngrams.BaseString2TF': 1, u'unit': 1} 
      
     self.expected_words_lowercase = {u'a': 1, u'@': 1, u',': 1, u'html2vectors': 1, u'for': 1, u'this': 1, u'(': 1,\
                                      u'is': 1, u'it': 1, u'.': 1, u')': 1, u'test': 1, u'package/module': 1, u'is!': 1,\
                                      u'class': 1, u'html2tfd.charngrams.basestring2tf': 1, u'unit': 1}
     
     self.pathto_htmls = "../../../unit_test_data/html/"
     self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]

Example #4

0

Show file

File: test_words.py Project: dpritsos/html2vectors

class Test_Html2TF(unittest.TestCase):
    
    def setUp(self):
        self.html2tf = words.Html2TF( lowercase=False, valid_html=True )
        self.html2tf_lowercase = words.Html2TF( lowercase=True, valid_html=True )
        self.htmltext = HtmlFullText(valid_html=True)
        self.str2wl = String2WordList()
        
        self.html_sample = "<html> \
                            <head> \
                            </head> \
                            <body>\
                             <p>This is a unit test (IT IS!) for <b>html2tfd.charngrams.BaseString2TF.</b> class, @package/module html2vectors</p>\
                            </body>\
                           </html>"
        
        self.expected_words = {u'a': 1, u'@': 1, u',': 1, u'html2vectors': 1, u'for': 1, u'This': 1, u'(': 1,\
                               u'is': 1, u'IT': 1, u'.': 1, u')': 1, u'test': 1, u'package/module': 1, u'IS!': 1,\
                               u'class': 1, u'html2tfd.charngrams.BaseString2TF': 1, u'unit': 1} 
         
        self.expected_words_lowercase = {u'a': 1, u'@': 1, u',': 1, u'html2vectors': 1, u'for': 1, u'this': 1, u'(': 1,\
                                         u'is': 1, u'it': 1, u'.': 1, u')': 1, u'test': 1, u'package/module': 1, u'is!': 1,\
                                         u'class': 1, u'html2tfd.charngrams.basestring2tf': 1, u'unit': 1}
        
        self.pathto_htmls = "../../../unit_test_data/html/"
        self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]
                          
                          
    def test_html2tf_from_src(self):
        words = self.html2tf.from_src( self.html_sample )
        self.assertEqual(words, self.expected_words) 


    def test_html2tf_from_src_lowercase(self):
        html_ngrams = self.html2tf_lowercase.from_src( self.html_sample )
        self.assertEqual(html_ngrams, self.expected_words_lowercase) 
        
       
    def test_html2tf_from_files(self):
        html_text = self.htmltext.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        words_lst = self.str2wl.terms_lst(html_text[0])
        html_words = self.html2tf.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        
        #Count the Length of Words List returned
        tf_num_expected = len(words_lst)
        
        #Count the total amount of Frequencies of the TF returned 
        tf_num_real = 0
        for tf in html_words[0].values():
            tf_num_real += float(tf)
            
        #Do the AssertEqual where the lists should be equal
        self.assertEqual(tf_num_real, tf_num_expected)

  
    def test_html2tf_from_paths(self):
        html_text = self.htmltext.from_paths(None, self.pathto_htmls, encoding='utf8', error_handling='strict' )
        words_lst = self.str2wl.terms_lst(html_text[0][1])
        html_words = self.html2tf.from_paths(None, self.pathto_htmls, encoding='utf8', error_handling='strict' )
        
        #Count the Length of Words List returned
        tf_num_expected = len(words_lst)
        
        #Count the total amount of Frequencies of the TF returned 
        tf_num_real = 0
        for tf in html_words[0][1].values():
            tf_num_real += float(tf)
            
        #Do the AssertEqual where the lists should be equal in:
        #The amount of Terms found
        self.assertEqual(tf_num_real, tf_num_expected)
        
        #The Files (i.e. filenames) found in the paths
        self.assertEqual(html_words[0][0], self.xhtml_file_l[0])

Example #5

0

Show file

File: test_cngrams.py Project: dpritsos/html2vectors

class Test_Html2TF__3grams(unittest.TestCase):
    
    def setUp(self):
        self.n = 3
        self.html2tf = cngrams.Html2TF( self.n, lowercase=False, valid_html=True )
        self.html2tf_lowercase = cngrams.Html2TF( self.n, lowercase=True, valid_html=True )
        self.htmltext = HtmlFullText(valid_html=True)
        
        self.html_sample = "<html> \
                            <head> \
                            </head> \
                            <body>\
                             <p>This is a unit test for <b>html2tfd.charngrams.BaseString2TF</b> class for html2vectors package/module</p>\
                            </body>\
                           </html>"
        
        self.expected_ngrams_freq = {'s i': 1, 't t': 1, 'ase': 1, 's a': 1, 'htm': 2, 'ram': 1, 'rs ': 1, 'TF ': 1, 's f': 1,\
                                     '.ch': 1, 't f': 1, ' un': 1, '2tf': 1, 'l2t': 1, 'l2v': 1, 's p': 1, 'eSt': 1, 'tes': 1,\
                                     'ge/': 1, 'ams': 1, 'or ': 2, 'cha': 1, 'est': 1, 'st ': 1, 'Str': 1, 'for': 2, 'tor': 1,\
                                     ' is': 1, 'ing': 1, 'cla': 1, 'e/m': 1, 'fd.': 1, 'ml2': 2, 'pac': 1, 'arn': 1, 'ngr': 1,\
                                     'r h': 2, '2TF': 1, 'har': 1, 'is ': 2, 'tml': 2, 'F c': 1, 'ass': 1, 'tri': 1, 'g2T': 1,\
                                     'his': 1, 'kag': 1, 'Bas': 1, '2ve': 1, 'tfd': 1, 'gra': 1, 'rng': 1, 'ors': 1, 'it ': 1,\
                                     'odu': 1, 'mod': 1, ' pa': 1, 'ect': 1, 'ule': 1, 'Thi': 1, 's.B': 1, ' te': 1, '.Ba': 1,\
                                     'nit': 1, 'las': 1, ' a ': 1, 'rin': 1, 'seS': 1, 'cka': 1, ' cl': 1, 'd.c': 1, 'dul': 1,\
                                     'ack': 1, 'age': 1, ' ht': 2, 'ms.': 1, '/mo': 1, 'ng2': 1, 'ss ': 1, 'uni': 1, 'cto': 1,\
                                     'vec': 1, ' fo': 2, 'a u': 1, 'le ': 1} 
        
        #NOTICE the 'le ':1 on the above dictionary which is extra 3gram compare toTest_BaseString2NgramList__3grams because of HTML clean-up process 
        self.expected_ngrams_freq_lowercase = {u'le ': 1, u's i': 1, u't t': 1, u's.b': 1, u'f c': 1, u's a': 1, u'htm': 2, u'ram': 1,\
                                               u'rs ': 1, u'tf ': 1, u's f': 1, u'.ch': 1, u't f': 1, u' un': 1, u'2tf': 2, u'.ba': 1,\
                                               u'l2t': 1, u'l2v': 1, u's p': 1, u'ses': 1, u'mod': 1, u'tes': 1, u'ge/': 1, u'ams': 1,\
                                               u'or ': 2, u'cha': 1, u'est': 2, u'st ': 1, u'for': 2, u'tor': 1, u' is': 1, u'ing': 1,\
                                               u'cla': 1, u'e/m': 1, u'fd.': 1, u'ml2': 2, u'pac': 1, u'arn': 1, u'ngr': 1, u'r h': 2,\
                                               u'ule': 1, u'har': 1, u'is ': 2, u'tml': 2, u'ng2': 1, u' cl': 1, u'ass': 1, u'tri': 1,\
                                               u'his': 1, u'kag': 1, u'str': 1, u'2ve': 1, u'tfd': 1, u'gra': 1, u'rng': 1, u'ors': 1,\
                                               u'it ': 1, u'odu': 1, u' pa': 1, u'ect': 1, u'ase': 1, u'dul': 1, u' te': 1, u'nit': 1,\
                                               u'las': 1, u' a ': 1, u'rin': 1, u'g2t': 1, u'cka': 1, u'bas': 1, u'd.c': 1, u'ack': 1,\
                                               u'age': 1, u' ht': 2, u'ms.': 1, u'/mo': 1, u'thi': 1, u'ss ': 1, u'uni': 1, u'cto': 1,\
                                               u'vec': 1, u' fo': 2, u'a u': 1}
        
        self.pathto_htmls = "../../../unit_test_data/html/"
        self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]
        self.txt_file_l = [ "../../../unit_test_data/txt/test_01.txt" ]
                         
                         
    def test_html2tf_from_src(self):
        html_ngrams = self.html2tf.from_src( self.html_sample )
        self.assertEqual(html_ngrams, self.expected_ngrams_freq) 
    
    
    def test_html2tf_from_src_lowercase(self):
        html_ngrams = self.html2tf_lowercase.from_src( self.html_sample )
        self.assertEqual(html_ngrams, self.expected_ngrams_freq_lowercase) 
       
       
    def test_html2tf_from_files(self):
        html_text = self.htmltext.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        
        html_ngrams = self.html2tf.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        
        ng_num_expected = len(html_text[0]) - self.n + 1
        ng_num_real = 0
        for nf in html_ngrams[0].values():
            ng_num_real += float(nf)
        self.assertEqual(ng_num_real, ng_num_expected)


    def test_html2tf_from_paths(self):
        html_text_l = self.htmltext.from_paths( None, self.pathto_htmls, encoding='utf8', error_handling='strict' )
        
        html_ngrams_l = self.html2tf.from_paths( None, self.pathto_htmls, encoding='utf8', error_handling='strict')
        
        #ng_num_expected: contains the calculated expected number of N-grams given the text-string lenght
        ng_num_expected = len(html_text_l[0][1]) - self.n + 1
        ng_num_real = 0
        for nf in html_ngrams_l[0][1].values():
            ng_num_real += float(nf)
        self.assertEqual(ng_num_real, ng_num_expected)

Example #6

0

Show file

File: test_words.py Project: dpritsos/html2vectors

class Test_Html2TF(unittest.TestCase):
    
    def setUp(self):
        self.html2tf = words.Html2TF(lowercase=False, valid_html=True, ndtype=np.dtype([('terms', 'S128'), ('freq', 'float32')]))
        self.html2tf_lowercase = words.Html2TF(lowercase=True, valid_html=True, ndtype=np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
        self.htmltext = HtmlFullText(valid_html=True)
        self.str2wl = String2WordList()
        
        self.html_sample = "<html> \
                            <head> \
                            </head> \
                            <body>\
                             <p>This is a unit test (IT IS!) for <b>html2tfd.charngrams.BaseString2TF.</b> class, @package/module html2vectors</p>\
                            </body>\
                           </html>"
        
        self.expected_words_arr = np.array( [('(', 1.0), (')', 1.0), (',', 1.0), ('.', 1.0), ('@', 1.0),\
                                             ('IS!', 1.0), ('IT', 1.0), ('This', 1.0), ('a', 1.0), ('class', 1.0),\
                                             ('for', 1.0), ('html2tfd.charngrams.BaseString2TF', 1.0), ('html2vectors', 1.0),\
                                             ('is', 1.0), ('package/module', 1.0), ('test', 1.0), ('unit', 1.0)],\
                                             np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
         
        self.expected_words_freq_arr_lowercase = np.array( [('(', 1.0), (')', 1.0), (',', 1.0), ('.', 1.0), ('@', 1.0),\
                                                            ('a', 1.0), ('class', 1.0), ('for', 1.0), ('html2tfd.charngrams.basestring2tf', 1.0),\
                                                            ('html2vectors', 1.0), ('is', 1.0), ('is!', 1.0), ('it', 1.0), ('package/module', 1.0),\
                                                            ('test', 1.0), ('this', 1.0), ('unit', 1.0)],\
                                                            np.dtype([('terms', 'S128'), ('freq', 'float32')]) )
        
       
        self.tables_filename = "../../../unit_test_data/hd5files/CorpusTable.h5"
        self.pathto_htmls = "../../../unit_test_data/html/"
        self.xhtml_file_l = [ "../../../unit_test_data/html/test_01.html" ]
        
        
    def test_html2tf_from_src(self):
        #Create the h5file and a test Group for the puropse of this Unit test
        h5file = tb.openFile(self.tables_filename, 'w')
        group_h5 = h5file.createGroup(h5file.root, "testgroup")
        
        #NOTE: the above comands should run into this fucntion (ie on the fly) and not in the setUP() method 
        #which is called again and again for each of the test_ methods into this Unit-test Class
        words_arr = self.html2tf.from_src(h5file, group_h5, self.html_sample, tbname="tbarray1")
        
        #Assert the Results
        for val, exp_val in zip(words_arr.read(), self.expected_words_arr):
            self.assertEqual(val, exp_val)
        
        #Close the file for the next Loop of Unit-Test
        h5file.close()
    
    
    def test_html2tf_from_src_lowercase(self):
        #Create the h5file and a test Group for the puropse of this Unit test
        h5file = tb.openFile(self.tables_filename, 'w')
        group_h5 = h5file.createGroup(h5file.root, "testgroup")
        
        #NOTE: the above comands should run into this fucntion (ie on the fly) and not in the setUP() method 
        #which is called again and again for each of the test_ methods into this Unit-test Class
        words_arr = self.html2tf_lowercase.from_src(h5file, group_h5, self.html_sample, tbname="tbarray1")
        
        for val, exp_val in zip(words_arr.read(), self.expected_words_freq_arr_lowercase):
            self.assertEqual(val, exp_val)
            
        #Close the file for the next Loop of Unit-Test 
        h5file.close()
                         
       
    def test_html2tf_from_files(self):
        #Create the h5file and a test Group for the puropse of this Unit test
        h5file = tb.openFile(self.tables_filename, 'w')
        group_h5 = h5file.createGroup(h5file.root, "testgroup")
        
        html_text = self.htmltext.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        words_lst = self.str2wl.terms_lst(html_text[0])
        words_arrz_group = self.html2tf.from_files(h5file, group_h5, self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        
        ng_num_expected = len(words_lst)
        ng_num_real = 0
        test_table = h5file.getNode(words_arrz_group, 'test_01_html') 
        ng_num_real += np.sum( test_table.read()['freq'] )        
        self.assertEqual(ng_num_real, ng_num_expected)
        self.assertEqual(test_table._v_attrs.filepath, self.xhtml_file_l[0]) 
        self.assertEqual(test_table._v_attrs.terms_num, ng_num_expected)
        
        #Close the file for the next Loop of Unit-Test
        h5file.close()
        
      
    def test_html2tf_from_paths(self):
        #Create the h5file and a test Group for the puropse of this Unit test
        h5file = tb.openFile(self.tables_filename, 'w')
        group_h5 = h5file.createGroup(h5file.root, "testgroup")
        
        html_text = self.htmltext.from_files( self.xhtml_file_l, encoding='utf8', error_handling='strict' )
        words_lst = self.str2wl.terms_lst(html_text[0])
        words_arrz_group = self.html2tf_lowercase.from_paths(h5file, group_h5, 'GenrePageListTable', None,\
                                                                   self.pathto_htmls, encoding='utf8', error_handling='strict' )
        
        #Assert for the Filename-List returned
        ### THIS IS A COMPLICTED STRUCTURE BE AWARE --> tb_trms_frq_arrz_group[1].read()['wpg_name']
        self.assertEqual(words_arrz_group[1].read()['filename'], self.xhtml_file_l)
        
        #Assert for the amount of Ngrams greated
        ng_num_expected = len(words_lst)
        ng_num_real = 0
        test_table = h5file.getNode(words_arrz_group[0], 'test_01_html') 
        ng_num_real += np.sum( test_table.read()['freq'] )        
        self.assertEqual(ng_num_real, ng_num_expected)
        
        #Close the file for the next Loop of Unit-Test
        h5file.close()