Python HtmlRequirementProcessor Examples

Programming Language: Python

Namespace/Package Name: nltkAnalyzer

Examples at hotexamples.com: 3

Python HtmlRequirementProcessor - 3 examples found. These are the top rated real world Python examples of nltkAnalyzer.HtmlRequirementProcessor extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

assign_temp_content_to_temp_files(2)

assign_temp_files_html_content(2)

create_categorized_corpus(2)

create_temp_files_named_by_wordtypes(2)

delete_temporary_files(2)

lemmatize_text_as_list(2)

list_raw_html_file(2)

list_wordtypes_from_html_list(2)

open_html_file(2)

remove_categories_directory(2)

split_html_entities(2)

split_stopwords(2)

plot_html_results(1)

Example #1

Show file

File: nltkAnalyzer_spec.py Project: EduardoCarvalho/nltkAnalyzer

class TestHtmlRequirementProcessor(TestCase):

    def setUp(self):
        self.v = Variables()
        self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat)
    
    def it_receives_a_url(self):
        self.hrp.html_file |should| equal_to(self.v.url)
        
    def it_opens_and_cleans_html_file(self):
        self.hrp.open_html_file(self.v.url) |should| equal_to(self.v.raw_text_and_space_spec)
        
    def it_splits_html_character_entities(self):
        self.hrp.split_html_entities(self.v.raw_text_and_space_spec) \
        |should| equal_to(self.v.raw_text_free_from_html_entity_spec)
        
    def it_lists_raw_html_file(self):
        self.hrp.list_raw_html_file(self.v.raw_text_free_from_html_entity_spec, self.v.concise_stopwords) \
        |should| equal_to((self.v.text_no_punct_list_spec, self.v.concise_stopwords_list))
   
    def it_splits_stopwords_from_html(self):
        self.hrp.split_stopwords(self.v.text_no_punct_list_spec, self.v.concise_stopwords_list) \
        |should| equal_to(self.v.text_alpha_no_punct_stopword_list_spec)
        
    def it_lists_lemmatized_verb_noun_and_adjective(self):
        self.hrp.lemmatize_text_as_list(self.v.text_alpha_no_punct_stopword_list_spec) \
        |should| equal_to(self.v.lemmatized_html_list)
        
    def it_creates_temporary_directory(self):
        self.hrp.create_temp_directory[1] |should| equal_to(True)
        
    def it_creates_wordtypes_from_html_lemmatized_list_of_words(self):
        self.hrp.list_wordtypes_from_html_list(self.v.lemmatized_html_list, self.v.n_cat) \
        |should| equal_to(self.v.wordtypes_html_list) 
        
    def it_creates_temporary_files_named_by_wordtypes_inside_temporary_directory(self):
        self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[1] \
        |should| equal_to(False) 
    
    def it_assigns_temporary_file_content_by_wordtype_to_a_list(self):
        self.hrp.assign_temp_files_html_content(self.v.wordtypes_html_list, self.v.lemmatized_html_list)[1] \
        |should| equal_to(True)
        
    def it_assigns_temporary_content_to_temporary_files(self):
        obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0]
        self.hrp.assign_temp_content_to_temp_files(self.v.categories_html_content, obj)[1] \
        |should| equal_to(True)
        
    def it_creates_categorized_plaintextcorpusreader(self):
        self.hrp.create_categorized_corpus(self.hrp.create_temp_directory[0])[1] \
        |should| equal_to(True)
        
    def it_deletes_all_wordtypes_temporary_files(self):
        obj = self.hrp.create_temp_files_named_by_wordtypes(self.v.wordtypes_html_list, self.v.temporary_directory)[0]
        self.hrp.delete_temporary_files(obj) |should| equal_to(True)
    
    def it_checks_temporary_directory_was_removed_from_tmp_folder(self):
        categories_directory = self.hrp.create_temp_directory[0]
        tmp_folderid = self.hrp.create_temp_directory[3]
        self.hrp.remove_categories_directory(categories_directory, self.v.temporary_directory, tmp_folderid) \
        |should| equal_to(False) # True

Example #2

Show file

File: nltkAnalyzer_spec.py Project: EduardoCarvalho/nltkAnalyzer

 def setUp(self):
     self.v = Variables()
     self.hrp = HtmlRequirementProcessor(self.v.url, self.v.n_cat)

Example #3

Show file

File: runalyzer.py Project: EduardoCarvalho/nltkAnalyzer

def html_analyzer(html_file, number_of_cat):
    hrp = HtmlRequirementProcessor(html_file, number_of_cat)
    raw_text_and_space = hrp.open_html_file(html_file)
    raw_text_free_from_html_entity = hrp.split_html_entities(raw_text_and_space)
    text_no_punct_list, \
    stopwords_list = hrp.list_raw_html_file(raw_text_free_from_html_entity, 
                                            STOPWORDS)
    text_alpha_no_punct_stopword_list = hrp.split_stopwords(text_no_punct_list, 
                                                            stopwords_list)
    lemmatized_list_by_verb_noun_adj_adv = \
        hrp.lemmatize_text_as_list(text_alpha_no_punct_stopword_list)
    categories_directory, \
    boolean_for_directory_test, \
    tmp_root, tmp_folderid = hrp.create_temp_directory
    wordtype_categories = \
        hrp.list_wordtypes_from_html_list(lemmatized_list_by_verb_noun_adj_adv, 
                                          number_of_cat)
    category_tmp_file_list, \
    boolean_for_file_test = \
        hrp.create_temp_files_named_by_wordtypes(wordtype_categories, 
                                                 categories_directory)
    category_tmp_file_content, \
    boolean_for_content_test = \
        hrp.assign_temp_files_html_content(wordtype_categories, 
                                           lemmatized_list_by_verb_noun_adj_adv)
    category_tmp_file_list, \
    boolean_for_check_content_file_test = \
        hrp.assign_temp_content_to_temp_files(category_tmp_file_content, 
                                              category_tmp_file_list)
    reader, \
    boolean_for_categories_test = hrp.create_categorized_corpus(categories_directory)
    hrp.tabulate_categorized_words(reader, number_of_cat)
    hrp.plot_html_results(lemmatized_list_by_verb_noun_adj_adv, number_of_cat)
    boolean_for_file_test = hrp.delete_temporary_files(category_tmp_file_list)
    boolean_for_categories_test = hrp.remove_categories_directory(categories_directory, 
                                                                  tmp_root, 
                                                                  tmp_folderid)