/
runalyzer.py
82 lines (78 loc) · 4.27 KB
/
runalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from nltkAnalyzer import TxtRequirementProcessor, HtmlRequirementProcessor
from stopwords import STOPWORDS
def txt_analyzer(txt_file, number_of_cat):
trp = TxtRequirementProcessor(txt_file, number_of_cat)
corpus_root, fileid = trp.split_path(txt_file)
raw_text = trp.search_fileid(corpus_root, fileid)
text_no_punct_list, \
stopwords_list = trp.list_raw_txt_file(raw_text,
STOPWORDS)
text_alpha_no_punct_stopword_list = \
trp.split_stopwords(text_no_punct_list,
stopwords_list)
lemmatized_list_by_verb_noun_adj_adv = \
trp.lemmatize_text_as_list(text_alpha_no_punct_stopword_list)
categories_directory, \
boolean_for_directory_test, \
tmp_root, tmp_folderid = trp.create_temp_directory
wordtype_categories = \
trp.list_wordtypes_from_lemmatized_list(lemmatized_list_by_verb_noun_adj_adv,
number_of_cat)
category_tmp_file_list, \
boolean_for_file_test = \
trp.create_temp_files_named_by_wordtypes(wordtype_categories,
categories_directory)
category_tmp_file_content, \
boolean_for_content_test = \
trp.assign_temp_files_txt_content(wordtype_categories,
lemmatized_list_by_verb_noun_adj_adv)
category_tmp_file_list, \
boolean_for_check_content_file_test = \
trp.assign_temp_content_to_temp_files(category_tmp_file_content,
category_tmp_file_list)
reader, \
boolean_for_categories_test = trp.create_categorized_corpus(categories_directory)
trp.tabulate_categorized_words(reader, number_of_cat)
trp.plot_txt_results(lemmatized_list_by_verb_noun_adj_adv,
number_of_cat)
boolean_for_file_test = trp.delete_temporary_files(category_tmp_file_list)
boolean_for_directory_test = trp.remove_categories_directory(categories_directory,
tmp_root,
tmp_folderid)
def html_analyzer(html_file, number_of_cat):
hrp = HtmlRequirementProcessor(html_file, number_of_cat)
raw_text_and_space = hrp.open_html_file(html_file)
raw_text_free_from_html_entity = hrp.split_html_entities(raw_text_and_space)
text_no_punct_list, \
stopwords_list = hrp.list_raw_html_file(raw_text_free_from_html_entity,
STOPWORDS)
text_alpha_no_punct_stopword_list = hrp.split_stopwords(text_no_punct_list,
stopwords_list)
lemmatized_list_by_verb_noun_adj_adv = \
hrp.lemmatize_text_as_list(text_alpha_no_punct_stopword_list)
categories_directory, \
boolean_for_directory_test, \
tmp_root, tmp_folderid = hrp.create_temp_directory
wordtype_categories = \
hrp.list_wordtypes_from_html_list(lemmatized_list_by_verb_noun_adj_adv,
number_of_cat)
category_tmp_file_list, \
boolean_for_file_test = \
hrp.create_temp_files_named_by_wordtypes(wordtype_categories,
categories_directory)
category_tmp_file_content, \
boolean_for_content_test = \
hrp.assign_temp_files_html_content(wordtype_categories,
lemmatized_list_by_verb_noun_adj_adv)
category_tmp_file_list, \
boolean_for_check_content_file_test = \
hrp.assign_temp_content_to_temp_files(category_tmp_file_content,
category_tmp_file_list)
reader, \
boolean_for_categories_test = hrp.create_categorized_corpus(categories_directory)
hrp.tabulate_categorized_words(reader, number_of_cat)
hrp.plot_html_results(lemmatized_list_by_verb_noun_adj_adv, number_of_cat)
boolean_for_file_test = hrp.delete_temporary_files(category_tmp_file_list)
boolean_for_categories_test = hrp.remove_categories_directory(categories_directory,
tmp_root,
tmp_folderid)