/
init.py
49 lines (36 loc) · 1.17 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gensim
from sklearn import preprocessing
import codecs
import helper_functions as hf
from tasks import Tasks
"""
# get paragraphs and stopwords
paragraphs = hf.get_paragraphs(codecs.open("./text", "r", "utf-8"))
stopwords = hf.get_stop_words(codecs.open("./stopWords.txt", "r", "utf-8"))
# Remove paragraphs with gutenberg
paragraphs = hf.remove("Gutenberg", paragraphs)
# Remove punktuation
paragraphs = hf.remove_punctuations(paragraphs)
copy = list(map(list, paragraphs))
# stemming the words
paragraphs = hf.stem(paragraphs)
dictionary = gensim.corpora.Dictionary(paragraphs)
# get stopwords
stopIds = hf.get_stop_wordids(stopwords, dictionary)
dictionary.filter_tokens(stopIds)
bags = []
for p in paragraphs:
bags.append(dictionary.doc2bow(p))
tfidf_model = gensim.models.TfidfModel(bags)
tfidf_corpus = tfidf_model[bags]
idf_matrix = gensim.similarities.MatrixSimilarity(tfidf_corpus)
lsi_model = gensim.models.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=100)
lsi_corpus = lsi_model[bags]
lsi_matrix = gensim.similarities.MatrixSimilarity(lsi_corpus)
print(lsi_model.show_topics())
"""
t = Tasks()
t.task_one()
t.task_two()
t.task_three()
t.task_four()