This repository has been archived by the owner on Jul 18, 2023. It is now read-only.
/
corpus_handler.py
82 lines (70 loc) · 3.35 KB
/
corpus_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import urllib.request, os, shutil, glob, re, nltk, gensim, scipy.stats, string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import PortugueseStemmer
from nltk import FreqDist
from gensim.models import FastText
import networkx as nx
import multiprocessing
import matplotlib
import matplotlib.pyplot as plt
class CorpusHandler:
def __init__(self, folder_path, language):
self.folder_path = folder_path
self.output_path = os.getcwd() + "\\Outputs"
os.makedirs("Output",exist_ok=True)
self.list_of_files = glob.glob(self.folder_path)
# set the language for the stopwords
self.stop_words = set(stopwords.words(language))
# set the punctuactions
symbols = {"º", "ª", "“", "”", "–", "¾", ',', '„', '⅞', '°', '\'', '£', '…', '’', '½'}
self.punctuations = set(string.punctuation).union(symbols)
# get the lemmas
self.lemmas = PortugueseStemmer()
# load and clean the texts
self.texts = []
for fil in self.list_of_files:
with open(fil, "r", encoding='utf-8') as f:
txt = f.read()
#self.texts.append(txt)
self.texts.append(self._clean_text(txt))
# create tokens
self.tokens = [[w.lower() for w in word_tokenize(text)] for text in self.texts]
# create dictionary
self.dictionary = gensim.corpora.Dictionary(self.tokens)
# create bag-of-words
self.bag_of_words = [self.dictionary.doc2bow(token) for token in self.tokens]
# create tf-idf
self.tf_idf = gensim.models.TfidfModel(self.bag_of_words)
def _clean_text(self, text):
stop_free = " ".join([i for i in text.lower().split() if i not in self.stop_words])
punc_free = ''.join(ch for ch in stop_free if ch not in self.punctuations)
normalized = " ".join(self.lemmas.stem(word) for word in punc_free.split())
return normalized
def get_LDA_modelling(self):
LDA = gensim.models.ldamodel.LdaModel
return LDA(self.bag_of_words, num_topics=3, id2word = self.dictionary, passes=50)
def get_corpus_frequent_terms(self, num_words):
all_texts = ""
for text in self.texts:
all_texts += text
fdist = FreqDist(all_texts.split())
return fdist.most_common(num_words)
def get_bigrams(self, doc_num):
return nltk.bigrams(self.texts[doc_num].split())
def get_trigrams(self, doc_num):
return nltk.trigrams(self.texts[doc_num].split())
def word_embedding(self):
return FastText(self.tokens, size=100, min_count=5, workers=multiprocessing.cpu_count(), sg=1)
def plot_word_embedding(self, word, model, nviz=15):
g = nx.Graph()
g.add_node(word, attr={'color':'blue'})
viz1 = model.most_similar(word, topn=nviz)
g.add_weighted_edges_from([(word, v, w) for v,w in viz1 if w> 0.5] )
for v in viz1:
g.add_weighted_edges_from([(v[0], v2, w2) for v2,w2 in model.most_similar(v[0])])
cols = ['r']*len(g.nodes()); cols[list(g.nodes()).index(word)]='b'
pos = nx.spring_layout(g, iterations=100)
plt.figure(3,figsize=(12,12))
nx.draw_networkx(g,pos=pos, node_color=cols, node_size=500, alpha=0.5, font_size=8)
plt.savefig("Graph.png", format="PNG")