/
word2vec_script.py
111 lines (91 loc) · 4 KB
/
word2vec_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import pandas as pd
from gensim.models import Word2Vec
from preprocessing import split_file,clean_text,remove_stopwords,func_lemma,func_container,func_tokenize #preprocessing functions
from vectorizer import MeanEmbeddingVectorizer,TfidfEmbeddingVectorizer
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from gensim.models.word2vec import FAST_VERSION
FAST_VERSION=1
import sys
sys.path.append('../lib/')
file=[]
path=r'rvm.txt' # path to file
for string in open(path,'r',encoding='cp1251'):
file.append(string.lower())
file_split=split_file(file)
text=clean_text([file_split[i][0] for i in range(len(file_split))]) # remove symbols in text
clear_text=remove_stopwords(text) # remove stop-words in text
s=func_lemma(func_container(clear_text)) # lemmatization procedure
w=func_tokenize(s) # w train dataset after preprocessing procedure
path=r'lenta-ru-news.csv' # path to test dataset
df = pd.read_csv(path,engine='python', delimiter=',',encoding = "utf-8-sig")
# plot topic news distribution
y_pos=np.arange(len(df['topic'].value_counts()))
performance=df['topic'].value_counts()
plt.figure(figsize=(8,6))
plt.bar(y_pos,performance,align='center',alpha=0.5,color='g',width=0.8)
plt.xticks(y_pos,df['topic'].value_counts().index.tolist(),rotation=90,size=15)
plt.yticks(size=15)
plt.xlabel('Topics',size=15)
plt.ylabel('Number of news',size=15)
plt.show()
test_dataset=[]
for text in df['text'][:100]: # take only 100 news from corpus
test_text=clean_text(text)
test_text=[''.join(word for word in test_text if not word=='')]
test_text=remove_stopwords(test_text)
test_text=func_lemma(test_text)
test_text=func_tokenize(test_text)
test_dataset.append(*test_text)
# create and train word2vec model
model = Word2Vec(w,min_count=2,workers=6,size=300,window=3,hs=0,sg=1)
# model_dict return word in vocab and vector
model_dict={}
for text in test_dataset:
for word in text:
word_vectors = model.wv
if word in word_vectors.vocab: # if word in vocab
model_dict.update({word:model[word]}) # add to dict
else:
continue
transform_corpus=[]
for text in test_dataset:
transform_corpus.append([model[word] for word in text if word in model.wv.vocab])
# list of classification models
clfs=[LogisticRegression(multi_class='multinomial',solver='sag', max_iter=1000),SVC(kernel='poly', gamma='scale', coef0=1, degree=3),RandomForestClassifier(n_estimators=400)]
def classification_models(X,y):
ans=[]
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,shuffle=True)
for number,method in enumerate(clfs):
method.fit(X_train,y_train)
ans.append('Classifier %d: accuracy score: %.3f' % (number,accuracy_score(y_test,method.predict(X_test))))
return ans
X=MeanEmbeddingVectorizer(model_dict).transform(transform_corpus) # mean of all word vectors in each topic
classification_models(X,df['topic'][:100])
X1=TfidfEmbeddingVectorizer(model_dict).fit(test_dataset,df['topic'][:100]).transform(test_dataset) # use TF-IDF for all words in each topic
classification_models(X1,df['topic'][:100])
# dimensionality reduction
X = model[list(model_dict.keys())[:8]] # take only first 8 words
y=model[df.loc[0,'topic'].lower()]
data=np.vstack((X,y)) # join words from text and topic
pca = PCA(n_components=2) # 2 components
result = pca.fit_transform(X)
colors=[*(['b']*7),'r']
# output results
plt.scatter(result[:, 0], result[:, 1],c=colors)
words = list(model_dict.keys())[:8]
for i, word in enumerate(words):
plt.annotate(word, xy=(result[i, 0], result[i, 1]),size=12)
plt.xticks(size=15)
plt.yticks(size=15)
plt.xlabel('X1 component',size=15)
plt.ylabel('X2 component',size=15)
plt.show()