/
get_tweet.py
80 lines (66 loc) · 2 KB
/
get_tweet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# -*- coding: utf-8 -*-
"""
Created on Thu Dec 10 16:28:20 2020
@author: minimilien
"""
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
import pandas as pd
import numpy as np
import nltk
try:
nltk.download('stopwords')
except:
pass
stopwords = set(stopwords.words('english'))
s = 200
df=pd.read_csv("tweets.csv",index_col=0)
def traitement(phrase):
phrase=phrase.lower()
texte2=(str(phrase).replace(',',' ')).lower()
texte2=texte2.replace("'",' ')
texte2=texte2.replace("-",' ')
texte2=texte2.replace(".",' ')
texte2=texte2.replace("*",' ')
texte2=texte2.replace("\\",' ')
texte2=texte2.replace("\n",' ')
texte2=texte2.replace("/",' ')
texte2=texte2.replace("!",' ')
texte2=texte2.replace("?",' ')
texte2=texte2.replace("\"",' ')
texte2=texte2.replace("'",' ')
texte2=texte2.replace("\#",' ')
texte2=texte2.split()
texte2=[token for token in texte2 if len(token) and token.lower() not in stopwords]
return ' '.join(texte2)
model=Doc2Vec.load('tweetmodel.model')
def norme(vec):
return np.sqrt(np.sum(vec*vec))
def prediction(vec1,vec2):
val=np.sum((vec1*vec2))
val/=(norme(vec1)*norme(vec2))
if val>1:
return np.arccos(1)#because of the structure of floats it can appen that the result is almost 1 but superior
else:
return np.arccos(val)
def taille(vec1,vec2):
return abs(norme(vec1)-norme(vec2))
def vectorization(word):
T=word.split(' ')
vector=sum([vec(i) for i in T])
return vector
def vec(mot):
try:
x=model.wv.get_vector(mot)
return x
except:
return np.array([0]*s,dtype=np.float32)
def similarity(word1,word2):
w1,w2=traitement(word1),traitement(word2)
v1,v2=vectorization(w1),vectorization(w2)
return prediction(v1,v2)
def get_20_best(phrase):
res={tweet:similarity(phrase,tweet) for tweet in df['text']}
res=sorted(res, key= lambda A: res[A])
return res[:20]
#print('\n'.join(get_20_best("Build a wall !!!")))