/
features.py
72 lines (57 loc) · 1.77 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import math
import os
import csv
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk.sentiment.vader as vader
import numpy as np
from scipy.spatial.distance import canberra
#from gensim.models import KeyedVectors
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('vader_lexicon')
nltk.download('punkt')
def canberraDist(h,b, bigram_vectorizer):
corpus = []
corpus.append(h)
corpus.append(b)
vecs = bigram_vectorizer.fit_transform(corpus).toarray()
vec1, vec2 = vecs[0,:],vecs[1,:]
#print (canberra(vec1,vec2))
return canberra(vec1,vec2)/1000
def tfidf(mat, common, vocab, doc):
tfidf = 0
for w in common:
if w in vocab:
tfidf = tfidf + mat[doc,vocab.index(w)]
return tfidf
def overlap(common):
return len(common)
def wmd(headline, body,wmd_model):
dist = wmd_model.wmdistance(headline,body)
if math.isinf(dist) or math.isnan(dist):
return 1.5
else:
return dist
def polarityScores(headline,body):
sid = SIA()
h = sid.polarity_scores(headline)
b = sid.polarity_scores(body)
return abs(h['neg'] - b['neg']), abs(h['neu'] - b['neu']) , abs(h['pos'] - b['pos'])
#return h['pos'], b['pos']
def negWords(headline,body):
b_count = 0
h_count = 0
for b in body:
if vader.negated(b):
b_count = b_count + 1
for h in headline:
if vader.negated(h):
h_count = h_count + 1
hlen = len(headline)
blen = len(body)
if hlen!=0 and blen!=0:
return abs((b_count/blen)-(h_count/hlen))
else:
#print("------------------------------------------------------")
return 0