-
Notifications
You must be signed in to change notification settings - Fork 0
/
javadoc_features.py
80 lines (59 loc) · 2.4 KB
/
javadoc_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from html.parser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
super(MLStripper, self).__init__()
self.reset()
self.strict = False
self.convert_charrefs= True
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
@staticmethod
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
def ngrams(text, n):
return zip(*[normalize(text).split()[i:] for i in range(n)])
nltk.download('punkt') # if necessary...
stemmer = nltk.stem.porter.PorterStemmer()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
'''remove punctuation, lowercase, stem'''
def normalize(text):
return stem_tokens(nltk.word_tokenize(MLStripper.strip_tags(text).lower().translate(remove_punctuation_map)))
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')
def cosine_sim(text1, text2):
tfidf = vectorizer.fit_transform([text1, text2])
return ((tfidf * tfidf.T).A)[0,1]
print(cosine_sim('a little bird', 'a little bird'))
print(cosine_sim('a little bird', 'a little bird chirps'))
print(cosine_sim('a little bird', 'a big dog barks'))
# print(strip_tags("""<p>Deep clone an {@code Object} using serialization.</p>
#
# <p>This is many times slower than writing clone methods by hand
# on all objects in your object graph. However, for complex object
# graphs, or for those that don't support deep cloning this can
# be a simple alternative implementation. Of course all the objects
# must be {@code Serializable}.</p>"""))
import spacy
nlp = spacy.load('en')
doc1 = nlp(u'Hello hi there!')
doc2 = nlp(u'Hello hi there!')
doc3 = nlp(u'Hey whatsup?')
print(doc1.similarity(doc2)) # 0.999999954642
print(doc2.similarity(doc3)) # 0.699032527716
print(doc1.similarity(doc3)) # 0.699032527716
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(["My name is Ankit",
"Ankit name is very famous",
"Ankit like his name",
"India has a lot of beautiful cities"])
print((tfidf * tfidf.T).A)