-
Notifications
You must be signed in to change notification settings - Fork 1
/
find_text.py
62 lines (50 loc) · 2.16 KB
/
find_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from bs4 import BeautifulSoup, Tag
from soupselect import select
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
import nltk, string
import requests
import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def find_similar(tfidf_matrix, index, top_n = 5):
cosine_similarities = linear_kernel(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
related_docs_indices = [i for i in cosine_similarities.argsort()[::-1] if i != index]
return [(index, cosine_similarities[index]) for index in related_docs_indices][0:top_n]
corpus = []
file_mapping = {}
for idx, file in enumerate(glob.glob("data/*")):
page = BeautifulSoup(open(file, "r"), "html.parser")
content = select(page, "div.post-body")[0].text
corpus.append(content)
file_mapping[idx] = file
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.WordNetLemmatizer()
punctuation_map = dict((ord(char), None) for char in string.punctuation)
def stemmer_tokens(tokens):
return [lemmatizer.lemmatize(stemmer.stem(item)) for item in tokens]
def normalize(text):
return stemmer_tokens(nltk.word_tokenize(text.lower().translate(punctuation_map)))
tf = TfidfVectorizer(analyzer='word', min_df = 0, stop_words = 'english', tokenizer=normalize)
tfidf_matrix = tf.fit_transform(corpus)
reducer = TruncatedSVD(n_components=100)
reducer.fit(tfidf_matrix)
svd_all = reducer.transform(tfidf_matrix)
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
new_all = model.fit_transform(svd_all)
df = pd.DataFrame(None)
df["page"] = [file_mapping[idx] for idx, value in enumerate(new_all)]
df["X coordinate"] = [x[0] for x in new_all]
df["Y coordinate"] = [x[1] for x in new_all]
def onpick3(event):
ind = event.ind
print "ind:{0}, x:{1}, y:{2}, id: {3}".format(ind, df["X coordinate"][ind], df["Y coordinate"][ind], df["page"][ind])
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(df["X coordinate"], df["Y coordinate"], picker = True)
fig.canvas.mpl_connect('pick_event', onpick3)
plt.show()