forked from burakbayramli/kod
-
Notifications
You must be signed in to change notification settings - Fork 0
/
loogle.py
96 lines (81 loc) · 3.25 KB
/
loogle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# IN PROGRESS
# Local Google - indexes all pdf,djvu,txt,epub files under a given
# directory, saves the index, and allows search on these indexed
# documents.
# https://sourceforge.net/projects/djvu/files/
# http://kevinboone.net/README_epub2txt.html
#
# Requirements
# scipy, pandas, numpy, nltk
# pdftotext executable, under tetex for windows
import nltk, string, sys, scipy.io as io
import scipy.sparse as sps, pandas as pd
from sklearn.preprocessing import normalize
import rsync, os, numpy as np
cols = 10000
stemmer = nltk.stem.porter.PorterStemmer()
def stem_tokens(tokens):
return [stemmer.stem(item) for item in tokens]
def index(search_dir,index_dir):
cmd = 'pdftotext "%s" %s/loog.txt'
dirs, files = rsync.ls(search_dir)
files = [(f,size) for (f,size) in files if '.pdf' in f]
N = len(files)
A = sps.lil_matrix((N,cols))
print A.shape
df_files = []
for i,(f,size) in enumerate(files):
file = f.replace("\\","/")
print file
if ".pdf" in file:
cmd2 = cmd % (f,os.environ['TEMP'])
os.system(cmd2)
lowers = open("%s/loog.txt" % os.environ['TEMP']).read().decode("ISO-8859-1").lower()
tokens = nltk.word_tokenize(lowers)
tokens = stem_tokens(tokens)
print tokens[:30]
for token in tokens: A[i,hash(token) % cols] += 1
df_files.append([file, size])
df = A.copy()
df[df > 0] = 1.
df = np.array(df.sum(axis=0))
idf = df.copy()
idf[df.nonzero()] = np.log(N/df[df.nonzero()])
io.mmwrite(index_dir + "/loogle_idf.mtx", idf)
tf = A.copy().tocoo()
tf.data = 1 + np.log(tf.data)
tfidf = sps.csr_matrix(tf.multiply(idf))
tfidf = normalize(tfidf, norm='l2', axis=1)
io.mmwrite(index_dir + "/loogle_tfidf.mtx", tfidf)
df_files = pd.DataFrame(df_files,columns=['file','size'])
df_files.to_csv(index_dir + "/loogle_files.csv",index=None)
def search(s, index_dir):
idf = io.mmread(index_dir + "/loogle_idf.mtx")
tfidf = io.mmread(index_dir + "/loogle_tfidf.mtx").tocsr()
docs = pd.read_csv(index_dir + "/loogle_files.csv")
N = len(docs)
docs = np.array(docs['file'])
sm = sps.lil_matrix((1,cols))
tokens = nltk.word_tokenize(s.lower())
tokens = stem_tokens(tokens)
print tokens[:20]
for token in tokens: sm[0,hash(token) % cols] += 1
tfidf_new = sm.multiply(idf)
tfidf_new = sps.csr_matrix(tfidf_new)
# onceden gorulmemis kelimeler 1 oluyor, onlari sifir yap
tfidf_new[tfidf_new==1.0] = 0.0
tfidf_new = normalize(tfidf_new, norm='l2', axis=1)
dist = tfidf.dot(tfidf_new.T)
res = pd.DataFrame(dist.todense(),columns=['score'])
print dist.todense()
res['docid'] = range(N)
res = res.sort_index(by='score',ascending=False)
res['doc'] = res.apply(lambda x: docs[int(x['docid'])],axis=1)
print res
if __name__ == "__main__":
#index(search_dir="c:/Users/burak/Downloads/test",
# index_dir="c:/Users/burak/Downloads/test")
#search("aramak", index_dir="c:/Users/burak/Downloads/test")
index(search_dir="c:/Users/burak/Documents/kitaplar",
index_dir="c:/Users/burak/Documents/kitaplar")
search("Lomb-Scargle", index_dir="c:/Users/burak/Documents/kitaplar")