def build_index(self, db_file, outfile): db = sqlite3.connect(db_file) self.index = index = defaultdict(lambda:defaultdict()) start_id = 0 while 1: q = self.DB_ITER_QUERY.format(id=start_id, limit=self.ITER_STEP) print q res = db.execute(q) res_rows = res.fetchall() if len(res_rows) == 0: break for doc_id, doc_body in res_rows: start_id = doc_id = int(doc_id) doc_body = doc_body.lower() words = kwutils.tokenize(doc_body) w_stemmed = kwutils.stem(words, self.p) w_stopped = w_stemmed w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw) for w in w_stopped: w = w.strip() if len(w) > 0: index[w][doc_id] = 1 with open(outfile, 'w') as f_out: f_out.write(json.dumps(index))
def analyze(self, sentence): words = kwutils.tokenize(sentence) w_stemmed = kwutils.stem(words, self.p) w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw) vector = set() for word in w_stopped: for doc_id in self.ndx.get(word, []): vector.add(doc_id) return vector
def search(self, phrase): phrase = phrase.lower() words = kwutils.tokenize(phrase) w_stemmed = kwutils.stem(words, self.p) w_stopped = w_stemmed w_stopped = kwutils.filter_stopwords(w_stemmed, self.sw) rank = defaultdict(lambda: defaultdict(lambda: 0)) for word in w_stopped: if len(word) == 0: continue #q = self.DB_WORD_QUERY.format(word=word) #print >>sys.stderr, q #res = self.db.execute(q) res = self.indexer.find(word) for doc_id in res: rank[doc_id][word] = 1 return self.rank_docs(rank, w_stopped)
def build(self, docpath, outfile): p = PorterStemmer() sw = stopwords.StopWords(self.stopword_file) ndx = defaultdict(list) for filename in os.listdir(docpath): if not filename.endswith(".txt"): continue doc_id = hash(filename.replace(".txt", "")) with open(os.path.join(docpath, filename)) as f: f_content = kwutils.normalize( words = kwutils.tokenize(f_content) w_stemmed = kwutils.stem(words, p) w_stopped = kwutils.filter_stopwords(w_stemmed, sw) for word in w_stopped: if len(word) > 0: if not doc_id in ndx[word]: ndx[word].append(doc_id) with open(outfile, 'w') as f: f.write(json.dumps(ndx))