def content_keyword(dbcon): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(dbcon, 'document','docid title content'): word_weight_list = eluate.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' eluate.close() dbcon.commit()
def content_keyword(dbcon): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(dbcon, 'document', 'docid title content'): word_weight_list = eluate.extract_kw(r[1], r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(dbcon, 'document', {'kw_content': wordwstr}, {'docid': r[0]}) cnt += 1 if cnt % 20 == 0: utils.updateProgress(cnt, doc_num) print '' eluate.close() dbcon.commit()
class DBKeywordExtractor: def __init__(self,db_path, weight_evaluator=None): self.dbpath = db_path self.dbcon = None if weight_evaluator: self.evaluator = weight_evaluator else: self.evaluator = WordWeightEvaluation(30,'../data') self.evaluator.add_docs_from_db(dbpath) def init_db(self): self.dbcon = sqlite3.connect(self.dbpath) def close_db(self): self.dbcon.close() self.evaluator.close() def content_keyword(self): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(self.dbcon, 'document') cnt = 0 #eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(self.dbcon, 'document','docid title content'): word_weight_list = self.evaluator.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' #eluate.close() self.dbcon.commit() def title_keyword(self, maxn=5): for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'): wordlist = [] if r[1]: for wt in r[1].split(): wordlist.append(wt.split('/')[0]) else: #from content keyword i = 0 for ww in r[2].split(): wordlist.append(ww.split('/')[0]) i += 1 if i == maxn: break; kwstr = ' '.join(wordlist) dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]}) self.dbcon.commit() def topic_keyword(self): self.dbcon.execute("create table if not exists topic (name text unique not null, doc_num integer default 0, keyword text, weight text)") cur = self.dbcon.execute('select cats, count(docid) from document group by cats') for r in cur: kwset = dict() cur2 = self.dbcon.execute('select kw_title from document where cats=?', (r[0],)) for kr in cur2: for w in kr[0].split(): try: kwset[w] += 1 except KeyError: kwset[w] = 1 sum_weight = float(sum(kwset.itervalues())) items = kwset.items() items.sort(key=lambda x:x[1],reverse=True) kw_str = ' '.join([w for w,f in items]) weight_str = ' '.join([str(f/sum_weight) for w,f in items]) dbutils.insert(self.dbcon, 'topic', {'name':r[0], 'doc_num':r[1], 'keyword':kw_str, 'weight':weight_str}) self.dbcon.commit()