def title_keyword(self, maxn=5): for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'): wordlist = [] if r[1]: for wt in r[1].split(): wordlist.append(wt.split('/')[0]) else: #from content keyword i = 0 for ww in r[2].split(): wordlist.append(ww.split('/')[0]) i += 1 if i == maxn: break; kwstr = ' '.join(wordlist) dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]}) self.dbcon.commit()
def title_df(dbcon): print 'statistic word document frequency...' doc_num = dbutils.countOfRecs(dbcon, 'document') cnt = 0 for r in dbutils.iterRec(dbcon,'document', 'kw_title'): title_set = set(r[0].split()) for w in title_set: df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w,)) assert df_r != None, "'%s' in Document %d except" % (w,r[2]) dbutils.updateByPK(dbcon, 'word', {'t_df':df_r[0]+1}, {'word':w}) cnt += 1 if cnt%50 == 0: utils.updateProgress(cnt,doc_num) print '' dbcon.commit()
def content_keyword(self): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(self.dbcon, 'document') cnt = 0 #eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(self.dbcon, 'document','docid title content'): word_weight_list = self.evaluator.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' #eluate.close() self.dbcon.commit()
def title_df(dbcon): print 'statistic word document frequency...' doc_num = dbutils.countOfRecs(dbcon, 'document') cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'kw_title'): title_set = set(r[0].split()) for w in title_set: df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w, )) assert df_r != None, "'%s' in Document %d except" % (w, r[2]) dbutils.updateByPK(dbcon, 'word', {'t_df': df_r[0] + 1}, {'word': w}) cnt += 1 if cnt % 50 == 0: utils.updateProgress(cnt, doc_num) print '' dbcon.commit()
def title_keyword(dbcon): print 'extrating keyword from title...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'docid title'): twords = set() for wt in r[1].split(' '): w = wt.split('/')[0] if w in wordset: twords.add(w) widstr = ' '.join(twords) dbutils.updateByPK(dbcon, 'document', {'kw_title':widstr}, {'docid':r[0]}) cnt += 1 if cnt%50==0: utils.updateProgress(cnt,doc_num) print '' dbcon.commit()
def title_keyword(dbcon): print 'extrating keyword from title...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'docid title'): twords = set() for wt in r[1].split(' '): w = wt.split('/')[0] if w in wordset: twords.add(w) widstr = ' '.join(twords) dbutils.updateByPK(dbcon, 'document', {'kw_title': widstr}, {'docid': r[0]}) cnt += 1 if cnt % 50 == 0: utils.updateProgress(cnt, doc_num) print '' dbcon.commit()