def transfer_data_file(dbfile, outputfile): con = dbutils.connect_db(dbfile) words = dict() for d in dbutils.iterRec(con, 'document', 'kw_content'): for ww in d[0].split(' '): word, weight = ww.split('/') words[word] = (words[word] + 1) if word in words else 1 # for w in words.keys(): # if words[w] <= 1: # del words[w] # print 'count of words: %d' % len(words) out = open(outputfile, 'w') rang = range(0, len(words)) out.write('\t'.join(words.iterkeys()).encode('utf8')) out.write('\r\n') out.write('\t'.join(['c' for i in rang])) out.write('\r\n\r\n') for d in dbutils.iterRec(con, 'document', 'kw_content'): for w in words: words[w] = 0.0 for ww in d[0].split(' '): word, weight = ww.split('/') if word in words: words[word] = float(weight) out.write('\t'.join([str(v) for v in words.itervalues()])) out.write('\r\n') out.close() con.close()
def transfer_data_file(dbfile, outputfile): con = dbutils.connect_db(dbfile) words = dict() for d in dbutils.iterRec(con, 'document','kw_content'): for ww in d[0].split(' '): word,weight = ww.split('/') words[word] = (words[word]+1) if word in words else 1 # for w in words.keys(): # if words[w] <= 1: # del words[w] # print 'count of words: %d' % len(words) out = open(outputfile, 'w') rang = range(0,len(words)) out.write('\t'.join(words.iterkeys()).encode('utf8')) out.write('\r\n') out.write('\t'.join(['c' for i in rang])) out.write('\r\n\r\n') for d in dbutils.iterRec(con, 'document', 'kw_content'): for w in words: words[w] = 0.0 for ww in d[0].split(' '): word,weight = ww.split('/') if word in words: words[word]=float(weight) out.write('\t'.join([str(v) for v in words.itervalues()])) out.write('\r\n') out.close() con.close()
def title_wordpair(db_con): print 'statistic word co-ocurence from title...' codict, dfdict = dict(), dict() cnt = 0 for r in dbutils.iterRec(db_con, 'document', 'kw_title'): words = r[0].split(' ') for wp in itertools.combinations(words, 2): if wp[0] > wp[1]: wp = (wp[1], wp[0]) codict[wp] = (codict[wp] + 1) if wp in codict else 1 for w in words: dfdict[w] = (dfdict[w] + 1) if w in dfdict else 1 cnt += 1 print 'doc num: %d' % cnt print 'number of wordpair in title_wordpair: %d' % len(codict) cnt = 0 for wp, co in codict.iteritems(): weight = co / math.sqrt(dfdict[wp[0]] * dfdict[wp[1]]) if co >= 2 and weight > 1e-3: cnt += 1 dbutils.insert(db_con, 't_wordpair', { 'word1': wp[0], 'word2': wp[1], 'coocur_num': co, 'weight': weight }) print 'number of edges ' + str(cnt) db_con.commit()
def iter_document(self): dbcon = dbutils.connect_db(self.dbfile) for r in dbutils.iterRec(dbcon,'document','docid kw_content'): word_dict = {} for ww in r[1].split(): s = ww.split('/') word_dict[s[0]] = float(s[1]) doc = Document(r[0], word_dict) yield doc dbcon.close()
def iter_document(self): dbcon = dbutils.connect_db(self.dbfile) for r in dbutils.iterRec(dbcon, 'document', 'docid kw_content'): word_dict = {} for ww in r[1].split(): s = ww.split('/') word_dict[s[0]] = float(s[1]) doc = Document(r[0], word_dict) yield doc dbcon.close()
def loadTitleWordnet(dbfile='../data/cn-topic.db', min_coocur=2, min_weight=1e-3): dbcon = dbutils.connect_db(dbfile) #g = igraph.Graph(directed=False) #g.vs['name']=None #edge_size =dbutils.countOfRecs(dbcon,'t_wordpair','coocur_num>=? and weight>?', (min_coocur,min_weight)) #cnt = 0; edgelist = [] for r in dbutils.iterRec(dbcon,'t_wordpair',['word1', 'word2','weight'], 'coocur_num>=? and weight>?', (min_coocur,min_weight)): edgelist.append({'source':r[0], 'target':r[1], 'weight':r[2]}) #cnt+=1 #if cnt%100==0: # utils.updateProgress(cnt, edge_size) #print '' dbutils.close_db(dbcon) return igraph.Graph.DictList(vertices=None, edges=edgelist)
def title_keyword(self, maxn=5): for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'): wordlist = [] if r[1]: for wt in r[1].split(): wordlist.append(wt.split('/')[0]) else: #from content keyword i = 0 for ww in r[2].split(): wordlist.append(ww.split('/')[0]) i += 1 if i == maxn: break; kwstr = ' '.join(wordlist) dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]}) self.dbcon.commit()
def content_keyword(self): print 'extracting keyword from content...' doc_num = dbutils.countOfRecs(self.dbcon, 'document') cnt = 0 #eluate = WordWeightEvaluation(30) for r in dbutils.iterRec(self.dbcon, 'document','docid title content'): word_weight_list = self.evaluator.extract_kw(r[1],r[2]) wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list]) dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]}) cnt += 1 if cnt%20==0: utils.updateProgress(cnt,doc_num) print '' #eluate.close() self.dbcon.commit()
def title_df(dbcon): print 'statistic word document frequency...' doc_num = dbutils.countOfRecs(dbcon, 'document') cnt = 0 for r in dbutils.iterRec(dbcon,'document', 'kw_title'): title_set = set(r[0].split()) for w in title_set: df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w,)) assert df_r != None, "'%s' in Document %d except" % (w,r[2]) dbutils.updateByPK(dbcon, 'word', {'t_df':df_r[0]+1}, {'word':w}) cnt += 1 if cnt%50 == 0: utils.updateProgress(cnt,doc_num) print '' dbcon.commit()
def title_df(dbcon): print 'statistic word document frequency...' doc_num = dbutils.countOfRecs(dbcon, 'document') cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'kw_title'): title_set = set(r[0].split()) for w in title_set: df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w, )) assert df_r != None, "'%s' in Document %d except" % (w, r[2]) dbutils.updateByPK(dbcon, 'word', {'t_df': df_r[0] + 1}, {'word': w}) cnt += 1 if cnt % 50 == 0: utils.updateProgress(cnt, doc_num) print '' dbcon.commit()
def word_preproc(dbcon, commonwordfile='../data/commonword'): """ statistic word document frequency,remove high word with high DF""" noun_dict, verb_dict = dict(), dict() commonword_set = load_commonword(commonwordfile) for r in dbutils.iterRec(dbcon, 'document', 'title content'): wordset = set(' '.join((r[0], r[1])).split()) for wt in wordset: w, t = wt.split('/') if w in commonword_set: continue if t == 'n': try: noun_dict[w] += 1 except KeyError: noun_dict[w] = 1 else: #t=='v' try: verb_dict[w] += 1 except KeyError: verb_dict[w] = 1 for w in noun_dict.keys(): if w in verb_dict: noun_dict[w] += verb_dict[w] del verb_dict[w] # sort by df in decending order noun_list = noun_dict.items() noun_list.sort(cmp=lambda l, r: r[1] - l[1]) # remove high df word #start = int(0.003*len(noun_list)) #noun_list = noun_list[start:] dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'n')", noun_list) verb_list = verb_dict.items() verb_list.sort(cmp=lambda l, r: r[1] - l[1]) #remove high df word #start = int(0.005*len(verb_list)) #verb_list = verb_list[start:] dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'v')", verb_list) dbcon.commit()
def loadTitleWordnet(dbfile='../data/cn-topic.db', min_coocur=2, min_weight=1e-3): dbcon = dbutils.connect_db(dbfile) #g = igraph.Graph(directed=False) #g.vs['name']=None #edge_size =dbutils.countOfRecs(dbcon,'t_wordpair','coocur_num>=? and weight>?', (min_coocur,min_weight)) #cnt = 0; edgelist = [] for r in dbutils.iterRec(dbcon, 't_wordpair', ['word1', 'word2', 'weight'], 'coocur_num>=? and weight>?', (min_coocur, min_weight)): edgelist.append({'source': r[0], 'target': r[1], 'weight': r[2]}) #cnt+=1 #if cnt%100==0: # utils.updateProgress(cnt, edge_size) #print '' dbutils.close_db(dbcon) return igraph.Graph.DictList(vertices=None, edges=edgelist)
def word_preproc(dbcon, commonwordfile='../data/commonword'): """ statistic word document frequency,remove high word with high DF""" noun_dict,verb_dict = dict(),dict() commonword_set = load_commonword(commonwordfile) for r in dbutils.iterRec(dbcon,'document', 'title content'): wordset = set(' '.join((r[0],r[1])).split()) for wt in wordset: w,t = wt.split('/') if w in commonword_set: continue if t=='n': try: noun_dict[w]+=1 except KeyError: noun_dict[w]=1 else: #t=='v' try: verb_dict[w]+=1 except KeyError: verb_dict[w]=1 for w in noun_dict.keys(): if w in verb_dict: noun_dict[w]+=verb_dict[w] del verb_dict[w] # sort by df in decending order noun_list = noun_dict.items() noun_list.sort(cmp=lambda l,r:r[1]-l[1]) # remove high df word #start = int(0.003*len(noun_list)) #noun_list = noun_list[start:] dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'n')", noun_list) verb_list = verb_dict.items() verb_list.sort(cmp=lambda l,r:r[1]-l[1]) #remove high df word #start = int(0.005*len(verb_list)) #verb_list = verb_list[start:] dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'v')", verb_list) dbcon.commit()
def title_keyword(dbcon): print 'extrating keyword from title...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'docid title'): twords = set() for wt in r[1].split(' '): w = wt.split('/')[0] if w in wordset: twords.add(w) widstr = ' '.join(twords) dbutils.updateByPK(dbcon, 'document', {'kw_title':widstr}, {'docid':r[0]}) cnt += 1 if cnt%50==0: utils.updateProgress(cnt,doc_num) print '' dbcon.commit()
def title_keyword(dbcon): print 'extrating keyword from title...' doc_num = dbutils.countOfRecs(dbcon, 'document') wordset = load_wordset(dbcon) cnt = 0 for r in dbutils.iterRec(dbcon, 'document', 'docid title'): twords = set() for wt in r[1].split(' '): w = wt.split('/')[0] if w in wordset: twords.add(w) widstr = ' '.join(twords) dbutils.updateByPK(dbcon, 'document', {'kw_title': widstr}, {'docid': r[0]}) cnt += 1 if cnt % 50 == 0: utils.updateProgress(cnt, doc_num) print '' dbcon.commit()
def title_wordpair(db_con): print 'statistic word co-ocurence from title...' codict,dfdict=dict(),dict() cnt = 0 for r in dbutils.iterRec(db_con, 'document','kw_title'): words = r[0].split(' ') for wp in itertools.combinations(words, 2): if wp[0]>wp[1]: wp = (wp[1],wp[0]) codict[wp] = (codict[wp]+1) if wp in codict else 1 for w in words: dfdict[w] = (dfdict[w]+1) if w in dfdict else 1 cnt+=1 print 'doc num: %d' % cnt print 'number of wordpair in title_wordpair: %d' % len(codict) cnt = 0 for wp,co in codict.iteritems(): weight = co/math.sqrt(dfdict[wp[0]]*dfdict[wp[1]]) if co>=2 and weight>1e-3: cnt += 1 dbutils.insert(db_con, 't_wordpair', {'word1':wp[0], 'word2':wp[1], 'coocur_num':co, 'weight':weight}) print 'number of edges '+str(cnt) db_con.commit()
def iter_title_words(self): dbcon = dbutils.connect_db(self.dbfile) for r in dbutils.iterRec(dbcon, 'document', 'kw_title'): yield Document(0, {w: 0 for w in r[0].split(' ')}) dbcon.close()
def iter_title_words(self): dbcon = dbutils.connect_db(self.dbfile) for r in dbutils.iterRec(dbcon,'document','kw_title'): yield Document(0, {w:0 for w in r[0].split(' ')}) dbcon.close()
def load_wordset(dbcon): wordset = set() for r in dbutils.iterRec(dbcon, 'word', 'word'): wordset.add(r[0]) return frozenset(wordset)
def load_wordset(dbcon): wordset = set() for r in dbutils.iterRec(dbcon,'word', 'word'): wordset.add(r[0]) return frozenset(wordset)