Exemple #1
0
def transfer_data_file(dbfile, outputfile):
    con = dbutils.connect_db(dbfile)
    words = dict()
    for d in dbutils.iterRec(con, 'document', 'kw_content'):
        for ww in d[0].split(' '):
            word, weight = ww.split('/')
            words[word] = (words[word] + 1) if word in words else 1
    # for w in words.keys():
    #     if words[w] <= 1:
    #         del words[w]
    # print 'count of words: %d' % len(words)

    out = open(outputfile, 'w')
    rang = range(0, len(words))
    out.write('\t'.join(words.iterkeys()).encode('utf8'))
    out.write('\r\n')
    out.write('\t'.join(['c' for i in rang]))
    out.write('\r\n\r\n')

    for d in dbutils.iterRec(con, 'document', 'kw_content'):
        for w in words:
            words[w] = 0.0
        for ww in d[0].split(' '):
            word, weight = ww.split('/')
            if word in words:
                words[word] = float(weight)
        out.write('\t'.join([str(v) for v in words.itervalues()]))
        out.write('\r\n')

    out.close()
    con.close()
Exemple #2
0
def transfer_data_file(dbfile, outputfile):
    con = dbutils.connect_db(dbfile)
    words = dict()
    for d in dbutils.iterRec(con, 'document','kw_content'):
        for ww in d[0].split(' '):
            word,weight = ww.split('/')
            words[word] = (words[word]+1) if word in words else 1
    # for w in words.keys():
    #     if words[w] <= 1:
    #         del words[w]
    # print 'count of words: %d' % len(words)
                
    out = open(outputfile, 'w')
    rang = range(0,len(words))
    out.write('\t'.join(words.iterkeys()).encode('utf8'))
    out.write('\r\n')
    out.write('\t'.join(['c' for i in rang]))
    out.write('\r\n\r\n')

    for d in dbutils.iterRec(con, 'document', 'kw_content'):
        for w in words: words[w] = 0.0
        for ww in d[0].split(' '):
            word,weight = ww.split('/')
            if word in words:
                words[word]=float(weight)
        out.write('\t'.join([str(v) for v in words.itervalues()]))
        out.write('\r\n')

    out.close()
    con.close()
Exemple #3
0
def title_wordpair(db_con):
    print 'statistic word co-ocurence from title...'

    codict, dfdict = dict(), dict()
    cnt = 0
    for r in dbutils.iterRec(db_con, 'document', 'kw_title'):
        words = r[0].split(' ')
        for wp in itertools.combinations(words, 2):
            if wp[0] > wp[1]: wp = (wp[1], wp[0])
            codict[wp] = (codict[wp] + 1) if wp in codict else 1
        for w in words:
            dfdict[w] = (dfdict[w] + 1) if w in dfdict else 1

        cnt += 1
    print 'doc num: %d' % cnt
    print 'number of wordpair in title_wordpair: %d' % len(codict)
    cnt = 0
    for wp, co in codict.iteritems():
        weight = co / math.sqrt(dfdict[wp[0]] * dfdict[wp[1]])
        if co >= 2 and weight > 1e-3:
            cnt += 1
        dbutils.insert(db_con, 't_wordpair', {
            'word1': wp[0],
            'word2': wp[1],
            'coocur_num': co,
            'weight': weight
        })
    print 'number of edges ' + str(cnt)
    db_con.commit()
    def iter_document(self):
        dbcon = dbutils.connect_db(self.dbfile)
        for r in dbutils.iterRec(dbcon,'document','docid kw_content'):
            word_dict = {}
            for ww in r[1].split():
                s = ww.split('/')
                word_dict[s[0]] = float(s[1])
            doc = Document(r[0], word_dict)
            yield doc

        dbcon.close()
    def iter_document(self):
        dbcon = dbutils.connect_db(self.dbfile)
        for r in dbutils.iterRec(dbcon, 'document', 'docid kw_content'):
            word_dict = {}
            for ww in r[1].split():
                s = ww.split('/')
                word_dict[s[0]] = float(s[1])
            doc = Document(r[0], word_dict)
            yield doc

        dbcon.close()
Exemple #6
0
def loadTitleWordnet(dbfile='../data/cn-topic.db', min_coocur=2, min_weight=1e-3):
    dbcon = dbutils.connect_db(dbfile)
   
    #g = igraph.Graph(directed=False)
    #g.vs['name']=None
    #edge_size =dbutils.countOfRecs(dbcon,'t_wordpair','coocur_num>=? and weight>?', (min_coocur,min_weight))
    #cnt = 0;
    edgelist = []
    for r in dbutils.iterRec(dbcon,'t_wordpair',['word1', 'word2','weight'], 'coocur_num>=? and weight>?', (min_coocur,min_weight)):
        edgelist.append({'source':r[0], 'target':r[1], 'weight':r[2]})
        #cnt+=1
        #if cnt%100==0:
        #    utils.updateProgress(cnt, edge_size)
    #print ''
    dbutils.close_db(dbcon)
    return igraph.Graph.DictList(vertices=None, edges=edgelist)
 def title_keyword(self, maxn=5):
     for r in dbutils.iterRec(self.dbcon, 'document', 'docid title kw_content'):
         wordlist = []
         if r[1]:
             for wt in r[1].split():
                 wordlist.append(wt.split('/')[0])
         else:
             #from content keyword
             i = 0
             for ww in r[2].split():
                 wordlist.append(ww.split('/')[0])
                 i += 1
                 if i == maxn: break;
         kwstr = ' '.join(wordlist)    
         dbutils.updateByPK(self.dbcon, 'document', {'kw_title':kwstr}, {'docid':r[0]})
     self.dbcon.commit()
    def content_keyword(self):
        print 'extracting keyword from content...'
        doc_num = dbutils.countOfRecs(self.dbcon, 'document')
        cnt = 0
        #eluate = WordWeightEvaluation(30)
        for r in dbutils.iterRec(self.dbcon, 'document','docid title content'):
            word_weight_list = self.evaluator.extract_kw(r[1],r[2])
            wordwstr = ' '.join(['%s/%.7f' % idw for idw in word_weight_list])
            dbutils.updateByPK(self.dbcon, 'document', {'kw_content':wordwstr}, {'docid':r[0]})

            cnt += 1
            if cnt%20==0:
                utils.updateProgress(cnt,doc_num)

        print ''
        #eluate.close()
        self.dbcon.commit()
def title_df(dbcon):
    print 'statistic word document frequency...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')  
    cnt = 0
    
    for r in dbutils.iterRec(dbcon,'document', 'kw_title'):
        title_set = set(r[0].split())
        for w in title_set:
            df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w,))
            assert df_r != None, "'%s' in Document %d except" % (w,r[2])
            dbutils.updateByPK(dbcon, 'word', {'t_df':df_r[0]+1}, {'word':w})

        cnt += 1
        if cnt%50 == 0:
            utils.updateProgress(cnt,doc_num)
    print ''
    dbcon.commit()
Exemple #10
0
def title_df(dbcon):
    print 'statistic word document frequency...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')
    cnt = 0

    for r in dbutils.iterRec(dbcon, 'document', 'kw_title'):
        title_set = set(r[0].split())
        for w in title_set:
            df_r = dbutils.queryOneRec(dbcon, 'word', 't_df', 'word=?', (w, ))
            assert df_r != None, "'%s' in Document %d except" % (w, r[2])
            dbutils.updateByPK(dbcon, 'word', {'t_df': df_r[0] + 1},
                               {'word': w})

        cnt += 1
        if cnt % 50 == 0:
            utils.updateProgress(cnt, doc_num)
    print ''
    dbcon.commit()
Exemple #11
0
def word_preproc(dbcon, commonwordfile='../data/commonword'):
    """ statistic word document frequency,remove high word with high DF"""
    noun_dict, verb_dict = dict(), dict()
    commonword_set = load_commonword(commonwordfile)
    for r in dbutils.iterRec(dbcon, 'document', 'title content'):
        wordset = set(' '.join((r[0], r[1])).split())
        for wt in wordset:
            w, t = wt.split('/')
            if w in commonword_set: continue
            if t == 'n':
                try:
                    noun_dict[w] += 1
                except KeyError:
                    noun_dict[w] = 1
            else:  #t=='v'
                try:
                    verb_dict[w] += 1
                except KeyError:
                    verb_dict[w] = 1
    for w in noun_dict.keys():
        if w in verb_dict:
            noun_dict[w] += verb_dict[w]
            del verb_dict[w]

    # sort by df in decending order
    noun_list = noun_dict.items()
    noun_list.sort(cmp=lambda l, r: r[1] - l[1])
    # remove high df word
    #start = int(0.003*len(noun_list))
    #noun_list = noun_list[start:]
    dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'n')",
                      noun_list)

    verb_list = verb_dict.items()
    verb_list.sort(cmp=lambda l, r: r[1] - l[1])
    #remove high df word
    #start = int(0.005*len(verb_list))
    #verb_list = verb_list[start:]
    dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'v')",
                      verb_list)

    dbcon.commit()
Exemple #12
0
def loadTitleWordnet(dbfile='../data/cn-topic.db',
                     min_coocur=2,
                     min_weight=1e-3):
    dbcon = dbutils.connect_db(dbfile)

    #g = igraph.Graph(directed=False)
    #g.vs['name']=None
    #edge_size =dbutils.countOfRecs(dbcon,'t_wordpair','coocur_num>=? and weight>?', (min_coocur,min_weight))
    #cnt = 0;
    edgelist = []
    for r in dbutils.iterRec(dbcon, 't_wordpair', ['word1', 'word2', 'weight'],
                             'coocur_num>=? and weight>?',
                             (min_coocur, min_weight)):
        edgelist.append({'source': r[0], 'target': r[1], 'weight': r[2]})
        #cnt+=1
        #if cnt%100==0:
        #    utils.updateProgress(cnt, edge_size)
    #print ''
    dbutils.close_db(dbcon)
    return igraph.Graph.DictList(vertices=None, edges=edgelist)
def word_preproc(dbcon, commonwordfile='../data/commonword'):
    """ statistic word document frequency,remove high word with high DF"""
    noun_dict,verb_dict = dict(),dict()
    commonword_set = load_commonword(commonwordfile)
    for r in dbutils.iterRec(dbcon,'document', 'title content'):
        wordset  = set(' '.join((r[0],r[1])).split())
        for wt in wordset:
            w,t = wt.split('/')
            if w in commonword_set: continue
            if t=='n':
                try:
                    noun_dict[w]+=1
                except KeyError:
                    noun_dict[w]=1
            else: #t=='v'
                try:
                    verb_dict[w]+=1
                except KeyError:
                    verb_dict[w]=1
    for w in noun_dict.keys():
        if w in verb_dict:
            noun_dict[w]+=verb_dict[w]
            del verb_dict[w]

    # sort by df in decending order
    noun_list = noun_dict.items()
    noun_list.sort(cmp=lambda l,r:r[1]-l[1])
    # remove high df word
    #start = int(0.003*len(noun_list))
    #noun_list = noun_list[start:]
    dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'n')", noun_list)

    verb_list = verb_dict.items()
    verb_list.sort(cmp=lambda l,r:r[1]-l[1])
    #remove high df word
    #start = int(0.005*len(verb_list))
    #verb_list = verb_list[start:]
    dbcon.executemany("insert into word(word,c_df,pos) values(?,?,'v')", verb_list)

    dbcon.commit()
def title_keyword(dbcon):
    print 'extrating keyword from title...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')

    wordset = load_wordset(dbcon)
    cnt = 0
    for r in dbutils.iterRec(dbcon, 'document', 'docid title'):
        twords = set()
        for wt in r[1].split(' '):
            w = wt.split('/')[0]
            if w in wordset:
                twords.add(w)
        widstr = ' '.join(twords)

        dbutils.updateByPK(dbcon, 'document', {'kw_title':widstr}, {'docid':r[0]})
        
        cnt += 1
        if cnt%50==0:
            utils.updateProgress(cnt,doc_num)
        
    print ''
    dbcon.commit()
Exemple #15
0
def title_keyword(dbcon):
    print 'extrating keyword from title...'
    doc_num = dbutils.countOfRecs(dbcon, 'document')

    wordset = load_wordset(dbcon)
    cnt = 0
    for r in dbutils.iterRec(dbcon, 'document', 'docid title'):
        twords = set()
        for wt in r[1].split(' '):
            w = wt.split('/')[0]
            if w in wordset:
                twords.add(w)
        widstr = ' '.join(twords)

        dbutils.updateByPK(dbcon, 'document', {'kw_title': widstr},
                           {'docid': r[0]})

        cnt += 1
        if cnt % 50 == 0:
            utils.updateProgress(cnt, doc_num)

    print ''
    dbcon.commit()
def title_wordpair(db_con):
    print 'statistic word co-ocurence from title...'

    codict,dfdict=dict(),dict()
    cnt = 0
    for r in dbutils.iterRec(db_con, 'document','kw_title'):
        words = r[0].split(' ')
        for wp in itertools.combinations(words, 2):
            if wp[0]>wp[1]: wp = (wp[1],wp[0])
            codict[wp] = (codict[wp]+1) if wp in codict else 1
        for w  in words:
            dfdict[w] = (dfdict[w]+1) if w in dfdict else 1

        cnt+=1
    print 'doc num: %d' % cnt
    print 'number of wordpair in title_wordpair: %d' % len(codict)
    cnt = 0
    for wp,co in codict.iteritems():
        weight = co/math.sqrt(dfdict[wp[0]]*dfdict[wp[1]])
        if co>=2 and weight>1e-3:
            cnt += 1
        dbutils.insert(db_con, 't_wordpair', {'word1':wp[0], 'word2':wp[1], 'coocur_num':co, 'weight':weight})
    print 'number of edges '+str(cnt)
    db_con.commit()        
    def iter_title_words(self):
        dbcon = dbutils.connect_db(self.dbfile)
        for r in dbutils.iterRec(dbcon, 'document', 'kw_title'):
            yield Document(0, {w: 0 for w in r[0].split(' ')})

        dbcon.close()
    def iter_title_words(self):
        dbcon = dbutils.connect_db(self.dbfile)
        for r in dbutils.iterRec(dbcon,'document','kw_title'):
            yield Document(0, {w:0 for w in r[0].split(' ')})

        dbcon.close()
Exemple #19
0
def load_wordset(dbcon):
    wordset = set()
    for r in dbutils.iterRec(dbcon, 'word', 'word'):
        wordset.add(r[0])
    return frozenset(wordset)
def load_wordset(dbcon):
    wordset = set()
    for r in  dbutils.iterRec(dbcon,'word', 'word'):
        wordset.add(r[0])
    return frozenset(wordset)