Ejemplo n.º 1
0
def transform_docs():
    docs = readfile( corpus_path )
    outf = open( new_docs_path,'w')
    voca = readvoca()
    outf.write( "# docs num %d,voca num:%d\n" % (len(docs),len(voca)) )
    i = 0
    for doc in docs:
        doc = doc.split()[-1]   # input format has been changed
        doc_wn = {}             # 统计doc中的每个词的词频
        for ch in doc:
            if is_ch_char( ch ) and ch in voca:
                v_id = voca.index( ch )
                if doc_wn.has_key( v_id ):
                    doc_wn[v_id] += 1 # 第i个词又一次出现
                else:
                    doc_wn[v_id] = 1 # 第i个词第一次出现
                pass
        # 将doc_wn写文档
        #print "write new doc:%d" % i
        words_li = ["%d:%d" % (w,n) for w,n in doc_wn.items() ]
        words_str = ','.join( words_li )
        outf.write( "%d\t%s\n" % (i,words_str) )
        i += 1
        pass
    outf.close()
Ejemplo n.º 2
0
def collect_voca():
    voca = {}
    docs = readfile( corpus_path )
    for doc in docs:
        doc = doc.split()[-1]   # input format has been changed
        for char in doc:
            if is_ch_char( char ): # only take care of Chinese char
                if char in voca:
                    voca[char] += 1 # count occurrence of a char
                else:
                    voca[char] = 1 
        pass
    # write voca to file
    outf = open( voca_path,'w')
    voca_li = sorted( voca.items(),key=lambda d:d[1],reverse=True )
    i = 0
    for w,n in voca_li:
        outf.write( "%d\t%s\t%d\n" % (i,w.encode("gb2312"),n) )
        i += 1
    print "[finished]write voca to %s" % voca_path