Esempio n. 1
0
def linesfilter(postName, commentName, postfix):
    fpost = open(postName)
    fcomment = open(commentName)
    postLine = fpost.readlines()
    commentLine = fcomment.readlines()
    #can remove some lines which is too long..
    assert (len(postLine) == len(commentLine))
    postTextLines = _getTextList(postLine)
    commentTextLines = _getTextList(commentLine)
    postLineWrite = []
    commentLineWrite = []
    maxlen = 200
    for i in xrange(len(postTextLines)):
        if len(postTextLines[i]) > maxlen or len(commentTextLines[i]) > maxlen :
            print postTextLines[i], len(postTextLines[i])
            print 'too long ', i
            continue
        text = postTextLines[i]
        templine = Filter.urlFilter(text)
        templine = Filter.spaceFilter(templine)
        postLineWrite.append(templine + '\n')

        text = commentTextLines[i]
        templine = Filter.urlFilter(text)
        templine = Filter.spaceFilter(templine)
        commentLineWrite.append(templine + '\n')

    filteredPostName = postName + postfix
    filteredCommentName = commentName + postfix
    print filteredPostName
    fpost = open(filteredPostName, 'w')
    fcomment = open(filteredCommentName, 'w')
    fpost.writelines(postLineWrite)
    fcomment.writelines(commentLineWrite)
Esempio n. 2
0
    stcpath = os.path.join(cfg.ROOT_DIR, cfg.DATAPATH)
    postFile = os.path.join(stcpath, cfg.POST_FILENAME)
    commentFile = os.path.join(stcpath, cfg.COMMENT_FILENAME)
    filterPostfix = cfg.FILTER_POSTFIX
    print stcpath
    print postFile
    print commentFile

    linesfilter(postFile, commentFile, filterPostfix)
    print 'filer ok'

    #f = open(postFile, 'r')
    f = open(commentFile, 'r')
    lines = f.readlines()
    _data = edict()
    dicts = {}
    for line in lines:
        #temp = line.decode('utf-8').strip().split("\r")
        temp = line.strip().split("\t")
        dicts[temp[0]] = temp[1]

    #aa = dicts['repos-post-1000008610']
    aa = dicts['repos-cmnt-1000919120']
    print aa
    aa = Filter.urlFilter(aa)
    print aa
    segs = jieba.cut(aa)
    for w in segs:
        print w