def linesfilter(postName, commentName, postfix): fpost = open(postName) fcomment = open(commentName) postLine = fpost.readlines() commentLine = fcomment.readlines() #can remove some lines which is too long.. assert (len(postLine) == len(commentLine)) postTextLines = _getTextList(postLine) commentTextLines = _getTextList(commentLine) postLineWrite = [] commentLineWrite = [] maxlen = 200 for i in xrange(len(postTextLines)): if len(postTextLines[i]) > maxlen or len(commentTextLines[i]) > maxlen : print postTextLines[i], len(postTextLines[i]) print 'too long ', i continue text = postTextLines[i] templine = Filter.urlFilter(text) templine = Filter.spaceFilter(templine) postLineWrite.append(templine + '\n') text = commentTextLines[i] templine = Filter.urlFilter(text) templine = Filter.spaceFilter(templine) commentLineWrite.append(templine + '\n') filteredPostName = postName + postfix filteredCommentName = commentName + postfix print filteredPostName fpost = open(filteredPostName, 'w') fcomment = open(filteredCommentName, 'w') fpost.writelines(postLineWrite) fcomment.writelines(commentLineWrite)
stcpath = os.path.join(cfg.ROOT_DIR, cfg.DATAPATH) postFile = os.path.join(stcpath, cfg.POST_FILENAME) commentFile = os.path.join(stcpath, cfg.COMMENT_FILENAME) filterPostfix = cfg.FILTER_POSTFIX print stcpath print postFile print commentFile linesfilter(postFile, commentFile, filterPostfix) print 'filer ok' #f = open(postFile, 'r') f = open(commentFile, 'r') lines = f.readlines() _data = edict() dicts = {} for line in lines: #temp = line.decode('utf-8').strip().split("\r") temp = line.strip().split("\t") dicts[temp[0]] = temp[1] #aa = dicts['repos-post-1000008610'] aa = dicts['repos-cmnt-1000919120'] print aa aa = Filter.urlFilter(aa) print aa segs = jieba.cut(aa) for w in segs: print w