def LabelDicts(labelNum,fileNum,featureNum): t=util.Counter() for i in range(fileNum): t+=featureSelected.featureDict(labelNum,i) print "%d.txt" %i print "legth of dict label",len(t) result=sorted(t.items(),key=lambda e:e[1],reverse=True) featureDict={} for i in range(featureNum): featureDict[result[i][0]]=result[i][1] print "features are as follows:" print json.dumps(featureDict,encoding='UTF-8',ensure_ascii=False) return featureDict
def fileToDict(labelNum=2,num=765): result=featureSelected.featureDict(labelNum,num) sort=sorted(result.items(),key=lambda e:e[1],reverse=True) print "文本单词提取" print json.dumps(sort,encoding='UTF-8',ensure_ascii=False) str=('../data/trainData/Dict/%d.txt') % labelNum f=open(str,'rb') DICT=pickle.load(f) f.close() print "字典:" print json.dumps(DICT,encoding='UTF-8',ensure_ascii=False) return sort
def fileToVector(labelNum=2,num=765): result=featureSelected.featureDict(labelNum,num) vector=[] str=('../data/trainData/Dict/%d.txt') % labelNum f=open(str,'rb') loadDict=pickle.load(f) f.close() loadDict=sorted(loadDict.items(),key=lambda e:e[1],reverse=True) for item in loadDict: if item[0] in result: vector.append(result[item[0]]) else: vector.append(0) print "文本向量化已经完成" print vector
def saveWordTextDict(label,fileNum): str1=('../data/trainData/Dict/%d.txt') % label str2=('../data/trainData/Dict/words%d.txt') %label f1=open(str1,'rb') Dict=pickle.load(f1) f1.close() wordDict=util.Counter() for item in Dict.items(): wordDict[item]=0 for item in Dict.items(): print 'item',item for i in range(fileNum): tmp=featureSelected.featureDict(label,i) if item in tmp: wordDict[item]+=1 print "saveWordTextDict has finished!" f2=open(str2,'wb') pickle.dump(wordDict,f2) f2.close()
#-*- coding = utf-8 -*- #HuangYao THU #12-9-2013 import featureSelected import json labelNum=2 num=450 result=featureSelected.featureDict(labelNum,num) sort=sorted(result.items(),key=lambda e:e[1],reverse=True) res=[] length=200 if len(sort)<200: length=len(sort) for i in range(length): res.append((sort[i][0],sort[i][1])) print 'txt features are as follows:' print json.dumps(res,encoding='UTF-8',ensure_ascii=False)