def CalcTFIDF(self): sh = show() count = sh.showcount() docArray = self.loadDataFromCutFile(count) #docArray = self.loadDataFromCutFile(10) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) print 'done' #write index-doc to file i = 0 indexdoc = dict() f = open(Global.inverse_dir+'id.txt','wb') word = vectorizer.get_feature_names() for name in vectorizer.get_feature_names(): i+=1 indexdoc[name] = i f.write(json.dumps(indexdoc)) f.close() colnum = tfidf.shape[1] #for i in range(0,colnum): # filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' # f = open(filename,'a') # idx_list = dict() # for j in range(0,row): # val = tfidf[j,i] # if val > 0: # idx_list[j+1] = val # f.write(json.dumps(idx_list)+'\n') # f.close() #i表示词项的编号,row表示非零文档所在的行 for i in range(0,colnum): filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' coldata = tfidf.getcol(i) col_nonzero_index = np.nonzero(coldata) item_weight_dict = dict() for row in col_nonzero_index[0]: item_weight_dict[row+1] = coldata[row][0].data[0] f = open(filename,'a') f.write(json.dumps(item_weight_dict)+'\n') f.close() print 'item ',i,'calculate done'
def CalcTFIDF(self): sh = show() count = sh.showcount() docArray = self.loadDataFromCutFile(count) # docArray = self.loadDataFromCutFile(10) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray)) print 'done' # write index-doc to file i = 0 indexdoc = dict() f = open(Global.inverse_dir + 'id.txt', 'wb') word = vectorizer.get_feature_names() for name in vectorizer.get_feature_names(): i += 1 indexdoc[name] = i f.write(json.dumps(indexdoc)) f.close() colnum = tfidf.shape[1] # for i in range(0,colnum): # filename = Global.inverse_dir+str(i/Global.filesize)+'.txt' # f = open(filename,'a') # idx_list = dict() # for j in range(0,row): # val = tfidf[j,i] # if val > 0: # idx_list[j+1] = val # f.write(json.dumps(idx_list)+'\n') # f.close() # i表示词项的编号,row表示非零文档所在的行 for i in range(0, colnum): filename = Global.inverse_dir + str(i / Global.filesize) + '.txt' coldata = tfidf.getcol(i) col_nonzero_index = np.nonzero(coldata) item_weight_dict = dict() for row in col_nonzero_index[0]: item_weight_dict[row + 1] = coldata[row][0].data[0] f = open(filename, 'a') f.write(json.dumps(item_weight_dict) + '\n') f.close() print 'item ', i, 'calculate done'
import sys sys.path.append("..") from tools.show import show import tools.Global as Global from ml.Cut import Cut s = show() #s.showcount() #s.shownews(1) s.showKeyWord() #s.showitem(2608) #c = Cut() #line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize) #s.showitem(line)