Beispiel #1
0
	def CalcTFIDF(self):
		sh = show()
		count = sh.showcount()
		docArray = self.loadDataFromCutFile(count)
        #docArray = self.loadDataFromCutFile(10)
		vectorizer = CountVectorizer()
		transformer = TfidfTransformer()
		tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
		print 'done'
		#write index-doc to file
		i = 0
		indexdoc = dict()
		f = open(Global.inverse_dir+'id.txt','wb')
		word = vectorizer.get_feature_names()
		for name in vectorizer.get_feature_names():
			i+=1
			indexdoc[name] = i
		f.write(json.dumps(indexdoc))
		f.close()
		
		colnum = tfidf.shape[1]
		#for i in range(0,colnum):
		#	filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
		#	f = open(filename,'a')
		#	idx_list = dict()
		#	for j in range(0,row):
		#		val = tfidf[j,i]
		#		if val > 0:
		#			idx_list[j+1] = val
		#	f.write(json.dumps(idx_list)+'\n')
		#	f.close()
		#i表示词项的编号,row表示非零文档所在的行
		for i in range(0,colnum):
			filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
			coldata = tfidf.getcol(i)
			col_nonzero_index = np.nonzero(coldata)
			item_weight_dict = dict()
			for row in col_nonzero_index[0]:
				item_weight_dict[row+1] = coldata[row][0].data[0]
			f = open(filename,'a')
			f.write(json.dumps(item_weight_dict)+'\n')
			f.close()
			print 'item ',i,'calculate done'
Beispiel #2
0
    def CalcTFIDF(self):
        sh = show()
        count = sh.showcount()
        docArray = self.loadDataFromCutFile(count)
        # docArray = self.loadDataFromCutFile(10)
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer()
        tfidf = transformer.fit_transform(vectorizer.fit_transform(docArray))
        print 'done'
        # write index-doc to file
        i = 0
        indexdoc = dict()
        f = open(Global.inverse_dir + 'id.txt', 'wb')
        word = vectorizer.get_feature_names()
        for name in vectorizer.get_feature_names():
            i += 1
            indexdoc[name] = i
        f.write(json.dumps(indexdoc))
        f.close()

        colnum = tfidf.shape[1]
        # for i in range(0,colnum):
        #	filename = Global.inverse_dir+str(i/Global.filesize)+'.txt'
        #	f = open(filename,'a')
        #	idx_list = dict()
        #	for j in range(0,row):
        #		val = tfidf[j,i]
        #		if val > 0:
        #			idx_list[j+1] = val
        #	f.write(json.dumps(idx_list)+'\n')
        #	f.close()
        # i表示词项的编号,row表示非零文档所在的行
        for i in range(0, colnum):
            filename = Global.inverse_dir + str(i / Global.filesize) + '.txt'
            coldata = tfidf.getcol(i)
            col_nonzero_index = np.nonzero(coldata)
            item_weight_dict = dict()
            for row in col_nonzero_index[0]:
                item_weight_dict[row + 1] = coldata[row][0].data[0]
            f = open(filename, 'a')
            f.write(json.dumps(item_weight_dict) + '\n')
            f.close()
            print 'item ', i, 'calculate done'
Beispiel #3
0
import sys
sys.path.append("..")
from tools.show import show
import tools.Global as Global
from ml.Cut import Cut

s = show()
#s.showcount()
#s.shownews(1)
s.showKeyWord()
#s.showitem(2608)

#c = Cut()
#line = c.getRow(50,Global.cutnews_origin_dir,Global.filesize)
#s.showitem(line)