Beispiel #1
0
def valuate(mincount2,maxcount2,PredictAndAnalyze = PredictAndAnalyze2):
	vectorinfo = {}
	for ID in ["0000","0001","0002","0003","0004","0005","0006"]:
		vectorinfo[ID] = {}
		for j in textinfo[ID].keys():
			#print j
			try:
				vectorinfo[ID][j] = createvector(video_id = j, ID = ID,mincount = mincount2,maxcount = maxcount2)
			except:
				#vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
				print ID,j
	target2 = createtargetarray(maxcount2,100000000,10760.0,34544)
	data2 = createtvectorMat(maxcount2,100000000,vectorinfo)
	(TfidfTextList, word2freqlist) = makeTfidfTextList(maxcount2,100000000,mincount2,maxcount2)
	tfidf = TfidfVectorizer(tokenizer=tokenize)
	tfs = tfidf.fit_transform(TfidfTextList.values())
	idlist = TfidfTextList.keys()
	feature_names = tfidf.get_feature_names()
	tfidfvectorinfo = {}
	sample = tfs.toarray().shape[0]
	print sample, len(feature_names), 
	for n in range(0,sample):
		#print n
		tfidfvectorinfo[idlist[n]] = maketfidfvec(n,feature_names = feature_names,tfs = tfs,idlist= idlist,word2freqlist = word2freqlist)
	l = {}
	for ID in ["0000","0001","0002","0003","0004","0005","0006"]:
		l[ID] = vectorinfo[ID].keys()
	tfidfdata = createtfidfvectorMat(maxcount2,100000000,tfidfvectorinfo)
	print mincount2,maxcount2
	print "logreg"
	k2 = PredictAndAnalyze(data2,target2,clf_cv =linear_model.LogisticRegression(C=1e1))
	print accuracy_score(k2[0],k2[1]),(1.0 - accuracy_score(k2[2],k2[3]))
	k22 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv =linear_model.LogisticRegression(C=1e1))
	print accuracy_score(k22[0],k22[1]),(1.0 - accuracy_score(k22[2],k22[3]))
	print "svm"
	k0 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True))
	print accuracy_score(k0[0],k0[1]),(1.0 - accuracy_score(k0[2],k0[3]))
	print "LinearSVM"
	k1 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.LinearSVC())
	print accuracy_score(k1[0],k1[1]),(1.0 - accuracy_score(k1[2],k1[3]))
	k00 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv = svm.LinearSVC())
	print accuracy_score(k00[0],k00[1]),(1.0 - accuracy_score(k00[2],k00[3]))
	return k2,k0,k1
Beispiel #2
0
def valuate1(mincount2,maxcount2,PredictAndAnalyze = PredictAndAnalyze2):
	vectorinfo = {}
	for ID in ["0000","0001","0002","0003","0004","0005","0006"]:
		vectorinfo[ID] = {}
		for j in textinfo[ID].keys():
			#print j
			try:
				vectorinfo[ID][j] = createvector(video_id = j, ID = ID,mincount = mincount2,maxcount = maxcount2)
			except:
				#vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]]))
				print ID,j
	target2 = createtargetarray(maxcount2,100000000,10760.0,34544)
	data2 = createtvectorMat(maxcount2,100000000)
	print mincount2,maxcount2
	k2 = PredictAndAnalyze(data2,target2,clf_cv =linear_model.LogisticRegression(C=1e1))
	k1 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.LinearSVC())
	#k2 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv =linear_model.LogisticRegression(C=1e1))
	k0 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True))
	#k0 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True))
	return k2,k0,k1
Beispiel #3
0
from kaiseki0925 import wordvec,morphological_analysis,output,makevec,createvector,createtargetarray,createtvectorMat,PredictAndAnalyze,makewordlist,makeTfidfTextList
from kaiseki0925 import tokenize,maketfidfvec,createtfidfvectorMat
data = word2vec.Text8Corpus('allcomment2kaiseiki.txt')
#modelnico = word2vec.Word2Vec(data, size=50)
#modelnico = word2vec.Word2Vec.load("allcomment2.model")
#modelnico = word2vec.Word2Vec.load("allcomment1.model")
modelnico = word2vec.Word2Vec.load("allcomment2kaiseiki.model") #コメントをスペースでつなぐ改行正規化済み
#modelnico = word2vec.Word2Vec.load("allcomment2.model") #コメントをスペースでつなぐ改行正規化なし
#modelnico = word2vec.Word2Vec.load("allcomment_kai.model") #コメントをひとつずつ改行正規化済み
model = modelnico



target2 = createtargetarray(100,100000000,10000,30000)
data2 = createtvectorMat(100,100000000)
k0 = PredictAndAnalyze(data2,target2,clf_cv = svm.SVC(kernel='linear', probability=True,class_weight={0:2,1:1}))
k0 = PredictAndAnalyze(data2,target2,clf_cv = svm.SVC(kernel='linear', probability=True))
k1 = PredictAndAnalyze(data2,target2,clf_cv = neighbors.KNeighborsClassifier(n_neighbors=10))
k2 = PredictAndAnalyze(data2,target2,clf_cv =linear_model.LogisticRegression(C=1e1))

l = {}
for ID in ["0000","0001","0002","0003"]:
    l[ID] = vectorinfo[ID].keys()

(TfidfTextList, word2freqlist) = makeTfidfTextList(100,100000000)
tfs = tfidf.fit_transform(TfidfTextList.values())
idlist = TfidfTextList.keys()
tfidfvectorinfo = {}
sample = tfs.toarray().shape[0]
for n in range(0,sample):