def quantify(X_train, X_test, y_train, y_test,method): q=Quantification(method=method) q.fit(X_train,y_train) KLD=q.score([X_test], [y_test]) #print('Test',q._classify_and_count([y_test])) #print('Train',q._classify_and_count([y_train])) return KLD
def test_gen_data(): k_CC = 0 k_PCC = 0 k_ACC = 0 k_PACC = 0 k_EM = 0 k_SVMp = 0 k_Iter = 0 k_EM1 = 0 CC = [] PCC = [] ACC = [] PACC = [] EM = [] EM1 = [] SVMp = [] Iter = [] Iter1 = [] for i in range(1): X, y = generate_data(n_samples=10000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75) q = Quantification(is_clean=True) X_test_d, y_test_d = Quantification.make_drift_list( X_test, y_test, 0.75) q.fit(X_train, y_train) #s = SVMperf(X_train, y_train, X_test_d, y_test_d) #SVMp.append(q._kld(q._classify_and_count([y_test]), q._classify_and_count([s.getPredictions()]))) CC.append(q.score(X_test_d, y_test_d, method='CC')) PCC.append(q.score(X_test_d, y_test_d, method='PCC')) ACC.append(q.score(X_test_d, y_test_d, method='ACC')) PACC.append(q.score(X_test_d, y_test_d, method='PACC')) EM1.append(q.score(X_test_d, y_test_d, method='EM1')) EM.append(q.score(X_test_d, y_test_d, method='EM')) Iter.append(q.score(X_test_d, y_test_d, method='Iter')) Iter1.append(q.score(X_test_d, y_test_d, method='Iter1')) #print(i, q._classify_and_count([y_test])) if Iter1[i] <= PCC[i]: k_PCC += 1 if Iter1[i] <= CC[i]: k_CC += 1 if Iter1[i] <= ACC[i]: k_ACC += 1 if Iter1[i] <= PACC[i]: k_PACC += 1 if Iter1[i] <= EM[i]: k_EM += 1 #if EM1[i]<=SVMp[i]: k_SVMp+=1 if Iter1[i] <= EM1[i]: k_EM1 += 1 if Iter1[i] <= Iter[i]: k_Iter += 1 #print(q.predict(X_test_d, method='CC'),'\n',q.predict(X_test_d, method='PCC'),'\n',q.predict(X_test_d, method='ACC'),'\n',q.predict(X_test_d, method='PACC'),'\n',q.predict(X_test_d, method='EM1'),'\n',q.predict(X_test_d, method='EM'),'\n',SVMp,'\n',q.predict(X_test_d, method='Iter')) print(i, q._classify_and_count(y_test_d), '\n') print('CC\t', 'PCC\t', 'ACC\t', 'PACC\t', 'EM1\t', 'EM\t', 'SVMperf\t', 'Iter\t', 'Iter1\t') print(np.average(CC), '\t', np.average(PCC), '\t', np.average(ACC), '\t', np.average(PACC), '\t', np.average(EM1), '\t', np.average(EM), '\t', np.average(SVMp), '\t', np.average(Iter), '\t', np.average(Iter1), '\t\t\t', np.var(CC), '\t', np.var(PCC), '\t', np.var(ACC), '\t', np.var(PACC), '\t', np.var(EM1), '\t', np.var(EM), '\t', np.var(SVMp), '\t', np.var(Iter), '\t', np.var(Iter1), '\t\t\t', k_CC, '\t', k_PCC, '\t', k_ACC, '\t', k_PACC, '\t', k_EM1, '\t', k_EM, '\t', k_SVMp, '\t', k_Iter)
def quantify(X_train, X_test, y_train, y_test, method): q = Quantification(method=method) q.fit(X_train, y_train) KLD = q.score([X_test], [y_test]) #print('Test',q._classify_and_count([y_test])) #print('Train',q._classify_and_count([y_train])) return KLD
def semEval(): #fname='100_topics_100_tweets.sentence-three-point.subtask-A' fname = '100_topics_100_tweets.topic-five-point.subtask-CE' #fname='100_topics_XXX_tweets.topic-two-point.subtask-BD' #train=read_semeval('texts/2download/gold/all/'+fname+'.all.gold.tsv') train = read_semeval('texts/2download/gold/train/' + fname + '.train.gold.tsv') tp = text_processing(n=4) cp = char_processing() Xw_train = tp.fit_transform(train[0]) #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out') #add1=csr_matrix(pos_score_train).transpose() #add2=csr_matrix(neg_score_train).transpose() #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr') Xc_train = cp.fit_transform(train[0]) X_train = scipy.sparse.hstack((Xw_train, Xc_train), format='csr') #X_train=Xw_train y_train = np.asarray(train[1]) #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt') test = read_semeval('texts/2download/gold/devtest/' + fname + '.devtest.gold.tsv') #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv') Xw_test = tp.transform(test[0]) #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out') #add1=csr_matrix(pos_score_test).transpose() #add2=csr_matrix(neg_score_test).transpose() #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr') Xc_test = cp.transform(test[0]) X_test = scipy.sparse.hstack((Xw_test, Xc_test), format='csr') #X_test=Xw_test y_test = np.asarray(test[1]) #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test) #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75) print(X_test.shape) X_test_list, y_test_list, utopics = split_by_topic(X_test, y_test, test[2]) q = Quantification(method='Iter1', is_clean=True) #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2) q.fit(X_train, y_train) print('train', q._classify_and_count(y_train)) prevs = q.predict_set(X_test_list, method='Iter1') write_semeval(dict(zip(utopics, prevs)), fname=fname) #forSVMperf(q,y_test,test[2],y_test_list, utopics) #q.iter_model.predict() print('CC', q.score(X_test_list, y_test_list, method='CC')) print('PCC', q.score(X_test_list, y_test_list, method='PCC')) print('EM', q.score(X_test_list, y_test_list, method='EM')) print('EM1', q.score(X_test_list, y_test_list, method='EM1')) print('Iter', q.score(X_test_list, y_test_list, method='Iter')) print('Iter1', q.score(X_test_list, y_test_list, method='Iter1')) print('ACC', q.score(X_test_list, y_test_list, method='ACC')) print('PACC', q.score(X_test_list, y_test_list, method='PACC'))
def after_semEval(): #fname='100_topics_100_tweets.sentence-three-point.subtask-A' #fname='100_topics_100_tweets.topic-five-point.subtask-CE' fname='100_topics_XXX_tweets.topic-two-point.subtask-BD' train=read_semeval('texts/2download/gold/all_train/'+fname+'.all.gold.tsv') #train=read_semeval('texts/2download/gold/all/'+fname+'.train.gold.tsv') tp=text_processing(n=1) #cp=char_processing() Xw_train=tp.fit_transform(train[0]) #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out') #add1=csr_matrix(pos_score_train).transpose() #add2=csr_matrix(neg_score_train).transpose() #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr') #Xc_train=cp.fit_transform(train[0]) #X_train=scipy.sparse.hstack((Xw_train, Xc_train), format='csr') X_train=Xw_train y_train=np.asarray(train[1]) #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt') test=read_semeval('texts/2download/gold/test/test_gold_2.csv') #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv') Xw_test=tp.transform(test[0]) #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out') #add1=csr_matrix(pos_score_test).transpose() #add2=csr_matrix(neg_score_test).transpose() #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr') #Xc_test=cp.transform(test[0]) #X_test=scipy.sparse.hstack((Xw_test, Xc_test), format='csr') X_test=Xw_test y_test=np.asarray(test[1]) #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test) #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75) print(X_test.shape) X_test_list, y_test_list, utopics=split_by_topic(X_test, y_test, test[2]) q=Quantification(method='Iter1',is_clean=True) #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2) q.fit(X_train, y_train) print('train',q._classify_and_count(y_train)) #prevs=q.predict_set(X_test_list, method='Iter1') #write_semeval(dict(zip(utopics,prevs)),fname=fname) #forSVMperf(q,y_test,test[2],y_test_list, utopics) #q.iter_model.predict() print('CC',q.score(X_test_list,y_test_list, method='CC')) print('PCC',q.score(X_test_list,y_test_list, method='PCC')) print('EM',q.score(X_test_list,y_test_list, method='EM')) print('EM1',q.score(X_test_list,y_test_list, method='EM1')) print('Iter',q.score(X_test_list,y_test_list, method='Iter')) print('Iter1',q.score(X_test_list,y_test_list, method='Iter1')) print('ACC',q.score(X_test_list,y_test_list, method='ACC')) print('PACC',q.score(X_test_list,y_test_list, method='PACC'))
def test_gen_data(): k_CC=0 k_PCC=0 k_ACC=0 k_PACC=0 k_EM=0 k_SVMp=0 k_Iter=0 k_EM1=0 CC=[] PCC=[] ACC=[] PACC=[] EM=[] EM1=[] SVMp=[] Iter=[] Iter1=[] for i in range(1): X,y=generate_data(n_samples=10000) X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=0.75) q=Quantification(is_clean=True) X_test_d,y_test_d=Quantification.make_drift_list(X_test,y_test,0.75) q.fit(X_train,y_train) #s = SVMperf(X_train, y_train, X_test_d, y_test_d) #SVMp.append(q._kld(q._classify_and_count([y_test]), q._classify_and_count([s.getPredictions()]))) CC.append(q.score(X_test_d, y_test_d, method='CC')) PCC.append(q.score(X_test_d, y_test_d, method='PCC')) ACC.append(q.score(X_test_d, y_test_d, method='ACC')) PACC.append(q.score(X_test_d, y_test_d, method='PACC')) EM1.append(q.score(X_test_d, y_test_d, method='EM1')) EM.append(q.score(X_test_d, y_test_d, method='EM')) Iter.append(q.score(X_test_d, y_test_d, method='Iter')) Iter1.append(q.score(X_test_d, y_test_d, method='Iter1')) #print(i, q._classify_and_count([y_test])) if Iter1[i]<=PCC[i]: k_PCC+=1 if Iter1[i]<=CC[i]: k_CC+=1 if Iter1[i]<=ACC[i]: k_ACC+=1 if Iter1[i]<=PACC[i]: k_PACC+=1 if Iter1[i]<=EM[i]: k_EM+=1 #if EM1[i]<=SVMp[i]: k_SVMp+=1 if Iter1[i]<=EM1[i]: k_EM1+=1 if Iter1[i]<=Iter[i]: k_Iter+=1 #print(q.predict(X_test_d, method='CC'),'\n',q.predict(X_test_d, method='PCC'),'\n',q.predict(X_test_d, method='ACC'),'\n',q.predict(X_test_d, method='PACC'),'\n',q.predict(X_test_d, method='EM1'),'\n',q.predict(X_test_d, method='EM'),'\n',SVMp,'\n',q.predict(X_test_d, method='Iter')) print(i, q._classify_and_count(y_test_d),'\n') print('CC\t','PCC\t','ACC\t','PACC\t','EM1\t','EM\t','SVMperf\t','Iter\t','Iter1\t') print(np.average(CC),'\t',np.average(PCC),'\t',np.average(ACC),'\t',np.average(PACC),'\t',np.average(EM1),'\t',np.average(EM),'\t',np.average(SVMp),'\t',np.average(Iter),'\t',np.average(Iter1),'\t\t\t', np.var(CC),'\t',np.var(PCC),'\t',np.var(ACC),'\t',np.var(PACC),'\t',np.var(EM1),'\t',np.var(EM),'\t',np.var(SVMp),'\t',np.var(Iter),'\t',np.var(Iter1),'\t\t\t', k_CC,'\t',k_PCC,'\t',k_ACC,'\t',k_PACC,'\t',k_EM1,'\t',k_EM,'\t',k_SVMp,'\t',k_Iter)