def test_model(texts, classes, model, folds=5): (model, params) = model classes = np.array(classes) texts = np.array(texts) auc_sum = 0 for train, test in cross_validation.StratifiedKFold(classes, folds): texts_train = texts[train] classes_train = classes[train] texts_test = texts[test] classes_test = classes[test] s = model(texts_train, classes_train, *params) predictions = s.classify(texts_test) auc = calculate_auc(classes_test, predictions) auc_sum += auc return auc_sum / folds
def test(texts, classes, models, nn_params, folds=4): ''' Check the performance on an SVM implementation, given a list of texts and their classes (negative/neutral/positive) Uses k-fold cross-validation (keeping in mind to divide the data appropriately, depending on the class) ''' classes = np.array(classes) texts = np.array(texts) wrongs = [] auc_sum = 0 for train, test in cross_validation.StratifiedKFold(classes, folds): texts_train = texts[train] classes_train = classes[train] texts_test = texts[test] classes_test = classes[test] n = Ensemble(texts_train, classes_train, nn_params, models) predictions = n.classify(texts_test) predictions[predictions<0] = 0 auc = calculate_auc(classes_test, predictions) print auc auc_sum += auc for i in range(len(texts_test)): if abs(classes_test[i] - predictions[i]) > 0.5: wrongs.append((classes_test[i], predictions[i], texts_test[i])) ''' import csv writer = open('wrongs.csv', 'w') for w in wrongs: writer.write('%s,%s,%s\n' % w) writer.close() ''' return auc_sum / folds
# evaluate the classfier on verification dataset texts = [] classes = [] csvr = csv.reader(open('./dataset/test_with_solutions.csv', 'rb'), delimiter=',', quotechar='"') csvr.next() for row in csvr: texts.append(row[2].decode('utf8')) classes.append(int(row[0])) results = n.classify(texts) results[results < 0] = 0 results[results > 1] = 1 print sys.argv[1] + " --- " + ` calculate_auc(classes, results) ` end = time.time() # print "classification time=" # print end-start # writer = open('rez.csv', 'w') # for r in results: # writer.write('%s\n' % r) # writer.close() # wrongs = [] # for i in range(len(texts)): # if abs(classes[i] - results[i]) > 0.5: # wrongs.append((classes[i], results[i], texts[i])) # import csv # writer = open('wrongs.csv', 'w')
m2 = Dictionary(texts, classes) texts = [] classes = [] csvr = csv.reader(open('test_with_solutions.csv', 'rb'), delimiter=',', quotechar='"') csvr.next() for row in csvr: texts.append(row[2].decode('utf8')) classes.append(int(row[0])) #results = n.classify(texts) #results[results<0] = 0 #print calculate_auc(classes, results) r1 = m1.classify(texts) print calculate_auc(classes, r1) r2 = np.array(m2.classify(texts)) print calculate_auc(classes, r2) r = (1.2 * r1 + 0.8 * r2) / 2 r[r > 1] = 1 r[r < 0] = 0 print calculate_auc(classes, r) #print TestSVM.test_model(texts, classes, models[-1]) #print TestSVM.test(texts, classes, models, nn_params) n = Ensemble(texts, classes, nn_params, models) texts = [] csvr = csv.reader(open('test.csv', 'rb'), delimiter=',', quotechar='"') csvr.next() for row in csvr:
m1 = ChSVM(texts, classes) m2 = Dictionary(texts, classes) texts = [] classes = [] csvr = csv.reader(open('test_with_solutions.csv', 'rb'), delimiter=',', quotechar='"') csvr.next() for row in csvr: texts.append(row[2].decode('utf8')) classes.append(int(row[0])) #results = n.classify(texts) #results[results<0] = 0 #print calculate_auc(classes, results) r1 = m1.classify(texts) print calculate_auc(classes, r1) r2 = np.array(m2.classify(texts)) print calculate_auc(classes, r2) r = (1.2*r1 + 0.8*r2) / 2 r[r>1] = 1 r[r<0] = 0 print calculate_auc(classes, r) #print TestSVM.test_model(texts, classes, models[-1]) #print TestSVM.test(texts, classes, models, nn_params) n = Ensemble(texts, classes, nn_params, models) texts = [] csvr = csv.reader(open('test.csv', 'rb'), delimiter=',', quotechar='"')
start = time.time() # evaluate the classfier on verification dataset texts = [] classes = [] csvr = csv.reader(open('./dataset/test_with_solutions.csv', 'rb'), delimiter=',', quotechar='"') csvr.next() for row in csvr: texts.append(row[2].decode('utf8')) classes.append(int(row[0])) results = n.classify(texts) results[results<0] = 0 results[results>1] = 1 print sys.argv[1]+" --- "+`calculate_auc(classes,results)` end = time.time() # print "classification time=" # print end-start # writer = open('rez.csv', 'w') # for r in results: # writer.write('%s\n' % r) # writer.close() # wrongs = [] # for i in range(len(texts)): # if abs(classes[i] - results[i]) > 0.5: # wrongs.append((classes[i], results[i], texts[i]))