def solve(train_X, train_Y, test_X, test_Y): best_lambda, test_accuracy = train(train_X, train_Y, train_X, train_Y) print("Answer for problem 16 is {0}".format(best_lambda)) best_lambda, test_accuracy = train(train_X, train_Y, test_X, test_Y) print("Answer for problem 17 is {0}".format(best_lambda)) train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 120, 200) best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y) print("best lambda is: {0}".format(best_lambda)) model = liblinearutil.train(train_y, train_x, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model) print("Answer for problem 18 is {0}".format((100 - accuracy[0]) / 100)) model = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model) print("Answer for problem 19 is {0}".format((100 - accuracy[0]) / 100)) accuracy = [] for i in range(5): train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 40 * i, 40 * (i + 1)) best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y) accuracy.append(test_accuracy) mean_accuracy = np.mean(accuracy, axis=0) print("Answer for problem 20 is {0}".format(min(mean_accuracy)))
def predict(y,xs,xw): xs,_ = simpleScale(xs,scales_shallow) xw,_ = simpleScale(xw,scales_neuralbrn) p_label, p_acc, p_val = ll.predict(y,xs,model_shallow,'-q -b 1') ls_s = score(p_label,p_val) p_label, p_acc, p_val = ll.predict(y,xw,model_neuralbrn,'-q -b 1') ls_w = score(p_label,p_val) return [(x+y)/2 for x,y in zip(ls_s,ls_w)],ls_s,ls_w
def predict(y, xs, xw): xs, _ = simpleScale(xs, scales_shallow) xw, _ = simpleScale(xw, scales_neuralbrn) p_label, p_acc, p_val = ll.predict(y, xs, model_shallow, '-q -b 1') ls_s = score(p_label, p_val) p_label, p_acc, p_val = ll.predict(y, xw, model_neuralbrn, '-q -b 1') ls_w = score(p_label, p_val) return [(x + y) / 2 for x, y in zip(ls_s, ls_w)], ls_s, ls_w
def get_edge_scores(self, edges, **kwargs): fv1 = [self._get_feature_vecs((i, j)) for (i, j) in edges] fv2 = [self._get_feature_vecs((j, i)) for (i, j) in edges] """ ypred1 = self.svm.decision_function(fv1) ypred2 = self.svm.decision_function(fv2) """ fakey = np.zeros(len(fv1)) _, _, ypred1 = predict(fakey, fv1, self.svm) _, _, ypred2 = predict(fakey, fv2, self.svm) ypred1, ypred2 = np.array(ypred1), np.array(ypred2) #print('SVM decision function sample', ypred1[:20]) ypred = 0.5 * (ypred1 + ypred2) return ypred
def classify(self, tweet): features = self._extract_features(tweet) value = liblinearutil.predict([0], [features], self.model)[0][0] for lang, number in self.languages.items(): if number == value: return lang raise ValueError
def predict(test_data, features, model): x = [] y = [] keys = test_data.keys() for key in keys: #y.append(test_data[key]['class']) y.append(0); x.append(features[key]) p_label, p_acc, p_val = liblinearutil.predict(y, x, model, '-q') predictions = {} reverse = False for i in range(len(p_label)): predictions[keys[i]] = { 'class' : p_label[i], 'score' : p_val[i][0] } if (p_label[i] <= 0 and p_val[i][0] > 0): reverse = True if (p_label[i] > 0 and p_val[i][0] < 0): reverse = True if reverse: print 'REVERSING SCORE!' for key in predictions: predictions[key]['score'] *= -1 return predictions
def LDA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): # set parameters num_topics = 20 burn_in = 1000 # 0 alpha = 0.1 beta = 0.1 samples = 8 spacing = 100 num_test_docs = test_matrix.shape[0] sampler = lda.LDA(num_topics, alpha, beta) print('Starting!') theta, phi, likelihood = sampler.train(matrix, burn_in, samples, spacing) print('likelihood: ', likelihood) theta_test, likelihood = sampler.classify(test_matrix, phi, burn_in, samples, spacing) print('likelihood: ', likelihood) theta = theta / np.sum(theta, 1)[:, None] theta_test = theta_test / np.sum(theta_test, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), theta.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs), theta_test.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def hash(features, num_train_samples=58000, L=8): bits = [] for i in range(L): start = timeit.default_timer() m = liblinearutil.load_model( 'models/tr{0:05d}-L{1:02d}-b{2:02d}.model'.format( num_train_samples, L, i)) p_label, p_acc, p_val = liblinearutil.predict([0] * features.shape[0], features.tolist(), m, str('-q')) bits.append(p_label) end = timeit.default_timer() print('[HASH] {0:3d}th bit hashed. {1:.4f} seconds elapsed'.format( i, end - start)) start = timeit.default_timer() bits = np.vstack(bits).transpose().astype(np.int) bits[np.nonzero(bits == 0)] = -1 with open('hash/tr{0:05d}-L{1:02d}'.format(num_train_samples, L), 'wb') as fo: cPickle.dump(bits, fo) end = timeit.default_timer() print('[HASH] Hash codes saved. {0:.4f} seconds elapsed'.format(end - start)) return
def predict(self, data_test): """ Predicts tags in the given data. If the data is fully labeled, reports the accuracy. """ start_time = time.time() assert not self.__feature_extractor.is_training # Assert trained # Extract features (on all instances, labeled or unlabeled) and pass # them to liblinear for prediction. [label_list, features_list, _] = \ self.__feature_extractor.extract_features(data_test, True, []) pred_labels, (acc, _, _), _ = \ liblinearutil.predict(label_list, features_list, self.__liblinear_model, "-q") if not self.quiet: num_seconds = int(math.ceil(time.time() - start_time)) print("Prediction time: {0}".format( str(datetime.timedelta(seconds=num_seconds)))) if not data_test.is_partially_labeled: print("Per-instance accuracy: {0:.3f}%".format(acc)) else: print("Not reporting accuracy: test data missing gold labels") # Convert predicted labels from integer IDs to strings. for i, label in enumerate(pred_labels): pred_labels[i] = self.__feature_extractor.get_label_string(label) return pred_labels, acc
def predict_from_SVR_liblinear(predictor, features): # -b 1: return probability estimates # -q: quiet mode p_label, p_acc, p_val = liblinearutil.predict([], features, predictor, '-q') return p_label
def predictTraitValue(self, imagelist_file='image_list.txt', class_label=1, outfile='output', norm=-1, debug=False): """ Calls external functions to obtain predicted value between 0 and 1 for given image, updates class values Args: imagelist_file: file that contains the list of image/images to be processed class_label: label given images as either positive or negative for SVM classification outfile: filename of output for Dense SIFT analysis Return: True if successful """ with open(imagelist_file, 'w') as f: f.write('tmp.jpg\n') cv2.imwrite('tmp.jpg', self.norm_image) hog_histogram(imagelist_file, class_label, outfile, norm, debug) prob_y, prob_x = svm_read_problem(outfile) model_file = models[self.trait] model = load_model(model_file) self.p_val = predict(prob_y, prob_x, model)[2][0][0] return True
def predict(self,x,y=[]): """ x: a list/tuple of l predicting instances. The feature vector of each predicting instance is an instance of list/tuple or dictionary. y: a list/tuple of l true labels (type must be int/double). It is used for calculating the accuracy. Use [] if true labels are unavailable. predicting_options: a string of predicting options in the same format as that of LIBLINEAR. p_acc: a tuple including accuracy (for classification), mean squared error, and squared correlation coefficient (for regression). p_vals: a list of decision values or probability estimates (if '-b 1' is specified). If k is the number of classes, for decision values, each element includes results of predicting k binary-class SVMs. If k = 2 and solver is not MCSVM_CS, only one decision value is returned. For probabilities, each element contains k values indicating the probability that the testing instance is in each class. Note that the order of classes here is the same as 'model.label' field in the model structure. """ p_labels, p_acc, p_vals = lu.predict(y, x, self.model) # print p_labels, p_acc, p_vals return p_labels, p_acc, p_vals
def filter_tweets(self, tweets, text_lookup_field_name='text'): """ filter loaded tweets """ tweets = [a for a in tweets if len(a[text_lookup_field_name].split()) > 7] # filter tweets less than 7 words if(len(tweets) ==0): return [] filtered_tweets = [] if(len(tweets) > 1000000): ## split into chunks if the size is too big tweet_chunks = list(kgen.chunks(tweets, 1000000)) for chnk in range(len(tweet_chunks)): print "processing chunk %d" % chnk X, hash2originalID = extract_vectors(tweet_chunks[chnk], self.ktw, self._vocab, text_lookup_field_name) Y = [] if(len(X) <= 2): continue p_label, p_acc, p_val = predict(Y, X, self._model, '-q') ###he if(self._fd): p_label = self.hard_feature_rules(p_label, X) for i, pred in enumerate(p_label): if pred == 1: filtered_tweets.append(tweet_chunks[chnk][hash2originalID[i]]) else: X, hash2originalID = extract_vectors(tweets, self.ktw, self._vocab, text_lookup_field_name) Y = [] if(len(X) == 0): return [] p_label, p_acc, p_val = predict(Y, X, self._model, '-q') ###he if(self._fd): p_label = self.hard_feature_rules(p_label, X) for i, pred in enumerate(p_label): if pred == 1: if(not self.hard_reject(tweets[hash2originalID[i]]['text'])): filtered_tweets.append(tweets[hash2originalID[i]]) return filtered_tweets
def predict_liblinear(features, labels, model): print('Loading predictions...') start = time.time() preds, p_acc, p_val = liblinearutil.predict(labels, features, model) preds = list(map(int, preds)) end = time.time() print('Predictions made in ' + str(end - start) + ' seconds') return preds
def predict_from_log_regr_liblinear(predictor, features): pos_class_idx = 1 if int(predictor.label[1]) == 1 else 0 # -b 1: return probability estimates # -q: quiet mode p_label, p_acc, p_val = liblinearutil.predict([], features, predictor, '-b 1 -q') return [probs[pos_class_idx] for probs in p_val]
def propose_deterministic(model, cache, feats, x_, j): if feats in cache[j][1]: ny = cache[j][1][feats] else: x = [{k:1 for k in x_[j] + list(feats)}] ny = int(ll.predict([], x, model, '-q')[0][0]) cache[j][1][feats] = ny return ny
def eval_SVR_liblinear(predictor, examples, responses): echo('Evaluating predictor') predictions, p_acc, p_val = liblinearutil.predict([], examples, predictor, '-q') echo('RMSE:', rmse(responses, predictions), '\tR2:', R2(responses, predictions), '\tPearson R:', pearsonr(responses, predictions)) return predictions
def predict(x, y, model, classify=True): p_label, p_acc, p_vals = liblin.predict(y.tolist(), x.tolist(), model, "-q") if classify: return p_label return p_vals
def predict(self , vec) : ''' vec : [ {idx : val , ...} ] , list with only one element ! return : str , POSITIVE_NAME / NEGATIVE_NAME ''' y_fake = [ self.NEGATIVE_LABEL ,] # for liblinear p_labels , _ , _ = liblinearutil.predict(y_fake , vec , self.model , "-q") p_label = int(p_labels[0]) p_name = self.POSITIVE_NAME if p_label == self.POSITIVE_LABEL else self.NEGATIVE_NAME return p_name
def face_detect(self, img): """ Detect face bounding box given image img: input image """ # convert to gray if img.ndim > 2: img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # detect face bboxes = self.face_detector['detector'].detectMultiScale(img, minNeighbors=self.face_detector['minNeighbors'], minSize=self.face_detector['minSize']) if len(bboxes) == 0: #print('No face is detected') return np.zeros((0, 4)) # else, select appropriate face # exclude very small bounding box index_face_size = (-bboxes[:, 2]).argsort() # descending order bboxes = bboxes[index_face_size, :] for idx in np.arange(1, bboxes.shape[0]): if bboxes[idx, 2] <= np.round(bboxes[0, 2]*0.3): bboxes = bboxes[:idx, :] break # compute confidence for each remaining bbox final_bboxes = np.zeros((0, 4)) C = [] for idx in np.arange(bboxes.shape[0]): bbox = bboxes[idx, :] im_cut = img[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]] im_cut = cv2.resize(im_cut, (160, 160), interpolation=cv2.INTER_CUBIC) _, descriptor = lbp(im_cut) descriptor = descriptor.astype(float)/np.sum(descriptor) descriptor = list(descriptor) _, _, confidence = predict([0], [descriptor], self.face_detector['confidence_LBP']['model'], '-b 1 -q') if confidence[0][0] < self.face_detector['confidence_LBP']['thre']: continue C.append(confidence[0][0]) final_bboxes = np.concatenate((final_bboxes, bbox.reshape((1, -1)))) if final_bboxes.shape[0] == 0: return final_bboxes # choose largest and best one #index_face_size = (-final_bboxes[:, 2]).argsort() # descending order #final_bboxes = final_bboxes[index_face_size, :] #C = C[index_face_size] maxC = np.max(C) for idx in np.arange(final_bboxes.shape[0]): if C[idx] - maxC > -0.05: bbox = final_bboxes[idx, :].reshape((1, -1)) break return bbox
def saliency(self, img, normalize=True): """Computes eDN saliency map for single image or image sequence""" descs = self.descriptions # rescale image to typical input size imgSize = img.shape[:2] rescale_factor = 0.5*EDN_INSIZE[0]/max(imgSize) + \ 0.5*EDN_INSIZE[1]/min(imgSize) # single image: # img = misc.imresize(image, rescale_factor, 'bicubic') # image sequence (or single image) scaledImg = np.zeros([a * rescale_factor for a in imgSize] + [img.shape[2]]) for j in xrange(img.shape[2] / 3): scaledImg[:, :, j * 3:j * 3 + 3] = misc.imresize( img[:, :, j * 3:j * 3 + 3], rescale_factor, 'bicubic') img = scaledImg # compute eDN features for description(s) t1 = time.time() fMapEDN, fMapSize = eDN_features(img, descs) t2 = time.time() logging.info("Feature computation took %0.3fs" % (t2 - t1)) if self.biasToCntr: fMapCntr = dist_to_cntr_features(img, fMapSize) fMap = np.hstack((fMapCntr, fMapEDN)) else: fMap = fMapEDN fMapW, fwParams = whiten_features(fMap, self.whitenParams) # SVM prediction t1 = time.time() bs, pAcc, pred = predict([], fMapW.tolist(), self.svm, options="-q") t2 = time.time() logging.info("Prediction took %0.3fs" % (t2 - t1)) pred = np.array(pred) # reshaping and upscaling pred = pred.reshape(fMapSize, order='F') predLarge = sp.ndimage.interpolation.zoom( pred, (imgSize[0] / float(pred.shape[0]), imgSize[1] / float(pred.shape[1]))) # normalization if normalize: rescaled = (255.0 / (predLarge.max() - predLarge.min()) * (predLarge - predLarge.min())).astype(np.uint8) return rescaled else: return predLarge
def liblinear_classifier(svm_input=None, y=[], x=[]): """调用训练好的liblinear分类器做垃圾过滤 """ svm_model = load_model(SVM_MODEL_FILE) if svm_input: y, x = svm_read_problem(svm_input) p_label, p_acc, p_val = predict(y, x, svm_model, "-q") return p_label
def saliency(self, img, normalize=True): """Computes eDN saliency map for single image or image sequence""" descs = self.descriptions # rescale image to typical input size imgSize = img.shape[:2] rescale_factor = 0.5*EDN_INSIZE[0]/max(imgSize) + \ 0.5*EDN_INSIZE[1]/min(imgSize) # single image: # img = misc.imresize(image, rescale_factor, 'bicubic') # image sequence (or single image) scaledImg = np.zeros([a*rescale_factor for a in imgSize] + [img.shape[2]]) for j in xrange(img.shape[2]/3): scaledImg[:,:,j*3:j*3+3] = misc.imresize(img[:,:,j*3:j*3+3], rescale_factor, 'bicubic') img = scaledImg # compute eDN features for description(s) t1 = time.time() fMapEDN, fMapSize = eDN_features(img, descs) t2 = time.time() logging.info("Feature computation took %0.3fs" % (t2-t1)) if self.biasToCntr: fMapCntr = dist_to_cntr_features(img, fMapSize) fMap = np.hstack((fMapCntr, fMapEDN)) else: fMap = fMapEDN fMapW, fwParams = whiten_features(fMap, self.whitenParams) # SVM prediction t1 = time.time() bs, pAcc, pred = predict([], fMapW.tolist(), self.svm, options="-q") t2 = time.time() logging.info("Prediction took %0.3fs" % (t2-t1)) pred = np.array(pred) # reshaping and upscaling pred = pred.reshape(fMapSize, order='F') predLarge = sp.ndimage.interpolation.zoom(pred, (imgSize[0]/float(pred.shape[0]), imgSize[1]/float(pred.shape[1]))) # normalization if normalize: rescaled = (255.0 / (predLarge.max()-predLarge.min()) * (predLarge-predLarge.min())).astype(np.uint8) return rescaled else: return predLarge
def alternating_predict(x, y, proj_landmarks, model, classify=True): sample = recontruct_views(x, proj_landmarks) p_label, p_acc, p_vals = liblin.predict(y.tolist(), sample.tolist(), model, "-q") if classify: return p_label return p_vals
def test( C, Y_test, X_test, x_lines ): """ This function takes in the test labels and features and prints out the accuracy :param C : list containing parameter C :param X_test : test features :param Y_test : test labels :return None """ # for c in C: model = lu.load_model("model/lmods2_tamper" + str(round(C,2)) + "_" + str(x_lines) + "l.model") p_letters, p_acc, p_val = lu.predict(Y_test, X_test, model) return p_letters
def parallel_train_predict(args): print("A process begins.") x_train,y_train,x_test,y_test=args problem = liblinearutil.problem(y_train, x_train) parameter = liblinearutil.parameter('-s 0 -c 1') time_start = time.clock() model = liblinearutil.train(problem, parameter) print("A process training finished in %f."%(time.clock()-time_start)) time_start = time.clock() p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0') print("A process predicting finished in %f."%(time.clock()-time_start)) return p_val
def validation(k, data_x, data_y, s, e, C): accuracies = [] params = get_params(s, e, C) print('s = {}, e = {}, C = {}'.format(s, e, C)) for fold in range(k): train_x, test_x = get_k_fold(k, fold, data_x) train_y, test_y = get_k_fold(k, fold, data_y) m = liblinearutil.train(train_y, train_x, params) _, p_acc, __ = liblinearutil.predict(test_y, test_x, m) accuracies.append(p_acc[0]) return accuracies
def predict(self, x): y = [] for sample in x: data = dict([(self._features.getId(d), sample[d]) for d in sample if self._features.getId(d)]) label, _, _ = liblinear.predict([0], [data], self._model, "") if self._regression: y.append(label[0]) else: if self._labels.count() == 2: label[0] = 1 if label[0] == 1 else 2 y.append(self._labels.getVal(label[0])) return y
def predict_with_svm(model, predict_y, predict_x): # all_y, all_x = read_multiple_days(start_day, end_day) labels, acc, values = ll.predict(predict_y, predict_x, model, "-q") num_false_pos = 0 num_false_neg = 0 total = len(predict_y) for gt, pred in zip(predict_y, labels): diff = gt - pred if diff == -2: num_false_pos += 1 if diff == 2: num_false_neg += 1 return (total, num_false_pos, num_false_neg)
def prefict_from_weibos(weibos,verbose=False): fvecs = [] for weibo in weibos: text = weibo.get('text') fvec = analyze(text) if fvec: fvecs.append(fvec) if not fvecs: return None #print fvecs result,_,prob = svm.predict([],fvecs,model) stresses = [exp(plist[0])/(1+exp(plist[0])) for plist in prob] return sum(stresses)/len(stresses)
def unimodalPredDev(gs, feats, nDim): parts = ['dev'] [cccs, preds] = [{} for i in range(2)] for s in parts: cccs[s] = -1.0 warnings.filterwarnings('ignore', category=ConvergenceWarning) #Liblinear for comp in v.C: #Options for liblinear options = "-s "+str(v.sVal)+" -c "+str(comp)+" -B 1 -q" #We learn the model on train model = train(gs['train'][nDim],feats['train'],options) #We predict on data for s in parts: pred = np.array(predict(gs[s][nDim],feats[s],model,"-q"))[0] #We calculate the correlation and store it ccc = cccCalc(np.array(pred),gs[s][nDim]) if (ccc > cccs[s]): preds[s] = pred cccs[s] = ccc function = "SVR" alpha = comp if (v.fullMode == True): #We see if we can do better with sklearn for nbFunc in range(len(v.lFunc)): for c in v.parFunc[nbFunc]: func = v.lFunc[nbFunc] reg = func[0](alpha=c) #One task prediction if (func[1] == 0): reg.fit(feats['train'],gs['train'][nDim]) for s in parts: p = reg.predict(feats['dev']) ccc = cccCalc(p,gs[s][nDim]) if (ccc > cccs[s]) : preds[s] = p cccs[s] = ccc function = func[2] alpha = c #Multi task prediction else : reg.fit(feats['train'],np.transpose(gs['train'])) for s in parts: p = reg.predict(feats['dev'])[:,nDim] ccc = cccCalc(p,gs[s][nDim]) if (ccc > cccs[s]) : preds[s] = p cccs[s] = ccc function = func[2] alpha = c return cccs, preds, function, alpha
def predict(self, x): y = [] for sample in x: data = dict([(self._features.getId(d), sample[d]) for d in sample if self._features.getId(d)]) label, _, _ = liblinear.predict([0], [data], self._model, '') if self._regression: y.append(label[0]) else: if self._labels.count() == 2: label[0] = 1 if label[0] == 1 else 2 y.append(self._labels.getVal(label[0])) return y
def predict(name, query, testGame, model, method='skip_1'): '''Predicts the answer to the query and returns an array of tuples (score, answers), as well as the correct answer. In the tuples, answers is all the possible right answers e.g. ['Ronaldo', 'Cristiano Ronaldo', 'Cristiano']. ''' entities = {} # create Dataset object testSet = dts.Dataset.from_columns(name) text = ' '.join([t.decode() for t in testGame.text]) text, entities = txt.anonymize(text) #for i in range(len(text)): #text[i], entities = txt.anonymize(text[i], entities) inv_entities = {v: k for k, v in entities.items()} # fetch answer try: answer = testGame.query_dict[query] except KeyError: answer = 'N/A' # create feature vector for each entity in text for ent_id in inv_entities.iterkeys(): ent_name = 'ent' + str(ent_id) if method!='word2vec': feature_vector = ext.create_feature_vector(ent_name, text, method) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 # add feature vector to dataset testSet.append((feature_vector, label), ent_name) else: feature_vector = ext.create_feature_vector(ent_name, text, method, model=model) try: label = (ent_id == inv_entities[answer]) * 1.0 except KeyError: label = (inv_entities[ent_id] in answer) * 1.0 testSet.append((dict(zip(range(len(feature_vector)), feature_vector)), label), ent_name) scores = [] words = testSet.entities _, _, probas = llb.predict(testSet.Y, testSet.X, model, '-b 1') for i, proba in enumerate(probas): scores.append((proba[1], [k for k,v in entities.iteritems() \ if str(v) == words[i][3:]])) return scores, answer
def propose_probabilistic(model, cache, feats, x_, j): if feats in cache[j][1]: dist = cache[j][1][feats] else: x = [{k:1 for k in x_[j] + list(feats)}] # print ll.predict([], x, model, '-q -b 1') # exit(0) dist = ll.predict([], x, model, '-q -b 1')[2][0] cache[j][1][feats] = dist r = random() for (y, p) in enumerate(dist): r -= p if r < 0: return y + 1 return y + 1
def classify_single_tweet(self, tweet, text_lookup_field_name='text'): """ classify a single tweet is cu-related or not """ X = [extract_single_vector(tweet, self.ktw, self._vocab, text_lookup_field_name)] Y = [] p_label, p_acc, p_val = predict(Y, X, self._model, '-q') if(self._fd and self._featrule): p_label = self.hard_feature_rules(p_label, X) if(self.hard_reject(tweet[text_lookup_field_name])): p_label = [0] return p_label[0] == 1
def TOKEN_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): n_docs = matrix.shape[0] n_test_docs = test_matrix.shape[0] matrix = matrix / np.sum(matrix, 1)[:, None] test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), matrix.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(n_test_docs), test_matrix.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def get_scores(self, items): model = self.train() insts = [] for item in items: inst = self.to_instance(item['title']) insts.append(inst) ## XXX dirty hack to hide messages printed by LIBLINEAR old_out = sys.stdout sys.stdout = open('/dev/null', 'wb') dummy, dummy, decs = liblinearutil.predict([], insts, model) sys.stdout = old_out return decs
def train(train_X, train_Y, test_X, test_Y): test_accuracy = [] model_1 = liblinearutil.train(train_Y, train_X, '-s 0 -c 5000 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_1) test_accuracy.append((100 - accuracy[0]) / 100) model_2 = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_2) test_accuracy.append((100 - accuracy[0]) / 100) model_3 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.5 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_3) test_accuracy.append((100 - accuracy[0]) / 100) model_4 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.005 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_4) test_accuracy.append((100 - accuracy[0]) / 100) model_5 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.00005 -e 0.000001') label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_5) test_accuracy.append((100 - accuracy[0]) / 100) return lambda_set[test_accuracy.index(min(test_accuracy))], test_accuracy
def eval_SVM(X, y, Xhat, yhat): # create classification problem problem = liblinearutil.problem(y, X) # set SVM parameters svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1') # train SVM model = liblinearutil.train(problem, svm_param) # predict and evaluate p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q') # compute accuracy acc, mse, scc = liblinearutil.evaluations(yhat, p_label) return acc
def r_obj_function(R, x, y, lands, svm, mask, c1, c2): m = len(x) l = len(lands) r_2d = R.reshape(m, l) _, _, svm_pred = liblin.predict(y, np.dot(r_2d, lands).tolist(), svm, "-q") cost = c2 * sum((x - np.dot(r_2d, lands))[mask]**2) cost += c1 * sum(np.max(0, 1 - np.asarray(svm_pred)[0, y])) # for i in range(len(x)): # m_i = mask[i] # cost += c2*sum((x[i, m_i] - np.dot(r_2d[i], lands[:, m_i]).T)**2) # cost += c1*max(0, 1 - np.asarray(svm_pred)[0, y[i]]) return cost
def alternating_train(x, y, lands, c1, c2=1, params='-s 2 -B 1 -q'): nb_views = lands.shape[2] L = np.hstack([lands[:, :, v] for v in range(nb_views)]) M = np.hstack([x[:, :, v] for v in range(nb_views)]) l = len(lands) m = len(x) r0, mask = missing_lstsq(L, M) s0 = np.dot(r0, L) sample = s0.copy() R = r0.copy() y_list = y.tolist() svm = liblin.train(y.tolist(), sample.tolist(), '-c {} '.format(c1) + params) it = 0 while True: it += 1 res = minimize(r_obj_function, R.flatten(), args=(M, y, L, svm, mask, c1, c2), options={'disp': True}) # for i in range(len(x)): # r_i = minimize(r_obj_function, R[i, :], args=(i, M, y, L, svm, mask, c1, c2), options={'disp': False}) # R[i] = r_i.x # cost += r_i.fun print(r_i.fun) R = res.x.reshape(m, l) sample = np.dot(R, L) svm = liblin.train(y.tolist(), sample.tolist(), '-c {} '.format(c1) + params) _, p_acc, _ = liblin.predict(y, sample.tolist(), svm, "-q") print(p_acc) if it == 1: break return svm
def AT_FA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords): # set parameters num_topics = 4 burn_in = 1000 # 0 alpha = 0.1 beta = 0.1 samples = 8 spacing = 100 num_test_docs = test_matrix.shape[0] doc_authors_new, n_authors_new = add_fic_authors(doc_authors, n_authors) sampler = at.AtSampler(num_topics, n_authors_new, alpha, beta) print('Starting!') theta, phi, likelihood = sampler.train(doc_authors_new, matrix, burn_in, samples, spacing) print('theta:', theta.shape) print('phi:', phi.shape) print('likelihood:', likelihood) sampler.n_authors = num_test_docs theta_test = sampler.classify(test_matrix, phi, burn_in, samples, spacing) print('theta test:', theta_test.shape) training_matrix = concatenate_fic_authors(doc_authors, num_topics) num_test_docs = test_matrix.shape[0] test_matrix = np.concatenate((theta_test, theta_test), axis=1) training_matrix = training_matrix / np.sum(training_matrix, 1)[:, None] test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None] svm_model = ll.train(sum(doc_authors, []), training_matrix.tolist(), '-c 4') p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs), test_matrix.tolist(), svm_model) author_probs = np.zeros((n_test_docs, n_authors)) for doc, author in enumerate(p_label): author_probs[doc, int(author)] = 1 return author_probs
def label(self, sentence): labeled = [] prev = [] for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) p_label, _, _ = svm.predict([0], [featurespace.featureset], self._svm_model, '') label = p_label[0] prev.append((body, label)) if len(prev) > self.chain_len: del(prev[0]) labeled.append((word, label)) return labeled
def main(): if __name__ == "__main__": y, x = svm_read_problem(feature_file, return_scipy=True) # train:test = 7:3 train_X = x[:14000] train_y = y[:14000] test_X = x[14000:] test_y = y[14000:] prob = problem(train_y, train_X) param = parameter("-c 1 -s 2") model = train(prob, param) p_labs, p_acc, p_vals = predict(test_y, test_X, model) accuracy, precision, recall = metrics_result(test_y, p_labs) print print "accuracy: ", accuracy print "precision: ", precision print "recall: ", recall
def f_step(features, models, verbose=True): # X = features Z = [] for (m, i) in zip(models, range(len(models))): t_start = timeit.default_timer() p_label, p_acc, p_val = liblinearutil.predict([0] * features.shape[0], features.tolist(), m, str('-q')) Z.append(p_label) t_end = timeit.default_timer() if verbose: print('[F] {:3d}th bit, {:.4f} seconds elapsed'.format( i, t_end - t_start)) Z = np.vstack(Z).transpose() # np.linalg.pinv(Z).dot(X).shape return (np.linalg.pinv(Z).dot(features), Z)
def label(self, sentence): labeled = [] prev = [] for word in sentence: body = word.lower() featurespace = self._construct_featurespace(body, prev) p_label, _, _ = svm.predict([0], [featurespace.featureset], self._svm_model, '') label = p_label[0] prev.append((body, label)) if len(prev) > self.chain_len: del (prev[0]) labeled.append((word, label)) return labeled
def classify(self, text): instance = NormalizedBinaryTextInstance( None, text, self.feature_indices) # Construct the feature vector. feature_vector = {} for f, v in instance.feature_values(): feature_index = self.feature_indices.get(f) if feature_index != None: feature_vector[feature_index] = v options = '-b 1' if self.output_probability else '' p_labs, p_acc, p_vals = liblinearutil.predict([], [feature_vector], self.model, options) response = [] vals = dict(enumerate(p_vals[0])) for label in sorted(vals, key=vals.get, reverse=True): response.append((label, self._soft_max_scaling(vals[label]))) return response
def classify(self, text): instance = NormalizedBinaryTextInstance(None, text, self.feature_indices) # Construct the feature vector. feature_vector = {} for f, v in instance.feature_values(): feature_index = self.feature_indices.get(f) if feature_index != None: feature_vector[feature_index] = v options = '-b 1' if self.output_probability else '' p_labs, p_acc, p_vals = liblinearutil.predict([], [feature_vector], self.model, options) response = [] vals = dict(enumerate(p_vals[0])) for label in sorted(vals, key=vals.get, reverse=True): response.append((label, self._soft_max_scaling(vals[label]))) return response
def run_classifier(train_file, test_file): count_one=0 y_train, x_train = svm_read_problem(train_file) counter=0 while counter<len(y_train): if y_train[counter]==-1: count_one=count_one+1 counter=counter+1 w1=count_one/float(len(y_train)) #w1=0.95 # Extra credit #w1=0.95 # Extra credit param='-s 0 -w1 '+str(w1)+' -w-1 '+str(1-w1) #param='-s 0' # Extra Credit model = train(y_train, x_train, param) y_test, x_test = svm_read_problem(test_file) p_labels, p_acc, p_vals = predict(y_test, x_test, model, '-b 1') accuracy = p_acc[0] index=0 if model.label[0]==1: index=0 elif model.label[1]==1: index=1 counter=0 prob_list=[] while counter<len(p_vals): prob_list.append(p_vals[counter][index]) counter=counter+1 output_tup=(p_labels, y_test, prob_list) return output_tup
def liblinear_predict(problem_filepath, model_filepath): """ Using LibLinear to predict result of a problem Returns ------- (ids, labels) """ # Reading a problem ids, x = liblinearutil.svm_read_problem(problem_filepath) print "len(x) = ", len(x) # Preparing a model model = liblinearutil.load_model(model_filepath) # Predicting y = [-2] * len(x) p_label, p_acc, p_val = liblinearutil.predict(y, x, model) return (ids, p_label)
def classify_text(text, lang, exclude=[]): """ makes the text into a feature vector, then classifies it. exclude should be a list of strings to exclude from the vectors exclude excludes any phrase that contains or is equal to any of the string in the exclude list using a case insensitive comparison TODO: What if an unsupported language is requested? The files won't be on disk """ model = load_model(lang) features = load_features(lang) texts = [text.lower()] for e in exclude:#this for loop is not right new_texts = [] for t in texts: new_texts = new_texts + t.split(e) texts = new_texts feature_vector = get_sparse_feature_vector(texts, features, exclude) p_label, p_acc, p_val = linu.predict([0], [feature_vector], model) p_val = p_val[0][0]/(1+abs(p_val[0][0])) return {'label':p_label[0],'value':p_val}
def getLogTagProbsByPos(self, senFeats): numberedSenFeats = self.getNumberedSenFeats(senFeats) contexts = [dict([(feat, 1) for feat in feats]) for feats in numberedSenFeats] dummyOutcomes = [1 for c in contexts] _, __, probDistsByPos = predict(dummyOutcomes, contexts, self.model, self.params) """ logTagProbsByPos = [ dict([(self.featCounter.noToFeat[i+1], math.log(prob)) for i, prob in enumerate(probDist)]) for probDist in probDistsByPos] """ logTagProbsByPos = [] for probDist in probDistsByPos: logProbsByTag = {} for c, prob in enumerate(probDist): tag = self.labelCounter.noToFeat[c+1] logProbsByTag[tag] = math.log(prob) logTagProbsByPos.append(logProbsByTag) return logTagProbsByPos
def _lib_predict_liblinear(to_predict, num_pos, num_neg, modellog): sparse_to_predict, num_pos, num_neg = _convert_to_sparse_matrix(to_predict, num_pos, num_neg, False) labels_predict = ([1] * num_pos) + ([-1] * num_neg) p_labs, p_acc, p_vals = predict(labels_predict, sparse_to_predict, modellog) return (p_labs, p_acc, p_vals, labels_predict)
def predict(input_file, model0_file, model1_file, mapping_file, output_file): global bos, eos out = open(output_file, 'w') m0 = ll.load_model(model0_file) m1 = ll.load_model(model1_file) mapping = Mapping(mapping_file) bos = mapping.map_pos('BOS') eos = mapping.map_pos('EOS') print '# of features:', len(mapping.feature_dict) # for easier mapping from neighbouring pos tags to features p_f = {} for i in mapping.pos_dict_rev: #[0, 1, 2, 3, ...] bos and eos are also included pi = mapping.map_pos_rev(i) p_f[(-1, i)] = mapping.map_features('POS_P1:%s' % pi) p_f[(-2, i)] = mapping.map_features('POS_P2:%s' % pi) p_f[(+1, i)] = mapping.map_features('POS_N1:%s' % pi) p_f[(+2, i)] = mapping.map_features('POS_N2:%s' % pi) for j in mapping.pos_dict_rev: pj = mapping.map_pos_rev(j) p_f[(-1, -2, i, j)] = mapping.map_features('POS_P1_P2:%s_%s' % (pi, pj)) p_f[(+1, +2, i, j)] = mapping.map_features('POS_N1_N2:%s_%s' % (pi, pj)) p_f[(-1, +1, i, j)] = mapping.map_features('POS_P1_N1:%s_%s' % (pi, pj)) s0, s1, s2 = 0, 0, 0 total = 0 for sent in read_sentence(input_file): x_ = [] g_ = [] for t in sent: feat = t.maxent_features(mapping.map_features) x_.append(feat) g_.append(mapping.map_pos(t.gold_pos)) y_0 = map(int, ll.predict([], [{k : 1 for k in f} for f in x_], m0, '-q')[0]) # y_2 = [choice(xrange(1, len(mapping.pos_dict))) for i in y_1] y_1, y_2 = inference(m1, p_f, y_0[:], x_, propose_deterministic) # y_2 = inference(m1, p_f, y_1[:], x_, propose_probabilistic) for y, y0, y1, y2, t in zip(g_, y_0, y_1, y_2, sent): if y == y0: s0 += 1 if y == y1: s1 += 1 if y == y2: s2 += 1 total += 1 p0 = mapping.map_pos_rev(y0) p1 = mapping.map_pos_rev(y1) p2 = mapping.map_pos_rev(y2) out.write('%s\t%s\n' % (t.word, p1)) out.write('\n') out.close() print 'acc 0: %d / %d = %.4f' % (s0, total, s0 / total) print 'acc 1: %d / %d = %.4f' % (s1, total, s1 / total) print 'acc 2: %d / %d = %.4f' % (s2, total, s2 / total)
def train_actively(self, data_train, data_dev): """Does margin-based active learning on the given data.""" # We will assume that we can label every example. assert not data_train.is_partially_labeled # Keep track of which examples can be still selected for labeling. __skip_extraction = [] for _, label_sequence in data_train.sequence_pairs: __skip_extraction.append([False for _ in label_sequence]) # Create an output directory. if os.path.exists(self.active_output_path): subprocess.check_output(["rm", "-rf", self.active_output_path]) os.makedirs(self.active_output_path) logfile = open(os.path.join(self.active_output_path, "log"), "w") def __make_data_from_locations(locations): """ Makes SequenceData out of a subset of data_train from given location=(sequence_num, position) pairs. """ selected_positions = collections.defaultdict(list) for (sequence_num, position) in locations: selected_positions[sequence_num].append(position) sequence_list = [] for sequence_num in selected_positions: word_sequence, label_sequence = \ data_train.sequence_pairs[sequence_num] selected_labels = [None for _ in range(len(word_sequence))] for position in selected_positions[sequence_num]: selected_labels[position] = label_sequence[position] # This example will not be selected again. __skip_extraction[sequence_num][position] = True sequence_list.append((word_sequence, selected_labels)) selected_data = SequenceData(sequence_list) return selected_data def __train_silently(data_selected): """Trains on the argument data in silent mode.""" self.__feature_extractor.is_training = True # Reset for training. quiet_value = self.quiet self.quiet = True self.train(data_selected, None) # No need for development here. self.quiet = quiet_value def __interval_report(data_selected): # Only report at each interval. if data_selected.num_labeled_instances % \ self.active_output_interval != 0: return # Test on the development data if we have it. if data_dev is not None: quiet_value = self.quiet self.quiet = True _, acc = self.predict(data_dev) self.quiet = quiet_value message = "{0} labels: {1:.3f}%".format( data_selected.num_labeled_instances, acc) print(message) logfile.write(message + "\n") logfile.flush() # Output the selected labeled examples so far. file_name = os.path.join( self.active_output_path, "example" + str(data_selected.num_labeled_instances)) with open(file_name, "w") as outfile: outfile.write(data_selected.__str__()) # Compute the (active_seed_size) most frequent word types in data_train. sorted_wordcount_pairs = sorted(data_train.observation_count.items(), key=lambda type_count: type_count[1], reverse=True) seed_wordtypes = [wordtype for wordtype, _ in sorted_wordcount_pairs[:self.active_seed_size]] # Select a random occurrence of each selected type for a seed example. occurring_locations = collections.defaultdict(list) for sequence_num, (observation_sequence, _) in \ enumerate(data_train.sequence_pairs): for position, word in enumerate(observation_sequence): if word in seed_wordtypes: occurring_locations[word].append((sequence_num, position)) locations = [random.sample(occurring_locations[wordtype], 1)[0] for wordtype in seed_wordtypes] data_selected = __make_data_from_locations(locations) __train_silently(data_selected) # Train for the first time. __interval_report(data_selected) while len(locations) < data_train.num_labeled_instances: # Make predictions on the remaining (i.e., not on the skip list) # labeled examples. [label_list, features_list, location_list] = \ self.__feature_extractor.extract_features(\ data_train, False, __skip_extraction) _, _, scores_list = \ liblinearutil.predict(label_list, features_list, self.__liblinear_model, "-q") # Compute "confidence" of each prediction: # max_{y} score(x,y) - max_{y'!=argmax_{y} score(x,y)} score(x,y') confidence_index_pairs = [] for index, scores in enumerate(scores_list): sorted_scores = sorted(scores, reverse=True) # Handle the binary case: liblinear gives only 1 score whose # sign indicates the class (+ versus -). confidence = sorted_scores[0] - sorted_scores[1] \ if len(scores) > 1 else abs(scores[0]) confidence_index_pairs.append((confidence, index)) # Select least confident examples for next labeling. confidence_index_pairs.sort() for _, index in confidence_index_pairs[:self.active_step_size]: locations.append(location_list[index]) data_selected = __make_data_from_locations(locations) __train_silently(data_selected) # Train from scratch. __interval_report(data_selected) logfile.close()