コード例 #1
0
def solve(train_X, train_Y, test_X, test_Y):
    best_lambda, test_accuracy = train(train_X, train_Y, train_X, train_Y)
    print("Answer for problem 16 is {0}".format(best_lambda))
    best_lambda, test_accuracy = train(train_X, train_Y, test_X, test_Y)
    print("Answer for problem 17 is {0}".format(best_lambda))

    train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 120, 200)
    best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y)
    print("best lambda is: {0}".format(best_lambda))

    model = liblinearutil.train(train_y, train_x, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model)
    print("Answer for problem 18 is {0}".format((100 - accuracy[0]) / 100))

    model = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model)
    print("Answer for problem 19 is {0}".format((100 - accuracy[0]) / 100))

    accuracy = []
    for i in range(5):
        train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 40 * i, 40 * (i + 1))
        best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y)
        accuracy.append(test_accuracy)

    mean_accuracy = np.mean(accuracy, axis=0)
    print("Answer for problem 20 is {0}".format(min(mean_accuracy)))
コード例 #2
0
ファイル: speciteller.py プロジェクト: jjessyli/speciteller
def predict(y,xs,xw):
    xs,_ = simpleScale(xs,scales_shallow)
    xw,_ = simpleScale(xw,scales_neuralbrn)
    p_label, p_acc, p_val = ll.predict(y,xs,model_shallow,'-q -b 1')
    ls_s = score(p_label,p_val)
    p_label, p_acc, p_val = ll.predict(y,xw,model_neuralbrn,'-q -b 1')
    ls_w = score(p_label,p_val)
    return [(x+y)/2 for x,y in zip(ls_s,ls_w)],ls_s,ls_w
コード例 #3
0
def predict(y, xs, xw):
    xs, _ = simpleScale(xs, scales_shallow)
    xw, _ = simpleScale(xw, scales_neuralbrn)
    p_label, p_acc, p_val = ll.predict(y, xs, model_shallow, '-q -b 1')
    ls_s = score(p_label, p_val)
    p_label, p_acc, p_val = ll.predict(y, xw, model_neuralbrn, '-q -b 1')
    ls_w = score(p_label, p_val)
    return [(x + y) / 2 for x, y in zip(ls_s, ls_w)], ls_s, ls_w
コード例 #4
0
ファイル: l2svm.py プロジェクト: HassounLab/ELP
 def get_edge_scores(self, edges, **kwargs):
     fv1 = [self._get_feature_vecs((i, j)) for (i, j) in edges]
     fv2 = [self._get_feature_vecs((j, i)) for (i, j) in edges]
     """
     ypred1 = self.svm.decision_function(fv1)
     ypred2 = self.svm.decision_function(fv2) 
     """
     fakey = np.zeros(len(fv1))
     _, _, ypred1 = predict(fakey, fv1, self.svm)
     _, _, ypred2 = predict(fakey, fv2, self.svm)
     ypred1, ypred2 = np.array(ypred1), np.array(ypred2)
     #print('SVM decision function sample', ypred1[:20])
     ypred = 0.5 * (ypred1 + ypred2)
     return ypred        
コード例 #5
0
 def classify(self, tweet):
     features = self._extract_features(tweet)
     value = liblinearutil.predict([0], [features], self.model)[0][0]
     for lang, number in self.languages.items():
         if number == value:
             return lang
     raise ValueError
コード例 #6
0
def predict(test_data, features, model):
    x = []
    y = []

    keys = test_data.keys()
    for key in keys:
        #y.append(test_data[key]['class'])
        y.append(0);
        x.append(features[key])

    p_label, p_acc, p_val = liblinearutil.predict(y, x, model, '-q')
    
    predictions = {}
    reverse = False
    for i in range(len(p_label)):
        predictions[keys[i]] = {
            'class' : p_label[i],
            'score' : p_val[i][0]
        }

        if (p_label[i] <= 0 and p_val[i][0] > 0):
            reverse = True

        if (p_label[i] > 0 and p_val[i][0] < 0):
            reverse = True
            
    if reverse:
        print 'REVERSING SCORE!'

        for key in predictions:
            predictions[key]['score'] *= -1

    return predictions
コード例 #7
0
def LDA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    # set parameters
    num_topics = 20
    burn_in = 1000  # 0
    alpha = 0.1
    beta = 0.1
    samples = 8
    spacing = 100

    num_test_docs = test_matrix.shape[0]

    sampler = lda.LDA(num_topics, alpha, beta)

    print('Starting!')
    theta, phi, likelihood = sampler.train(matrix, burn_in, samples, spacing)
    print('likelihood: ', likelihood)

    theta_test, likelihood = sampler.classify(test_matrix, phi, burn_in,
                                              samples, spacing)
    print('likelihood: ', likelihood)

    theta = theta / np.sum(theta, 1)[:, None]
    theta_test = theta_test / np.sum(theta_test, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), theta.tolist(), '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs),
                                       theta_test.tolist(), svm_model)
    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
コード例 #8
0
ファイル: bae.py プロジェクト: zhuhd15/binary-autoencoder
def hash(features, num_train_samples=58000, L=8):
    bits = []
    for i in range(L):
        start = timeit.default_timer()
        m = liblinearutil.load_model(
            'models/tr{0:05d}-L{1:02d}-b{2:02d}.model'.format(
                num_train_samples, L, i))
        p_label, p_acc, p_val = liblinearutil.predict([0] * features.shape[0],
                                                      features.tolist(), m,
                                                      str('-q'))
        bits.append(p_label)
        end = timeit.default_timer()
        print('[HASH] {0:3d}th bit hashed. {1:.4f} seconds elapsed'.format(
            i, end - start))

    start = timeit.default_timer()
    bits = np.vstack(bits).transpose().astype(np.int)
    bits[np.nonzero(bits == 0)] = -1

    with open('hash/tr{0:05d}-L{1:02d}'.format(num_train_samples, L),
              'wb') as fo:
        cPickle.dump(bits, fo)
    end = timeit.default_timer()
    print('[HASH] Hash codes saved. {0:.4f} seconds elapsed'.format(end -
                                                                    start))
    return
コード例 #9
0
ファイル: minitagger.py プロジェクト: rasoolims/minitagger
    def predict(self, data_test):
        """
        Predicts tags in the given data. If the data is fully labeled, reports
        the accuracy.
        """
        start_time = time.time()
        assert not self.__feature_extractor.is_training  # Assert trained

        # Extract features (on all instances, labeled or unlabeled) and pass
        # them to liblinear for prediction.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_test, True, [])
        pred_labels, (acc, _, _), _ = \
            liblinearutil.predict(label_list, features_list,
                                  self.__liblinear_model, "-q")
        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Prediction time: {0}".format(
                    str(datetime.timedelta(seconds=num_seconds))))
            if not data_test.is_partially_labeled:
                print("Per-instance accuracy: {0:.3f}%".format(acc))
            else:
                print("Not reporting accuracy: test data missing gold labels")

        # Convert predicted labels from integer IDs to strings.
        for i, label in enumerate(pred_labels):
            pred_labels[i] = self.__feature_extractor.get_label_string(label)
        return pred_labels, acc
コード例 #10
0
ファイル: cdiff.py プロジェクト: pfiziev/cdiff
def predict_from_SVR_liblinear(predictor, features):

    # -b 1: return probability estimates
    # -q: quiet mode
    p_label, p_acc, p_val = liblinearutil.predict([], features, predictor, '-q')

    return p_label
コード例 #11
0
ファイル: faceUtils.py プロジェクト: CVRL/Live-Attribute-Demo
    def predictTraitValue(self,
                          imagelist_file='image_list.txt',
                          class_label=1,
                          outfile='output',
                          norm=-1,
                          debug=False):
        """
        Calls external functions to obtain predicted value between 0 and 1 for given image, updates class values
        Args:
            imagelist_file: file that contains the list of image/images to be processed
            class_label: label given images as either positive or negative for SVM classification
            outfile: filename of output for Dense SIFT analysis
        Return: True if successful
        """
        with open(imagelist_file, 'w') as f:
            f.write('tmp.jpg\n')
        cv2.imwrite('tmp.jpg', self.norm_image)

        hog_histogram(imagelist_file, class_label, outfile, norm, debug)

        prob_y, prob_x = svm_read_problem(outfile)
        model_file = models[self.trait]
        model = load_model(model_file)
        self.p_val = predict(prob_y, prob_x, model)[2][0][0]

        return True
コード例 #12
0
ファイル: linear_mod.py プロジェクト: Ralf3/samt2
    def predict(self,x,y=[]):
        """
        
        x: a list/tuple of l predicting instances. The feature vector of
          each predicting instance is an instance of list/tuple or dictionary.

        y: a list/tuple of l true labels (type must be int/double). It is used
           for calculating the accuracy. Use [] if true labels are
           unavailable.
        predicting_options: a string of predicting options in the same
        format as that of LIBLINEAR.
        p_acc: a tuple including accuracy (for classification), mean
               squared error, and squared correlation coefficient (for
               regression).

        p_vals: a list of decision values or probability estimates (if '-b 1'
                is specified). If k is the number of classes, for decision
                values, each element includes results of predicting k
                binary-class SVMs. If k = 2 and solver is not MCSVM_CS, only
                one decision value is returned. For probabilities, each
                element contains k values indicating the probability that the
                testing instance is in each class.
                Note that the order of classes here is the same as
                'model.label' field in the model structure.
        """
        p_labels, p_acc, p_vals = lu.predict(y, x, self.model)
        # print  p_labels, p_acc, p_vals
        return  p_labels, p_acc, p_vals
コード例 #13
0
ファイル: minitagger.py プロジェクト: afcarl/minitagger
    def predict(self, data_test):
        """
        Predicts tags in the given data. If the data is fully labeled, reports
        the accuracy.
        """
        start_time = time.time()
        assert not self.__feature_extractor.is_training  # Assert trained

        # Extract features (on all instances, labeled or unlabeled) and pass
        # them to liblinear for prediction.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_test, True, [])
        pred_labels, (acc, _, _), _ = \
            liblinearutil.predict(label_list, features_list,
                                  self.__liblinear_model, "-q")
        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Prediction time: {0}".format(
                str(datetime.timedelta(seconds=num_seconds))))
            if not data_test.is_partially_labeled:
                print("Per-instance accuracy: {0:.3f}%".format(acc))
            else:
                print("Not reporting accuracy: test data missing gold labels")

        # Convert predicted labels from integer IDs to strings.
        for i, label in enumerate(pred_labels):
            pred_labels[i] = self.__feature_extractor.get_label_string(label)
        return pred_labels, acc
コード例 #14
0
ファイル: svmfilter.py プロジェクト: klyc0k/EDSFilter
    def filter_tweets(self, tweets, text_lookup_field_name='text'):
        """
        filter loaded tweets
        """
        tweets = [a for a in tweets if len(a[text_lookup_field_name].split()) > 7] # filter tweets less than 7 words
        
        if(len(tweets) ==0):
            return []
        
        filtered_tweets = []
        if(len(tweets) > 1000000):  ## split into chunks if the size is too big
            tweet_chunks = list(kgen.chunks(tweets, 1000000))
            
            for chnk in range(len(tweet_chunks)):
                print "processing chunk %d" % chnk
                X, hash2originalID = extract_vectors(tweet_chunks[chnk], self.ktw, self._vocab, text_lookup_field_name)
                Y = []
                
                if(len(X) <= 2):
                    continue
                
                p_label, p_acc, p_val = predict(Y, X, self._model, '-q') ###he
                         
                if(self._fd):
                    p_label = self.hard_feature_rules(p_label, X)
                
                for i, pred in enumerate(p_label):
                    if pred == 1:
                        filtered_tweets.append(tweet_chunks[chnk][hash2originalID[i]])        
        else:
            X, hash2originalID = extract_vectors(tweets, self.ktw, self._vocab, text_lookup_field_name)
            Y = []
            
            if(len(X) == 0):
                return []
            
            p_label, p_acc, p_val = predict(Y, X, self._model, '-q') ###he

            if(self._fd):
                p_label = self.hard_feature_rules(p_label, X)                                        
            
            for i, pred in enumerate(p_label):
                if pred == 1:
                    if(not self.hard_reject(tweets[hash2originalID[i]]['text'])):
                        filtered_tweets.append(tweets[hash2originalID[i]])
                    
        return filtered_tweets
コード例 #15
0
ファイル: l1k.py プロジェクト: joaoreis92/extremel1k
def predict_liblinear(features, labels, model):
    print('Loading predictions...')
    start = time.time()
    preds, p_acc, p_val = liblinearutil.predict(labels, features, model)
    preds = list(map(int, preds))
    end = time.time()
    print('Predictions made in ' + str(end - start) + ' seconds')
    return preds
コード例 #16
0
ファイル: cdiff.py プロジェクト: pfiziev/cdiff
def predict_from_log_regr_liblinear(predictor, features):
    pos_class_idx = 1 if int(predictor.label[1]) == 1 else 0

    # -b 1: return probability estimates
    # -q: quiet mode
    p_label, p_acc, p_val = liblinearutil.predict([], features, predictor, '-b 1 -q')

    return [probs[pos_class_idx] for probs in p_val]
コード例 #17
0
ファイル: maxent.py プロジェクト: EggplantElf/TeamLab
def propose_deterministic(model, cache, feats, x_, j):
    if feats in cache[j][1]:
        ny = cache[j][1][feats]
    else:
        x = [{k:1 for k in x_[j] + list(feats)}]
        ny = int(ll.predict([], x, model, '-q')[0][0])
        cache[j][1][feats] = ny
    return ny
コード例 #18
0
ファイル: cdiff.py プロジェクト: pfiziev/cdiff
def eval_SVR_liblinear(predictor, examples, responses):
    echo('Evaluating predictor')
    predictions, p_acc, p_val = liblinearutil.predict([], examples, predictor, '-q')

    echo('RMSE:', rmse(responses, predictions),
         '\tR2:', R2(responses, predictions),
         '\tPearson R:', pearsonr(responses, predictions))

    return predictions
コード例 #19
0
ファイル: lmvsvm.py プロジェクト: luisfredgs/multiviewLSVM
def predict(x, y, model, classify=True):

    p_label, p_acc, p_vals = liblin.predict(y.tolist(), x.tolist(), model,
                                            "-q")

    if classify:
        return p_label

    return p_vals
コード例 #20
0
 def predict(self , vec) :
     '''
     vec : [ {idx : val , ...} ] , list with only one element !
     return : str , POSITIVE_NAME / NEGATIVE_NAME
     '''
     y_fake = [ self.NEGATIVE_LABEL ,] # for liblinear
     p_labels , _ , _ = liblinearutil.predict(y_fake , vec , self.model , "-q")
     p_label = int(p_labels[0])
     p_name = self.POSITIVE_NAME if p_label == self.POSITIVE_LABEL else self.NEGATIVE_NAME
     return p_name
コード例 #21
0
 def face_detect(self, img):
     """
     Detect face bounding box given image
     img: input image
     """
     # convert to gray
     if img.ndim > 2:
         img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     # detect face      
     bboxes = self.face_detector['detector'].detectMultiScale(img,
                                             minNeighbors=self.face_detector['minNeighbors'], 
                                             minSize=self.face_detector['minSize'])
     if len(bboxes) == 0:
         #print('No face is detected')
         return np.zeros((0, 4))
     # else, select appropriate face
     # exclude very small bounding box
     index_face_size = (-bboxes[:, 2]).argsort() # descending order
     bboxes = bboxes[index_face_size, :]
     for idx in np.arange(1, bboxes.shape[0]):
         if bboxes[idx, 2] <= np.round(bboxes[0, 2]*0.3):
             bboxes = bboxes[:idx, :]
             break
         
     # compute confidence for each remaining bbox
     final_bboxes = np.zeros((0, 4))
     C = []
     for idx in np.arange(bboxes.shape[0]):
         bbox = bboxes[idx, :]
         im_cut = img[bbox[1]:bbox[1]+bbox[3], bbox[0]:bbox[0]+bbox[2]]
         im_cut = cv2.resize(im_cut, (160, 160), interpolation=cv2.INTER_CUBIC)
         _, descriptor = lbp(im_cut)
         descriptor = descriptor.astype(float)/np.sum(descriptor)
         descriptor = list(descriptor)
         _, _, confidence = predict([0], [descriptor], self.face_detector['confidence_LBP']['model'], '-b 1 -q')
         if confidence[0][0] < self.face_detector['confidence_LBP']['thre']:
             continue
         C.append(confidence[0][0])
         final_bboxes = np.concatenate((final_bboxes, bbox.reshape((1, -1))))
     
         
             
     if final_bboxes.shape[0] == 0:
         return final_bboxes
     
     # choose largest and best one
     #index_face_size = (-final_bboxes[:, 2]).argsort() # descending order
     #final_bboxes = final_bboxes[index_face_size, :]
     #C = C[index_face_size]
     maxC = np.max(C)
     for idx in np.arange(final_bboxes.shape[0]):
         if C[idx] - maxC > -0.05:
             bbox = final_bboxes[idx, :].reshape((1, -1))
             break
     return bbox
コード例 #22
0
ファイル: eDNSalModel.py プロジェクト: yenchih/edn-cvpr2014
    def saliency(self, img, normalize=True):
        """Computes eDN saliency map for single image or image sequence"""

        descs = self.descriptions

        # rescale image to typical input size
        imgSize = img.shape[:2]
        rescale_factor = 0.5*EDN_INSIZE[0]/max(imgSize) + \
                         0.5*EDN_INSIZE[1]/min(imgSize)

        # single image:
        # img = misc.imresize(image, rescale_factor, 'bicubic')

        # image sequence (or single image)
        scaledImg = np.zeros([a * rescale_factor
                              for a in imgSize] + [img.shape[2]])
        for j in xrange(img.shape[2] / 3):
            scaledImg[:, :, j * 3:j * 3 + 3] = misc.imresize(
                img[:, :, j * 3:j * 3 + 3], rescale_factor, 'bicubic')
        img = scaledImg

        # compute eDN features for description(s)
        t1 = time.time()
        fMapEDN, fMapSize = eDN_features(img, descs)
        t2 = time.time()
        logging.info("Feature computation took %0.3fs" % (t2 - t1))

        if self.biasToCntr:
            fMapCntr = dist_to_cntr_features(img, fMapSize)
            fMap = np.hstack((fMapCntr, fMapEDN))
        else:
            fMap = fMapEDN

        fMapW, fwParams = whiten_features(fMap, self.whitenParams)

        # SVM prediction
        t1 = time.time()
        bs, pAcc, pred = predict([], fMapW.tolist(), self.svm, options="-q")
        t2 = time.time()
        logging.info("Prediction took %0.3fs" % (t2 - t1))
        pred = np.array(pred)

        # reshaping and upscaling
        pred = pred.reshape(fMapSize, order='F')
        predLarge = sp.ndimage.interpolation.zoom(
            pred, (imgSize[0] / float(pred.shape[0]),
                   imgSize[1] / float(pred.shape[1])))

        # normalization
        if normalize:
            rescaled = (255.0 / (predLarge.max() - predLarge.min()) *
                        (predLarge - predLarge.min())).astype(np.uint8)
            return rescaled
        else:
            return predLarge
コード例 #23
0
def liblinear_classifier(svm_input=None, y=[], x=[]):
    """调用训练好的liblinear分类器做垃圾过滤
    """
    svm_model = load_model(SVM_MODEL_FILE)

    if svm_input:
        y, x = svm_read_problem(svm_input)

    p_label, p_acc, p_val = predict(y, x, svm_model, "-q")

    return p_label
コード例 #24
0
ファイル: eDNSalModel.py プロジェクト: Oldog/edn-cvpr2014
    def saliency(self, img, normalize=True): 
        """Computes eDN saliency map for single image or image sequence""" 
    
        descs = self.descriptions 

        # rescale image to typical input size 
        imgSize = img.shape[:2]  
        rescale_factor = 0.5*EDN_INSIZE[0]/max(imgSize) + \
                         0.5*EDN_INSIZE[1]/min(imgSize) 

        # single image: 
        # img = misc.imresize(image, rescale_factor, 'bicubic') 
        
        # image sequence (or single image) 
        scaledImg = np.zeros([a*rescale_factor for a in imgSize] + [img.shape[2]])
        for j in xrange(img.shape[2]/3):
            scaledImg[:,:,j*3:j*3+3] = misc.imresize(img[:,:,j*3:j*3+3],
                                                     rescale_factor, 'bicubic')
        img = scaledImg

        # compute eDN features for description(s) 
        t1 = time.time()
        fMapEDN, fMapSize = eDN_features(img, descs) 
        t2 = time.time()
        logging.info("Feature computation took %0.3fs" % (t2-t1))

        if self.biasToCntr:  
            fMapCntr = dist_to_cntr_features(img, fMapSize) 
            fMap = np.hstack((fMapCntr, fMapEDN)) 
        else: 
            fMap = fMapEDN 

        fMapW, fwParams = whiten_features(fMap, self.whitenParams) 

        # SVM prediction 
        t1 = time.time()
        bs, pAcc, pred = predict([], fMapW.tolist(), self.svm, options="-q") 
        t2 = time.time()
        logging.info("Prediction took %0.3fs" % (t2-t1))
        pred = np.array(pred) 

        # reshaping and upscaling
        pred = pred.reshape(fMapSize, order='F')
        predLarge = sp.ndimage.interpolation.zoom(pred, 
            (imgSize[0]/float(pred.shape[0]),
            imgSize[1]/float(pred.shape[1])))

        # normalization
        if normalize: 
            rescaled = (255.0 / (predLarge.max()-predLarge.min()) * 
                       (predLarge-predLarge.min())).astype(np.uint8) 
            return rescaled
        else:
            return predLarge
コード例 #25
0
ファイル: lmvsvm.py プロジェクト: luisfredgs/multiviewLSVM
def alternating_predict(x, y, proj_landmarks, model, classify=True):

    sample = recontruct_views(x, proj_landmarks)

    p_label, p_acc, p_vals = liblin.predict(y.tolist(), sample.tolist(), model,
                                            "-q")

    if classify:
        return p_label

    return p_vals
コード例 #26
0
def liblinear_classifier(svm_input=None, y=[], x=[]):
    """调用训练好的liblinear分类器做垃圾过滤
    """
    svm_model = load_model(SVM_MODEL_FILE)

    if svm_input:
        y, x = svm_read_problem(svm_input)

    p_label, p_acc, p_val = predict(y, x, svm_model, "-q")

    return p_label
コード例 #27
0
def test( C, Y_test, X_test, x_lines ):
    """
    This function takes in the test labels and features and prints out the accuracy
    :param C      : list containing parameter C
    :param X_test : test features
    :param Y_test : test labels
    :return None
    """
    # for c in C:
    model = lu.load_model("model/lmods2_tamper" + str(round(C,2)) + "_" + str(x_lines) + "l.model")
    p_letters, p_acc, p_val = lu.predict(Y_test, X_test, model)
    return p_letters
コード例 #28
0
def parallel_train_predict(args):
    print("A process begins.")
    x_train,y_train,x_test,y_test=args
    problem = liblinearutil.problem(y_train, x_train)
    parameter = liblinearutil.parameter('-s 0 -c 1')
    time_start = time.clock()
    model = liblinearutil.train(problem, parameter)
    print("A process training finished in %f."%(time.clock()-time_start))
    time_start = time.clock()
    p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0')
    print("A process predicting finished in %f."%(time.clock()-time_start))
    return p_val
コード例 #29
0
def validation(k, data_x, data_y, s, e, C):
    accuracies = []
    params = get_params(s, e, C)
    print('s = {}, e = {}, C = {}'.format(s, e, C))
    for fold in range(k):
        train_x, test_x = get_k_fold(k, fold, data_x)
        train_y, test_y = get_k_fold(k, fold, data_y)
        m = liblinearutil.train(train_y, train_x, params)
        _, p_acc, __ = liblinearutil.predict(test_y, test_x, m)
        accuracies.append(p_acc[0])

    return accuracies
コード例 #30
0
ファイル: svm.py プロジェクト: Imperat/Pyrus
    def predict(self, x):
        y = []
        for sample in x:
            data = dict([(self._features.getId(d), sample[d]) for d in sample if self._features.getId(d)])
            label, _, _ = liblinear.predict([0], [data], self._model, "")
            if self._regression:
                y.append(label[0])
            else:
                if self._labels.count() == 2:
                    label[0] = 1 if label[0] == 1 else 2
                y.append(self._labels.getVal(label[0]))

        return y
コード例 #31
0
def predict_with_svm(model, predict_y, predict_x):
	# all_y, all_x = read_multiple_days(start_day, end_day)
	labels, acc, values = ll.predict(predict_y, predict_x, model, "-q")
	num_false_pos = 0
	num_false_neg = 0
	total = len(predict_y)
	for gt, pred in zip(predict_y, labels):
		diff = gt - pred
		if diff == -2:
			num_false_pos += 1
		if diff == 2:
			num_false_neg += 1
	return (total, num_false_pos, num_false_neg)
コード例 #32
0
ファイル: get_feature.py プロジェクト: lispc/stressanalysis
def prefict_from_weibos(weibos,verbose=False):
	fvecs = []
	for weibo in weibos:
		text = weibo.get('text')
		fvec = analyze(text)
		if fvec:
			fvecs.append(fvec)
	if not fvecs:
		return None
	#print fvecs
	result,_,prob = svm.predict([],fvecs,model)
	stresses = [exp(plist[0])/(1+exp(plist[0])) for plist in prob]
	return sum(stresses)/len(stresses)
コード例 #33
0
def unimodalPredDev(gs, feats, nDim):
	parts = ['dev']
	[cccs, preds] = [{} for i in range(2)]
	for s in parts:
		cccs[s] = -1.0
	warnings.filterwarnings('ignore', category=ConvergenceWarning)
	#Liblinear
	for comp in v.C:
		#Options for liblinear
		options = "-s "+str(v.sVal)+" -c "+str(comp)+" -B 1 -q"
		#We learn the model on train
		model = train(gs['train'][nDim],feats['train'],options)
		#We predict on data
		for s in parts:
			pred = np.array(predict(gs[s][nDim],feats[s],model,"-q"))[0]
			#We calculate the correlation and store it
			ccc = cccCalc(np.array(pred),gs[s][nDim])
			if (ccc > cccs[s]):
				preds[s] = pred
				cccs[s] = ccc
				function = "SVR"
				alpha = comp
	if (v.fullMode == True):
		#We see if we can do better with sklearn
		for nbFunc in range(len(v.lFunc)):
			for c in v.parFunc[nbFunc]:
				func = v.lFunc[nbFunc]
				reg = func[0](alpha=c)
				#One task prediction
				if (func[1] == 0):
					reg.fit(feats['train'],gs['train'][nDim])
					for s in parts:
						p = reg.predict(feats['dev'])
						ccc = cccCalc(p,gs[s][nDim])
						if (ccc > cccs[s]) : 
							preds[s] = p
							cccs[s] = ccc
							function = func[2]
							alpha = c
				#Multi task prediction
				else :
					reg.fit(feats['train'],np.transpose(gs['train']))
					for s in parts:
						p = reg.predict(feats['dev'])[:,nDim]
						ccc = cccCalc(p,gs[s][nDim])
						if (ccc > cccs[s]) : 
							preds[s] = p
							cccs[s] = ccc
							function = func[2]
							alpha = c
	return cccs, preds, function, alpha
コード例 #34
0
ファイル: svm.py プロジェクト: Satanbear/MorfAnalizer
    def predict(self, x):
        y = []
        for sample in x:
            data = dict([(self._features.getId(d), sample[d]) for d in sample
                         if self._features.getId(d)])
            label, _, _ = liblinear.predict([0], [data], self._model, '')
            if self._regression:
                y.append(label[0])
            else:
                if self._labels.count() == 2:
                    label[0] = 1 if label[0] == 1 else 2
                y.append(self._labels.getVal(label[0]))

        return y
コード例 #35
0
ファイル: test.py プロジェクト: natoromano/sportsAI
def predict(name, query, testGame, model, method='skip_1'):
    '''Predicts the answer to the query and returns an array of tuples (score,
    answers), as well as the correct answer.
    
    In the tuples, answers is all the possible right answers e.g. ['Ronaldo',
    'Cristiano Ronaldo', 'Cristiano'].
    '''
    entities = {}
    # create Dataset object
    testSet = dts.Dataset.from_columns(name)    
    text = ' '.join([t.decode() for t in testGame.text])
    text, entities = txt.anonymize(text)
    #for i in range(len(text)):
        #text[i], entities = txt.anonymize(text[i], entities)
    inv_entities = {v: k for k, v in entities.items()}
    # fetch answer
    try:
        answer = testGame.query_dict[query]
    except KeyError:
        answer = 'N/A'
    # create feature vector for each entity in text
    for ent_id in inv_entities.iterkeys():
        ent_name = 'ent' + str(ent_id)
        if method!='word2vec':
            feature_vector = ext.create_feature_vector(ent_name, 
                                                       text, method)
            try:
                label = (ent_id == inv_entities[answer]) * 1.0
            except KeyError:
                label = (inv_entities[ent_id] in answer) * 1.0                
            # add feature vector to dataset
            testSet.append((feature_vector, label), ent_name)
        else:
            feature_vector = ext.create_feature_vector(ent_name, text,
                                                       method, model=model)
            try:
                label = (ent_id == inv_entities[answer]) * 1.0
            except KeyError:
                label = (inv_entities[ent_id] in answer) * 1.0 
            testSet.append((dict(zip(range(len(feature_vector)), 
                                     feature_vector)), label), ent_name)
    scores = []
    words = testSet.entities
    _, _, probas = llb.predict(testSet.Y, testSet.X, model, '-b 1')
    for i, proba in enumerate(probas):
        scores.append((proba[1], 
                       [k for k,v in entities.iteritems() \
                       if str(v) == words[i][3:]]))
    return scores, answer
コード例 #36
0
ファイル: maxent.py プロジェクト: EggplantElf/TeamLab
def propose_probabilistic(model, cache, feats, x_, j):
    if feats in cache[j][1]:
        dist = cache[j][1][feats]
    else:
        x = [{k:1 for k in x_[j] + list(feats)}]
        # print ll.predict([], x, model, '-q -b 1')
        # exit(0)
        dist = ll.predict([], x, model, '-q -b 1')[2][0]
        cache[j][1][feats] = dist
    r = random()
    for (y, p) in enumerate(dist):
        r -= p
        if r < 0:
            return y + 1
    return y + 1
コード例 #37
0
ファイル: svmfilter.py プロジェクト: klyc0k/EDSFilter
 def classify_single_tweet(self, tweet, text_lookup_field_name='text'):
     """
     classify a single tweet is cu-related or not
     """
     X = [extract_single_vector(tweet, self.ktw, self._vocab, text_lookup_field_name)]
     Y = []
     p_label, p_acc, p_val = predict(Y, X, self._model, '-q')
     
     if(self._fd and self._featrule):
         p_label = self.hard_feature_rules(p_label, X)
     
     if(self.hard_reject(tweet[text_lookup_field_name])):
         p_label = [0]
         
     return p_label[0] == 1
コード例 #38
0
def TOKEN_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    n_docs = matrix.shape[0]
    n_test_docs = test_matrix.shape[0]
    matrix = matrix / np.sum(matrix, 1)[:, None]
    test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), matrix.tolist(), '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(n_test_docs),
                                       test_matrix.tolist(), svm_model)

    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
コード例 #39
0
    def get_scores(self, items):
        model = self.train()

        insts = []
        for item in items:
            inst = self.to_instance(item['title'])
            insts.append(inst)

        ## XXX dirty hack to hide messages printed by LIBLINEAR
        old_out = sys.stdout
        sys.stdout = open('/dev/null', 'wb')
        dummy, dummy, decs = liblinearutil.predict([], insts, model)
        sys.stdout = old_out

        return decs
コード例 #40
0
def train(train_X, train_Y, test_X, test_Y):
    test_accuracy = []

    model_1 = liblinearutil.train(train_Y, train_X, '-s 0 -c 5000 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_1)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_2 = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_2)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_3 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.5 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_3)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_4 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.005 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_4)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_5 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.00005 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_5)
    test_accuracy.append((100 - accuracy[0]) / 100)

    return lambda_set[test_accuracy.index(min(test_accuracy))], test_accuracy
コード例 #41
0
ファイル: rssreadercore.py プロジェクト: ho-chiahua/rssreader
	def get_scores(self, items):
		model = self.train()
	
		insts = []
		for item in items:
			inst = self.to_instance(item['title'])
			insts.append(inst)

		## XXX dirty hack to hide messages printed by LIBLINEAR
		old_out = sys.stdout
		sys.stdout = open('/dev/null', 'wb')
		dummy, dummy, decs = liblinearutil.predict([], insts, model)
		sys.stdout = old_out

		return decs
コード例 #42
0
ファイル: SUNRGBD_few_shot.py プロジェクト: ybyangjing/AGA
def eval_SVM(X, y, Xhat, yhat):
    # create classification problem
    problem = liblinearutil.problem(y, X)

    # set SVM parameters
    svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1')

    # train SVM
    model = liblinearutil.train(problem, svm_param)

    # predict and evaluate
    p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q')

    # compute accuracy
    acc, mse, scc = liblinearutil.evaluations(yhat, p_label)
    return acc
コード例 #43
0
ファイル: lmvsvm.py プロジェクト: luisfredgs/multiviewLSVM
def r_obj_function(R, x, y, lands, svm, mask, c1, c2):
    m = len(x)
    l = len(lands)

    r_2d = R.reshape(m, l)
    _, _, svm_pred = liblin.predict(y, np.dot(r_2d, lands).tolist(), svm, "-q")

    cost = c2 * sum((x - np.dot(r_2d, lands))[mask]**2)
    cost += c1 * sum(np.max(0, 1 - np.asarray(svm_pred)[0, y]))

    # for i in range(len(x)):
    #     m_i = mask[i]
    #     cost += c2*sum((x[i, m_i] - np.dot(r_2d[i], lands[:, m_i]).T)**2)
    #     cost += c1*max(0, 1 - np.asarray(svm_pred)[0, y[i]])

    return cost
コード例 #44
0
ファイル: lmvsvm.py プロジェクト: luisfredgs/multiviewLSVM
def alternating_train(x, y, lands, c1, c2=1, params='-s 2 -B 1 -q'):

    nb_views = lands.shape[2]

    L = np.hstack([lands[:, :, v] for v in range(nb_views)])
    M = np.hstack([x[:, :, v] for v in range(nb_views)])

    l = len(lands)
    m = len(x)

    r0, mask = missing_lstsq(L, M)
    s0 = np.dot(r0, L)

    sample = s0.copy()
    R = r0.copy()

    y_list = y.tolist()
    svm = liblin.train(y.tolist(), sample.tolist(),
                       '-c {} '.format(c1) + params)

    it = 0
    while True:
        it += 1

        res = minimize(r_obj_function,
                       R.flatten(),
                       args=(M, y, L, svm, mask, c1, c2),
                       options={'disp': True})
        # for i in range(len(x)):
        #     r_i = minimize(r_obj_function, R[i, :], args=(i, M, y, L, svm, mask, c1, c2), options={'disp':  False})
        #     R[i] = r_i.x
        #     cost += r_i.fun

        print(r_i.fun)
        R = res.x.reshape(m, l)
        sample = np.dot(R, L)

        svm = liblin.train(y.tolist(), sample.tolist(),
                           '-c {} '.format(c1) + params)

        _, p_acc, _ = liblin.predict(y, sample.tolist(), svm, "-q")
        print(p_acc)
        if it == 1:
            break
    return svm
コード例 #45
0
def AT_FA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    # set parameters
    num_topics = 4
    burn_in = 1000  # 0
    alpha = 0.1
    beta = 0.1
    samples = 8
    spacing = 100

    num_test_docs = test_matrix.shape[0]

    doc_authors_new, n_authors_new = add_fic_authors(doc_authors, n_authors)

    sampler = at.AtSampler(num_topics, n_authors_new, alpha, beta)

    print('Starting!')
    theta, phi, likelihood = sampler.train(doc_authors_new, matrix, burn_in,
                                           samples, spacing)
    print('theta:', theta.shape)
    print('phi:', phi.shape)
    print('likelihood:', likelihood)

    sampler.n_authors = num_test_docs

    theta_test = sampler.classify(test_matrix, phi, burn_in, samples, spacing)
    print('theta test:', theta_test.shape)

    training_matrix = concatenate_fic_authors(doc_authors, num_topics)

    num_test_docs = test_matrix.shape[0]
    test_matrix = np.concatenate((theta_test, theta_test), axis=1)

    training_matrix = training_matrix / np.sum(training_matrix, 1)[:, None]
    test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), training_matrix.tolist(),
                         '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs),
                                       test_matrix.tolist(), svm_model)

    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
コード例 #46
0
ファイル: pos.py プロジェクト: Adderty/Pyrus
	def label(self, sentence):
		labeled = []
		prev = []
		for word in sentence:
			body = word.lower()
			
			featurespace = self._construct_featurespace(body, prev)
			
			p_label, _, _ = svm.predict([0], [featurespace.featureset], self._svm_model, '')
			label = p_label[0]
			
			prev.append((body, label))
			if len(prev) > self.chain_len:
				del(prev[0])
				
			labeled.append((word, label))
			
		return labeled
コード例 #47
0
ファイル: predict.py プロジェクト: info-adk/baseline
def main():
    if __name__ == "__main__":
        y, x = svm_read_problem(feature_file, return_scipy=True)
        # train:test = 7:3
        train_X = x[:14000]
        train_y = y[:14000]
        test_X = x[14000:]
        test_y = y[14000:]

        prob = problem(train_y, train_X)
        param = parameter("-c 1 -s 2")
        model = train(prob, param)
        p_labs, p_acc, p_vals = predict(test_y, test_X, model)
        accuracy, precision, recall = metrics_result(test_y, p_labs)
        print
        print "accuracy: ", accuracy
        print "precision: ", precision
        print "recall: ", recall
コード例 #48
0
ファイル: bae.py プロジェクト: zhuhd15/binary-autoencoder
def f_step(features, models, verbose=True):
    # X = features
    Z = []
    for (m, i) in zip(models, range(len(models))):
        t_start = timeit.default_timer()
        p_label, p_acc, p_val = liblinearutil.predict([0] * features.shape[0],
                                                      features.tolist(), m,
                                                      str('-q'))
        Z.append(p_label)
        t_end = timeit.default_timer()
        if verbose:
            print('[F] {:3d}th bit, {:.4f} seconds elapsed'.format(
                i, t_end - t_start))

    Z = np.vstack(Z).transpose()

    # np.linalg.pinv(Z).dot(X).shape
    return (np.linalg.pinv(Z).dot(features), Z)
コード例 #49
0
    def label(self, sentence):
        labeled = []
        prev = []
        for word in sentence:
            body = word.lower()

            featurespace = self._construct_featurespace(body, prev)

            p_label, _, _ = svm.predict([0], [featurespace.featureset],
                                        self._svm_model, '')
            label = p_label[0]

            prev.append((body, label))
            if len(prev) > self.chain_len:
                del (prev[0])

            labeled.append((word, label))

        return labeled
コード例 #50
0
    def classify(self, text):
        instance = NormalizedBinaryTextInstance(
            None, text, self.feature_indices)

        # Construct the feature vector.
        feature_vector = {}
        for f, v in instance.feature_values():
            feature_index = self.feature_indices.get(f)
            if feature_index != None:
                feature_vector[feature_index] = v

        options = '-b 1' if self.output_probability else ''
        p_labs, p_acc, p_vals = liblinearutil.predict([], [feature_vector], self.model, options)

        response = []
        vals = dict(enumerate(p_vals[0]))
        for label in sorted(vals, key=vals.get, reverse=True):
            response.append((label, self._soft_max_scaling(vals[label])))
        return response
コード例 #51
0
    def classify(self, text):
        instance = NormalizedBinaryTextInstance(None, text,
                                                self.feature_indices)

        # Construct the feature vector.
        feature_vector = {}
        for f, v in instance.feature_values():
            feature_index = self.feature_indices.get(f)
            if feature_index != None:
                feature_vector[feature_index] = v

        options = '-b 1' if self.output_probability else ''
        p_labs, p_acc, p_vals = liblinearutil.predict([], [feature_vector],
                                                      self.model, options)

        response = []
        vals = dict(enumerate(p_vals[0]))
        for label in sorted(vals, key=vals.get, reverse=True):
            response.append((label, self._soft_max_scaling(vals[label])))
        return response
コード例 #52
0
ファイル: faceUtils.py プロジェクト: CVRL/Live-Attribute-Demo
    def predictTraitValue(self, imagelist_file='image_list.txt', class_label=1, outfile='output', norm=-1, debug=False):
        """
        Calls external functions to obtain predicted value between 0 and 1 for given image, updates class values
        Args:
            imagelist_file: file that contains the list of image/images to be processed
            class_label: label given images as either positive or negative for SVM classification
            outfile: filename of output for Dense SIFT analysis
        Return: True if successful
        """
        with open(imagelist_file, 'w') as f:
                f.write('tmp.jpg\n')
        cv2.imwrite('tmp.jpg', self.norm_image)

        hog_histogram(imagelist_file, class_label, outfile, norm, debug)

        prob_y, prob_x = svm_read_problem(outfile)
        model_file = models[self.trait]
        model = load_model(model_file)
        self.p_val = predict(prob_y, prob_x, model)[2][0][0]

        return True
コード例 #53
0
def run_classifier(train_file, test_file):

        count_one=0

        y_train, x_train = svm_read_problem(train_file)

        counter=0
        while counter<len(y_train):
                if y_train[counter]==-1:
                        count_one=count_one+1
                counter=counter+1

        w1=count_one/float(len(y_train))
        #w1=0.95 # Extra credit
        #w1=0.95 # Extra credit 
        param='-s 0 -w1 '+str(w1)+' -w-1 '+str(1-w1)
        #param='-s 0'   # Extra Credit
        model = train(y_train, x_train, param)

        y_test, x_test = svm_read_problem(test_file)
        p_labels, p_acc, p_vals = predict(y_test, x_test, model, '-b 1')


        accuracy = p_acc[0]

        index=0
        if model.label[0]==1:
                index=0
        elif model.label[1]==1:
                index=1

        counter=0
        prob_list=[]
        while counter<len(p_vals):
                prob_list.append(p_vals[counter][index])
                counter=counter+1

        output_tup=(p_labels, y_test, prob_list)

        return output_tup
コード例 #54
0
ファイル: predict.py プロジェクト: nicolay-r/tone-classifier
def liblinear_predict(problem_filepath, model_filepath):
    """
    Using LibLinear to predict result of a problem

    Returns
    -------
        (ids, labels)
    """

    # Reading a problem
    ids, x = liblinearutil.svm_read_problem(problem_filepath)

    print "len(x) = ", len(x)

    # Preparing a model
    model = liblinearutil.load_model(model_filepath)

    # Predicting
    y = [-2] * len(x)
    p_label, p_acc, p_val = liblinearutil.predict(y, x, model)

    return (ids, p_label)
コード例 #55
0
def classify_text(text, lang, exclude=[]):
    """ 
        makes the text into a feature vector, then classifies it. 
        exclude should be a list of strings to exclude from the vectors
        
        exclude excludes any phrase that contains or is equal to any of the
        string in the exclude list using a case insensitive comparison
        
        TODO: What if an unsupported language is requested?  The files won't be on disk
    """
    
    model = load_model(lang)
    features = load_features(lang)
    
    texts = [text.lower()]
    for e in exclude:#this for loop is not right
        new_texts = []
        for t in texts:
            new_texts = new_texts + t.split(e)
        texts = new_texts
    feature_vector = get_sparse_feature_vector(texts, features, exclude)
    p_label, p_acc, p_val = linu.predict([0], [feature_vector], model)
    p_val = p_val[0][0]/(1+abs(p_val[0][0]))
    return {'label':p_label[0],'value':p_val}
コード例 #56
0
ファイル: tagger.py プロジェクト: gabor-recski/HunTag
    def getLogTagProbsByPos(self, senFeats):
        numberedSenFeats = self.getNumberedSenFeats(senFeats)
        contexts = [dict([(feat, 1) for feat in feats])
            for feats in numberedSenFeats]
        dummyOutcomes = [1 for c in contexts]
        _, __, probDistsByPos = predict(dummyOutcomes, contexts,
                                        self.model, self.params)

        """
        logTagProbsByPos = [ dict([(self.featCounter.noToFeat[i+1],
                                   math.log(prob))
                                   for i, prob in enumerate(probDist)])
                                   for probDist in probDistsByPos]
        """

        logTagProbsByPos = []
        for probDist in probDistsByPos:
            logProbsByTag = {}
            for c, prob in enumerate(probDist):
                tag = self.labelCounter.noToFeat[c+1]
                logProbsByTag[tag] = math.log(prob)
            logTagProbsByPos.append(logProbsByTag)

        return logTagProbsByPos
コード例 #57
0
def _lib_predict_liblinear(to_predict, num_pos, num_neg, modellog):
    sparse_to_predict, num_pos, num_neg = _convert_to_sparse_matrix(to_predict, num_pos, num_neg, False)
    labels_predict = ([1] * num_pos) + ([-1] * num_neg)
    p_labs, p_acc, p_vals = predict(labels_predict, sparse_to_predict, modellog)
    return (p_labs, p_acc, p_vals, labels_predict)
コード例 #58
0
ファイル: maxent.py プロジェクト: EggplantElf/TeamLab
def predict(input_file, model0_file, model1_file, mapping_file, output_file):
    global bos, eos
    out = open(output_file, 'w')
    m0 = ll.load_model(model0_file)
    m1 = ll.load_model(model1_file)
    mapping = Mapping(mapping_file)
    bos = mapping.map_pos('BOS')
    eos = mapping.map_pos('EOS')

    print '# of features:', len(mapping.feature_dict)

    # for easier mapping from neighbouring pos tags to features
    p_f = {}
    for i in mapping.pos_dict_rev: #[0, 1, 2, 3, ...] bos and eos are also included
        pi = mapping.map_pos_rev(i)
        p_f[(-1, i)] = mapping.map_features('POS_P1:%s' % pi)
        p_f[(-2, i)] = mapping.map_features('POS_P2:%s' % pi)
        p_f[(+1, i)] = mapping.map_features('POS_N1:%s' % pi)
        p_f[(+2, i)] = mapping.map_features('POS_N2:%s' % pi)


        for j in mapping.pos_dict_rev:
            pj = mapping.map_pos_rev(j)
            p_f[(-1, -2, i, j)] = mapping.map_features('POS_P1_P2:%s_%s' % (pi, pj))
            p_f[(+1, +2, i, j)] = mapping.map_features('POS_N1_N2:%s_%s' % (pi, pj))
            p_f[(-1, +1, i, j)] = mapping.map_features('POS_P1_N1:%s_%s' % (pi, pj))

    s0, s1, s2 = 0, 0, 0
    total = 0
    for sent in read_sentence(input_file):
        x_ = []
        g_ = []
        for t in sent:
            feat = t.maxent_features(mapping.map_features)
            x_.append(feat)
            g_.append(mapping.map_pos(t.gold_pos))
        y_0 = map(int, ll.predict([], [{k : 1 for k in f} for f in x_], m0, '-q')[0])

        # y_2 = [choice(xrange(1, len(mapping.pos_dict))) for i in y_1]

        y_1, y_2 = inference(m1, p_f, y_0[:], x_, propose_deterministic)

        # y_2 = inference(m1, p_f, y_1[:], x_, propose_probabilistic)

        for y, y0, y1, y2, t in zip(g_, y_0, y_1, y_2, sent):
            if y == y0:
                s0 += 1
            if y == y1:
                s1 += 1
            if y == y2:
                s2 += 1
            total += 1

            p0 = mapping.map_pos_rev(y0)
            p1 = mapping.map_pos_rev(y1)
            p2 = mapping.map_pos_rev(y2)

            out.write('%s\t%s\n' % (t.word, p1))
        out.write('\n')

    out.close()
    print 'acc 0: %d / %d = %.4f' % (s0, total, s0 / total)
    print 'acc 1: %d / %d = %.4f' % (s1, total, s1 / total)
    print 'acc 2: %d / %d = %.4f' % (s2, total, s2 / total)
コード例 #59
0
ファイル: minitagger.py プロジェクト: rasoolims/minitagger
    def train_actively(self, data_train, data_dev):
        """Does margin-based active learning on the given data."""

        # We will assume that we can label every example.
        assert not data_train.is_partially_labeled

        # Keep track of which examples can be still selected for labeling.
        __skip_extraction = []
        for _, label_sequence in data_train.sequence_pairs:
            __skip_extraction.append([False for _ in label_sequence])

        # Create an output directory.
        if os.path.exists(self.active_output_path):
            subprocess.check_output(["rm", "-rf", self.active_output_path])
        os.makedirs(self.active_output_path)
        logfile = open(os.path.join(self.active_output_path, "log"), "w")

        def __make_data_from_locations(locations):
            """
            Makes SequenceData out of a subset of data_train from given
            location=(sequence_num, position) pairs.
            """
            selected_positions = collections.defaultdict(list)
            for (sequence_num, position) in locations:
                selected_positions[sequence_num].append(position)

            sequence_list = []
            for sequence_num in selected_positions:
                word_sequence, label_sequence = \
                    data_train.sequence_pairs[sequence_num]
                selected_labels = [None for _ in range(len(word_sequence))]
                for position in selected_positions[sequence_num]:
                    selected_labels[position] = label_sequence[position]

                    # This example will not be selected again.
                    __skip_extraction[sequence_num][position] = True
                sequence_list.append((word_sequence, selected_labels))

            selected_data = SequenceData(sequence_list)
            return selected_data

        def __train_silently(data_selected):
            """Trains on the argument data in silent mode."""
            self.__feature_extractor.is_training = True  # Reset for training.
            quiet_value = self.quiet
            self.quiet = True
            self.train(data_selected, None)  # No need for development here.
            self.quiet = quiet_value

        def __interval_report(data_selected):
            # Only report at each interval.
            if data_selected.num_labeled_instances % \
                    self.active_output_interval != 0:
                return

            # Test on the development data if we have it.
            if data_dev is not None:
                quiet_value = self.quiet
                self.quiet = True
                _, acc = self.predict(data_dev)
                self.quiet = quiet_value
                message = "{0} labels: {1:.3f}%".format(
                    data_selected.num_labeled_instances, acc)
                print(message)
                logfile.write(message + "\n")
                logfile.flush()

            # Output the selected labeled examples so far.
            file_name = os.path.join(
                self.active_output_path,
                "example" + str(data_selected.num_labeled_instances))
            with open(file_name, "w") as outfile:
                outfile.write(data_selected.__str__())

        # Compute the (active_seed_size) most frequent word types in data_train.
        sorted_wordcount_pairs = sorted(data_train.observation_count.items(),
                                        key=lambda type_count: type_count[1],
                                        reverse=True)
        seed_wordtypes = [wordtype for wordtype, _ in
                          sorted_wordcount_pairs[:self.active_seed_size]]

        # Select a random occurrence of each selected type for a seed example.
        occurring_locations = collections.defaultdict(list)
        for sequence_num, (observation_sequence, _) in \
                enumerate(data_train.sequence_pairs):
            for position, word in enumerate(observation_sequence):
                if word in seed_wordtypes:
                    occurring_locations[word].append((sequence_num, position))
        locations = [random.sample(occurring_locations[wordtype], 1)[0] for
                     wordtype in seed_wordtypes]
        data_selected = __make_data_from_locations(locations)
        __train_silently(data_selected)  # Train for the first time.
        __interval_report(data_selected)

        while len(locations) < data_train.num_labeled_instances:
            # Make predictions on the remaining (i.e., not on the skip list)
            # labeled examples.
            [label_list, features_list, location_list] = \
                self.__feature_extractor.extract_features(\
                data_train, False, __skip_extraction)

            _, _, scores_list = \
                liblinearutil.predict(label_list, features_list,
                                      self.__liblinear_model, "-q")

            # Compute "confidence" of each prediction:
            #   max_{y} score(x,y) - max_{y'!=argmax_{y} score(x,y)} score(x,y')
            confidence_index_pairs = []
            for index, scores in enumerate(scores_list):
                sorted_scores = sorted(scores, reverse=True)

                # Handle the binary case: liblinear gives only 1 score whose
                # sign indicates the class (+ versus -).
                confidence = sorted_scores[0] - sorted_scores[1] \
                    if len(scores) > 1 else abs(scores[0])
                confidence_index_pairs.append((confidence, index))

            # Select least confident examples for next labeling.
            confidence_index_pairs.sort()
            for _, index in confidence_index_pairs[:self.active_step_size]:
                locations.append(location_list[index])
            data_selected = __make_data_from_locations(locations)
            __train_silently(data_selected)  # Train from scratch.
            __interval_report(data_selected)

        logfile.close()