Esempio n. 1
0
File: pos.py Progetto: Adderty/Pyrus
	def train(self, sentences, labels, cross_validation = False):
		x = []
		y = []
		
		for i in range(0, len(sentences)):
			sentence = sentences[i]
			prev = []
			
			j = 0
			for word in sentence:
				body = word.lower()
				
				featurespace = self._construct_featurespace(body, prev)
				
				prev.append((body, labels[i][j]))
				if len(prev) > self.chain_len:
					del(prev[0])
					
				x.append(featurespace.featureset)
				j += 1

			y.extend(labels[i])

		prob = svm.problem(y, x)
		
		if cross_validation:
			param = svm.parameter('-c 1 -v 4 -s 4')
			svm.train(prob, param)
		else:
			param = svm.parameter('-c 1 -s 4')
			self._svm_model = svm.train(prob, param)
Esempio n. 2
0
def solve(train_X, train_Y, test_X, test_Y):
    best_lambda, test_accuracy = train(train_X, train_Y, train_X, train_Y)
    print("Answer for problem 16 is {0}".format(best_lambda))
    best_lambda, test_accuracy = train(train_X, train_Y, test_X, test_Y)
    print("Answer for problem 17 is {0}".format(best_lambda))

    train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 120, 200)
    best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y)
    print("best lambda is: {0}".format(best_lambda))

    model = liblinearutil.train(train_y, train_x, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model)
    print("Answer for problem 18 is {0}".format((100 - accuracy[0]) / 100))

    model = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model)
    print("Answer for problem 19 is {0}".format((100 - accuracy[0]) / 100))

    accuracy = []
    for i in range(5):
        train_x, val_x, train_y, val_y = split_data(train_X, train_Y, 40 * i, 40 * (i + 1))
        best_lambda, test_accuracy = train(train_x, train_y, val_x, val_y)
        accuracy.append(test_accuracy)

    mean_accuracy = np.mean(accuracy, axis=0)
    print("Answer for problem 20 is {0}".format(min(mean_accuracy)))
Esempio n. 3
0
    def train(self, sentences, labels, cross_validation=False):
        x = []
        y = []

        for i in range(0, len(sentences)):
            sentence = sentences[i]
            prev = []

            j = 0
            for word in sentence:
                body = word.lower()

                featurespace = self._construct_featurespace(body, prev)

                prev.append((body, labels[i][j]))
                if len(prev) > self.chain_len:
                    del (prev[0])

                x.append(featurespace.featureset)
                j += 1

            y.extend(labels[i])

        prob = svm.problem(y, x)

        if cross_validation:
            param = svm.parameter('-c 1 -v 4 -s 4')
            svm.train(prob, param)
        else:
            param = svm.parameter('-c 1 -s 4')
            self._svm_model = svm.train(prob, param)
Esempio n. 4
0
    def _complete_training(self, debug=False):
        """ Forward data to external training and extract classifier information
        """
        if self.str_label_function is not None:
            self.label_function = eval(self.str_label_function)
            self.labels = self.label_function()

        options = "-c %.42f  -e %.42f -s %d -B %d" % \
             (self.complexity, self.tolerance, self.alg_num, self.offset)
        for i, w in enumerate(self.weight):
            options += " -w%d %.42f" % (i, w)
        if not self.debug:
            options += " -q"
            self._log("Liblinear is now quiet!")

        import liblinearutil

        param = liblinearutil.parameter(options)
        problem = liblinearutil.problem(self.labels, self.samples)
        model = liblinearutil.train(problem, param)

        self.calculate_classification_vector(model)
        if self.debug:
            print self.print_w
            print self.b
Esempio n. 5
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
Esempio n. 6
0
def train_liblinear(args):
    model_name, gold_dir, dirs = args[0], args[1], args[2:]
    vectors, predicates = get_data(gold_dir, dirs)
    prob = problem(map(num_to_class, predicates), vectors)
    param = parameter('-s 0')
    model = train(prob, param)
    save_model(model_name, model)
Esempio n. 7
0
def tuneParameters(weightString="-w-1 1.0 -w1 1.0 ", trainFile='models/type2_fc_10_11.train',
                   CList=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], BList=[0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], v=10):
  labels = []
  features = []
  bestAccuracy = 0
  bestParams = None
  with open(trainFile) as f:
    for line in f:
      if line == '':
        continue
      data = line.rstrip().split()
      labels.append(float(data[0]))
      data = [float(x.split(':')[1]) for x in data[1:]]
      features.append(data)
  for C in CList:
    for B in BList:
      myOptions =  weightString + "-s 0 -e 0.000001 -c " + str(C) + " -B " + str(B) + " -v " + str(v)
      accuracy = liblinearutil.train(labels, features, myOptions)
      
      print "C: " + str(C) + " B: " + str(B) + " " + str(accuracy)
      if accuracy > bestAccuracy:
        bestAccuracy = accuracy
        bestParams = (C,B)
  
  return bestAccuracy, bestParams[0], bestParams[1]
Esempio n. 8
0
    def _complete_training(self, debug=False):
        """ Forward data to external training and extract classifier information
        """
        if self.str_label_function is not None:
            self.label_function = eval(self.str_label_function)
            self.labels = self.label_function()

        options = "-c %.42f  -e %.42f -s %d -B %d" % \
             (self.complexity, self.tolerance, self.alg_num, self.offset)
        for i,w in enumerate(self.weight):
            options += " -w%d %.42f" % (i, w)
        if not self.debug:
            options += " -q"
            self._log("Liblinear is now quiet!")

        import liblinearutil

        param = liblinearutil.parameter(options)
        problem = liblinearutil.problem(self.labels, self.samples)
        model = liblinearutil.train(problem, param)

        self.calculate_classification_vector(model)
        if self.debug:
            print self.print_w
            print self.b
    def _train(self, folder):
        # train classifier on all .txt files in <folder>
        entities = dict()
    
        files = filter(lambda x: x.endswith('.txt'), os.listdir(folder))
        for f in files:
            # assign number to current language
            print('LogR adds file ' + f)
            for tweet in LogR._read_from_file(os.path.join(folder, f)):
                # get features
                features = self._extract_features(tweet)
                # push out twit into training set
                ind = LogR._get_dict_and_update(self.languages, f[:-4])
                entities[ind] = entities.get(ind, []) + [features]

        target = []
        objects = []
        for lang in entities.keys():
            for obj in entities[lang]:
                target.append(lang)
                objects.append(obj)
        
        print('LogR started training...')
        self.model = liblinearutil.train(target, objects, '-s 0')
        print('Finished!')
Esempio n. 10
0
 def best_C(self,x,y):
     """
     training using y=list,x=dict
     parameter = string of parameters
     searches for best C
     """
     prob=lu.problem(y,x)
     para=""
     para+= "-s 2 -C -B %f -p %f -e %f" % (self.bias,
                                           self.p,
                                           self.eps)
     print para
     para1=lu.parameter(para)
     self.model=lu.train(prob,para1)
     best_C, best_rate = lu.train(y, x, para)
     return best_C, best_rate
Esempio n. 11
0
    def build(self, images, targets, extra):
        shapes, mean_shape, i_stage = extra
        n_landmarks = mean_shape.n_points
        feature_extractor = self.feature_extractor_builder.build(
            images, shapes, targets, (mean_shape, i_stage))

        print("Extracting local binary features for each image.\n")
        features = [
            list(feature_extractor.apply(images[i], shapes[i]))
            for i in xrange(len(images))
        ]
        print("Features extracted.\n")
        w = np.zeros(shape=(2 * n_landmarks, len(features[0])))

        for lmark in xrange(2 * n_landmarks):
            print_dynamic(
                "Learning linear regression coefficients for landmark coordinate {}/{}.\n"
                .format(lmark, 2 * n_landmarks))
            linreg = liblinearutil.train(
                list(targets[:, lmark]), features,
                "-s 12 -p 0 -c {}".format(1 / float(len(features))))
            w_list = linreg.get_decfun()[0]
            w[lmark][0:len(w_list)] = w_list

        return GlobalRegression(feature_extractor, w, mean_shape)
Esempio n. 12
0
    def train(self, data_train, data_dev):
        """Trains Minitagger on the given data."""
        start_time = time.time()
        assert self.__feature_extractor.is_training  # Assert untrained

        # Extract features (only labeled instances) and pass them to liblinear.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_train, False, [])
        if not self.quiet:
            print("{0} labeled instances (out of {1})".format(
                    len(label_list), data_train.num_instances))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} observation types".format(
                    len(data_train.observation_count)))
            print("\"{0}\" feature template".format(
                    self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                    self.__feature_extractor.num_feature_types()))
        problem = liblinearutil.problem(label_list, features_list)
        self.__liblinear_model = \
            liblinearutil.train(problem, liblinearutil.parameter("-q"))
        self.__feature_extractor.is_training = False

        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Training time: {0}".format(
                    str(datetime.timedelta(seconds=num_seconds))))
            if data_dev is not None:
                quiet_value = self.quiet
                self.quiet = True
                _, acc = self.predict(data_dev)
                self.quiet = quiet_value
                print("Dev accuracy: {0:.3f}%".format(acc))
def _lib_train_liblinear(user_tfidf, num_pos, num_neg, ignore):
    param = parameter("-s 0")
    sparse_user_tfidf, num_pos, num_neg = _convert_to_sparse_matrix(user_tfidf, num_pos, num_neg, ignore)
    labels = ([1] * num_pos) + ([-1] * num_neg)
    prob = problem(labels, sparse_user_tfidf)
    modellog = train(prob, param)
    return modellog
Esempio n. 14
0
def svm(name):
    '''Trains a logistic regression model on the feature extracted data.
    
    Name is the data set name, e.g. who_won_1031.
    '''
    data = load_dataset(name)
    return llb.train(data.Y, data.X, '-s 1')
Esempio n. 15
0
    def train(self, data_train, data_dev):
        """Trains Minitagger on the given data."""
        start_time = time.time()
        assert self.__feature_extractor.is_training  # Assert untrained

        # Extract features (only labeled instances) and pass them to liblinear.
        [label_list, features_list, _] = \
            self.__feature_extractor.extract_features(data_train, False, [])
        if not self.quiet:
            print("{0} labeled instances (out of {1})".format(
                len(label_list), data_train.num_instances))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} observation types".format(
                len(data_train.observation_count)))
            print("\"{0}\" feature template".format(
                self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                self.__feature_extractor.num_feature_types()))
        problem = liblinearutil.problem(label_list, features_list)
        self.__liblinear_model = \
            liblinearutil.train(problem, liblinearutil.parameter("-q"))
        self.__feature_extractor.is_training = False

        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            print("Training time: {0}".format(
                str(datetime.timedelta(seconds=num_seconds))))
            if data_dev is not None:
                quiet_value = self.quiet
                self.quiet = True
                _, acc = self.predict(data_dev)
                self.quiet = quiet_value
                print("Dev accuracy: {0:.3f}%".format(acc))
Esempio n. 16
0
 def train(self):
     sys.stderr.write('creating training problem...')
     prob = problem(self.labels, self.contexts)
     sys.stderr.write('done\ntraining with option(s) "' + self.parameters +
                      '"...')
     self.model = train(prob, parameter(self.parameters))
     sys.stderr.write('done\n')
Esempio n. 17
0
def LDA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    # set parameters
    num_topics = 20
    burn_in = 1000  # 0
    alpha = 0.1
    beta = 0.1
    samples = 8
    spacing = 100

    num_test_docs = test_matrix.shape[0]

    sampler = lda.LDA(num_topics, alpha, beta)

    print('Starting!')
    theta, phi, likelihood = sampler.train(matrix, burn_in, samples, spacing)
    print('likelihood: ', likelihood)

    theta_test, likelihood = sampler.classify(test_matrix, phi, burn_in,
                                              samples, spacing)
    print('likelihood: ', likelihood)

    theta = theta / np.sum(theta, 1)[:, None]
    theta_test = theta_test / np.sum(theta_test, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), theta.tolist(), '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs),
                                       theta_test.tolist(), svm_model)
    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
    def train(self, data_train, data_test):
        """
		Trains Minitagger on the given train data. If test data is given, it reports the accuracy of the trained model
		and the F1_score (macro average of f1_score of each label)

		@type data_train: SequenceData
		@param data_train: the training data set
		@type data_test: SequenceData
		@param data_test: the test data set
		"""

        # keep the training start timestamp
        start_time = time.time()
        assert (self.__feature_extractor.is_training
                ), "In order to train, is_training flag should be True"

        # Extract features only for labeled instances from data_train
        [label_list, features_list,
         _] = self.__feature_extractor.extract_features(data_train, False, [])
        # print some useful information about the data
        if not self.quiet:
            print("{0} labeled words (out of {1})".format(
                len(label_list), data_train.num_of_words))
            print("{0} label types".format(len(data_train.label_count)))
            print("{0} word types".format(len(data_train.word_count)))
            print("\"{0}\" feature template".format(
                self.__feature_extractor.feature_template))
            print("{0} feature types".format(
                self.__feature_extractor.num_feature_types()))
        # define problem to be trained using the parameters received from the feature_extractor
        problem = liblinearutil.problem(label_list, features_list)
        # train the model (-q stands for quiet = True in the liblinearutil)
        self.__liblinear_model = liblinearutil.train(
            problem, liblinearutil.parameter("-q"))
        # training is done, set is_training to False, so that prediction can be done
        self.__feature_extractor.is_training = False

        # print some useful information
        if not self.quiet:
            num_seconds = int(math.ceil(time.time() - start_time))
            # how much did the training last
            print("Training time: {0}".format(
                str(datetime.timedelta(seconds=num_seconds))))
            # perform prediction on the data_test and report accuracy
        if data_test is not None:
            quiet_value = self.quiet
            self.quiet = True
            pred_labels, acc = self.predict(data_test)
            self.quiet = quiet_value

            self.__save_prediction_to_file(data_test, pred_labels)
            f1score, precision, recall = report_fscore(self.prediction_path +
                                                       "/predictions.txt",
                                                       wikiner=self.wikiner)
            print("Accuracy: ", acc)
            # create some files useful for debugging
            if self.debug:
                self.__debug(data_test, pred_labels)
        return f1score, precision, recall
Esempio n. 19
0
def alternating_train(x, y, lands, c1, c2=1, params='-s 2 -B 1 -q'):

    nb_views = lands.shape[2]

    L = np.hstack([lands[:, :, v] for v in range(nb_views)])
    M = np.hstack([x[:, :, v] for v in range(nb_views)])

    l = len(lands)
    m = len(x)

    r0, mask = missing_lstsq(L, M)
    s0 = np.dot(r0, L)

    sample = s0.copy()
    R = r0.copy()

    y_list = y.tolist()
    svm = liblin.train(y.tolist(), sample.tolist(),
                       '-c {} '.format(c1) + params)

    it = 0
    while True:
        it += 1

        res = minimize(r_obj_function,
                       R.flatten(),
                       args=(M, y, L, svm, mask, c1, c2),
                       options={'disp': True})
        # for i in range(len(x)):
        #     r_i = minimize(r_obj_function, R[i, :], args=(i, M, y, L, svm, mask, c1, c2), options={'disp':  False})
        #     R[i] = r_i.x
        #     cost += r_i.fun

        print(r_i.fun)
        R = res.x.reshape(m, l)
        sample = np.dot(R, L)

        svm = liblin.train(y.tolist(), sample.tolist(),
                           '-c {} '.format(c1) + params)

        _, p_acc, _ = liblin.predict(y, sample.tolist(), svm, "-q")
        print(p_acc)
        if it == 1:
            break
    return svm
Esempio n. 20
0
def train_liblinear(features, labels, C=1, s=1, folds=10, threads=4):
    print('Training model...')
    start = time.time()
    model = liblinearutil.train(
        labels, features,
        '-c {0} -s {1} -v {2} -n {3}'.format(C, 2, folds, threads))
    end = time.time()
    print('Model trained in ' + str(end - start) + ' seconds')
    return model
Esempio n. 21
0
File: svm.py Progetto: Imperat/Pyrus
    def train_regression(self, x, y):
        data = []
        for sample in x:
            data.append(dict([(self._features.setId(d), sample[d]) for d in sample]))

        self._regression = True
        param = liblinear.parameter("-c 1 -s 0")
        prob = liblinear.problem(y, data)
        self._model = liblinear.train(prob, param)
Esempio n. 22
0
	def train(self):
		y = list(self.training[0])
		x = list(self.training[1])
		y.append(-1)
		x.append({1:1})

		if len(x) <= 1:
			message('unable to recommend because you have not read any feed.', 'Error', die=True)

		return liblinearutil.train(y, x, '-q')
Esempio n. 23
0
    def train_regression(self, x, y):
        data = []
        for sample in x:
            data.append(
                dict([(self._features.setId(d), sample[d]) for d in sample]))

        self._regression = True
        param = liblinear.parameter('-c 1 -s 0')
        prob = liblinear.problem(y, data)
        self._model = liblinear.train(prob, param)
Esempio n. 24
0
def rank_pooling(data):
    #mean = time_varying_mean(data)
    #non_linear_mean = non_linearity(mean)
    normalized_data = normalize(data, axis=1, norm='l2')

    total_frames = normalized_data.shape[0]
    labels = list(range(1, total_frames + 1))
    data = normalized_data.tolist()
    model = train(labels, data, '-s 11 -q')
    return np.array(model.get_decfun()[0])
Esempio n. 25
0
def train_log_regr_liblinear(features, responses):
    echo('Training with Liblinear')
    prob = liblinearutil.problem(responses, features)

    # -s 0: L2-regularized logistic regression (primal)
    # -B 1: Fit a bias term
    # -q: quiet mode
    param = liblinearutil.parameter('-s 0 -B 1 -q')

    return liblinearutil.train(prob, param)
Esempio n. 26
0
def train_SVR_liblinear(features, responses):
    echo('Training with Liblinear')
    prob = liblinearutil.problem(responses, features)

    # -s 11: L2-regularized L2-loss support vector regression (primal)
    # -B 1: Fit a bias term
    # -q: quiet mode
    param = liblinearutil.parameter('-s 11 -B 1 -q')

    return liblinearutil.train(prob, param)
Esempio n. 27
0
def train( C, Y_train, X_train, x_lines ):
    """
    This function takes in the training labels and features and creates a model and saves that model
    :param C       : list containing parameter C
    :param X_train : training features
    :param Y_train : training labels
    :return None
    """
    # for c in C:
    param = '-s 2 -c ' + str(C)
    model = lu.train(Y_train, X_train, param)
    lu.save_model("model/lmods2_tamper" + str(round(C,2)) + "_" + str(x_lines) + "l.model", model)
Esempio n. 28
0
def parallel_train_predict(args):
    print("A process begins.")
    x_train,y_train,x_test,y_test=args
    problem = liblinearutil.problem(y_train, x_train)
    parameter = liblinearutil.parameter('-s 0 -c 1')
    time_start = time.clock()
    model = liblinearutil.train(problem, parameter)
    print("A process training finished in %f."%(time.clock()-time_start))
    time_start = time.clock()
    p_label, p_acc, p_val = liblinearutil.predict(y_test, x_test,model,'-b 0')
    print("A process predicting finished in %f."%(time.clock()-time_start))
    return p_val
Esempio n. 29
0
def validation(k, data_x, data_y, s, e, C):
    accuracies = []
    params = get_params(s, e, C)
    print('s = {}, e = {}, C = {}'.format(s, e, C))
    for fold in range(k):
        train_x, test_x = get_k_fold(k, fold, data_x)
        train_y, test_y = get_k_fold(k, fold, data_y)
        m = liblinearutil.train(train_y, train_x, params)
        _, p_acc, __ = liblinearutil.predict(test_y, test_x, m)
        accuracies.append(p_acc[0])

    return accuracies
Esempio n. 30
0
    def train(self):
        y = list(self.training[0])
        x = list(self.training[1])
        y.append(-1)
        x.append({1: 1})

        if len(x) <= 1:
            message('unable to recommend because you have not read any feed.',
                    'Error',
                    die=True)

        return liblinearutil.train(y, x, '-q')
Esempio n. 31
0
def train(c, Y_train, X_train):
    """
    This function takes in the training labels and features and creates a model and saves that model
    :param C       : list containing parameter C
    :param X_train : training features
    :param Y_train : training labels
    :return None
    """
    #for c in C:
    param = '-s 2 -c ' + str(c)
    model = lu.train(Y_train, X_train, param)
    lu.save_model("model/lmods2_"+str(round(c,2))+".model", model)
Esempio n. 32
0
def h_step(features, codes, verbose=True):
    N, D = features.shape
    models = []
    for (y, i) in zip(codes.T, range(codes.shape[1])):
        t_start = timeit.default_timer()
        models.append(
            liblinearutil.train(y.tolist(), features.tolist(),
                                str('-s 0 -c 4 -q')))
        t_end = timeit.default_timer()
        if verbose:
            print('[H] {:3d}th bit, {:.4f} seconds elapsed'.format(
                i, t_end - t_start))
    return models
Esempio n. 33
0
def train(train_data, features, c):
    x = []
    y = []

    for key in train_data:
        y.append(train_data[key]['class'])
        x.append(features[key])

    prob = liblinearutil.problem(y, x)
    param = liblinearutil.parameter('-q -c ' + str(c) )
    model = liblinearutil.train(prob, param)

    return model
Esempio n. 34
0
File: svm.py Progetto: Imperat/Pyrus
    def train(self, x, y, biased=False):
        data = []
        for sample in x:
            data.append(dict([(self._features.setId(d), sample[d]) for d in sample]))

        labels = [self._labels.setId(C) for C in y]
        if self._labels.count() == 2:
            labels = [1 if label == 1 else -1 for label in labels]
            param = liblinear.parameter("-c 1 -s 2 -q" + (" -B {0}".format(biased) if biased else ""))
        else:
            param = liblinear.parameter("-c 1 -s 4 -q" + (" -B {0}".format(biased) if biased else ""))
        prob = liblinear.problem(labels, data)
        self._model = liblinear.train(prob, param)
Esempio n. 35
0
def unimodalPredDev(gs, feats, nDim):
	parts = ['dev']
	[cccs, preds] = [{} for i in range(2)]
	for s in parts:
		cccs[s] = -1.0
	warnings.filterwarnings('ignore', category=ConvergenceWarning)
	#Liblinear
	for comp in v.C:
		#Options for liblinear
		options = "-s "+str(v.sVal)+" -c "+str(comp)+" -B 1 -q"
		#We learn the model on train
		model = train(gs['train'][nDim],feats['train'],options)
		#We predict on data
		for s in parts:
			pred = np.array(predict(gs[s][nDim],feats[s],model,"-q"))[0]
			#We calculate the correlation and store it
			ccc = cccCalc(np.array(pred),gs[s][nDim])
			if (ccc > cccs[s]):
				preds[s] = pred
				cccs[s] = ccc
				function = "SVR"
				alpha = comp
	if (v.fullMode == True):
		#We see if we can do better with sklearn
		for nbFunc in range(len(v.lFunc)):
			for c in v.parFunc[nbFunc]:
				func = v.lFunc[nbFunc]
				reg = func[0](alpha=c)
				#One task prediction
				if (func[1] == 0):
					reg.fit(feats['train'],gs['train'][nDim])
					for s in parts:
						p = reg.predict(feats['dev'])
						ccc = cccCalc(p,gs[s][nDim])
						if (ccc > cccs[s]) : 
							preds[s] = p
							cccs[s] = ccc
							function = func[2]
							alpha = c
				#Multi task prediction
				else :
					reg.fit(feats['train'],np.transpose(gs['train']))
					for s in parts:
						p = reg.predict(feats['dev'])[:,nDim]
						ccc = cccCalc(p,gs[s][nDim])
						if (ccc > cccs[s]) : 
							preds[s] = p
							cccs[s] = ccc
							function = func[2]
							alpha = c
	return cccs, preds, function, alpha
Esempio n. 36
0
def TOKEN_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    n_docs = matrix.shape[0]
    n_test_docs = test_matrix.shape[0]
    matrix = matrix / np.sum(matrix, 1)[:, None]
    test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), matrix.tolist(), '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(n_test_docs),
                                       test_matrix.tolist(), svm_model)

    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
Esempio n. 37
0
def train(train_X, train_Y, test_X, test_Y):
    test_accuracy = []

    model_1 = liblinearutil.train(train_Y, train_X, '-s 0 -c 5000 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_1)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_2 = liblinearutil.train(train_Y, train_X, '-s 0 -c 50 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_2)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_3 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.5 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_3)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_4 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.005 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_4)
    test_accuracy.append((100 - accuracy[0]) / 100)

    model_5 = liblinearutil.train(train_Y, train_X, '-s 0 -c 0.00005 -e 0.000001')
    label, accuracy, value = liblinearutil.predict(test_Y, test_X, model_5)
    test_accuracy.append((100 - accuracy[0]) / 100)

    return lambda_set[test_accuracy.index(min(test_accuracy))], test_accuracy
Esempio n. 38
0
    def train(self, x, y, biased=False):
        data = []
        for sample in x:
            data.append(
                dict([(self._features.setId(d), sample[d]) for d in sample]))

        labels = [self._labels.setId(C) for C in y]
        if self._labels.count() == 2:
            labels = [1 if label == 1 else -1 for label in labels]
            param = liblinear.parameter(
                '-c 1 -s 2 -q' + (' -B {0}'.format(biased) if biased else ''))
        else:
            param = liblinear.parameter(
                '-c 1 -s 4 -q' + (' -B {0}'.format(biased) if biased else ''))
        prob = liblinear.problem(labels, data)
        self._model = liblinear.train(prob, param)
Esempio n. 39
0
def eval_SVM(X, y, Xhat, yhat):
    # create classification problem
    problem = liblinearutil.problem(y, X)

    # set SVM parameters
    svm_param = liblinearutil.parameter('-s 3 -c 10 -q -B 1')

    # train SVM
    model = liblinearutil.train(problem, svm_param)

    # predict and evaluate
    p_label, p_acc, p_val = liblinearutil.predict(yhat, Xhat, model, '-q')

    # compute accuracy
    acc, mse, scc = liblinearutil.evaluations(yhat, p_label)
    return acc
Esempio n. 40
0
    def build(self, images, targets, extra):
        shapes, mean_shape, i_stage = extra
        n_landmarks = mean_shape.n_points
        feature_extractor = self.feature_extractor_builder.build(images, shapes, targets, (mean_shape, i_stage))

        print("Extracting local binary features for each image.\n")
        features = [ list(feature_extractor.apply(images[i], shapes[i])) for i in xrange(len(images)) ]
        print("Features extracted.\n")
        w = np.zeros(shape=(2*n_landmarks, len(features[0])))

        for lmark in xrange(2*n_landmarks):
            print_dynamic("Learning linear regression coefficients for landmark coordinate {}/{}.\n".format(lmark, 2*n_landmarks))
            linreg = liblinearutil.train(list(targets[:, lmark]), features, "-s 12 -p 0 -c {}".format(1/float(len(features))))
            w_list = linreg.get_decfun()[0]
            w[lmark][0:len(w_list)] = w_list

        return GlobalRegression(feature_extractor, w, mean_shape)
Esempio n. 41
0
def AT_FA_SVM(matrix, test_matrix, n_authors, doc_authors, vocab, stopwords):
    # set parameters
    num_topics = 4
    burn_in = 1000  # 0
    alpha = 0.1
    beta = 0.1
    samples = 8
    spacing = 100

    num_test_docs = test_matrix.shape[0]

    doc_authors_new, n_authors_new = add_fic_authors(doc_authors, n_authors)

    sampler = at.AtSampler(num_topics, n_authors_new, alpha, beta)

    print('Starting!')
    theta, phi, likelihood = sampler.train(doc_authors_new, matrix, burn_in,
                                           samples, spacing)
    print('theta:', theta.shape)
    print('phi:', phi.shape)
    print('likelihood:', likelihood)

    sampler.n_authors = num_test_docs

    theta_test = sampler.classify(test_matrix, phi, burn_in, samples, spacing)
    print('theta test:', theta_test.shape)

    training_matrix = concatenate_fic_authors(doc_authors, num_topics)

    num_test_docs = test_matrix.shape[0]
    test_matrix = np.concatenate((theta_test, theta_test), axis=1)

    training_matrix = training_matrix / np.sum(training_matrix, 1)[:, None]
    test_matrix = test_matrix / np.sum(test_matrix, 1)[:, None]

    svm_model = ll.train(sum(doc_authors, []), training_matrix.tolist(),
                         '-c 4')
    p_label, p_acc, p_val = ll.predict(np.random.rand(num_test_docs),
                                       test_matrix.tolist(), svm_model)

    author_probs = np.zeros((n_test_docs, n_authors))
    for doc, author in enumerate(p_label):
        author_probs[doc, int(author)] = 1

    return author_probs
Esempio n. 42
0
 def learn_embedding(self, G, **kwargs):
     print('l2svm G')
     print(nx.info(G))
     npairs = G.number_of_nodes() * (G.number_of_nodes() - 1)
     self.mapping = {}
     self.feature_vecs = np.zeros((npairs, len(G.nodes[0]['fingerprint']) * 3))
     labels = np.zeros(npairs) - 1
     
     k = 0
     nnodes = G.number_of_nodes()
     for i in range(nnodes - 1):
         fpi = G.nodes[i]['fingerprint']
         for j in range(i + 1, nnodes):
             fpj = G.nodes[j]['fingerprint']
             self.feature_vecs[k] = self._make_feature_vecs(fpi, fpj)
             labels[k] = labels[k + 1] = int(G.has_edge(i, j))
             self.mapping[(i, j)] = k
             k += 1
             self.feature_vecs[k] = self._make_feature_vecs(fpj, fpi)
             self.mapping[(j, i)] = k 
             k += 1
     assert np.all(labels >= 0)
     print('%d training instance' % (len(labels)))
     """
     self.svm = LinearSVC(loss='hinge', C=self.C, tol=0.1,
                          random_state=self.random_seed, verbose=1)
     self.svm.fit(self.feature_vecs, labels)
     print('Training completed')
     print('Accuracy on training set', self.svm.score(self.feature_vecs[:10], labels[:10]))
     """
     # -s 3 is l2 regularized l1 loss functino
     params = '-s 2 -C' # parameter selection
     params = '-s %d -c %f' % (self.s, self.C) 
     
     print('SVM param:', params)
     
     self.svm = train(labels, self.feature_vecs, params)
     print('SVM instance', self.svm)
     test_true_edges = list(G.edges())[:10]
     test_neg_edges = list(nx.complement(G).edges())[:10]
     yscore = self.get_edge_scores(test_true_edges + test_neg_edges)
     ylabel = np.concatenate((np.ones(10), np.zeros(10)))
     auc = roc_auc_score(ylabel, yscore)
     print('AUC on training set', auc)
     self.G = G
Esempio n. 43
0
def main():
    if __name__ == "__main__":
        y, x = svm_read_problem(feature_file, return_scipy=True)
        # train:test = 7:3
        train_X = x[:14000]
        train_y = y[:14000]
        test_X = x[14000:]
        test_y = y[14000:]

        prob = problem(train_y, train_X)
        param = parameter("-c 1 -s 2")
        model = train(prob, param)
        p_labs, p_acc, p_vals = predict(test_y, test_X, model)
        accuracy, precision, recall = metrics_result(test_y, p_labs)
        print
        print "accuracy: ", accuracy
        print "precision: ", precision
        print "recall: ", recall
Esempio n. 44
0
 def train(self,x,y):
     """
     training using y=list,x=dict
     parameter = string of parameters
     """
     prob=lu.problem(y,x)
     para=""
     para+= "-s %d -c %f -B %f -p %f -e %f" % (self.L,
                                               self.c,
                                               self.bias,
                                               self.p,
                                               self.eps)
     if(self.v!=0):
         para+=" -v %d" % self.v
     if(self.q!=0):
         para+= " -q"
     print para
     para1=lu.parameter(para)
     self.model=lu.train(prob,para1)
     return True
def run_classifier(train_file, test_file):

        count_one=0

        y_train, x_train = svm_read_problem(train_file)

        counter=0
        while counter<len(y_train):
                if y_train[counter]==-1:
                        count_one=count_one+1
                counter=counter+1

        w1=count_one/float(len(y_train))
        #w1=0.95 # Extra credit
        #w1=0.95 # Extra credit 
        param='-s 0 -w1 '+str(w1)+' -w-1 '+str(1-w1)
        #param='-s 0'   # Extra Credit
        model = train(y_train, x_train, param)

        y_test, x_test = svm_read_problem(test_file)
        p_labels, p_acc, p_vals = predict(y_test, x_test, model, '-b 1')


        accuracy = p_acc[0]

        index=0
        if model.label[0]==1:
                index=0
        elif model.label[1]==1:
                index=1

        counter=0
        prob_list=[]
        while counter<len(p_vals):
                prob_list.append(p_vals[counter][index])
                counter=counter+1

        output_tup=(p_labels, y_test, prob_list)

        return output_tup
Esempio n. 46
0
def unimodalPredTest(gs, feats, nDim, func, c):
    [cccs, preds] = [{} for i in range(2)]
    for s in v.aPart:
        cccs[s] = -1.0
    warnings.filterwarnings('ignore', category=ConvergenceWarning)
    if (func == "SVR"):
        #Options for liblinear
        options = "-s " + str(v.sVal) + " -c " + str(c) + " -B 1 -q"
        #We learn the model on train
        model = train(gs['train'][nDim], feats['train'], options)
        #We predict on data
        for s in v.aPart:
            pred = np.array(predict(gs[s][nDim], feats[s], model, "-q"))[0]
            #We calculate the correlation and store it
            ccc = cccCalc(np.array(pred), gs[s][nDim])
            if (ccc > cccs[s]):
                preds[s] = pred
                cccs[s] = ccc
    else:
        for f in v.lFunc:
            if (f[2] == func):
                fun = f
        reg = fun[0](alpha=c)
        if (fun[1] == 0):
            reg.fit(feats['train'], gs['train'][nDim])
            for s in v.aPart:
                p = reg.predict(feats[s])
                ccc = cccCalc(p, gs[s][nDim])
                if (ccc > cccs[s]):
                    preds[s] = p
                    cccs[s] = ccc
        else:
            reg.fit(feats['train'], np.transpose(gs['train']))
            for s in v.aPart:
                p = reg.predict(feats[s])[:, nDim]
                ccc = cccCalc(p, gs[s][nDim])
                if (ccc > cccs[s]):
                    preds[s] = p
                    cccs[s] = ccc
    return cccs, preds, func, c
Esempio n. 47
0
def _svm_test_attr_unit(worker_idx, idx_attr_rng, feat_train, feat_test,
                        label_train, label_test, attr_entry, cache_dir):
    idx_list = range(idx_attr_rng[0], idx_attr_rng[1])
    c_list = [0.1, 1., 10.]
    pred = np.zeros((label_test.shape[0], len(idx_list)), dtype=np.float32)
    for i, idx in enumerate(idx_list):
        t = time.time()
        l_train = label_train[:, idx].astype(np.int)
        l_test = label_test[:, idx].astype(np.int)
        w1 = l_train.size / l_train.sum() - 1
        # w1 = 1.
        # if param_C_by_CV:
        #     c, _ = liblinear.train(l_train, feat_train, '-s 0 -B 1. -C -w1 %f -q' % w1)
        #     c = max(0.1, c)
        # else:
        #     c = 512.
        best_acc = -1.
        for c in c_list:
            svm_model = liblinear.train(l_train, feat_train,
                                        '-s 0 -B 1. -c %f -w1 %f -q' % (c, w1))
            svm_out = liblinear.predict(l_test, feat_test, svm_model,
                                        '-b 1 -q')
            acc = svm_out[1][0]
            if acc > best_acc:
                best_acc = acc
                best_c = c
                k = svm_model.get_labels().index(1)
                prob = np.array(svm_out[2])[:, k]

        pred[:, i] = prob
        print(
            'worker [%d]: "%s(%d)" [%d/%d], acc: %f, c: %f, time cost: %.2f sec'
            % (worker_idx, attr_entry[idx]['entry'], idx, i, len(idx_list),
               best_acc, best_c, time.time() - t))

    io.save_data(pred, os.path.join(cache_dir, '%d.pkl' % worker_idx))
Esempio n. 48
0
    def Train(self):
    
        # Check classifier type
        if(self.classifierType == "SVM"):
            
            if(self.packageType == "liblinear"):
                from liblinearutil import train            
                self.cParam = 4# Best cross validation accuracy
                self.nFoldsParam = 10
                self.classifierModel = train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam))
                train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam) + ' -v ' + str(self.nFoldsParam))
            if(self.packageType == "libsvm"):
                from svmutil import svm_train
                self.cParam = 32# Best cross validation accuracy
                self.nFoldsParam = 10
                self.classifierModel = train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam))
                train(self.trainTargets, self.trainFeatures, '-c ' + str(self.cParam) + ' -v ' + str(self.nFoldsParam))            
        elif(self.classifierType == "DecisionTree"):
            if(self.packageType == "nltk"):
                import nltk 
                train_set = []
                i = 0;
                weights = [];
                for fet in self.trainFeatures:
                    train_set.append((self.trainFeatures[i],self.trainTargets[i]))
                    weights.append( i * 0.5)                   
                    i +=1
                self.classifierModel = nltk.DecisionTreeClassifier.train(train_set,entropy_cutoff=.01,depth_cutoff=300,binary=True,verbose=True)
                '''
                self.classifierModel = AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=1000),
                         algorithm="SAMME",
                         n_estimators=200)
                '''
                '''
                self.classifierModel = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1000),
                         algorithm="SAMME",
                         n_estimators=200)
                '''

                #self.classifierModel.fit(train_set)
                #sorted(self.classifierModel.labels())
                #print(self.classifierModel)
            elif(self.packageType == "sklearn"):
                import sklearn.tree
                self.classifierModel = sklearn.tree.DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=1000)
                
                # Convert into array not dictionary
                trainFeatures = []
                for feature in self.trainFeatures:
                    trainFeatures.append(list(feature.values()))
                    
                self.classifierModel.fit(trainFeatures, self.trainTargets)
        elif(self.classifierType == "AdaBoost"):                
                if(self.packageType == "sklearn"):
                    import sklearn.ensemble
                    if(self.baseClassifierType == "DecisionTree"):
                        import sklearn.tree
                        self.classifierModel =  sklearn.ensemble.AdaBoostClassifier(
                                                                                    sklearn.tree.DecisionTreeClassifier(criterion="gini", splitter="best", max_depth=1000),
                                                                                    algorithm="SAMME",
                                                                                    n_estimators=200)
                        # Convert into array not dictionary
                        trainFeatures = []
                        for feature in self.trainFeatures:
                            trainFeatures.append(list(feature.values()))
                            
                        self.classifierModel.fit(trainFeatures, self.trainTargets)
                    else:
                        print("Only DecisionTree is supported as base classifier")
                else:
                    print("Only sklearn is supported for AdaBoost")
        else:
            print("Not supported classifier type")
Esempio n. 49
0
def train_model():
    """训练模型
    """
    y, x = svm_read_problem(TRAIN_INPUT_FILE)
    m = train(y, x, "-c 4")
    save_model(SVM_MODEL_FILE, m)
Esempio n. 50
0
def svm_test_single_attr():
    # config
    tar_attr_idx = 1
    train_on_val_set = True
    reduced_dim = 512
    whiten = True
    num_attr = 1000

    opt = TestAttributeOptions().parse()

    # extract feature
    feat_data = extract_feature(opt)
    feat_train = feat_data['feat_train']
    feat_test = feat_data['feat_test']
    print('extract feature done!')

    # load attribute label
    attr_label = io.load_data('datasets/DeepFashion/Fashion_design/' +
                              opt.fn_label)
    attr_entry = io.load_json('datasets/DeepFashion/Fashion_design/' +
                              opt.fn_entry)
    label_train = np.array(
        [attr_label[s_id] for s_id in feat_data['id_list_train']])
    label_test = np.array(
        [attr_label[s_id] for s_id in feat_data['id_list_test']])
    label_train = label_train[:, 0:num_attr]
    label_test = label_test[:, 0:num_attr]

    # label_train = np.random.choice([0,1], size = (feat_train.shape[0], num_attr))
    # label_test = np.random.choice([0,1], size = (feat_test.shape[0], num_attr))

    # get validation feature and label
    id_list_val = io.load_json(
        'datasets/DeepFashion/Fashion_design/Split/ca_split.json')['val']
    id2idx = {s_id: idx for idx, s_id in enumerate(feat_data['id_list_train'])}
    idx_list_val = [id2idx[s_id] for s_id in id_list_val]
    feat_val = feat_train[idx_list_val, :]
    label_val = label_train[idx_list_val, :]
    if train_on_val_set:
        feat_train = feat_val
        label_train = label_val

    print('PCA reduction and whitening...')
    t = time.time()
    pca = PCA(n_components=reduced_dim, whiten=whiten)
    pca.fit(feat_train)
    feat_train = pca.transform(feat_train)
    feat_test = pca.transform(feat_test)
    print('PCA done! (%f sec)' % (time.time() - t))

    t = time.time()
    print(
        'selected attribute: %s(%d)' %
        (attr_entry[tar_attr_idx]['entry'], attr_entry[tar_attr_idx]['type']))
    label_train = label_train[:, tar_attr_idx].astype(np.int)
    label_test = label_test[:, tar_attr_idx].astype(np.int)

    # w1 = label_train.size / label_train.sum() - 1
    w1 = 1.
    print('w1: %f' % w1)

    # best_c , _= liblinear.train(label_train, feat_train, '-s 0 -B 1. -C -w1 %f -q' % w1)
    for c in [0.1, 1., 10.]:
        svm_model = liblinear.train(label_train, feat_train,
                                    '-s 0 -B 1. -c %f -w1 %f -q' % (c, w1))
        svm_out = liblinear.predict(label_test, feat_test, svm_model,
                                    '-b 1 -q')
        print('c = %f, acc = %f' % (c, svm_out[1][0]))
        k = svm_model.get_labels().index(1)
        prob = np.array(svm_out[2])[:, k]

    print('SVM training time: %f sec' % (time.time() - t))

    crit_ap = MeanAP()
    crit_ap.add(prob.reshape(-1, 1), label_test.reshape(-1, 1))
    ap, _ = crit_ap.compute_mean_ap()

    print('AP: %f' % ap)
Esempio n. 51
0
 def train(self):
     if os.path.isfile("svm.model") and self.useModel:
         self.model = llu.load_model("svm.model")
     else:
         self.model = llu.train(self.ys, self.xs, self.train_param)
         llu.save_model("svm.model", self.model)
def train(word_dict):
    get_feature(word_dict, "data/train.dat", "data/train.format")
    get_feature(word_dict, "data/test.dat", "data/test.format")
    train_y, train_x = linear.svm_read_problem("data/train.format")
    model = linear.train(train_y, train_x) 
    linear.save_model("model.dat", model)
Esempio n. 53
0
def train(x, y, c, params='-s 2 -B 1 -q'):
    return liblin.train(y.tolist(), x.tolist(), '-c {} '.format(c) + params)
def classify(ds_cur = None):
	from os import chdir, system
	chdir('./liblinear-2.1/python/')
	from liblinearutil import problem, parameter, train, predict
	chdir('../../')
	from pdb import set_trace
	from tqdm import tqdm
	from pymongo import MongoClient
	from json import dumps
	from bson.objectid import ObjectId

	set_trace()

	dont_include = {'_id' : 0}
	print 'List of variables:\n'
	for key in variable_lookup:
		print key[1]
	ch1 = raw_input('Input "s" to select custom fields (default selection - all fields):')
	if ch1 == 's':
		print 'Please input 0 for fields you would like to exclude, any other input would include it.'
		for key in variable_lookup:
			if key[0] == 'class':
				continue
			ch2 = raw_input(key[1] + ':')
			if ch2 == '0':
				dont_include[key[0]] = 0

	if ds_cur == None:
		conn = MongoClient('mongodb://localhost:27017')
		dataset = conn['rmpdb']['dataset_profs_ten_over']
		ds_cur = dataset.find(filter = {}, projection = dont_include)
		dataset2 = conn['rmpdb']['dataset_profs_five_over_less_ten']
		ds_cur2 = dataset2.find(filter = {}, projection = dont_include)

	X = [] # Variables
	Y = [] # Classes
	ids = [] # Keep track of professor IDs
	X2 = [] # Variables
	Y2 = [] # Classes
	ids2 = [] # Keep track of professor IDs
	

	print 'Building training set according to selection..'
	for row in tqdm(ds_cur):
		x_dict = dict()
		for key in row:
			if key == 'class':
				Y.append(int(row[key]))
			elif key == 'prof_id':
				ids.append(row[key])
			elif isNan(row[key]):
				continue
			else:
				x_dict[int(key)] = float(row[key])
		X.append(x_dict)

	for row in tqdm(ds_cur2):
		x_dict2 = dict()
		for key in row:
			if key == 'class':
				Y2.append(int(row[key]))
			elif key == 'prof_id':
				ids2.append(row[key])
			elif isNan(row[key]):
				continue
			else:
				x_dict2[int(key)] = float(row[key])
		X2.append(x_dict2)

	ch = raw_input('Include top words for males and females as features? (y/n) [n]: ')
	if ch == 'y':
		from glob import glob
		from json import loads

		vec_files = glob('../logs/*.vec')
		if not len(vec_files) == 0:
			print 'Word vector files found in ../logs: \n'
			print vec_files
			fch = raw_input('Enter name of file without extension. [../logs/trial0.vec] Enter 0 to skip. ../logs/')
			if fch == '0':
				male_vector, female_vector = build_vector()
			else:
				try:
					f = open('../logs/' + fch + '.vec', 'r')
					male_vector, female_vector = loads(f.read())
				except:
					f = open('../logs/trial0.vec', 'r')
					male_vector, female_vector = loads(f.read())
		else:
			male_vector, female_vector = build_vector()
		
		print 'Male vectors as (word, count)'
		print male_vector
		print "============================================="
		print 'Female vectors as (word, count)'
		print female_vector
		print "============================================="
		print 'Calculating word features for all professors in dataset. This shall take some time.'
		print 'Depending on your cutoff, this can take from 4 - 6 hours. Probably a good idea to get some other stuff done..'

		male_words = [tup[0] for tup in male_vector]
		female_words = [tup[0] for tup in female_vector]

		union_words = list(set(male_words).union(set(female_words)))
		final_words = list()
		print 'Select words you want to remove by entering "x".'
		for word in union_words:
			wch = raw_input(word + ':')
			if wch == 'x':
				continue
			else:
				final_words.append(word)



		from string import punctuation

		exclude = set(punctuation)
		rmpdb = MongoClient('mongodb://localhost:27017')['rmpdb']
		for i in tqdm(range(len(ids))):
			prof_id = ids[i]

			# male_dict = dict()
			# female_dict = dict()

			# for tup in male_vector:
			# 	male_dict[tup[0]] = 0
			# for tup in female_vector:
			# 	female_dict[tup[0]] = 0

			vec_dict = dict()
			for word in final_words:
				vec_dict[word] = 0

			prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1})
			for comment in prof_comments['all comments']:
				text = comment['rComments']
				no_punc_text = ''.join(ch for ch in text if ch not in exclude)
				toks = no_punc_text.split()

				for tok in toks:
					# if tok.lower() in male_dict:
					# 	male_dict[tok.lower()] += 1
					# if tok.lower() in female_dict:
					# 	female_dict[tok.lower()] += 1
					if tok.lower() in vec_dict:
						vec_dict[tok.lower()] += 1

			feature_counter = 53 #starts right after variable_lookup['53']
			# for j in range(len(male_vector)):
			# 	feature_counter += 1
			# 	tup = male_vector[j]
			# 	if not male_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = male_dict[tup[0]]
			# for j in range(len(female_vector)):
			# 	feature_counter += 1
			# 	tup = female_vector[j]
			# 	if not female_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = female_dict[tup[0]]		
			for j in range(len(final_words)):
				feature_counter += 1
				word = final_words[j]
				if not vec_dict[word] == 0:
					X[i][feature_counter] = vec_dict[word]
				# if feature_counter == 97:
				# 	break

		print "Building test set.."
		for i in tqdm(range(len(ids2))):
			prof_id = ids2[i]

			# male_dict = dict()
			# female_dict = dict()

			# for tup in male_vector:
			# 	male_dict[tup[0]] = 0
			# for tup in female_vector:
			# 	female_dict[tup[0]] = 0

			vec_dict = dict()
			for word in final_words:
				vec_dict[word] = 0

			prof_comments = rmpdb['profs'].find_one({'_id' : ObjectId(prof_id)}, {'_id' : 0, 'all comments.rComments' : 1})
			for comment in prof_comments['all comments']:
				text = comment['rComments']
				no_punc_text = ''.join(ch for ch in text if ch not in exclude)
				toks = no_punc_text.split()

				for tok in toks:
					# if tok.lower() in male_dict:
					# 	male_dict[tok.lower()] += 1
					# if tok.lower() in female_dict:
					# 	female_dict[tok.lower()] += 1
					if tok.lower() in vec_dict:
						vec_dict[tok.lower()] += 1

			feature_counter = 53 #starts right after variable_lookup['53']
			# for j in range(len(male_vector)):
			# 	feature_counter += 1
			# 	tup = male_vector[j]
			# 	if not male_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = male_dict[tup[0]]
			# for j in range(len(female_vector)):
			# 	feature_counter += 1
			# 	tup = female_vector[j]
			# 	if not female_dict[tup[0]] == 0:
			# 		X[i][feature_counter] = female_dict[tup[0]]		
			for j in range(len(final_words)):
				feature_counter += 1
				word = final_words[j]
				if not vec_dict[word] == 0:
					X2[i][feature_counter] = vec_dict[word]

		print 'Words used:'
		print final_words

	else:
		pass

	print 'Writing temp files for AUC calculation..'
	build_svm_file(X, Y)
	print 'Temp file written..'
	print 'Features used:'
	fstr = list()
	for key in variable_lookup:
		if key[0] in dont_include or key[0] == 'class':
			continue
		else:
			fstr.append(key[1])
	print dumps(fstr)
	print '======================================\n'
	prob = problem(Y, X)
	param = parameter('-s 6 -v 10')
	m = train(prob, param)
	print 'Evaluating..\n'
	system('liblinear-2.1/train -s 6 -v 10 liblinear-2.1/temp_ds')
	model = train(prob, parameter('-s 6 -q'))
	#system('rm liblinear-2.1/temp_ds')

	print 'Testing model on test set..'
	p_Y2, p_acc, p_vals = predict(Y2, X2, model)

	contingency_mat = [[0, 0], [0, 0]]
	for i in range(len(Y2)):
		if (Y2[i] == 0) and (p_Y2[i] == 0):
			contingency_mat[0][0] += 1
		elif (Y2[i] == 0) and (p_Y2[i] == 1):
			contingency_mat[0][1] += 1
		elif (Y2[i] == 1) and (p_Y2[i] == 0):
			contingency_mat[1][0] += 1
		else:
			contingency_mat[1][1] += 1


	return (model, p_acc, contingency_mat)
Esempio n. 55
0
def train(y, x, params):
    """ Trains on y,x and returns the model """
    print "Training on data of size: ", len(y)
    m = llu.train(y,x, params)
    return m
Esempio n. 56
0
#coding: utf-8
import liblinearutil
import outputLIBSVMformat

train_label, train_data = liblinearutil.svm_read_problem("./train_libsvmFormat.txt")
#カーネル関数は線型
model = liblinearutil.train(train_label, train_data, "-s 3")

test_label, test_data = liblinearutil.svm_read_problem("./test_libsvmFormat.txt")
p_label, p_acc, p_val = liblinearutil.predict(test_label, test_data, model)

Esempio n. 57
0
def train(instance_file, model_file, param):
    y, x = ll.svm_read_problem(instance_file)
    prob = ll.problem(y, x)
    m = ll.train(prob, param)
    ll.save_model(model_file, m)
    print 'done training', model_file
Esempio n. 58
0
 def train(self):
     sys.stderr.write('creating training problem...')
     prob = problem(self.labels, self.contexts)
     sys.stderr.write('done\ntraining with option(s) "'+self.parameters+'"...')
     self.model = train(prob, parameter(self.parameters))
     sys.stderr.write('done\n')
def simpleLibLinear(X_train,Y_train):
	prob = ll.problem(Y_train,X_train)
	param = ll.parameter('-c '+str(c))
	m = ll.train(prob, param)
	return m