def classification_linear_svm(tweets,
                              train_index,
                              test_index,
                              labels_train,
                              random_state=None):
    """Classifies using SVM as classifier
    """

    #Representation
    tfidf_parser = TfidfVectorizer(tokenizer=tokenize,
                                   lowercase=False,
                                   analyzer='word')
    tweets_train = [tweets[tweet_index] for tweet_index in train_index]
    tweets_test = [tweets[tweet_index] for tweet_index in test_index]

    train_sparse_matrix_features_tfidf = tfidf_parser.fit_transform(
        tweets_train)
    test_sparse_matrix_features_tfidf = tfidf_parser.transform(tweets_test)

    classifier = LinearSVC(multi_class="ovr", random_state=random_state)
    print("Start SVM training")
    classifier = classifier.fit(train_sparse_matrix_features_tfidf,
                                labels_train)
    print("Finish SVM training")
    y_labels = classifier.predict(test_sparse_matrix_features_tfidf)

    return y_labels
class Expander_LDA_multiclass(Expander_LDA_cossim):
    """
    take LDA vectors of labelled articles and do a multi-class
    classification for deciding where the LDA of the test text belongs
    """

    def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass):
        Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type)
        self.classifier = LinearSVC()

    def transform(self, X):
        results = Expander_LDA_cossim.transform(self, X)
        return [self._getDenseVector(item) for item in results]

    def _getDenseVector(self, sparse_vec):
        return sparse2full(sparse_vec, self.ldaModel.num_topics)

    def fit(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)

    def predict(self, X_test, acronym):
        labels = self.classifier.predict(X_test)

        decisions = self.classifier.decision_function(X_test)

        confidences = self._getConfidencesFromDecisionFunction(labels, decisions)

        return labels, confidences
Esempio n. 3
0
    def fit(self, data, args):
        self.model = LinearSVC()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
def applyModelandfit(tweet_list, tweet_label_list,all_tweet,model_name,filename):
    X, y, tweet_id_list = buildMatrixTrainAndTest(tweet_list, tweet_label_list, all_tweet)
    X_train = X[:len(y),:]
    y_train = y
    X_test =  X[len(y):,:]
    tweet_id_list_test = tweet_id_list[len(y):]
    print "number of training tweets are ", X_train.shape, len(y_train)
    if model_name == 'SVM':
        clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
        clf.fit(X_train, y_train)
    elif model_name == 'NaiveBayes':
        clf = GaussianNB()
        clf.fit(X_train.todense(), y_train)
    elif model_name == 'LogisticRegression':
        clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
        clf.fit(X_train.toarray(), y_train)
    else:
        raise Exception("The model name is incorrect!!!")

    y_pred = clf.predict(X_test)
    print 'length of predict data is ', len(y_pred)
    with open(RESULT_FOLDER+'/'+filename+'_c.csv','wb') as fp:
        writer = csv.writer(fp, delimiter =",",quoting=csv.QUOTE_MINIMAL)
        for i, tweetid in enumerate(tweet_id_list_test):
            writer.writerow([tweetid, all_tweet[tweetid], y_pred[i]])
Esempio n. 5
0
class LinearSVCImpl():

    def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000):
        self._hyperparams = {
            'penalty': penalty,
            'loss': loss,
            'dual': dual,
            'tol': tol,
            'C': C,
            'multi_class': multi_class,
            'fit_intercept': fit_intercept,
            'intercept_scaling': intercept_scaling,
            'class_weight': class_weight,
            'verbose': verbose,
            'random_state': random_state,
            'max_iter': max_iter}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)
Esempio n. 6
0
def get_svm_score(w, b_h, dataset):
    """
    Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation
    :param w: Weights
    :param b_h: Hidden biases
    :param dataset: A Dataset object
    :return: A scalar score
    """
    proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h)
    classifier = LinearSVC()
    classifier.fit(proj_training_data, dataset.training_set.target)
    proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h)
    predicted_labels = classifier.predict(proj_test_data)
    score = percent_correct(dataset.test_set.target, predicted_labels)
    return score
Esempio n. 7
0
def get_svm_score(w, b_h, dataset):
    """
    Given a trained RBM, get the classification score of a linear SVM trained on the hidden Representation
    :param w: Weights
    :param b_h: Hidden biases
    :param dataset: A Dataset object
    :return: A scalar score
    """
    proj_training_data = sigm(dataset.training_set.input.dot(w)+b_h)
    classifier = LinearSVC()
    classifier.fit(proj_training_data, dataset.training_set.target)
    proj_test_data = sigm(dataset.test_set.input.dot(w)+b_h)
    predicted_labels = classifier.predict(proj_test_data)
    score = percent_correct(dataset.test_set.target, predicted_labels)
    return score
Esempio n. 8
0
class CreateLinearSVC(CreateModel):
    def fit(self, data, args):
        self.model = LinearSVC()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def predict(self, data):
        assert self.model is not None

        with Timer() as t:
            self.predictions = self.test(data)

        return t.interval
Esempio n. 9
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Esempio n. 10
0
    def test_model_within_optimizer(self):
        pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())])
        n_features_options = [2, 4, 8]
        c_options = [1, 10, 100, 1000]
        param_grid = [
            {
                'reduce_dim': [PCA(iterated_power=7),
                               NMF()],
                'reduce_dim__n_components': n_features_options,
                'classify__C': c_options
            },
            {
                'reduce_dim': [SelectKBest(chi2)],
                'reduce_dim__k': n_features_options,
                'classify__C': c_options
            },
        ]
        grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid)
        digits = load_digits()
        grid.fit(digits.data, digits.target)

        try:
            Porter(grid, language='java')
        except ValueError:
            self.assertTrue(False)
        else:
            self.assertTrue(True)
Esempio n. 11
0
 def  train_with_svm(self):
     rbm = BernoulliRBM(random_state=0, verbose=False)
     svc = LinearSVC(C=10.0,class_weight='balanced',max_iter=100)
     classifier = Pipeline(steps=[('rbm', rbm), ('svm', svc)])
     
     rbm.learning_rate = 0.05
     rbm.n_iter = 30
     # More components tend to give better prediction performance, but larger
     # fitting time
     rbm.n_components = 100
     
     classifier.fit(self.X, self.Y)
     self.classifier = classifier
     joblib.dump(classifier,"rbm.pkl")
def runandsaveModel(tweet_list, tweet_label_list,model_name):
    X, y, vectorizer= buildMatrixTrain(tweet_list, tweet_label_list)

    print "number of training tweets are ", X.shape, len(y)

    #trainning the model
    if model_name == 'SVM':
        clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
        clf.fit(X, y)
    elif model_name == 'NaiveBayes':
        clf = GaussianNB()
        clf.fit(X.todense(), y)
    elif model_name == 'LogisticRegression':
        clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
        clf.fit(X.toarray(), y)
    else:
        raise Exception("The model name is incorrect!!!")

    #save the model
    model = Model(model_name, clf, vectorizer)
    with open(RESULT_FOLDER+"/"+model_name+"_model.m","wb") as pf:
        pickle.dump(model,pf)
    print model_name, "is saved at", RESULT_FOLDER+"/"+model_name+"_model.m"
Esempio n. 13
0
 def __init__(self, penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight='balanced', verbose=0, random_state=None, max_iter=1000):
     self._hyperparams = {
         'penalty': penalty,
         'loss': loss,
         'dual': dual,
         'tol': tol,
         'C': C,
         'multi_class': multi_class,
         'fit_intercept': fit_intercept,
         'intercept_scaling': intercept_scaling,
         'class_weight': class_weight,
         'verbose': verbose,
         'random_state': random_state,
         'max_iter': max_iter}
     self._wrapped_model = SKLModel(**self._hyperparams)
Esempio n. 14
0
def single_model_tuning(modelname, fold_nr):
    """
    The thread function that can be used for finding the best model hyperparameters, for a single, non-ensemble model,
    for a fixed preprocessor, this method requires the data to be split in folds first.

    parameters:
    :param str modelname: The name of the model to test.
    :param int fold_nr: The number of the fold.
    :return list<dict> results: A list of dictionaries containing the parameter setting and the mae.
    """
    # Init a best mae so far (for printing purposes)
    best = 10
    try:
        log('Fold: ' + str(fold_nr) + ': Loaded the cached preprocessed data.')
        X_train, X_val, y_train, y_val, rev_val = load_fold(fold_nr)
    except IOError:
        log('Fold: ' + str(fold_nr) + 'run "python kfold_prepr.py" first')
    results = []

    # Tune a model based on the command line argument
    if modelname == 'log':
        par = ParameterGrid({
            'logistic__C': np.logspace(-5.0, 5.0, num=11),
            'logistic__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            logistic = LogisticRegression(solver='sag',
                                          n_jobs=NUM_THEADS,
                                          C=a['logistic__C'],
                                          tol=a['logistic__tol'])
            logistic.fit(X_train, y_train)
            predictions_val = logistic.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'logistic__C': a['logistic__C'],
                'logistic__tol': a['logistic__tol'],
                'mae': mae
            })
    elif modelname == 'ridge':
        par = ParameterGrid({'ridge__alpha': np.logspace(-5.0, 5.0, num=11)})
        for a in list(par):
            ridge = OrdinalRidge(a['ridge__alpha'])
            ridge.fit(X_train, y_train)
            predictions_val = ridge.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({'ridge__alpha': a['ridge__alpha'], 'mae': mae})
    elif modelname == 'svc':
        par = ParameterGrid({
            'svc__C': np.logspace(-5.0, 5.0, num=11),
            'svc__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svc = LinearSVC(C=a['svc__C'], tol=a['svc__tol'])
            svc.fit(X_train, y_train)
            predictions_val = svc.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'svc__C': a['svc__C'],
                'svc__tol': a['svc__tol'],
                'mae': mae
            })
    elif modelname == 'lad':
        par = ParameterGrid({
            'lad__C': np.logspace(-5.0, 5.0, num=11),
            'lad__tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            svr_ = svm.LinearSVR(loss='squared_epsilon_insensitive')
            svr = LAD(svr_)  # use mord for rounding and clipping
            svr.fit(X_train, y_train)
            predictions_val = svr.predict(X_val)
            mae = mean_absolute_error(predictions_val, y_val)
            results.append({
                'lad__C': a['lad__C'],
                'lad__tol': a['lad__tol'],
                'mae': mae
            })
    elif modelname == 'final':
        # This is the tuning of the final ensemble, with fixing 0 rating predictions
        par = ParameterGrid({
            'logistic_lbfgs__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_lbfgs_multinom__tol':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__C':
            np.logspace(-5.0, 5.0, num=11),
            'logistic_sag_balanced__tol':
            np.logspace(-5.0, 5.0, num=11)
        })

        ensemble = VotingClassifier(estimators=[
            ('logistic_lbfgs',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01)),
            ('logistic_lbfgs_multinom',
             LogisticRegression(solver='lbfgs',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                multi_class='multinomial')),
            ('logistic_sag_balanced',
             LogisticRegression(solver='sag',
                                n_jobs=NUM_THEADS,
                                C=5,
                                tol=0.01,
                                class_weight='balanced')),
        ],
                                    voting='soft',
                                    weights=[1, 1, 1])

        for a in list(par):
            ensemble.set_params(**a)
            ensemble.fit(X_train, y_train)
            predictions_val = ensemble.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_bal':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-1.0, 1.0, num=5),
            'tol': np.logspace(-3.0, -1.0, num=3)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'lbfgs_multi':
        clf = LogisticRegression(solver='lbfgs',
                                 n_jobs=NUM_THEADS,
                                 multi_class='multinomial')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'sag_bal':
        clf = LogisticRegression(solver='sag',
                                 n_jobs=NUM_THEADS,
                                 class_weight='balanced')
        par = ParameterGrid({
            'C': np.logspace(-5.0, 5.0, num=11),
            'tol': np.logspace(-5.0, 5.0, num=11)
        })
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    elif modelname == 'nb':
        clf = MultinomialNB()
        par = ParameterGrid(
            {'alpha': [0, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 1.5]})
        for a in list(par):
            clf.set_params(**a)
            clf.fit(X_train, y_train)
            predictions_val = clf.predict(X_val)
            predictions_val = fix_zero_predictions(predictions_val, rev_val)
            mae = mean_absolute_error(predictions_val, y_val)
            temp = a
            temp['mae'] = mae
            if mae < best:
                print temp
                best = mae
            results.append(temp)
    else:
        print "model name not defined"
        return None
    return results
Esempio n. 15
0
def main_preprop():
    """
    The main method that can be used for finding the best preprocessor hyperparameters,
    for a fixed model.
    """
    # Initialize the result array
    results = []

    # Set the search range in a parameter grid, all possible combinations will be tested
    params_preprocessor = {
        'a_value': range(10, 15),
        'epsilon': [0.1, 0.01, 0.001],
        'reduction_level': [0.01, 0.02, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5]
    }

    # Create a preprocessor object (tweaked for tuning hyperparameters)
    preprocessor_object = Preprocessor2()

    # Define the models
    logistic = LogisticRegression(solver='sag',
                                  n_jobs=NUM_THEADS,
                                  C=5,
                                  tol=0.01)
    ridge = OrdinalRidge(10)
    svc = LinearSVC(C=0.5)
    svr = svm.LinearSVR(loss='squared_epsilon_insensitive')
    lad = LAD(svr)
    ensemble = VotingClassifier(estimators=[('logistic', logistic),
                                            ('ridge', ridge), ('svc', svc),
                                            ('lad', lad)],
                                voting='hard',
                                weights=[1, 1, 1, 1])

    # load the data
    train_data, test_data = preprocessor_object.load()

    # Convert the data to tf-idf matrices
    X_train_full, X_val_full, y_train, y_val = preprocessor_object.fit_data(
        train_data)

    # Test the preprocessor for all possible hyperparameter combinations
    for x in params_preprocessor['a_value']:
        for y in params_preprocessor['epsilon']:
            sigma_values = preprocessor_object.compute_sigma_values(
                X_train_full, y_train, x, y)
            for z in params_preprocessor['reduction_level']:
                X_train, X_val, X_test = preprocessor_object.remove_features(
                    sigma_values, z, X_train_full, X_val_full, test_data)
                ensemble.fit(X_train, y_train)
                predictions_val = ensemble.predict(X_val)
                mae = mean_absolute_error(predictions_val, y_val)
                print strftime("%H:%M:%S") + ' -> a: ' + str(
                    x) + '; epsilon: ' + str(y) + '; red lvl: ' + str(
                        z) + '; MAE: ' + str(mae)
                results.append({
                    'a_value': x,
                    'epsilon': y,
                    'reduction_level': z,
                    'mae': mae
                })

    print results

    log('That\'s all folks!')
Esempio n. 16
0
def runModel(X, y, S_data, model_name):

    f = open('r_' + "_" + model_name + '.txt', 'w')
    auc_score_all = []
    fold = S_data[1]
    Index_gen = S_data[0]
    label = ['0.0', '1.0']
    # note that python does not copy the generator,
    # so when it's in the end of the for loop the generator for S_data is also extruded!
    print 'Running', model_name
    for exp in range(0, fold):
        print "=" * 80, "\n", "experiment =", exp
        # getting one fold of indices from the index generator
        train_index, test_index = Index_gen.next()
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # fitting the model
        # 1 fit a model
        # 2 prediction
        if model_name == 'SVM':
            # LinearSVC take care of the multi class response by using one vs others method
            clf = LinearSVC(random_state=0).fit(X_train, y_train)
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.to_dense(), y_train)
            y_pred = clf.predict(X_test.to_dense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.as_matrix(), y_train)
            y_pred = clf.predict(X_test.as_matrix())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        auc_score = roc_auc_score(y_test, y_pred, average=None)
        auc_score_all.append(auc_score)

    auc_ave = mean(array(auc_score_all), 0)

    print >> f, model_name, '\n', "=" * 80
    print >> f, 'avg auc = ', auc_ave
Esempio n. 17
0
#path = '/media/robbis/DATA/fmri/monks'
path = '/home/carlos/mount/megmri03/monks'
subjects = os.listdir(path)
subjects = [s for s in subjects if s.find('.') == -1 and s.find('_') == -1]

# Load monk data in the form of n_samples x n_voxels x n_time
ds, _, _ = load_subject_ds(
    path,
    subjects[:1],
    #os.path.join(path, 'subjects.csv'),
    'meditation_permut1.conf',
    'fmri',
    prepro=MonksPreprocessingPipeline(),
    roi_labels=atlas_dict)

clf = make_pipeline(StandardScaler(), LinearSVC(C=1))
time_gen = GeneralizingEstimator(clf, scoring='accuracy', n_jobs=20)

ds = SampleSlicer({'group': ['E']}).transform(ds)

scores_dict = {}
# Generalization of time
for network in os.listdir(path_templates):

    network = network[:-21]
    ds_network = FeatureSlicer({network: ['!0']}).transform(ds)

    n_samples, n_voxels = ds_network.shape
    data = ds_network.samples.reshape(-1, 135, n_voxels)
    X = np.rollaxis(data, 1, 3)
    y = np.arange(data.shape[0]) % 2
Esempio n. 18
0
 def setUp(self):
     super(LinearSVCJavaTest, self).setUp()
     self.estimator = LinearSVC(C=1., random_state=0)
 def __init__(self, ldaModelAll, expander_type=AcronymExpanderEnum.LDA_multiclass):
     Expander_LDA_cossim.__init__(self, ldaModelAll, expander_type)
     self.classifier = LinearSVC()
Esempio n. 20
0
			'KernelRidge':KernelRidge(),
			'LSHForest':LSHForest(),
			'LabelPropagation':LabelPropagation(),
			'LabelSpreading':LabelSpreading(),
			'Lars':Lars(),
			'LarsCV':LarsCV(),
			'Lasso':Lasso(),
			'LassoCV':LassoCV(),
			'LassoLars':LassoLars(),
			'LassoLarsCV':LassoLarsCV(),
			'LassoLarsIC':LassoLarsIC(),
			'LatentDirichletAllocation':LatentDirichletAllocation(),
			'LedoitWolf':LedoitWolf(),
			'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
			'LinearRegression':LinearRegression(),
			'LinearSVC':LinearSVC(),
			'LinearSVR':LinearSVR(),
			'LocallyLinearEmbedding':LocallyLinearEmbedding(),
			'LogisticRegression':LogisticRegression(),
			'LogisticRegressionCV':LogisticRegressionCV(),
			'MDS':MDS(),
			'MLPClassifier':MLPClassifier(),
			'MLPRegressor':MLPRegressor(),
			'MaxAbsScaler':MaxAbsScaler(),
			'MeanShift':MeanShift(),
			'MinCovDet':MinCovDet(),
			'MinMaxScaler':MinMaxScaler(),
			'MiniBatchDictionaryLearning':MiniBatchDictionaryLearning(),
			'MiniBatchKMeans':MiniBatchKMeans(),
			'MiniBatchSparsePCA':MiniBatchSparsePCA(),
			'MultiTaskElasticNet':MultiTaskElasticNet(),
Esempio n. 21
0
 def setUp(self):
     super(LinearSVCTest, self).setUp()
     mdl = LinearSVC(C=1., random_state=0)
     self._port_model(mdl)
def runModel(X, y, model_name):
    nFolders = 5
    accs = []
    precs = []
    recalls = []
    F1s = []

    n = X.shape[0]
    for exp in range(0, nFolders):
        print '\n\n============================================================================================\nexperiment' , exp
        ### 2.1 split training and testing data
        start = (int)((1-(exp+1) * 1.0/nFolders)*n)
        end = (int)((1-exp * 1.0/nFolders)*n)
        #print n, start, end
        X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end)
        print 'Running', model_name
        if model_name == 'SVM':
            ### 2.2 build classifier
            clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.todense(), y_train)
            y_pred = clf.predict(X_test.todense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.toarray(), y_train)
            y_pred = clf.predict(X_test.toarray())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        acc, prec, recall, F1 = eval(y_test, y_pred)
        print 'Acc = ', acc;
        print 'Precision =', prec;
        print 'Recall=', recall;
        print 'F1 =',  F1
        accs.append(acc)
        precs.append(prec)
        recalls.append(recall)
        F1s.append(F1)

    print '\n\n\n'
    print 'avg Acc = ', sum(accs)/len(accs)
    print 'avg Precision = ', sum(precs)/len(precs)
    print 'avg Recall = ', sum(recalls)/len(recalls)
    print 'avg F1 = ', sum(F1s)/len(F1s)
    return sum(accs)/len(accs), sum(precs)/len(precs),  sum(recalls)/len(recalls), sum(F1s)/len(F1s)
Esempio n. 23
0
n_classes = 2

FEAT_F33 = "F33"
ROLL_LEN = 10
PANTHEON_SIZE = 10

n_users = 1000
max_runs = None  #10000
percTest = 0.1

predictors = [
    # DummyClassifier(strategy="stratified"),
    # DummyClassifier(strategy="uniform"),
    # BernoulliNB(),
    # SVC(kernel='rbf', max_iter=10000, class_weight="balanced", verbose=1),
    LinearSVC(max_iter=50),
    MLPClassifier(max_iter=50, nesterovs_momentum=True,
                  early_stopping=True),  #, activation="logistic"),
    # LogisticRegression(class_weight='balanced'),
    RandomForestClassifier(class_weight="balanced"),
    # ExtraTreeClassifier(),
    # AdaBoostClassifier(),
    # DecisionTreeClassifier(),
]

predictor_params = [
    # None,
    # None,
    #{'n_iter':50, 'alpha': numpy.logspace(-3, 2) },
    # {'name':'RBFSVC', 'n_iter':50,'C': numpy.logspace(-2, 6), 'gamma': numpy.logspace(-9, 3)},
    {
Esempio n. 24
0
#criacao o dicionario
#dict = []
#cont = 0
#for t in dataset.letra[:]:
#    for w in t.split():
#        dict.append(w)

#dict = set(dict) #dict agora contem todas as palavras existentes no dataset
#dict_array = list(dict) #precisa transformar pois o fit do label encoder nao aceita set

#separar o dataset em uma parte para fit e outra para predict
letras_train, letras_test, label_train, label_test = train_test_split(
    dataset.letra, dataset.label, test_size=0.30, random_state=42)

#uso do LinearSVC
clf = SVC(C=1, loss='squared_hinge', penalty='l1', dual=False)

#print(dict_array)

#rotinas para alimentar o LabelEnconder
label_encoder = LabelEncoder()
int_encoded_fit = label_encoder.fit_transform(letras_train)
int_encoded_pred = label_encoder.fit_transform(letras_test)

#dict_encode = label_encoder.fit(dict_array)

#test_encode = []
#cont = 0
#while cont < 10:
#    test_encode.append(dataset.letra[cont].split())
#    cont += 1
from sklearn.datasets import make_blobs
import mglearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm.classes import LinearSVC
from Lib.idlelib.colorizer import color_config

X,y = make_blobs(random_state=42)
mglearn.discrete_scatter(X[:,0],X[:,1],y)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.legend(["Class 0","Class 1","Class 2"])
plt.show()

##在这个数据集上训练一个LinearSVC分类器
linear_svm = LinearSVC().fit(X,y)
print("Coefficient shape: ",linear_svm.coef_.shape)
print("Intercept shape: ",linear_svm.intercept_.shape)
#Coefficient shape:  (3, 2)
#Intercept shape:  (3,)
#coef_每行包含三个类别之一的系数向量,每列包含某个特征对应的系数值,包含两个特征
#intercept_是一维数组,保存每个类别的截距

#将这3个分类器给出的直线可视化

mglearn.discrete_scatter(X[:,0],X[:,1],y)
line = np.linspace(-15,15)
for coef,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']):
    plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color)
plt.ylim(-10,15)
plt.xlim((-10,8))
Esempio n. 26
0
 def setUp(self):
     super(LinearSVCTest, self).setUp()
     self.porter = Porter(language='ruby')
     self._port_model(LinearSVC(C=1., random_state=0))
op.add_option('--use_available_classifiers', action='store_true', dest='available_classifiers',
              help='Uses previously generated classifiers. If does not exists, new classifier models are \
              generated.')
op.add_option("--random_state", action='store', type='int', dest='random_state',
              help='Use random value of type int to reproduce the results.')
(opts, args) = op.parse_args()

if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)
if __doc__:
    print(__doc__)
op.print_help()

clfs = (
            (LinearSVC(), "Linear SVC"),
        )

# total results from classifier
def calculateScore(result, n_folds):
    print 'x' * 70
    n_classifier = len(clfs)
    for x in xrange(n_classifier):
        a=0.0
        p=0.0
        r=0.0
        f1=0.0
        for i in xrange(x * n_folds, (x * n_folds) + n_folds):
            a += result[i][0]
            p += result[i][1]
            r += result[i][2]
Esempio n. 28
0
QENC_QUAL=False
QENC_DIFF=False
qenc_width = 33
n_classes = 3

FEAT_F33 = "F33"

n_users = 1000
max_runs = None #10000
percTest = 0.20

predictors = [
    # DummyClassifier(strategy="stratified"),
    # DummyClassifier(strategy="uniform"),
    # BernoulliNB(),
    LinearSVC(max_iter=100, class_weight="balanced"),
    MLPClassifier(max_iter=100, nesterovs_momentum=True, early_stopping=True), #, activation="logistic"),
    LogisticRegression(class_weight='balanced'),
    # GaussianNB(),
]

predictor_params = [
    # None,
    # None,
    # {'n_iter':50, 'alpha': numpy.logspace(-3, 2) },
    {'n_iter':50,'C': numpy.logspace(-3, 2)},
    {'n_iter':125,'hidden_layer_sizes':[(100,), (66,10)], 'learning_rate_init':[0.001, 0.01, 0.1], 'alpha': numpy.logspace(-6,2) },
    {'n_iter':50,'C': numpy.logspace(-3, 2)},
    # None,
]
Esempio n. 29
0
        
    return docs, t_docs, t_docsCategories


data = readData('hackerrank/documentClassification.txt')
X_train = np.array(data[1])
y_train = np.array(data[2])
X_test = np.array(data[0])
print("Extracting features from the training dataset using a sparse vectorizer")
#vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
X_train = vectorizer.fit_transform(X_train)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
#                                 stop_words='english')
#X2_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)
svm_classifier = LinearSVC().fit(X_train, y_train)
maxent_classifier = LogisticRegression().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = svm_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = maxent_classifier.predict(X_test)
print(y_nb_predicted)
Esempio n. 30
0
 def setUp(self):
     super(LinearSVCTest, self).setUp()
     self.porter = Porter(language='c')
     self.set_classifier(LinearSVC(C=1., random_state=0))
Esempio n. 31
0
train_r1 = time()

# prediction or testing
test_r0 = time()
predict = clf_rbf.predict(features_test)
test_r1 = time()

print "accuracy: ", clf_rbf.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_r1 - train_r0, 3), "s"
print "prediction time: ", round(test_r1 - test_r0, 3), "s"
print "#################################"
'''
#SVC lib_linear
print("lib_linear")
clf_lib=LinearSVC()

# training
train_l0 = time()
clf_lib.fit(features_train, labels_train)
train_l1 = time()

# prediction or testing
test_l0 = time()
predict = clf_lib.predict(features_test)
test_l1 = time()

print "accuracy: ", clf_lib.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_l1 - train_l0, 3), "s"
print "prediction time: ", round(test_l1 - test_l0, 3), "s"
Esempio n. 32
0
 def setUp(self):
     super(LinearSVCPHPTest, self).setUp()
     self.mdl = LinearSVC(C=1., random_state=0)
train_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/train.csv'
test_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/test.csv'

train_data = np.loadtxt(train_file, np.float32, delimiter=',')
test_data = np.loadtxt(test_file, np.float32, delimiter=',')

#training_data, training_label, test_data, test_label = train_data[:15000,:-1], data[:15000,-1], data[15000:,:-1], data[15000:,-1]

train_X = train_data[:, :-1]
train_y = train_data[:, -1]
test_X = test_data

print "starting..."

predict_label = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
    train_X, train_y).predict(test_X)
'''print "Started Training..."
clf.fit(train_X, train_y)
print "Done Training"
print "Started Predicting..."
predict_label = clf.predict(test_X)'''

output_file = '/home/jvujjini/Kaggle/ForestCoverTypePrediction/output.csv'

with open(output_file, 'w') as thefile:
    print "File Opened..."
    for item in predict_label:
        thefile.write("%s\n" % item)

print "Success!"
def runModel(X, y, model_name):
    nFolders = 5
    accs = []
    precs = []
    recalls = []
    F1s = []

    n = X.shape[0]
    for exp in range(0, nFolders):
        print '\n\n============================================================================================\nexperiment', exp
        ### 2.1 split training and testing data
        start = (int)((1 - (exp + 1) * 1.0 / nFolders) * n)
        end = (int)((1 - exp * 1.0 / nFolders) * n)
        #print n, start, end
        X_train, y_train, X_test, y_test = splitTrainTest(X, y, start, end)
        print 'Running', model_name
        if model_name == 'SVM':
            ### 2.2 build classifier
            clf = LinearSVC(penalty="l1", dual=False, tol=1e-7)
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        if model_name == 'SVM_new':
            ### 2.2 build classifier
            clf = svm.SVC(C=1.0, gamma=1.0, class_weight='auto')
            clf.fit(X_train, y_train)
            ### 2.3 predict
            y_pred = clf.predict(X_test)
        elif model_name == 'NaiveBayes':
            clf = GaussianNB()
            clf.fit(X_train.todense(), y_train)
            y_pred = clf.predict(X_test.todense())
        elif model_name == 'LogisticRegression':
            clf = LogisticRegression(C=1.0, penalty='l1', tol=0.01)
            clf.fit(X_train.toarray(), y_train)
            y_pred = clf.predict(X_test.toarray())
        else:
            raise Exception("The model name is incorrect!!!")
        ### 2.4 eval
        acc, prec, recall, F1 = eval(y_test, y_pred)
        print 'Acc = ', acc
        print 'Precision =', prec
        print 'Recall=', recall
        print 'F1 =', F1
        accs.append(acc)
        precs.append(prec)
        recalls.append(recall)
        F1s.append(F1)

    print '\n\n\n'
    print 'avg Acc = ', sum(accs) / len(accs)
    print 'avg Precision = ', sum(precs) / len(precs)
    print 'avg Recall = ', sum(recalls) / len(recalls)
    print 'avg F1 = ', sum(F1s) / len(F1s)
    return sum(accs) / len(accs), sum(precs) / len(precs), sum(recalls) / len(
        recalls), sum(F1s) / len(F1s)
Esempio n. 35
0
    train_acc = metrics.accuracy_score(y_train, model.predict(x_train))
    test_acc = metrics.accuracy_score(y_test, y_hat)
    print u'训练集准确率:%.2f%%' % (100 * train_acc)
    print u'测试集准确率:%.2f%%' % (100 * test_acc)

    return t_train, t_test, 1 - train_acc, 1 - test_acc, name


#开始传提参数
clfs = [[RidgeClassifier(), 'Ridge'], [KNeighborsClassifier(), 'KNN'],
        [MultinomialNB(), 'MultinomialNB'], [BernoulliNB(), 'BernoulliNB'],
        [RandomForestClassifier(n_estimators=200), 'RandomForest'],
        [SVC(), 'SVM'],
        [
            LinearSVC(loss='squared_hinge', penalty='l1', dual=False,
                      tol=1e-4), 'LinearSVC-l1'
        ],
        [
            LinearSVC(loss='squared_hinge', penalty='l2', dual=False,
                      tol=1e-4), 'LinearSVC-l2'
        ]]

#开始训练
result = []
for clf, name in clfs:
    a = benchmark(clf, name)
    result.append(a)
    print '\n'

result = np.array(result)
Esempio n. 36
0
genres = list(data_df.drop(['title', 'plot'], axis=1).columns.values)
data_x = data_df[['plot']].as_matrix()
data_y = data_df.drop(['title', 'plot'], axis=1).as_matrix()
stratified_split = StratifiedShuffleSplit(n_splits=2, test_size=0.33)

x_train, x_test, y_train, y_test = train_test_split(data_x,
                                                    data_y,
                                                    test_size=0.33,
                                                    random_state=42)

# transform matrix of plots into lists to pass to a TfidfVectorizer
train_x = [x[0].strip() for x in x_train.tolist()]
test_x = [x[0].strip() for x in x_test.tolist()]

stop_words = set(stopwords.words('english'))

## http://michelleful.github.io/code-blog/2015/06/20/pipelines/
## learn feature union to add more features (time, region)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
])
parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
grid_search(train_x, y_train, test_x, y_test, genres, parameters, pipeline)
Esempio n. 37
0
train_r1 = time()

# prediction or testing
test_r0 = time()
predict = clf_rbf.predict(features_test)
test_r1 = time()

print "accuracy: ", clf_rbf.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_r1 - train_r0, 3), "s"
print "prediction time: ", round(test_r1 - test_r0, 3), "s"
print "#################################"
'''
#SVC lib_linear
print("lib_linear")
clf_lib = LinearSVC()

# training
train_l0 = time()
clf_lib.fit(features_train, labels_train)
train_l1 = time()

# prediction or testing
test_l0 = time()
predict = clf_lib.predict(features_test)
test_l1 = time()

print "accuracy: ", clf_lib.score(features_test, labels_test)
print "#################################"
print "tain time: ", round(train_l1 - train_l0, 3), "s"
print "prediction time: ", round(test_l1 - test_l0, 3), "s"