コード例 #1
0
def main(argv):
    (X, Y) = read_Data(datafile)
    ### implement cross validation ###
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=0)
    
#     num_class_one = sum(y_train)
    sample_weight_class_one = np.ones(len(y_train))
    for i in range(len(y_train)):
        if y_train[i,0] == 1:
            sample_weight_class_one[i] = 1
    ### generate poly SVM model ###
    # clf = svm.SVC(kernel = 'poly', degree = 5)
    # clf.fit(X_train, y_train[:,0], sample_weight_class_one)
    # mkdir(outputfile)
    # joblib.dump(clf, outputfile + 'svm_poly_4.pkl') # save model to disc

    ### load existing model ###
    clf = joblib.load(outputfile + 'svm_poly_4.pkl') 

    y_pred = clf.predict(X_test)    # calculate prediction result
    cm = confusion_matrix(y_test, y_pred)

# plot_confusion_matrix(cm)
    print '\n', 'Confusion matrix:', '\n', cm

    err_FP, err_FN, err_ALL = err_rate(cm)
    print '\n', 'Over all accuracy: ', clf.score(X_test, y_test)*100, '%'
    print '\n', 'Fasle positive rate: ', err_FP*100, '%'
    print 'Fasle negative rate: ', err_FN*100, '%'
    print 'Over all error rate: ', err_ALL*100, '%'
    return 0
コード例 #2
0
def runSGDPipeline(entries, langs):
	t0 = time()
	sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                            alpha=0.001, n_iter=5, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)

	clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return sgd_pipeline
コード例 #3
0
def runRFPipeline(entries, langs):
    t0 = time()
    rf_pipeline = Pipeline([('vect',
                             CountVectorizer(ngram_range=(1, 1),
                                             max_features=n_features)),
                            ('tfidf', TfidfTransformer(use_idf=True)),
                            ('clf', RandomForestClassifier(n_estimators=10))])

    vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features)
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)

    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X_train_tfidf, langs)

    X_new_counts = vect.transform(entries)
    X_new_tfidf = tfidf.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf.toarray())

    print(np.mean(predicted == langs))
    print(metrics.classification_report(langs, predicted, target_names=langs))
    print(metrics.confusion_matrix(langs, predicted))
    print("Took %s seconds." % (time() - t0))
    print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
    return rf_pipeline
コード例 #4
0
def runSGDPipeline(entries, langs):
    t0 = time()
    sgd_pipeline = Pipeline([('vect',
                              CountVectorizer(ngram_range=(1, 1),
                                              max_features=n_features)),
                             ('tfidf', TfidfTransformer(use_idf=True)),
                             ('clf',
                              SGDClassifier(loss='squared_hinge',
                                            penalty='l2',
                                            alpha=0.001,
                                            n_iter=5,
                                            random_state=42))])

    vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features)
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.fit_transform(X_train_counts)

    clf = SGDClassifier(loss='squared_hinge',
                        penalty='l2',
                        alpha=0.001,
                        n_iter=5,
                        random_state=42)
    clf.fit(X_train_tfidf, langs)

    X_new_counts = vect.transform(entries)
    X_new_tfidf = tfidf.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf.toarray())

    print(np.mean(predicted == langs))
    print(metrics.classification_report(langs, predicted, target_names=langs))
    print(metrics.confusion_matrix(langs, predicted))
    print("Took %s seconds." % (time() - t0))
    print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
    return sgd_pipeline
コード例 #5
0
def runSVCPipeline(entries, langs):
	t0 = time()
	svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.transform(X_train_counts)

	clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	#dec = clf.decision_function([[1]])
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return svc_pipeline
コード例 #6
0
def print_confusion_matrix(y_test_report, y_predicted_report, algo_name):
    # y_predicted report is one array of len(100) , this array has all the predicted values of all folds
    cm = confusion_matrix(y_test_report, y_predicted_report)
    np.set_printoptions(precision=2)
    print ("Confusion matrix for: " + algo_name)
    print (cm)
    return None
コード例 #7
0
ファイル: classify_mode.py プロジェクト: herbertchen1/SciTail
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    labels = np.asarray(map(int, list(file_line_generator(args.label_file))))

    log.info('performing cross validation')
    single_predictions, classification_result = do_cross_validation(labels)

    log.info('storing results')
    header = 'fold_no;instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 2]
    all_pred_labels = single_predictions[:, 3]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = 'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #8
0
ファイル: classify_mode.py プロジェクト: joluychen/THUCC
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    labels = np.asarray(map(int, list(file_line_generator(args.label_file))))

    log.info('performing cross validation')
    single_predictions, classification_result = do_cross_validation(labels)

    log.info('storing results')
    header = 'fold_no;instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 2]
    all_pred_labels = single_predictions[:, 3]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = 'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #9
0
def print_confusion_matrix(y_test_report, y_predicted_report,algo_name):
    #y_predicted report is one array of len(100) , this array has all the predicted values of all folds
    cm = confusion_matrix(y_test_report, y_predicted_report)
    np.set_printoptions(precision=2)
    print('Confusion matrix for: ' +algo_name)
    print(cm)
    return None
コード例 #10
0
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
コード例 #11
0
def _run_cv_iter((ensemble, selection_strategy, inp, y,
                  train_indices, test_indices, seed, it)):
    Logger.get().write("!Running", (it+1), "iteration...")

    train_inp, train_y = inp[train_indices], y[train_indices]
    test_inp, test_y = inp[test_indices], y[test_indices]

    ensemble.set_params(
        random_state=seed, selection_strategy=selection_strategy
    )
    ensemble.fit(train_inp, train_y)

    threshold_range = selection_strategy.get_threshold_range(
        ensemble.n_estimators
    )
    confusion_matrices = numpy.zeros(
        (len(threshold_range), ensemble.n_classes_, ensemble.n_classes_)
    )
    for i, threshold in enumerate(threshold_range):
        ensemble.selection_strategy.threshold = threshold
        Logger.get().write("!Testing using threshold: {:.3f}".format(threshold))
        confusion_matrices[i] = confusion_matrix(
            test_y, ensemble.predict(test_inp), labels=ensemble.classes_
        )
    return confusion_matrices
コード例 #12
0
def testDataset(model, test, test_labels, n_slice=1):
    test_sample = test[::n_slice]
    test_labels_sample = test_labels[::n_slice]
    # print test_sample[1]

    # extracted_test_sample = extractFeatures(test_sample, [74,745,361,445,164,681,258,230,277,719,509,637,738,709,529,557,473,175,664,133,305,75,333,585,501,222,105,612,342,250,286,746,202,481,68,621,94,638,257,314,165,639,278,147,229,692,120,453,720,285,680,650,76,737,708,174,564,194,613,134,536,592,425,640,747,201,67,313,135,397,558,256,665,284,693,106,93,530,77,649,508,736,721,146,341,586,369,136,306,251,173,228,707,666,502,667,620,679,641,748,119,279,223,166,107,66,92,772,771,312,668,480,735,474,694,78,145,614,770,137,200,722,446,678,283,749,648,255,773,418,42,340,195,108,227,311,334,362,41,669,307,563,774,390,40,706,591,65,91,769,768,43,452,750,695,44,535,723,172,677,775,642,79,619,396,167,339,118,39,507,751,254,503,559,531,199,696,335,734,587,766,767,64,45,109,776,479,282,226,80,47,11,13,16,8,12,14,9,15,10,116,81,7,90,88,6,5,4,3,2,89,87,114,18,115,86,117,82,83,84,85,17,111,19,51,50,30,52,53,54,55,56,49,48,46,36,38,37,35,32,34,33,31,29,20,62,61,28,63,110,112,113,21,60,59,58,25,27,26,24,57,23,22,784,392,138,676,697,675,699,674,698,700,139,704,705,703,701,702,673,672,671,615,616,590,670,589,617,618,643,644,647,646,645,724,725,726,763,764,762,759,761,765,777,778,779,782,781,780,760,758,727,730,731,729,757,728,732,733,752,753,756,755,754,588,562,561,281,308,280,225,253,309,310,336,337,364,363,338,252,224,366,142,143,141,198,140,144,168,169,170,197,196,171,365,367,560,476,477,475,449,451,478,504,505,506,534,533,532,450,448,368,393,394,783,447,391,395,419,420,421,424,423,422,1])
    extracted_test_sample = test_sample

    preds = model.predict(extracted_test_sample)

    accuracy = model.score(extracted_test_sample, test_labels_sample)

    cm = metrics.confusion_matrix(test_labels_sample, preds)

    print("Confusion matrix:\n%s" % cm)

    plot_confusion_matrix(cm)

    digitsDict = digitDict.initData()
    print 'true \t\t predicted'
    errors = 0
    for i in xrange(0, len(test_sample)):
        if preds[i] != test_labels_sample[i]:
            errors += 1
            trueClass = str(int(test_labels_sample[i]))
            prediction = str(int(preds[i]))
            digitDict.updateErrors(digitsDict, trueClass, prediction)
            print test_labels_sample[i], '\t\t', preds[i]
    digitDict.save(digitsDict)

    print 'Accuracy: ', accuracy, '\nErrors: ', errors
コード例 #13
0
def runSVCPipeline(entries, langs):
    t0 = time()
    svc_pipeline = Pipeline([('vect',
                              CountVectorizer(ngram_range=(1, 1),
                                              max_features=n_features)),
                             ('tfidf', TfidfTransformer(use_idf=True)),
                             ('clf',
                              LinearSVC(dual=False,
                                        loss='squared_hinge',
                                        max_iter=100,
                                        random_state=42))])

    vect = CountVectorizer(ngram_range=(1, 1), max_features=n_features)
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.transform(X_train_counts)

    clf = LinearSVC(dual=False,
                    loss='squared_hinge',
                    max_iter=100,
                    random_state=42)
    clf.fit(X_train_tfidf, langs)

    X_new_counts = vect.transform(entries)
    X_new_tfidf = tfidf.transform(X_new_counts)
    #dec = clf.decision_function([[1]])
    predicted = clf.predict(X_new_tfidf.toarray())

    print(np.mean(predicted == langs))
    print(metrics.classification_report(langs, predicted, target_names=langs))
    print(metrics.confusion_matrix(langs, predicted))
    print("Took %s seconds." % (time() - t0))
    print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
    return svc_pipeline
コード例 #14
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    train_labels = np.asarray(
        map(int, list(file_line_generator(args.train_labels))))
    train_features = np.loadtxt(args.train_data)

    if train_features.ndim == 1:
        train_features = train_features.reshape((train_features.shape[0], 1))

    test_labels = np.asarray(
        map(int, list(file_line_generator(args.test_labels))))
    test_features = np.loadtxt(args.test_data)

    if test_features.ndim == 1:
        test_features = test_features.reshape((test_features.shape[0], 1))

    log.info('performing classification')
    single_predictions, classification_result, weight_vectors, model = \
            calc_results(train_features, train_labels, test_features,
            test_labels, args.normalize, args.mode == True)

    log.info('storing results')
    save_object_to_file(model, os.path.join(args.output_dir, 'svm'))

    np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors,
               '%f', ';', '\n')

    header = 'instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
               single_predictions,
               '%d',
               ';',
               '\n',
               header=header)

    all_true_labels = single_predictions[:, 1]
    all_pred_labels = single_predictions[:, 2]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
               confusion, '%d', ';', '\n')

    header = 'accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
               classification_result,
               '%f',
               ';',
               '\n',
               header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #15
0
 def show_report(self, y_predicted):
     y_true = []
     y_predicted_new = []
     
     for i in range(len(self.__labels)):
         if self.__labels[i]=='P':
             y_true.append(1)
         if y_predicted[i]=='positivo':
             y_predicted_new.append(1)
         if self.__labels[i]=='N':
             y_true.append(-1)
         if y_predicted[i]=='negativo':
             y_predicted_new.append(-1)
         if self.__labels[i]=='NEU':
             y_true.append(0)
         if y_predicted[i]=='neutral':
             y_predicted_new.append(0)
     
     print classification_report(y_true, y_predicted_new)
     print confusion_matrix(y_true, y_predicted_new)
コード例 #16
0
def show_confusion_matrix(y_true, y_pred, title=''):
    """
	Plot (and print) a confusion matrix from y_true and y_predicted
	"""
    # TODO: show confusion matrix plot
    cm = confusion_matrix(y_true, y_pred)
    pl.matshow(cm)
    pl.title(title)
    pl.colorbar()
    pl.ylabel('True label')
    pl.xlabel('Predicted label')
    pl.show()
コード例 #17
0
ファイル: classify_imdb_docs.py プロジェクト: joluychen/THUCC
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading embeddings')
    vocab = read_vocabulary_id_file(args.vocabulary)
    embs = np.loadtxt(args.embeddings)

    log.info('loading documents')
    features, labels = load_data(args.corpus_dir, vocab, embs)

    log.info('performing cross validation')
    single_predictions, classification_result, weight_vectors = \
            do_cross_validation(features, labels)

    log.info('storing results')
    np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
               weight_vectors, '%f', ';', '\n')

    with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \
            as pred_file:
        pred_file.write(u'fold_no;doc;true_label;pred_label\n')

        for sp in single_predictions:
            pred_file.write(u';'.join(map(unicode, sp)) + u'\n')

    all_true_labels = [sp[2] for sp in single_predictions]
    all_pred_labels = [sp[3] for sp in single_predictions]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
               confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(
        -1, all_true_labels, all_pred_labels)

    header = u'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
               classification_result,
               '%f',
               u';',
               u'\n',
               header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #18
0
ファイル: metrics.py プロジェクト: ozwin/mpri_labs
def show_confusion_matrix(y_true, y_predicted, title=''):

    # compute confusion matrix
    cm = confusion_matrix(y_true, y_predicted)
    print cm
    # configure window
    pl.matshow(cm)
    pl.title(title)
    pl.colorbar()
    pl.ylabel('True label')
    pl.xlabel('Predicted label')
    pl.jet()
    # show confusion matrix plot
    pl.show()
コード例 #19
0
ファイル: classify.py プロジェクト: rgtjf/DeepLearning
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    train_labels = np.asarray(list(map(int, list(file_line_generator(args.train_labels)))))
    train_features = np.loadtxt(args.train_data)

    if train_features.ndim == 1:
        train_features = train_features.reshape((train_features.shape[0], 1))

    test_labels = np.asarray(list(map(int, list(file_line_generator(args.test_labels)))))
    test_features = np.loadtxt(args.test_data)

    if test_features.ndim == 1:
        test_features = test_features.reshape((test_features.shape[0], 1))

    log.info('performing classification')
    single_predictions, classification_result, weight_vectors, model = \
            calc_results(train_features, train_labels, test_features,
            test_labels, args.normalize, args.mode == True)

    log.info('storing results')
    save_object_to_file(model, os.path.join(args.output_dir, 'svm'))

    np.savetxt(os.path.join(args.output_dir, 'weights.csv'),
            weight_vectors, '%f', ';', '\n')

    header = 'instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 1]
    all_pred_labels = single_predictions[:, 2]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    header = 'accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #20
0
def main():
    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', LogisticRegression())])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(
        y_test, predictions)
コード例 #21
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading embeddings')
    vocab = read_vocabulary_id_file(args.vocabulary)
    embs = np.loadtxt(args.embeddings)

    log.info('loading documents')
    features, labels = load_data(args.corpus_dir, vocab, embs)

    log.info('performing cross validation')
    single_predictions, classification_result, weight_vectors = \
            do_cross_validation(features, labels)

    log.info('storing results')
    np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
            weight_vectors, '%f', ';', '\n')

    with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \
            as pred_file:
        pred_file.write(u'fold_no;doc;true_label;pred_label\n')

        for sp in single_predictions:
            pred_file.write(u';'.join(map(unicode, sp)) + u'\n')

    all_true_labels = [sp[2] for sp in single_predictions]
    all_pred_labels = [sp[3] for sp in single_predictions]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = u'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', u';', u'\n', header=header)

    log.info(classification_result)
    log.info('finished')
コード例 #22
0
ファイル: movies.py プロジェクト: moonbury/pythonanywhere
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(y_test, predictions)
コード例 #23
0
def runTreePipeline(entries, langs):
	t0 = time()
	tree_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', DecisionTreeClassifier(max_features=n_features))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.transform(X_train_counts)

	clf = DecisionTreeClassifier(max_features=n_features)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return tree_pipeline
コード例 #24
0
		lc.pop(0)
		lc = [float(i) for i in lc]
		x.append(lc)
	f.close()

	pipeline = Pipeline([
		('clf', LogisticRegression())
	])

	parameters = {
		'clf__C': (0.1, 1, 10),
	}	

	X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.5)
	
	grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

	grid_search.fit(X_train, y_train)
	print 'Best score: %0.3f' % grid_search.best_score_
	print 'Best parameters set:'
	best_parameters = grid_search.best_estimator_.get_params()
	for param_name in sorted(parameters.keys()):
		print '\t%s: %r' % (param_name, best_parameters[param_name])
	
	predictions = grid_search.predict(X_test)
	print 'Accuracy:', accuracy_score(y_test, predictions)
	print 'Confusion Matrix:'
	print confusion_matrix(y_test, predictions)
	print 'Classification Report:'
	print classification_report(y_test, predictions)
コード例 #25
0
indices = np.arange(n_total_samples)

unlabeled_set = indices[n_labeled_points:]

# shuffle everything around
y_train = np.copy(y)
y_train[unlabeled_set] = -1

###############################################################################
# Learn with LabelSpreading
lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, y_train)
predicted_labels = lp_model.transduction_[unlabeled_set]
true_labels = y[unlabeled_set]

cm = confusion_matrix(true_labels, predicted_labels,
        labels=lp_model.classes_)

print "Label Spreading model: %d labeled & %d unlabeled points (%d total)" % \
        (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)

print metrics.classification_report(true_labels, predicted_labels)

print "Confusion matrix"
print cm

# calculate uncertainty values for each transduced distribution
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# pick the top 10 most uncertain labels
uncertainty_index = np.argsort(pred_entropies)[-10:]
コード例 #26
0
ファイル: develop.py プロジェクト: yangwf1199/AuthorProfiling
            y_.extend(y_test)
            prediction_.extend(prediction)

    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age', 'gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print('Accuracy              :', accuracy_score(y_, prediction_))
        print('Precision             :', precision_score(y_, prediction_))
        print('Recall                :', recall_score(y_, prediction_))
        print('F-score               :', f1_score(y_, prediction_))
        print('\nClasification report:\n',
              classification_report(y_, prediction_))
        print('\nConfussion matrix   :\n', confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score
        print('Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print('Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print('R2 Error              :', r2_score(y_, prediction_))

    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')
    #plt.colorbar()
    #plt.xlabel()
    #plt.xlabel('categoria de verdad')
    #plt.ylabel('categoria predecida')
    #plt.show()
コード例 #27
0
ファイル: utils.py プロジェクト: 1987hasit/BoVW_Action
def plotConfusionMatrix(trueLabels, testPredLabels, 
                        saveFilename, normalization = False):
    """ Plot confusion matrix using true labels and prediction labels 
        normalization: True, accuracy of each class
                       False, number of results
    """
    
    # Calculate confusion matrix
    cm = confusion_matrix(trueLabels, testPredLabels)
    if normalization:  
        # If normalization
        cm = cm.astype(float) / LA.norm(cm, ord = 1, axis = 1)
    
    labels = []
    for item in trueLabels:
        if item not in labels:
            labels.append(item)
    
    
    # Plot confusion matrix
    font = {'size' : 12}
    mplib.rc('font', **font)
    
    fig, ax = plt.subplots()
    fig.subplots_adjust(bottom=0.2)
    ax.set_aspect('equal', adjustable='box')
    
    
    height, width = cm.shape
    
    for x in xrange(width):
        for y in xrange(height):
            
            if normalization:
                # If normalization
                floatNum = cm[x, y]
                annotation = "%.2f" % floatNum
                
                ax.annotate(annotation, xy=(y, x), 
                            horizontalalignment='center',
                            verticalalignment='center')
            else:
                intNum = cm[x, y]
                annotation = str(intNum)
                
                ax.annotate(annotation, xy=(y, x), 
                            horizontalalignment='center',
                            verticalalignment='center')
    
    heatmap = ax.imshow(np.array(cm), cmap=plt.cm.jet, 
                    interpolation='nearest')       
    fig.colorbar(heatmap)
    ax.set_xticks(np.arange(height), minor=False)
    ax.set_yticks(np.arange(width), minor=False)
    ax.set_xticklabels(labels, minor=False, rotation=45)
    ax.set_yticklabels(labels, minor=False)
    ax.set_xlabel('Predicted Labels', fontsize=18)
    ax.set_ylabel('True Labels', fontsize=18)
    
    plt.show()
    fig.savefig(saveFilename)
コード例 #28
0
#Task 6 
# Map five to 1 and 1 to 0 
y_test[y_test ==1]  = 0
y_test[y_test == 5 ] = 1


y_pred_prob = nb.predict_proba(test_dtm)[:,1]
print metrics.roc_auc_score(y_test, y_pred_prob)
#Task 7
import matplotlib.pyplot as plt
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')

#Task 8
print metrics.confusion_matrix(y_test, y_pred)
sensitivity = 126 / float(25 + 126)
specificity = 813/ float(813 + 58)
#Task 9 
false_positives = X_test[y_test < y_pred] # false positives

false_negatives = X_test[y_test > y_pred] # false negatives 

#One theory I have for false positives is that the more descriptive language you use the more the model thinks that it willl be rated a 5. 

#Task 10
#From the ROC Curve I would say a threshold of .18 would maximize true positive rate
コード例 #29
0
ファイル: utils.py プロジェクト: jizhihang/BoVW_Action
def plotConfusionMatrix(trueLabels,
                        testPredLabels,
                        saveFilename,
                        normalization=False):
    """ Plot confusion matrix using true labels and prediction labels 
        normalization: True, accuracy of each class
                       False, number of results
    """

    # Calculate confusion matrix
    cm = confusion_matrix(trueLabels, testPredLabels)
    if normalization:
        # If normalization
        cm = cm.astype(float) / LA.norm(cm, ord=1, axis=1)

    labels = []
    for item in trueLabels:
        if item not in labels:
            labels.append(item)

    # Plot confusion matrix
    font = {'size': 12}
    mplib.rc('font', **font)

    fig, ax = plt.subplots()
    fig.subplots_adjust(bottom=0.2)
    ax.set_aspect('equal', adjustable='box')

    height, width = cm.shape

    for x in xrange(width):
        for y in xrange(height):

            if normalization:
                # If normalization
                floatNum = cm[x, y]
                annotation = "%.2f" % floatNum

                ax.annotate(annotation,
                            xy=(y, x),
                            horizontalalignment='center',
                            verticalalignment='center')
            else:
                intNum = cm[x, y]
                annotation = str(intNum)

                ax.annotate(annotation,
                            xy=(y, x),
                            horizontalalignment='center',
                            verticalalignment='center')

    heatmap = ax.imshow(np.array(cm), cmap=plt.cm.jet, interpolation='nearest')
    fig.colorbar(heatmap)
    ax.set_xticks(np.arange(height), minor=False)
    ax.set_yticks(np.arange(width), minor=False)
    ax.set_xticklabels(labels, minor=False, rotation=45)
    ax.set_yticklabels(labels, minor=False)
    ax.set_xlabel('Predicted Labels', fontsize=18)
    ax.set_ylabel('True Labels', fontsize=18)

    plt.show()
    fig.savefig(saveFilename)
コード例 #30
0
    def run_nested_cross_validation(self,
                                    data,
                                    labels,
                                    k=3,
                                    columns=None,
                                    draw_roc=True,
                                    draw_decision_boundaries=True,
                                    classifier_type='logistic',
                                    title_suffix='',
                                    save_path_prefix='',
                                    balance=True,
                                    Cs=None,
                                    error_f=None,
                                    higher_is_better=False,
                                    threshold_p=.5):
        '''
        Run nested CV on data with passed k folds.
        
        If columns is passed, it should be a list of columns to subset from data.
        Otherwise, all of data is used.
        
        Data is assumed to be a pandas dataframe or similar.
        Cs can also be passed as a list of C parameter values to use.
        
        Error_f is a function for measuring error that takes predicted probabilities
        and true labels. Defaults to MSE.
        
        Uses the scikit-learn LogisticRegression library.
        '''
        if columns is not None: data_chosen = data[columns]
        Cs = Cs or self.Cs
        error_f = error_f or self.mse

        best_c = 0

        try:
            cv_outer = StratifiedKFold(labels.values, k=k)
        except AttributeError:
            # Labels needs to be a pandas series
            labels = Series(labels)
            cv_outer = StratifiedKFold(labels.values, k=k)

        outer_metric, for_roc = [], []
        for train_outer, test_outer in cv_outer:
            mod = self.get_model(classifier_type=classifier_type)
            c_metric = []
            for c in Cs:
                cv_inner = StratifiedKFold(labels.ix[train_outer].values, k=k)
                mod.set_params(C=c)
                inner_metric = []
                for train_inner, test_inner in cv_inner:
                    # Balance rare classes if necessary:
                    if balance:
                        data_balanced, labels_balanced = self.balance_classes(
                            data_chosen.ix[train_inner],
                            labels.ix[train_inner])
                    else:
                        data_balanced, labels_balanced = data_chosen.ix[
                            train_inner], labels.ix[train_inner]

                    fitted = mod.fit(data_balanced, labels_balanced)

                    # Predict probabilities
                    predicted_probs = fitted.predict_proba(
                        data_chosen.ix[test_inner])

                    err = error_f(predicted_probs,
                                  labels.ix[test_inner].values)
                    inner_metric.append(err)
                error_for_this_c = sum(inner_metric) / len(inner_metric)
                print "Average Error: ", error_for_this_c, ", for C: ", c
                c_metric.append(error_for_this_c)

            best_c = self.get_best_c(Cs,
                                     c_metric,
                                     higher_is_better=higher_is_better)

            # Now that we have selected the best parameter, apply to outer set.
            mod.set_params(C=best_c)

            # Balance rare classes if necessary:
            if balance:
                data_balanced, labels_balanced = self.balance_classes(
                    data_chosen.ix[train_outer], labels.ix[train_outer])
            else:
                data_balanced, labels_balanced = data_chosen.ix[
                    train_outer], labels.ix[train_outer]

            predicted_probs = fitted.predict_proba(data_chosen.ix[test_outer])
            err = error_f(predicted_probs, labels.ix[test_outer].values)

            print confusion_matrix(labels[test_outer].values,
                                   predicted_probs[:, 1] > threshold_p)

            for_roc.append((labels[test_outer].values, predicted_probs))

            outer_metric.append(err)

        mean_metric = sum(outer_metric) / len(outer_metric)

        print 'Mean Nested CV Error for best c: ', mean_metric, ', C: ', best_c
        print 'Final intercept: ', fitted.intercept_[0]
        try:
            print 'Final columns, coefficients: '
            print zip(columns, fitted.coef_[0])
        except NotImplementedError:
            pass

        num_features = len(data_chosen.columns)
        if draw_decision_boundaries:
            self.draw_decision_boundaries(
                mod,
                data_chosen.columns,
                data_chosen.ix[train_outer].as_matrix(),
                labels.ix[train_outer].values,
                title='Decision Boundaries: ' + title_suffix,
                save_path=save_path_prefix +
                '_{0}_features_decision_boundaries.png'.format(num_features))

        if draw_roc:
            self.draw_roc(for_roc,
                          title='ROC for {1} features, c = {2}: {0}'.format(
                              title_suffix, num_features, best_c),
                          save_path=save_path_prefix +
                          '_{0}_features_roc.png'.format(num_features))
        return mean_metric, best_c, mod
コード例 #31
0
ファイル: develop.py プロジェクト: ivanvladimir/sonidero
    ids_ = np.load(opts.IDS)

    le = preprocessing.LabelEncoder()
    le.fit(ids_)
    verbose("Total classes", le.classes_.shape[0])
    ids = le.transform(ids_)

    X_train, X_test, y_train, y_test=\
        train_test_split(feats, ids, test_size=0.20, random_state=42)

    verbose("Training")
    classifier = RandomForestClassifier(n_estimators=opts.estimators,
                                        n_jobs=opts.nprocessors,
                                        max_depth=20,
                                        verbose=True)

    # Aprendiendo
    classifier.fit(X_train, y_train)

    # Prediciendo
    verbose("Prediction")
    prediction = classifier.predict(X_test)

    print('Accuracy              :', accuracy_score(y_test, prediction))
    print('Precision             :', precision_score(y_test, prediction))
    print('Recall                :', recall_score(y_test, prediction))
    print('F-score               :', f1_score(y_test, prediction))
    print('\nClasification report:\n',
          classification_report(y_test, prediction))
    print('\nConfussion matrix   :\n', confusion_matrix(y_test, prediction))
コード例 #32
0
#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

#print X_train.shape

from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score

print '\nAccuracy:', accuracy_score(y_test, prediction)
print '\nscore:', classifier.score(X_train, y_train)
print '\nrecall:', recall_score(y_test, prediction)
print '\nprecision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test, prediction)
print '\n confussion matrix:\n', confusion_matrix(y_test, prediction)

#plots:

import matplotlib.pyplot as plt
confusion_matrix_plot = confusion_matrix(y_test, prediction)
plt.title('matriz de confusion')
plt.colorbar()
plt.xlabel()
plt.xlabel('categoria de verdad')
plt.ylabel('categoria predecida')
plt.show()

#como arreglo
# import numpy as np
# scores = cross_val_score(classifier, X_train, y_train, cv=5)
コード例 #33
0
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)


#print X_train.shape

from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score



print '\nAccuracy:', accuracy_score(y_test, prediction)
print '\nscore:', classifier.score(X_train, y_train)
print '\nrecall:', recall_score(y_test, prediction)
print '\nprecision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test, prediction)
print '\n confussion matrix:\n',confusion_matrix(y_test, prediction)

#plots:

import matplotlib.pyplot as plt
confusion_matrix_plot = confusion_matrix(y_test, prediction)
plt.title('matriz de confusion')
plt.colorbar()
plt.xlabel()
plt.xlabel('categoria de verdad')
plt.ylabel('categoria predecida')
plt.show()

#como arreglo
# import numpy as np
# scores = cross_val_score(classifier, X_train, y_train, cv=5)
コード例 #34
0
ファイル: feature_classifier.py プロジェクト: karmel/glasslab
                    best_chosen = chosen
                    best_mod = mod
                if force_choice: break

            print "Best number of features: ", len(best_chosen)
            print "Best features: ", best_chosen
            print "Best C, MSE: ", best_c, best_err

            if force_choice:
                #mod = learner.get_model(classifier_type=classifier_type, C=best_c)
                #fitted = mod.fit(training_data, training_labels)
                test_vectors = test_vectors[best_chosen]
                predicted_probs = best_mod.predict_proba(test_vectors)
                err = learner.mse(predicted_probs, test_labels.values)
                print err
                print confusion_matrix(test_labels.values,
                                       predicted_probs[:, 1] > .5)

                learner.draw_roc(
                    label_sets=[(test_labels.values, predicted_probs)],
                    save_path=learner.get_filename(
                        subdir, 'check_nontrivial_{0}group'.format(rep_str)))


                learner.draw_decision_boundaries(best_mod, best_chosen,
                  test_vectors.as_matrix(),
                  test_labels.values,
                  title = 'Decision Boundaries: ' + (replicate_id and 'Group {0}'.format(replicate_id) or 'Overall'),
                  force_lim = [-3,3,-3,3],
                  save_path = learner.get_filename(subdir,'plot_{0}group'.format(rep_str))\
                                         + '_check_non_trivial_decision_boundaries.png'
                  )
コード例 #35
0
ファイル: randomforest.py プロジェクト: melanoma/tesis
	for tweet in reader[0:2*(numironicos/3)]:
		tweets_train.append(tweet["text"])
		labels_train.append("noironia")
	for tweet in reader[2*(numironicos/3):]:
		tweets_test.append(tweet["text"])
		labels_test.append("noironia")

stop_words = []
f = open("spanish.txt") 
for line in f:
	stop_words.append(line.strip())

f.close()

y_train = np.array(labels_train, dtype=object) 
y_test = np.array(labels_test, dtype=object) 

vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words)
X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object))
X_test = vectorizer.transform(np.array(tweets_test, dtype=object))
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train.toarray(), y_train)
prediction = classifier.predict(X_test.toarray())

print '\nAccuracy :', accuracy_score(y_test, prediction)
print '\nPrecision :', precision_score(y_test, prediction)
print '\nRecall :', recall_score(y_test, prediction)
print '\nF-score :', f1_score(y_test, prediction)
print '\nClasification report:\n', classification_report(y_test,prediction)
print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
コード例 #36
0


    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age','gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print( 'Accuracy              :', accuracy_score(y_, prediction_))
        print( 'Precision             :', precision_score(y_, prediction_))
        print( 'Recall                :', recall_score(y_, prediction_))
        print( 'F-score               :', f1_score(y_, prediction_))
        print( '\nClasification report:\n', classification_report(y_,
                prediction_))
        print( '\nConfussion matrix   :\n',confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score
        print( 'Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print( 'Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print( 'R2 Error              :', r2_score(y_, prediction_))


    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')
    #plt.colorbar()
    #plt.xlabel()
    #plt.xlabel('categoria de verdad')
    #plt.ylabel('categoria predecida')
コード例 #37
0
def load_data(dataset):
    f = gzip.open(dataset, 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()

    train_set_x, train_set_y = train_set
    valid_set_x, valid_set_y = valid_set
    test_set_x, test_set_y = test_set

    rval = [(train_set_x, train_set_y),
            (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
    return rval

if __name__ == "__main__":
    datasets = load_data('mnist.pkl.gz')

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    print train_set_x.shape
    print train_set_y.shape

    logreg = linear_model.LogisticRegression()
    logreg.fit(train_set_x, train_set_y)
    predictions = logreg.predict(test_set_x)
    print confusion_matrix(test_set_y, predictions)
    print classification_report(test_set_y, predictions)
コード例 #38
0
	vect__norm: 'l2'
	vect__use_idf: True
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix

__author__ = 'gavin'
import pandas as pd

df = pd.read_csv('sms/sms.csv')

X_train_r, X_test_r, y_train, y_test = train_test_split(
    df['message'], df['label'])

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=None,
                             ngram_range=(1, 1),
                             norm='l2',
                             use_idf=True)
X_train = vectorizer.fit_transform(X_train_r)
X_test = vectorizer.transform(X_test_r)
classifier = LogisticRegression(penalty='l2', C=7)
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print 'score', classifier.score(X_test, y_test)
print 'precision', precision_score(y_test, predictions)
print 'recall', recall_score(y_test, predictions)
print confusion_matrix(y_test, predictions)