def runSGDPipeline(entries, langs):
	t0 = time()
	sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                            alpha=0.001, n_iter=5, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)

	clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return sgd_pipeline
def runSVCPipeline(entries, langs):
	t0 = time()
	svc_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.transform(X_train_counts)

	clf = LinearSVC(dual=False, loss='squared_hinge', max_iter=100, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	#dec = clf.decision_function([[1]])
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return svc_pipeline
Example #3
0
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(stop_words='english')),
        ('clf', LogisticRegression())
    ])
    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('data/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)
Example #4
0
def bagOfWords(dataset):
    sp = SetProcessing()
    datalist = sp.convertDataToList(dataset)
    japanese, korean, mandarin = sp.organizeEasternLanguages(datalist)
    datalist = datalist[870:970]
    pairs = sp.buildSpeakingLearningPairs(datalist)
    print(pairs)
    entries = []
    langs = []
    korean = korean[:10]
    japanese = japanese[:10]

    for s in korean:
        datalist.append(s)
    for fr in japanese:
        datalist.append(fr)

    for data in datalist:
        entries.append(data[sp.ENTRY])
        langs.append(data[sp.SPEAKING])

    print(langs)

    vect = CountVectorizer()
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.transform(X_train_counts)
    X_train_tfidf = X_train_tfidf.toarray()

    tree = SGDClassifier()
    tree.fit(X_train_tfidf, langs)
    result = tree.predict(X_train_tfidf)
    print(np.mean(result == langs))
    print(metrics.classification_report(langs, result, target_names=langs))
def vectorize(training_data, test_data,approach):
    """Function that creates vectors from the training and test data for the SVM,
    trains the SVM and makes predictions on the test data.
    Uses the svm module from scikit.

    Parameters
    ----------
    training_data : A list of lists of the format
                    [named_pair, relationship, list of features].
    test_data : A list of lists of the format [named_pair, list of features].
    """
    global corpus, classes
    test_corpus = []
    for data in training_data:
        named_pair = data[0]
        rel_class = data[1]
        tokens = data[2]
        classes.append(rel_class)
        corpus.append(' '.join(tokens).decode("UTF-8",errors="ignore").encode("UTF-8"))
    u_clases=set(classes)
    class_list=list(u_clases)
    vectorizer = TfidfVectorizer(min_df=3, sublinear_tf=True, use_idf=True)
    X = vectorizer.fit_transform(corpus)
    svm = SVC(C=10, gamma=0.0, kernel='linear')
    svm.fit(X, classes)
    if approach==1:
        ind=1
    else:
        ind=2
    for data in test_data:
        named_pair = data[0]
        tokens = data[ind]
        test_corpus.append(' '.join(tokens).decode("UTF-8",errors="ignore").encode("UTF-8"))
    Xtest = vectorizer.transform(test_corpus)
    prediction = svm.predict(Xtest)
    corr1 = 0
    corr2 = 0
    total = 0
    actual=[]
    predict=[]
    for i in range(len(prediction)):
        if test_data[i][0][0]+"|"+test_data[i][0][1] in relation_dict:
            v=relation_dict[test_data[i][0][0]+"|"+test_data[i][0][1]]
        else:
            v="NA"
        if v=="NA":
            continue
        if prediction[i] == v and prediction[i] in class_list:
            corr1 += 1
        elif prediction[i] == v:
            corr2 += 1
        total += 1
        actual.append(v)
        predict.append(prediction[i])
        print str(i)+":"+test_data[i][0][0]+":"+test_data[i][0][1]+":"+prediction[i]+":"+v
    res=open("Result_approach"+str(approach)+".txt","w")
    res.write("Technique Approach "+str(approach)+"\n")
    res.write(metrics.classification_report(actual, predict))
    res.write("Accuracy:"+str(float(corr1+corr2)/total)+"\n\n")
Example #6
0
def print_classification_report(y_true, y_pred, title=''):
    """
	Print a classification report
	"""

    # TODO: print classification report

    print(classification_report(y_true, y_pred))
Example #7
0
def main():
    pipeline = Pipeline([('vect', TfidfVectorizer()),
                         ('clf', LogisticRegression())])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=-1,
                               verbose=1,
                               scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(
        y_test, predictions)
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes_musiconly.csv")
    target = np.array( [x[0] for x in data] )
    train = np.array( [x[1:] for x in data] )
    train_scaled = preprocessing.scale(train)

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_scaled, target, test_size = 0.5, random_state = 0)
    
    tuned_parameters = [{'kernel': ['rbf'],'gamma': [ 1e-3, 1e-4 ],
                        'C':[1, 10, 100, 1000]},
                       {'kernel': ['linear'], 'C':[1, 10,100, 1000]}]

    scores = [
           ('precision', metrics.precision_score),
           ('recall', metrics.recall_score),
    ]
  
    for score_name, score_func in scores:
        print "Tuning hyper-parameters for %s" % score_name
        print
        clf  = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
        clf.fit(X_train, y_train, cv = 5)

        print "Best Parameters set found on development set:"
        print 
        print clf.best_estimator_
        print
        print "Grid scores on development set:"
        print
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r" % (
            mean_score, scores.std() / 2, params)
        print

    
        
        print "Detailed classification report:"
        print
        print "The model is trained on the full development set."
        print "The scores are computed on the full evaluation set."
        print
        y_true, y_pred = y_test, clf.predict(X_test)
        print metrics.classification_report(y_true, y_pred)
        print
Example #9
0
 def run_model(self, train_path, test_path):
     trainx, trainy = self.load_data(train_path)
     self.train_model(trainx, trainy)
     testx, testy = self.load_data(test_path)
     predy = self.predict_res(testx)
     accuracy = accuracy_score(testy, predy) 
     label = [1, 0]
     classifier = ['interested', 'nointerested']
     result = classification_report(testy, predy, labels=label, target_names = classifier) + '\naccuracy\t' + str(accuracy)
     print result
def perecptronClassification():
  from sklearn.datasets import fetch_20newsgroups
  from sklearn.metrics.metrics import f1_score, classification_report
  from sklearn.feature_extraction.text import TfidfVectorizer
  from sklearn.linear_model import Perceptron

  categories = ['rec.sport.hockey','rec.sport.baseball','rec.autos']
  newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=('headers','footers','quotes'))
  newsgroups_test = fetch_20newsgroups(subset='test',categories=categories,remove=('headers','footers','quotes'))


  vectorizer = TfidfVectorizer()
  X_train = vectorizer.fit_transform(newsgroups_train.data)
  X_test = vectorizer.transform(newsgroups_test.data)

  classifier = Perceptron(n_iter=100,eta0=0.1)
  classifier.fit(X_train,newsgroups_train.target)
  predictions = classifier.predict(X_test)
  print classification_report(newsgroups_test.target,predictions)
Example #11
0
def main():
    pipeline = Pipeline([
        ('vect', TfidfVectorizer()),
        ('clf', LogisticRegression())
    ])
    parameters = {
        # 'vect__max_df': (0.25, 0.5, 0.75),
        'vect__stop_words': ('english', None),
        # 'vect__max_features': (5000, 10000, None),
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        # 'vect__use_idf': (True, False),
        # 'vect__norm': ('l1', 'l2'),
        # 'clf__penalty': ('l1', 'l2'),
        # 'clf__C': (0.1, 1, 10),
    }
    df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t')
    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Classification Report:', classification_report(y_test, predictions)

    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, predictions)
    print cm

    plt.matshow(cm)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    predictions = np.ones(len(predictions)) * 2
    print 'Accuracy:', accuracy_score(y_test, predictions)
    print 'Degenerate Classification Report:', classification_report(y_test, predictions)
 def Run(self, trainFileDir, testFileDir):
     XTrain, yTrain = self.loadData(trainFileDir)
     self.trainModel(XTrain, yTrain)
     XTest, yTest = self.loadData(testFileDir)
     yPred = self.predict(XTest)
     accuracy = accuracy_score(yTest, yPred)
     #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) 
     labels = [1, 0]
     classNames = ['interested', 'notInterested']
     report = classification_report(yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy)
     print report
Example #13
0
 def show_report(self, y_predicted):
     y_true = []
     y_predicted_new = []
     
     for i in range(len(self.__labels)):
         if self.__labels[i]=='P':
             y_true.append(1)
         if y_predicted[i]=='positivo':
             y_predicted_new.append(1)
         if self.__labels[i]=='N':
             y_true.append(-1)
         if y_predicted[i]=='negativo':
             y_predicted_new.append(-1)
         if self.__labels[i]=='NEU':
             y_true.append(0)
         if y_predicted[i]=='neutral':
             y_predicted_new.append(0)
     
     print classification_report(y_true, y_predicted_new)
     print confusion_matrix(y_true, y_predicted_new)
 def Run(self, trainFileDir, testFileDir):
     XTrain, yTrain = self.loadData(trainFileDir)
     self.trainModel(XTrain, yTrain)
     XTest, yTest = self.loadData(testFileDir)
     yPred = self.predict(XTest)
     accuracy = accuracy_score(yTest, yPred)
     #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred)
     labels = [1, 0]
     classNames = ['interested', 'notInterested']
     report = classification_report(
         yTest, yPred, labels=labels,
         target_names=classNames) + '\naccuracy\t' + str(accuracy)
     print report
def main(argv):

    try:
        opts, args = getopt.getopt(argv, "d:c:")

    except getopt.GetoptError:

        sys.exit(2)
    for opt, arg in opts:
        if opt == '-d':
            data_file = arg
        elif opt == '-c':
            label_col = int(arg)

    y_true = np.genfromtxt(data_file,
                           usecols=label_col,
                           delimiter="\t",
                           skip_header=1)
    for lab in range(2, 9):
        print "lab", lab
        y_pred = np.genfromtxt(data_file,
                               usecols=lab,
                               delimiter="\t",
                               skip_header=1)
        print "The classification report for Algorithm", lab, "is \n"
        #Make classification report
        print metrics.classification_report(y_true, y_pred)
        print "Accuracy: %.6f" % metrics.accuracy_score(y_true, y_pred)
        #Compute specificity from confusion amtrix
        cm = confusion_matrix(y_true, y_pred)
        print "Confusion matrix as \n", cm
        tn = int(cm[0, 0])
        fp = int(cm[0, 1])
        print "tn", tn
        print "fp", fp
        s = tn / (tn + fp)
        print "Speicificity is", s, "\n"
        print "Metthiew correlation co-efficient: %.6f" % matthews_corrcoef(
            y_true, y_pred)
Example #16
0
def perecptronClassification():
    from sklearn.datasets import fetch_20newsgroups
    from sklearn.metrics.metrics import f1_score, classification_report
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import Perceptron

    categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']
    newsgroups_train = fetch_20newsgroups(subset='train',
                                          categories=categories,
                                          remove=('headers', 'footers',
                                                  'quotes'))
    newsgroups_test = fetch_20newsgroups(subset='test',
                                         categories=categories,
                                         remove=('headers', 'footers',
                                                 'quotes'))

    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)

    classifier = Perceptron(n_iter=100, eta0=0.1)
    classifier.fit(X_train, newsgroups_train.target)
    predictions = classifier.predict(X_test)
    print classification_report(newsgroups_test.target, predictions)
def main():
    #read in  data, parse into training and target sets
    data = csv_io.read_data("./filtered_classes.csv")
    o_target = np.array( [x[0] for x in data] )
    o_train = np.array( [x[1:] for x in data] )

    #Split the data randomly into 80% training and 20% test
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(o_train,o_target, test_size = 0.20)

    print str(len(o_target))
    print str(len(y_test))
    #Compute the most frequent class in the training set
    H = histogram(y_train)
    mc = max(H.iteritems(), key=operator.itemgetter(1))[0]
    print str(H)
    print str(mc)

    y_predict = np.empty(len(y_test))
    y_predict[:] = mc

    #print str(y_predict)

    print metrics.classification_report(y_test, y_predict)
    print str(metrics.zero_one_score(y_test, y_predict)) 
def movieReviewsMultiClassClassification():
    import pandas as pd
    df = pd.read_csv('./data/trainMovieSentiment.tsv',
                     header=0,
                     delimiter='\t')
    print df.count()
    print df['Phrase'].head(10)
    print df['Sentiment'].describe()
    print df['Sentiment'].value_counts()
    print df['Sentiment'].value_counts() / df['Sentiment'].count()

    #training with sckit classifier
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model.logistic import LogisticRegression
    from sklearn.cross_validation import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.metrics.metrics import classification_report, accuracy_score, confusion_matrix
    from sklearn.grid_search import GridSearchCV

    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
                         ('clf', LogisticRegression())])

    parameters = {
        'vect__max_df': (0.25, 0.5),
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__use_idf': (True, False),
        'clf__C': (0.01, 1, 10)
    }

    X, y = df['Phrase'], df['Sentiment'].as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5)
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               n_jobs=3,
                               verbose=1,
                               scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score %0.3f' % grid_search.best_score_
    print 'Best params set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])

    predictions = grid_search.predict(X_test)
    print 'accuracy:', accuracy_score(y_test, predictions)
    print 'confusion matrix:', confusion_matrix(y_test, predictions)
    print 'classifiaction report', classification_report(y_test, predictions)
def evaluate(df):
    X = df.ix[:,0:7]
    y = df["seed"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    print len(X_train)

    y_test = np.array(y_test)
    clf = LogisticRegression()
    clf.fit(X_train,y_train)

    print "------------",clf.predict_proba(X_test)
    print clf.get_params()

    pipeline=  Pipeline([
                    ('clf',LogisticRegression())
                    ])

    parameters={


    }
    grid_search = GridSearchCV(pipeline,parameters,n_jobs=1,verbose=1)

    grid_search.fit(X_train,y_train)


    print "Best score:",grid_search.best_score_
    print "Best parameters set:"
    best_parameters = grid_search.best_estimator_.get_params()

    for param_name in sorted(parameters.keys()):
        print (param_name,best_parameters[param_name])

    prediction = grid_search.predict(X_test)
    for i,pred in enumerate(prediction):
        print "original:",y_test[i],"predicted",pred
    print grid_search.score(X_test,y_test)
    print accuracy_score(y_test,prediction)
    print "classification_report",classification_report(y_test,prediction)
    clf_pred = clf.predict(X_test)
    for i,pred in enumerate(clf_pred):
        print "original:",y_test[i],"predicted",pred
    print accuracy_score(y_test,clf_pred)
    print  clf.score(X_test,y_test)
Example #20
0
def Classify(txtList, txtLabels, fileName, labelList):
    x_train = np.array(txtList[0:300])
    y_train = np.array(txtLabels[0:300])
    x_test = np.array(txtList[301:])
    y_test = np.array(txtLabels[301:])
    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    f=open(fileName,'w')
    f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList))
    f.write('\nNumber of Labels:'+str(len(labelList)))
    f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted)))
    f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5)))
    f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted)))
    f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted)))
    f.close()
def Classify(txtList, txtLabels, fileName, labelList):
    x_train = np.array(txtList[0:300])
    y_train = np.array(txtLabels[0:300])
    x_test = np.array(txtList[301:])
    y_test = np.array(txtLabels[301:])
    classifier = Pipeline([
        ('vectorizer', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', OneVsRestClassifier(LinearSVC()))])
    classifier.fit(x_train, y_train)
    predicted = classifier.predict(x_test)
    f=open(fileName,'w')
    f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList))
    f.write('\nNumber of Labels:'+str(len(labelList)))
    f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted)))
    f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5)))
    f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted)))
    f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted)))
    f.close()
def evaluate(df):
    X = df.ix[:, 0:7]
    y = df["seed"]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=42)
    print len(X_train)

    y_test = np.array(y_test)
    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    print "------------", clf.predict_proba(X_test)
    print clf.get_params()

    pipeline = Pipeline([('clf', LogisticRegression())])

    parameters = {}
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1)

    grid_search.fit(X_train, y_train)

    print "Best score:", grid_search.best_score_
    print "Best parameters set:"
    best_parameters = grid_search.best_estimator_.get_params()

    for param_name in sorted(parameters.keys()):
        print(param_name, best_parameters[param_name])

    prediction = grid_search.predict(X_test)
    for i, pred in enumerate(prediction):
        print "original:", y_test[i], "predicted", pred
    print grid_search.score(X_test, y_test)
    print accuracy_score(y_test, prediction)
    print "classification_report", classification_report(y_test, prediction)
    clf_pred = clf.predict(X_test)
    for i, pred in enumerate(clf_pred):
        print "original:", y_test[i], "predicted", pred
    print accuracy_score(y_test, clf_pred)
    print clf.score(X_test, y_test)
def baseline(test_data,approach): # baseline method implementation
    correct=0
    total=0
    actual=[]
    pred=[]
    v = ""
    if approach==1:
        ind=1
    else:
        ind=2
    for pt in test_data:
        syn_list=[]
        if pt[0][0]+"|"+pt[0][1] in relation_dict:
            v=relation_dict[pt[0][0]+"|"+pt[0][1]]
            if v in relation_synonyms:
                    syn_list=relation_synonyms[v]
            else:
                    syn_list=[v]
        else:
            v="NA"
        made=False
        if v=="NA":
            continue
        for x in syn_list:
            if x in pt[ind]:
                pred.append(v)
                correct+=1
                made=True
        if made==False:
            pred.append("NA")
        actual.append(v)
        total+=1
    res=open("Result_approach"+str(approach)+".txt","a")
    res.write("Technique:Baseline\n")
    res.write(metrics.classification_report(actual, pred))
    res.write("Accuracy: "+str(float(correct)/total)+"\n")
def runTreePipeline(entries, langs):
	t0 = time()
	tree_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', DecisionTreeClassifier(max_features=n_features))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.transform(X_train_counts)

	clf = DecisionTreeClassifier(max_features=n_features)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return tree_pipeline
Example #25
0
#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

#print X_train.shape

from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score

print '\nAccuracy:', accuracy_score(y_test, prediction)
print '\nscore:', classifier.score(X_train, y_train)
print '\nrecall:', recall_score(y_test, prediction)
print '\nprecision:', precision_score(y_test, prediction)
print '\n clasification report:\n', classification_report(y_test, prediction)
print '\n confussion matrix:\n', confusion_matrix(y_test, prediction)

#plots:

import matplotlib.pyplot as plt
confusion_matrix_plot = confusion_matrix(y_test, prediction)
plt.title('matriz de confusion')
plt.colorbar()
plt.xlabel()
plt.xlabel('categoria de verdad')
plt.ylabel('categoria predecida')
plt.show()

#como arreglo
# import numpy as np
Example #26
0
 [    5   409  9024  6693   440]
 [    1    88  1112  2529   853]]
Classification Report:              precision    recall  f1-score   support

          0       0.59      0.13      0.21      3483
          1       0.51      0.28      0.36     13711
          2       0.64      0.90      0.75     39682
          3       0.56      0.40      0.47     16571
          4       0.62      0.19      0.29      4583

avg / total       0.59      0.61      0.57     78030
"""
predictions = grid_search.predict(X_test)
print 'Accuracy:', accuracy_score(y_test, predictions)
print 'Confusion Matrix:', confusion_matrix(y_test, predictions)
print 'Classification Report:', classification_report(y_test, predictions)


################# Sample 12 #################
# Applying Multi-label Classification
"""

"""


################# Sample 13 #################
# Multi-Label Classification Performance Metrics
"""
>>> import numpy as np
>>> from sklearn.metrics import hamming_loss
>>> print hamming_loss(np.array([[0.0, 1.0], [1.0, 1.0]]), np.array([[0.0, 1.0], [1.0, 1.0]]))
Example #27
0
# Split the data set into two subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

param_grid = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
    {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]

scores = ['precision', 'recall']

for score in scores:
    print '\nTuning hyper parameters for %s\n' % score

    # Define a classifier
    clf = GridSearchCV(svm.SVC(), param_grid, cv=5, n_jobs=-1, scoring=score)
    clf.fit(X_train, y_train)

    print 'Best parameters set found on development set:\n'
    print clf.best_estimator_

    print 'Grid scores on development set:\n'
    for params, mean_score, scores in clf.grid_scores_:
        print ("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

    print '\nDetailed classification report\n'
    print 'The model is trained on the full development set.'
    print 'The scores are computed on the full evaluation set\n'

    y_true, y_pred = y_test, clf.predict(X_test)
    print classification_report(y_true, y_pred)
def print_classification_report(y_test_report, y_predicted_report,target_names):
    #target_names = ['class 0', 'class 1']
    print ("overall accuracy score of the classifier is")
    print accuracy_score(y_test_report, y_predicted_report)
    print(classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names));
    return None
plt.title('Kittens and Adult Cats')
plt.show()

#Perceptron 
categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

classifier = Perceptron(n_iter=100, eta0=0.1)
classifier.fit_transform(X_train, newsgroups_train.target)
predictions = classifier.predict(X_test)
print classification_report(newsgroups_test.target, predictions)
"""
Output seen
             precision    recall  f1-score   support

          0       0.89      0.87      0.88       396
          1       0.87      0.78      0.82       397
          2       0.79      0.88      0.83       399

avg / total       0.85      0.85      0.85      1192
"""

#plot the output
import matplotlib
matplotlib.use('Qt4Agg')
Example #30
0
def main(argv):
    
    
    # get options passed at command line
    
    try:
        opts, args = getopt.getopt(argv, "d:o:c:C:t:m:")

    except getopt.GetoptError:
        
        #print helpString
        
        sys.exit(2)
#print opts
    for opt, arg in opts:
    
        if opt == '-d':
        
            data_file = arg
        
        elif opt == '-o':
            
            out_folder = arg
    
        elif opt == '-c':
            
            label_col = int(arg)
        
        elif opt == '-C':
            
            data_cols = arg
        
        elif opt == '-t':
            
            test_file = arg  #Whole genome prediction file

        elif opt == '-m':
            model_file = arg

    model_filename = os.path.abspath(model_file)
    data_file = os.path.abspath(data_file)
    test_file = os.path.abspath(test_file)
    print model_file, "\n"
    data_cols = [int(x) for x in data_cols.split(",")]
    x_data = np.loadtxt(data_file, usecols=data_cols, delimiter = "\t", skiprows=1)
    y_data = np.genfromtxt(data_file,  usecols = label_col, delimiter = "\t", skip_header=1)
    test_x_data = np.loadtxt(test_file, usecols=data_cols, delimiter = "\t", skiprows=1)
    test_y_data = np.genfromtxt(test_file,  usecols = label_col, delimiter = "\t", skip_header=1)
    
    #Load the model file#
    estimator = joblib.load(model_filename)

    #perform same scaling on training and testing data
    x_data, test_x_data = scaling_training_testing_data(x_data, test_x_data)
    np.random.seed(0)
    indices = np.random.permutation(len(test_x_data))
    test_x_data = test_x_data[indices]
    test_y_data = test_y_data[indices]
    cols = 0
    with open (test_file,"r") as temp:
        a =  '\n'.join(line.strip("\n") for line in temp)
        b = np.genfromtxt(StringIO(a), usecols = cols, delimiter="\t", dtype=None, skip_header=1)
        enhancer_names_test = b[indices]
    temp.close()
    y_FAN_pred = estimator.predict(test_x_data)
    y_score_test = estimator.predict_proba(test_x_data)
    print metrics.classification_report(test_y_data,y_FAN_pred)
    combined_test = zip(enhancer_names_test, test_y_data, y_FAN_pred, y_score_test[:,0], y_score_test[:,1])
    #f = open(out_folder + "/subroutine_RF_FANTOM_FeatureSelected_pred.txt", 'w')
    f = open(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.txt", 'w')
    f.write("Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n")
    for i in combined_test:
        line = '\t'.join(str(x) for x in i)
        f.write(line + '\n')
    f.close()
    print "Random Forests: On FANTOM, Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred)
    print metrics.classification_report(test_y_data,y_FAN_pred)
    print "Number of mislabeled points : %d" % (test_y_data != y_FAN_pred).sum()
    print metrics.classification_report(test_y_data,y_FAN_pred)
    print "Random Forests: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred)
    #Before we move on, let's look at a key parameter that RF returns, namely feature_importances. This tells us which #features in our dataset seemed to matter the most (although won't matter in the present scenario with only 2 features)
    print estimator.feature_importances_

#Plot ROC#
    roc_plt = plot_roc(estimator, test_x_data, test_y_data, y_FAN_pred)
    #pl.savefig(out_folder + "/subroutine_RF_FeatureSelected_split_test_train_Kfold.svg", transparent=True, bbox_inches='tight', pad_inches=0.2)
    pl.savefig(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.svg", transparent=True, bbox_inches='tight', pad_inches=0.2)
    roc_plt.show()
Example #31
0
            y_.extend(y_test)
            prediction_.extend(prediction)

    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age', 'gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print('Accuracy              :', accuracy_score(y_, prediction_))
        print('Precision             :', precision_score(y_, prediction_))
        print('Recall                :', recall_score(y_, prediction_))
        print('F-score               :', f1_score(y_, prediction_))
        print('\nClasification report:\n',
              classification_report(y_, prediction_))
        print('\nConfussion matrix   :\n', confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error, r2_score
        print('Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print('Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print('R2 Error              :', r2_score(y_, prediction_))

    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')
    #plt.colorbar()
    #plt.xlabel()
    #plt.xlabel('categoria de verdad')
    #plt.ylabel('categoria predecida')
            y_.extend(y_test)
            prediction_.extend(prediction)



    verbose('----------\n')
    verbose("Evaluation")

    if opts.mode in ['age','gender']:
        from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score
        # Calculando desempeño
        print( 'Accuracy              :', accuracy_score(y_, prediction_))
        print( 'Precision             :', precision_score(y_, prediction_))
        print( 'Recall                :', recall_score(y_, prediction_))
        print( 'F-score               :', f1_score(y_, prediction_))
        print( '\nClasification report:\n', classification_report(y_,
                prediction_))
        print( '\nConfussion matrix   :\n',confusion_matrix(y_, prediction_))
    else:
        from sklearn.metrics.metrics import mean_absolute_error, mean_squared_error,r2_score
        print( 'Mean Abs Error        :', mean_absolute_error(y_, prediction_))
        print( 'Mean Sqr Error        :', mean_squared_error(y_, prediction_))
        print( 'R2 Error              :', r2_score(y_, prediction_))


    #plots:
    #import matplotlib.pyplot as plt
    #confusion_matrix_plot = confusion_matrix(y_test, prediction)
    #plt.title('matriz de confusion')
    #plt.colorbar()
    #plt.xlabel()
    #plt.xlabel('categoria de verdad')
Example #33
0
def print_classification_report(y_true, y_pred, title=''):

    cr = classification_report(y_true, y_pred)
    print cr
Example #34
0
            for m in [1, 2]:
                print "STARTING CLASSIFICATION"
                clf = runClassificationTest(X_train, y_train, m, featureV, datatype)
                
                predicted= clf.predict(X_test)
                print "Accuracy: %0.3f " % (accuracy_score(y_test,predicted ))
                
                '''
                print "precision ", (precision_score(y_test, clf.predict(X_test), average=None))
                print "recall  ", (recall_score(y_test, clf.predict(X_test), average=None))
                print "F1 Score ", (f1_score(y_test, clf.predict(X_test), average=None))
                '''
                
        
                if datatype == 3:
                    print classification_report(y_test, predicted, target_names=['0','1', '2'], digits=3)
                    print draw_confusion_matrix(y_test, predicted, [0,1,2])
                else:
                    print classification_report(y_test, predicted, target_names=['0','1'], digits=3)
                    print draw_confusion_matrix(y_test, predicted, [0,1])
                    
        
    
    

    
    
    
   
            
    
Example #35
0
    processed_comment_list = []
    for art in commentList.items():
        for comm in art[1]:
            processed_comment_list.append(comm.body.decode('ascii', 'ignore'))
    features = vectorizer.transform(processed_comment_list)

    y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                                '_train.npy')
    y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag +
                               '_test.npy')

    print features.shape
    print y_train.shape
    print y_test.shape

    valueVector = np.concatenate([y_train, y_test])
    print
    print valueVector.shape

    # train_list = [' '.join(sent) for sent in train_list]
    # test_list = [' '.join(sent) for sent in test_list]
    predicted = [float(v) for v in clf.predict(features)]

    print "Accuracy: %0.3f " % (accuracy_score(valueVector, predicted))

    print classification_report(valueVector,
                                predicted,
                                target_names=['0', '1'])
    print draw_confusion_matrix(valueVector, predicted, ['ham', 'spam'])
Example #36
0
data_to_predict = []
for (i, X_to_predict) in enumerate(X_to_predict):
    features = image_to_feature_vector(X_to_predict)
    data_to_predict.append(features)
data_to_predict = np.array(data_to_predict) / 255.0

pred = model_CNN.predict(data_to_predict, batch_size=BS, verbose=1)
pred_cat = np.zeros((len(pred), 1))
for i in range(len(pred)):
    temp = pred[i, :]
    pred_cat[i] = np.where(temp == temp.max())

y_pred = pred_cat
y_true = y_to_predict
from sklearn.metrics import metrics
print(metrics.classification_report(y_true, y_pred))
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred)
plt.matshow(cm)

# ======= Predict on Test Set based on above Model
testX_reshape = np.reshape(testX, [len(testX), 64, 64, 3])
testX_data = []
for (i, testX_reshape) in enumerate(testX_reshape):
    features = image_to_feature_vector(testX_reshape)
    testX_data.append(features)
testX_data = np.array(testX_data) / 255.0

pred = model.predict(testX_data, batch_size=BS, verbose=1)

pred_cat = np.zeros((len(pred), 1))
		lc.pop(0)
		lc = [float(i) for i in lc]
		x.append(lc)
	f.close()

	pipeline = Pipeline([
		('clf', LogisticRegression())
	])

	parameters = {
		'clf__C': (0.1, 1, 10),
	}	

	X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.5)
	
	grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy')

	grid_search.fit(X_train, y_train)
	print 'Best score: %0.3f' % grid_search.best_score_
	print 'Best parameters set:'
	best_parameters = grid_search.best_estimator_.get_params()
	for param_name in sorted(parameters.keys()):
		print '\t%s: %r' % (param_name, best_parameters[param_name])
	
	predictions = grid_search.predict(X_test)
	print 'Accuracy:', accuracy_score(y_test, predictions)
	print 'Confusion Matrix:'
	print confusion_matrix(y_test, predictions)
	print 'Classification Report:'
	print classification_report(y_test, predictions)
Example #38
0
                           random_state=0)

#Great, the dataset has 4 classes that we'll try to predict. It's got fairly interesting seperation as we can see below.

#Let's visualize the data with a scatter plot
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.BuGn)
plt.show()

# In[3]:

#Great, let's now fit this dataset to the Decision Tree Classifier and see how well it does.
dtree = DecisionTreeClassifier(max_depth=10).fit(
    X, y)  #this parameter defines the maximum depth of the tree
y_pred = dtree.predict(X)

print metrics.classification_report(y, y_pred)

#THe report tells us that the overall accuracy of the predicted labels is about 94%. Looking at the data, we can be
#almost certain that this is definitely overfitting. To predict 94% of this dataset correctly, the tree would need to be
#extremely well tuned to the dataset we trained on (for now, the entire X dataset). This will mean that when you expose
#new data to the model, it will not be able to predict so well.

#We can confirm our understanding by doing a train/cv split on the data. Let's define a couple of functions next
#that will help us run this multiple times. We'll begin by doing a 80/20 split on the data below.
X_train, X_test, y_train, y_test = train_test_split(X, y)

# In[4]:

#All right let's do this the right way. We'll use a cross-validation generator to select train and CV datasets to finetune
#parameters such as C (Regularization parameter we saw earlier). These hyperparameters are extremely critical to the model.
#Now, if we tune parameters against the Test dataset, we will end up biasing towards the test set and will once again
Example #39
0
    ids_ = np.load(opts.IDS)

    le = preprocessing.LabelEncoder()
    le.fit(ids_)
    verbose("Total classes", le.classes_.shape[0])
    ids = le.transform(ids_)

    X_train, X_test, y_train, y_test=\
        train_test_split(feats, ids, test_size=0.20, random_state=42)

    verbose("Training")
    classifier = RandomForestClassifier(n_estimators=opts.estimators,
                                        n_jobs=opts.nprocessors,
                                        max_depth=20,
                                        verbose=True)

    # Aprendiendo
    classifier.fit(X_train, y_train)

    # Prediciendo
    verbose("Prediction")
    prediction = classifier.predict(X_test)

    print('Accuracy              :', accuracy_score(y_test, prediction))
    print('Precision             :', precision_score(y_test, prediction))
    print('Recall                :', recall_score(y_test, prediction))
    print('F-score               :', f1_score(y_test, prediction))
    print('\nClasification report:\n',
          classification_report(y_test, prediction))
    print('\nConfussion matrix   :\n', confusion_matrix(y_test, prediction))
np.set_printoptions(threshold=np.nan)
# 配置utf-8输出环境
reload(sys)
sys.setdefaultencoding('utf-8')
#从文件导入停用词表
stpwrdlst = process_tool.read_stopword("extra_dict/stop_words.txt")
#训练集读取
train_set = joblib.load("wordbag/word_bag1124.data")
print train_set.target_name
# print "fenci"
# process_tool.chinesefenci("test_corpus", "test_token")
# print "train_bag"
# process_tool.train_bags("test_token","test_set.data", "test_wordbag")
# print "test tfidf"
# test_data = process_tool.testset_tfidf("test_wordbag/test_set.data", "extra_dict/stop_words.txt", train_set.vocabulary)
test_data = joblib.load("test_wordbag/test_word_bag.data")
# print "MultinomialNB train"
# clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm, train_set.label)
# joblib.dump(clf,"model/MultinomialNB.model",compress=3)

clf = joblib.load("model/MultinomialNB.model")
print (test_data.tdm).shape
print len(test_data.label)

# print clf.predict(test_data.tdm)
print test_data.target_name
print classification_report(np.array(test_data.label), clf.predict(test_data.tdm),target_names=train_set.target_name)

cm = confusion_matrix(np.array(test_data.label), clf.predict(test_data.tdm))
print cm
categories = ['rec.sport.hockey', 'rec.sport.baseball', 'rec.autos']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories,
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories,
                                     remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

classifier = Perceptron(n_iter=100, eta0=0.1)
classifier.fit_transform(X_train, newsgroups_train.target)
predictions = classifier.predict(X_test)
print classification_report(newsgroups_test.target, predictions)

################# Example #################
"""

"""
"""
sudo apt-get remove libopenblas-base
openblas (required for video contextualization)
is incompatible with scipy.
"""
import numpy as np
import matplotlib
matplotlib.use('Qt4Agg')
import matplotlib.pyplot as plt
from sklearn.linear_model import Perceptron
Example #42
0
    temp = line.strip().split(',')
    train_feature.append(map(int,temp[0:-1]))
    train_target.extend(map(int,temp[-1]))
train_data.close()
##test data
test_feature=[]
test_target=[]
for line in test_data:
    temp = line.strip().split(',')
    test_feature.append(map(int,temp[0:-1]))
    test_target.extend(map(int,temp[-1]))
test_data.close()

train_feature = np.array(train_feature)
test_feature = np.array(test_feature)


##OneHotEncoder used
enc = OneHotEncoder(categorical_features=np.array([1,2,4,5,6,7,8,9,10,11,14,15,16,17,18,21]),n_values=[13,13,9,5,5,13,5,2,13,13,9,31,10,5,2,9])
enc.fit(train_feature)

train_feature = enc.transform(train_feature).toarray()
test_feature = enc.transform(test_feature).toarray()
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(train_feature,train_target)

##result
print (clf.predict(test_feature))
target_names = ['losing', 'active']
print (classification_report(test_target, clf.predict(test_feature),target_names=target_names))
def print_classification_report(y_test_report, y_predicted_report, target_names):
    # target_names = ['class 0', 'class 1']
    print ("overall accuracy score of the classifier is")
    print accuracy_score(y_test_report, y_predicted_report)
    print (classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names))
    return None
Example #44
0
def main():
    print("Getting features for deleted papers from the database")
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    #code for including keywords match feature
    print "adding addtional features..."
    import additional_features as af
    all_features = af.get_keywords_feature()
    kw_deleted, kw_confirmed, _ = all_features
    kw_features = kw_deleted + kw_confirmed
    for i in range(len(features)):
        _, _, ckw = kw_features[i]
        features[i] += (ckw, )

    featuresnp = np.array(features, dtype='float32')
    targetnp = np.array(target, dtype='int32')

    featuresnp -= np.mean(featuresnp, axis=0)
    featuresnp /= np.std(featuresnp, axis=0)

    # Set the parameters by cross-validation
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(featuresnp,
                                                        targetnp,
                                                        test_size=0.3,
                                                        random_state=0)

    tuned_parameters = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    }, {
        'kernel': ['linear'],
        'C': [1, 10, 100, 1000]
    }]

    scores = ['precision', 'recall']

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        clf = GridSearchCV(SVC(C=1),
                           tuned_parameters,
                           cv=4,
                           score_func=score,
                           n_jobs=4,
                           verbose=2)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_estimator_)
        print()
        print("Grid scores on development set:")
        print()
        for params, mean_score, scores in clf.cv_scores_:
            print("%0.3f (+/-%0.03f) for %r" %
                  (mean_score, scores.std() / 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()

        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
Example #45
0
d = {
    d[0]: d[1:]
    for d in [
        l.strip()[9:].split(' ') for l in open('reuters/cats.txt', 'rb')
        if l.startswith('training')
    ]
}
for f in glob.glob(
        '/home/gavin/PycharmProjects/mastering-machine-learning/ch4-logistic_regression/reuters/training/*'
):
    training_id = f[f.rfind('/') + 1:]
    articles.append(' '.join([label.strip() for label in open(f, 'rb')]))
    labels.append(d[training_id])

vectorizer = TfidfVectorizer()
train_len = int(len(articles) * .7)
X_train = vectorizer.fit_transform(articles[:train_len])
X_test = vectorizer.transform(articles[train_len:])

for label in set([label for instance in labels for label in instance][:3]):
    y = [1 if label in instance else 0 for instance in labels]
    print y
    y_train = y[:train_len]
    y_test = y[train_len:]
    classifier = LogisticRegression()
    classifier.fit_transform(X_train, y_train)
    predictions = classifier.predict(X_test)
    print y_test
    print predictions
    print classification_report(y_test, predictions)
Example #46
0
y_train[unlabeled_set] = -1

###############################################################################
# Learn with LabelSpreading
lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=5)
lp_model.fit(X, y_train)
predicted_labels = lp_model.transduction_[unlabeled_set]
true_labels = y[unlabeled_set]

cm = confusion_matrix(true_labels, predicted_labels,
        labels=lp_model.classes_)

print "Label Spreading model: %d labeled & %d unlabeled points (%d total)" % \
        (n_labeled_points, n_total_samples - n_labeled_points, n_total_samples)

print metrics.classification_report(true_labels, predicted_labels)

print "Confusion matrix"
print cm

# calculate uncertainty values for each transduced distribution
pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

# pick the top 10 most uncertain labels
uncertainty_index = np.argsort(pred_entropies)[-10:]

###############################################################################
# plot
f = pl.figure(figsize=(7, 5))
for index, image_index in enumerate(uncertainty_index):
    image = images[image_index]
                          class_sep=1.0, random_state=0)

#Great, the dataset has 4 classes that we'll try to predict. It's got fairly interesting seperation as we can see below. 

#Let's visualize the data with a scatter plot
plt.scatter(X[:,0], X[:,1], c=y, cmap=plt.cm.BuGn)
plt.show()


# In[3]:

#Great, let's now fit this dataset to the Decision Tree Classifier and see how well it does.
dtree = DecisionTreeClassifier(max_depth=10).fit(X,y) #this parameter defines the maximum depth of the tree
y_pred=dtree.predict(X)

print metrics.classification_report(y, y_pred)

#THe report tells us that the overall accuracy of the predicted labels is about 94%. Looking at the data, we can be
#almost certain that this is definitely overfitting. To predict 94% of this dataset correctly, the tree would need to be
#extremely well tuned to the dataset we trained on (for now, the entire X dataset). This will mean that when you expose
#new data to the model, it will not be able to predict so well.

#We can confirm our understanding by doing a train/cv split on the data. Let's define a couple of functions next
#that will help us run this multiple times. We'll begin by doing a 80/20 split on the data below.
X_train, X_test, y_train, y_test = train_test_split(X,y)


# In[4]:

#All right let's do this the right way. We'll use a cross-validation generator to select train and CV datasets to finetune
#parameters such as C (Regularization parameter we saw earlier). These hyperparameters are extremely critical to the model.
Example #48
0
	for tweet in reader[0:2*(numironicos/3)]:
		tweets_train.append(tweet["text"])
		labels_train.append("noironia")
	for tweet in reader[2*(numironicos/3):]:
		tweets_test.append(tweet["text"])
		labels_test.append("noironia")

stop_words = []
f = open("spanish.txt") 
for line in f:
	stop_words.append(line.strip())

f.close()

y_train = np.array(labels_train, dtype=object) 
y_test = np.array(labels_test, dtype=object) 

vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words)
X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object))
X_test = vectorizer.transform(np.array(tweets_test, dtype=object))
classifier = RandomForestClassifier(n_estimators = 10)
classifier.fit(X_train.toarray(), y_train)
prediction = classifier.predict(X_test.toarray())

print '\nAccuracy :', accuracy_score(y_test, prediction)
print '\nPrecision :', precision_score(y_test, prediction)
print '\nRecall :', recall_score(y_test, prediction)
print '\nF-score :', f1_score(y_test, prediction)
print '\nClasification report:\n', classification_report(y_test,prediction)
print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
Example #49
0
	#Split the data into training and test sets
	Features_train, Features_test, Labels_train, Labels_test = train_test_split(Features, Labels)

	#Create a pipeline and an instance of DecisionTreeClassifier for grid search.
	#Set 'criterion' to 'entropy' to build the tree using the information gain heuristic.
	# pipeline = Pipeline([('clf', DecisionTreeClassifier(criterion='entropy'))])
	#Replace Decision Tree with Random Forest
	pipeline = Pipeline([('clf', RandomForestClassifier(criterion='entropy'))])

	#Specify the hyperparameter space for grid search
	parameters = {
	'clf__n_estimators' : (5, 10, 20, 50),
	'clf__max_depth' : (50, 150, 250),
	'clf__min_samples_split' : (1, 2, 3),
	'clf__min_samples_leaf' : (1, 2, 3)
	}

	#Set GridSearchCV() to maximize the model's F1 score
	grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')
	grid_search.fit(Features_train, Labels_train)
	print 'Best score: %0.3f' %grid_search.best_score_
	print 'Best parameters set:'
	best_parameters = grid_search.best_estimator_.get_params()
	for param_name in sorted(parameters.keys()):
		print '\t%s %r' %(param_name, best_parameters[param_name])

	predictions = grid_search.predict(Features_test)
	print classification_report(Labels_test, predictions)

def load_data(dataset):
    f = gzip.open(dataset, 'rb')
    train_set, valid_set, test_set = cPickle.load(f)
    f.close()

    train_set_x, train_set_y = train_set
    valid_set_x, valid_set_y = valid_set
    test_set_x, test_set_y = test_set

    rval = [(train_set_x, train_set_y),
            (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
    return rval

if __name__ == "__main__":
    datasets = load_data('mnist.pkl.gz')

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    print train_set_x.shape
    print train_set_y.shape

    logreg = linear_model.LogisticRegression()
    logreg.fit(train_set_x, train_set_y)
    predictions = logreg.predict(test_set_x)
    print confusion_matrix(test_set_y, predictions)
    print classification_report(test_set_y, predictions)
def whole_dataset_train_test(X, y):
    rfpred = RandomForestClassifier().fit(X,y)
    pred = rfpred.predict(X)
    print "When fitted on the whole dataset with selected features, then the classification report is found to be:\n";
    print "Random Forests: Accuracy: %.6f" %metrics.accuracy_score(y,pred)
    print metrics.classification_report(y, pred)