Example #1
0
def train_classifier(texts, y):
    '''
    Here is a perfect example of the "feel it ... func it" philosophy:

    The pype call uses the function arguments and function body to specify 
    three variables, texts, a list of strings, y, a list of floats, and vectorizer,
    a scikit-learn object that vectorizes text.  This reiterates the adivce that you
    should use the function body and function arguments to declare your scope,
    whenever you can.  

    Line-by-line, here we go:

    {'vectorizer':vectorizer.fit,
     'X':vectorizer.transform},

    We build a dict, the first element of which is the fit vectorizer.  Luckily, the
    'fit' function returns an instance of the trained vectorizer, so we do not need to
    use _do.  This vectorizer is then assigned to 'vectorizer'.  Because iterating
    through dictionaries in Python3.6 preserves the order of the keys in which they 
    were declared, we can apply the fit function to the vectorizer on the texts, 
    assign that to the 'vectorizer' key.  We need this instance of the vectorizer to
    run the classifier for unknown texts.

    After this, we apply the 'transform' to convert the texts into a training matrix
    keyed by 'X', whose rows are texts and whose columns are words. 

    _a('classifier',(Classifier().fit,_['X'],y)),

    Finally, we can build a classifier.  _a, or _assoc, means we are adding a 
    key-value pair to the previous dictionary.  This will be a new instance of our
    Classifier, which is trained through the fit function on the text-word matrix 'X'
    and the labels vector y.

    _d('X'),

    Since we don't need the X matrix anymore, we delete it from the returned JSON,
    which now only contains 'vectorizer' and 'classifier', the two things we will
    need to classify unknown texts.
    '''
    vectorizer = Vectorizer()

    return p(
        texts,
        {
            'vectorizer': vectorizer.fit,
            'X': vectorizer.transform
        },
        _a('classifier', (Classifier().fit, _['X'], y)),
        _d('X'),
    )
Example #2
0
# from sklearn.datasets import fetch_mldata
# from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier as Classifier
from sklearn.metrics import confusion_matrix, accuracy_score
import time

# 比較をしやすくするため,予めtrain/testを分けたものを読み込む
from sklearn.externals import joblib
data_train, data_test, label_train, label_test = joblib.load("mnist")

# mnist = fetch_mldata("MNIST original", data_home=".")
# data = np.asarray(mnist.data, np.float32)
# data_train, data_test, label_train, label_test = train_test_split(data, mnist.target, test_size=0.2)

classifier = Classifier()
start = time.time()  # 処理時間の計測開始
classifier.fit(data_train, label_train)
training_time = time.time() - start

start = time.time()  # 処理時間の計測開始
result = classifier.predict(data_test)
predict_time = time.time() - start

# Confusion matrixを計算
print(training_time, predict_time)
cmat = confusion_matrix(label_test, result)
acc = accuracy_score(label_test, result)
print(cmat)
print(acc)
# Convert train_y to a 1d array to silence sklearn conversion warnings
train_y = train_y.as_matrix().reshape((train_y.shape[0], ))

test_x = pd.read_csv('test_x.csv')
test_y = pd.read_csv('test_y.csv')

# Using CV, discovered n=526 to be best (rng is seeded, results will not change)
n_estimators = [526]
best_error = float('Inf')
best_model = None
best_n = None

for n in n_estimators:
    clf = Classifier(n_estimators=n,
                     criterion='entropy',
                     n_jobs=-1,
                     random_state=123)
    clf.fit(train_x, train_y)

    predictions = clf.predict(valid_x)
    error = loss(valid_y, predictions)
    print "Validation error: " + str(error)

    if error < best_error:
        best_error = error
        best_model = clf
        best_n = n

print "Best n: " + str(best_n)

predictions = best_model.predict(test_x)
Example #4
0
def train_model():

    TIL_n = feat.count_TIL_corpus()
    decoy_n = TIL_n * _DECOY_PROPORTION
    FP_n = feat.count_TIL_false_pos()

    wiki_n = feat.count_WIKI_corpus()
    skip_wiki_n = wiki_n // decoy_n

    # Keep the number of false positives in about the same Order-of-Mag
    skip_FP = FP_n // TIL_n
    print "Skipping every {} value in FP".format(skip_FP)

    if FLAG_BUILD_DECOY_LIST:
        build_skip_query(skip_wiki_n)

    print "Loading features"
    features = Word2Vec.load(feat.f_features)
    dimension = 100  # default dimension

    ITR_decoy = query_skip_decoys()

    print "Building training set"
    ITR_train = list(feat.TIL_full_corpus_iter())

    print "Building the false positive set"
    ITR_FP = list(feat.TIL_false_pos_iter(skip_FP))

    print "Building corpus iter"
    ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy)
    ITR = list(ITR)

    Y = np.zeros(len(ITR))
    Y[:TIL_n] = 1.0

    TTS = train_test_split
    x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2)

    print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum())

    print "Calculating the wordVecs for train"
    vec_train = np.concatenate([
        getWordVecs(text, weight, features, dimension)
        for text, weight in x_train
    ])

    print "Building the scalar"
    scaler = preprocessing.StandardScaler().fit(vec_train)

    print "Saving the scaler"
    joblib.dump(scaler, f_norm_scale)

    print "Scaling train vectors"
    vec_train = scaler.transform(vec_train)

    print "Calculating the wordVecs for test"
    vec_test = np.concatenate([
        getWordVecs(text, weight, features, dimension)
        for text, weight in x_test
    ])

    print "Scaling test vectors"
    vec_test = scaler.transform(vec_test)

    print "Train size/TP in sample", vec_train.shape, (y_train == 1).sum()
    print "Test  size/TP in sample", vec_test.shape, (y_test == 1).sum()
    print "Training classifer"

    #from sklearn.linear_model import SGDClassifier as Classifier
    #from sklearn.linear_model import LogisticRegression as Classifier
    #from sklearn.linear_model import BayesianRidge as Classifier
    #from sklearn.naive_bayes import BernoulliNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.ensemble import RandomForestClassifier as Classifier
    from sklearn.ensemble import ExtraTreesClassifier as Classifier

    # This seems to be the best... but high FP rate
    #from sklearn.naive_bayes import BernoulliNB as Classifier

    #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD
    #clf =  Classifier(C=2500,verbose=2) # LogisiticRegression
    #clf =  Classifier() # Naive Bayes
    clf = Classifier(n_estimators=200, n_jobs=8)  # ExtraTrees

    clf.fit(vec_train, y_train)

    print 'Test Accuracy: %.3f' % clf.score(vec_test, y_test)

    idx_TP = np.array(y_test) > 0
    vec_TP = np.array(vec_test)[idx_TP]
    y_TP = np.array(y_test)[idx_TP]
    print 'Test Accuracy on TP: %.3f' % clf.score(vec_TP, y_TP)

    vec_FP = np.array(vec_test)[~idx_TP]
    y_FP = np.array(y_test)[~idx_TP]
    print 'Test Accuracy on FP: %.3f' % clf.score(vec_FP, y_FP)

    print "Saving the classifer"
    joblib.dump(clf, f_clf)

    #Create ROC curve
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt

    pred_probas = clf.predict_proba(vec_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')
    plt.show()
Example #5
0
X, Y = zip(*list(TRAINING_ITR()))
X = np.concatenate(X)

TTS = train_test_split
x_train, x_test, y_train, y_test = TTS(X, Y, test_size=0.17)

print "Scaling train vectors"
x_train = scalar.transform(x_train)

print "Scaling text vectors"
x_test = scalar.transform(x_test)

print "Training classifer"
from sklearn.ensemble import ExtraTreesClassifier as Classifier

clf = Classifier(n_estimators=200, n_jobs=8)  # ExtraTrees
clf.fit(x_train, y_train)

print 'Test Accuracy: %.3f' % clf.score(x_test, y_test)

y_test = np.array(y_test)
for n in _INV_STATUS_MAP.keys():
    idx = y_test == n
    try:
        score = clf.score(x_test[idx], y_test[idx])
    except:
        score = -1
    print 'Test Accuracy on {}: {:0.3f}'.format(_INV_STATUS_MAP[n], score)

print
print "Suggesting some new entries"
    ret, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    x.append(np.asarray(img, dtype=np.float64))
    y.append(name.split("_")[0])

x, y = np.array(x), np.array(y)
x_data = x.reshape((len(x), 150, 150, 1))
x_data = list(x_data)
for i in range(len(x_data)):
    x_data[i] = x_data[i].flatten()
x_data = np.array(x_data)

scaler = StandardScaler()
scaler.fit(x_data)
x_train = scaler.transform(x_data)

model = Classifier(n_estimators=100, max_depth=30, random_state=0)
model.fit(x_train, y)


def predict_gesture(img):
    ret, img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    x_test = np.array([np.asarray(img, dtype=np.float64)]).reshape(
        (1, 150, 150, 1))
    x_test = x_test / 255
    x_test = list(x_test)
    for i in range(len(x_test)):
        x_test[i] = x_test[i].flatten()
    x_test = np.array(x_test)
    r = model.predict(x_test)
    return r[0]