コード例 #1
0
ファイル: classifiers.py プロジェクト: scsherm/Congress_work
def run_neural_net(X, y):
    print 'running neural network...'
    model = Sequential()
    
    #split 80/20 train test
    sss = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.2, random_state = 42)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    #Oversampling of unbalanced dataset
    sm = SMOTE(kind = 'regular', verbose = True)
    X_train, y_train = sm.fit_transform(X_train, y_train)
    X_train, y_train = sm.fit_transform(X_train, y_train)

    y_train = y_train.reshape(y_train.shape[0],1)
    y_test = y_test.reshape(y_test.shape[0],1)
    y_train, y_test = [np_utils.to_categorical(x) for x in (y_train, y_test)]
    # Dense(64) is a fully-connected layer with 64 hidden units.
    # in the first layer, you must specify the expected input data shape:
    # here, 20-dimensional vectors.
    #tr = ThresholdedReLU(theta = 0.3)
    model.add(Dense(input_dim=X.shape[1], output_dim=1000, init='uniform',activation='relu'))
    #model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=1000, init='uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=1000, init='uniform'))
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(output_dim=2, init='uniform'))
    model.add(Activation('softmax'))
    #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    model.fit(X_train, y_train, nb_epoch=10, batch_size = 200)
    score = model.evaluate(X_test, y_test, show_accuracy=True)
    pred = model.predict_proba(X_test) #get back probabilities
    pred2 = model.predict_classes(X_test) #get back predictions
    fpr, tpr, thresholds = roc_curve(y_test[:,1], pred[:,1])
    
    #get the AUC
    AUC = roc_auc_score(y_test[:,1], pred[:,1])
    
    #get the AUC for precision and recall curve
    AUC2 = average_precision_score(y_test[:,1], pred[:,1])
    recall = recall_score(y_test[:,1], pred2)
    precision = precision_score(y_test[:,1], pred2)
    print score
    return model, X_train, y_train, X_test, y_test, score
コード例 #2
0
ファイル: classifiers.py プロジェクト: scsherm/Congress_work
def clf_model(X, y, m_label, model = RandomForestClassifier(n_estimators = 5000, n_jobs = -1, oob_score = True)):
    '''runs a classifier model for the given model (with paramters)'''
    print 'running {}...'.format(model)

    #split 80/20 train test
    sss = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.2, random_state = 42)
    for train_index, test_index in sss:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    #Oversampling of unbalanced dataset
    sm = SMOTE(kind = 'regular', verbose = True)
    X_train, y_train = sm.fit_transform(X_train, y_train)
    X_train, y_train = sm.fit_transform(X_train, y_train)


    u = UnderSampler()
    X_train, y_train = u.fit_transform(X_train, y_train)

    #fit model
    clf = model
    clf.fit(X_train, y_train)
    pred = clf.predict_proba(X_test) #get back probabilities
    pred2 = clf.predict(X_test) #get back predictions
    fpr, tpr, thresholds = roc_curve(y_test, pred[:,1])

    #get the AUC
    AUC = roc_auc_score(y_test, pred[:,1])

    #get the AUC for precision and recall curve
    AUC2 = average_precision_score(y_test, pred[:,1])
    recall = recall_score(y_test, pred2)
    precision = precision_score(y_test, pred2)

    #plot AUC
    #plt.plot(fpr, tpr, label = '{} AUC = {}'.format(m_label,round(AUC,3)))
    return clf, recall, AUC, precision, AUC2
コード例 #3
0
# Extract features using sparse vectorizer
if USE_HASHING:
    vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                   n_features=N_FEATURES, ngram_range=(1, 2))
    X_train = vectorizer.transform(training_data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english', ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(training_data)

X_test = vectorizer.transform(testing_data)

# Oversampling
y_train_new = [0 if x == "definition" else 1 for x in y_train]
# print y_train_new
sm = SMOTE(kind='regular', verbose=True, ratio=10)
X_train, y_train = sm.fit_transform(X_train.toarray(), np.asarray(y_train_new))
# OS = OverSampler(verbose=True, ratio=10)
# X_train, y_train = OS.fit_transform(X_train.toarray(), np.asarray(y_train_new))
X_train = sparse.csr_matrix(X_train)
y_train = y_train.tolist()
y_train = ["definition" if x == 0 else "none" for x in y_train]

# mapping from integer feature name to original token string
if USE_HASHING:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

# Extracting best features with chi-squared test
if USE_CHI2: