def run_neural_net(X, y): print 'running neural network...' model = Sequential() #split 80/20 train test sss = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.2, random_state = 42) for train_index, test_index in sss: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #Oversampling of unbalanced dataset sm = SMOTE(kind = 'regular', verbose = True) X_train, y_train = sm.fit_transform(X_train, y_train) X_train, y_train = sm.fit_transform(X_train, y_train) y_train = y_train.reshape(y_train.shape[0],1) y_test = y_test.reshape(y_test.shape[0],1) y_train, y_test = [np_utils.to_categorical(x) for x in (y_train, y_test)] # Dense(64) is a fully-connected layer with 64 hidden units. # in the first layer, you must specify the expected input data shape: # here, 20-dimensional vectors. #tr = ThresholdedReLU(theta = 0.3) model.add(Dense(input_dim=X.shape[1], output_dim=1000, init='uniform',activation='relu')) #model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=1000, init='uniform')) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=1000, init='uniform')) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=2, init='uniform')) model.add(Activation('softmax')) #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer='adam') model.fit(X_train, y_train, nb_epoch=10, batch_size = 200) score = model.evaluate(X_test, y_test, show_accuracy=True) pred = model.predict_proba(X_test) #get back probabilities pred2 = model.predict_classes(X_test) #get back predictions fpr, tpr, thresholds = roc_curve(y_test[:,1], pred[:,1]) #get the AUC AUC = roc_auc_score(y_test[:,1], pred[:,1]) #get the AUC for precision and recall curve AUC2 = average_precision_score(y_test[:,1], pred[:,1]) recall = recall_score(y_test[:,1], pred2) precision = precision_score(y_test[:,1], pred2) print score return model, X_train, y_train, X_test, y_test, score
def clf_model(X, y, m_label, model = RandomForestClassifier(n_estimators = 5000, n_jobs = -1, oob_score = True)): '''runs a classifier model for the given model (with paramters)''' print 'running {}...'.format(model) #split 80/20 train test sss = StratifiedShuffleSplit(y, n_iter = 1, test_size = 0.2, random_state = 42) for train_index, test_index in sss: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #Oversampling of unbalanced dataset sm = SMOTE(kind = 'regular', verbose = True) X_train, y_train = sm.fit_transform(X_train, y_train) X_train, y_train = sm.fit_transform(X_train, y_train) u = UnderSampler() X_train, y_train = u.fit_transform(X_train, y_train) #fit model clf = model clf.fit(X_train, y_train) pred = clf.predict_proba(X_test) #get back probabilities pred2 = clf.predict(X_test) #get back predictions fpr, tpr, thresholds = roc_curve(y_test, pred[:,1]) #get the AUC AUC = roc_auc_score(y_test, pred[:,1]) #get the AUC for precision and recall curve AUC2 = average_precision_score(y_test, pred[:,1]) recall = recall_score(y_test, pred2) precision = precision_score(y_test, pred2) #plot AUC #plt.plot(fpr, tpr, label = '{} AUC = {}'.format(m_label,round(AUC,3))) return clf, recall, AUC, precision, AUC2
# Extract features using sparse vectorizer if USE_HASHING: vectorizer = HashingVectorizer(stop_words='english', non_negative=True, n_features=N_FEATURES, ngram_range=(1, 2)) X_train = vectorizer.transform(training_data) else: vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', ngram_range=(1, 2)) X_train = vectorizer.fit_transform(training_data) X_test = vectorizer.transform(testing_data) # Oversampling y_train_new = [0 if x == "definition" else 1 for x in y_train] # print y_train_new sm = SMOTE(kind='regular', verbose=True, ratio=10) X_train, y_train = sm.fit_transform(X_train.toarray(), np.asarray(y_train_new)) # OS = OverSampler(verbose=True, ratio=10) # X_train, y_train = OS.fit_transform(X_train.toarray(), np.asarray(y_train_new)) X_train = sparse.csr_matrix(X_train) y_train = y_train.tolist() y_train = ["definition" if x == 0 else "none" for x in y_train] # mapping from integer feature name to original token string if USE_HASHING: feature_names = None else: feature_names = vectorizer.get_feature_names() # Extracting best features with chi-squared test if USE_CHI2: