def version2(): # Data cleaning in NLP Model corpus = [] for i in range(0, 527383): review = re.sub( '[^a-zA-Z]', ' ', df.iloc[i, 1]) # Removing all elements except words from all reviews review = review.lower() review = review.split() review = [ word for word in review if not word in set(sw.words('english')) ] stammer = ps() review = [stammer.stem(word) for word in review] review = " ".join(review) corpus.append(review) features = cv().fit_transform(corpus) labels = df.iloc[:, -1] train_test_split(features, labels, 100) features_test_vectorized = cv().transform(features_test) features_train_vectorized = cv().fit_transform(features_train) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def version1(): # Logistic Regression Model train_test_split(df["reviewText"], df["Positivity"], 100) features_train_vectorized = cv().fit_transform(features_train) features_test_vectorized = cv().transform(features_test) model = lr().fit(features_train_vectorized, labels_train) # Model creation for logistic regression predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) # Generating prediction score cm(labels_test, predictions) return model
def version3(): # TF_IDF Model global vect train_test_split(df["reviewText"], df["Positivity"], 100) vect = TfidfVectorizer(min_df=5) features_train_vectorized = vect.fit_transform(features_train) features_test_vectorized = vect.transform(features_test) model = lr().fit(features_train_vectorized, labels_train) predictions = model.predict(features_test_vectorized) ras(labels_test, predictions) cm(labels_test, predictions) return model
def roc_scores(data): y = data rocs = pandas.Series() for c in y.columns.values[1:]: temp1 = ras(y['true'], y[c]) rocs.set_value(c,temp1) # temp = pandas.Series([temp1], index=[c]) # rocs = rocs.append(temp) # also works fine return rocs
def dfauc(x): y = 0 length = len(df) for i in range(len(df)): if i == 0: average = np.multiply(x[i], df[i].target.values) y = y + x[i] elif i < (len(df) - 1): average = average + np.multiply(x[i], df[i].target.values) y = y + x[i] else: average = average + np.multiply(1 - y, df[i].target.values) auc_score = ras(df[i].ref.values, average) return -1 * auc_score
] totalScores = np.zeros(7) for subject in subjects: scores = np.zeros(7) print 'calculating scores for subject: ' + str(subject) for serie in series: data = pd.read_csv( 'SVM_results_binary_allCSP/subj%d_series%d_results.csv' % (subject, serie)) data = np.array(data[data.columns[1:]]) truth = pd.read_csv('input/train/subj%d_series%d_events.csv' % (subject, serie)) truth = np.array(truth[cols]) for i in range(0, 6): scores[i] += ras(truth[:, i], data[:, i]) scores[6] += ras(truth, data, average='macro') scores = np.true_divide(scores, len(series)) totalScores += scores print 'Writing scores for subject: ' + str(subject) f = open('SVM_scores/subj%d_mean_scores.txt' % (subject), 'w') f.write('Average AUC score: {}\n'.format(scores[6])) f.write('Scores by Event:\n') f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format( cols[0], cols[1], cols[2], cols[3], cols[4], cols[5])) f.write('\n') f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format( scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
def auc_roc(y_orig, x_orig, w): y_predicted = np.dot(x_orig, w.reshape(w.size, 1)) return ras(y_orig, y_predicted)
sess.run(cross_entropy, feed_dict={ prob: 1.0, x: xtr[i - 100:i], y_: ytr[i - 100:i] })) feed_dict = {x: xte, y_: yte, prob: 1.0} ac = sess.run(acc, feed_dict=feed_dict) print("Acc:", ac) '''print('Test Acc: %g' % acc.eval()) print('Train Acc: %g' % acc.eval(feed_dict={ x: xtr, y_: ytr, prob: 1.0}))''' # y_true = sess.run(tf.argmax(y_,1), feed_dict={y_:yte, prob:1.0}) # y_score = sess.run(tf.argmax(y_conv,1), feed_dict={x:xte, prob:1.0}) a = sess.run(y_, feed_dict={y_: yte, prob: 1.0}) b = sess.run(y_conv, feed_dict={x: xte, prob: 1.0}) rass = ras(a, b) print('AUC(ROC):', rass) plot.append([ac, ras]) # ka.append(sess.run(acc, feed_dict={x:xtr, y_:ytr, prob:1.0})) # kb.append(sess.run(acc, feed_dict={x:xte, y_:yte, prob:1.0})) # kc.append(rass) writer.add_graph(sess.graph) saver.save(sess, 'Model1/cnn') print('Done.')
import pandas as pd import sys import numpy as np from sklearn.metrics import roc_auc_score as ras results_file = sys.argv[1] truth_file = sys.argv[2] out = sys.argv[3] results = pd.read_csv(results_file) truth = pd.read_csv(truth_file) cols = np.array(truth.columns[1:]) scores = np.empty(6) for i in range(0, 6): scores[i] = ras(np.array(truth[cols[i]]), np.array(results[cols[i]])) avg_score = ras(truth[cols], results[cols], average='macro') f = open(out, 'w') f.write('Average AUC score: ' + str(avg_score) + '\n') f.write('Scores by Event:\n') f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format( cols[0], cols[1], cols[2], cols[3], cols[4], cols[5])) f.write('\n') f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format( scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
# Use first the file data_adjsutments.py to work with the variables. # Now lets optimize the model from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.metrics import roc_auc_score as ras #First, number of estimators results = [] n_estimator_option = [100, 250, 300, 700] for trees in n_estimator_option: model = rfr(trees, oob_score=True, random_state=42) model.fit(X_train, Y_train) print trees, 'trees' roc = ras(Y_train, model.oob_prediction_) print 'c-stat:', roc results.append(roc) print "" pd.Series(results, n_estimator_option).plot() # n_estimator_option = 300 is the highest. Let's use it #%% results = [] max_feature_option = ['auto', None, "sqrt", "log2", 0.9, 0.2] for max_feature in max_feature_option: model = rfr(n_estimators=300,
sum1 = sum1 + y[i] * x1[i] * ( 1 - (1 / (1 + np.exp(-y[i] * (w1 * x1[i] + w2 * x2[i]))))) sum2 = sum2 + y[i] * x2[i] * ( 1 - (1 / (1 + np.exp(-y[i] * (w1 * x1[i] + w2 * x2[i]))))) w1 = w1 + k / 205 * sum1 - k * c * w1 w2 = w2 + k / 205 * sum2 - k * c * w2 itter += 1 print(w1, w2, itter) return w1, w2 def sigma(w1, w2, x1, x2): p = 1 / (1 + np.exp(-w1 * x1[i] - w2 * x2[i])) return p pvect = [] w1, w2 = weights(0, 0, 0.1, 10) print(w1, w2) for i in range(205): pvect.append(sigma(w1, w2, x1, x2)) # print(sigma(w1,w2,x1,x2)) y_true = np.array(y) y_scores = np.array(pvect) print(ras(y_true, y_scores), 'jopa') #print(func(w1,w2,y,x1,x2))
def create_and_predict(data, **kwargs): """ kwargs: neurons=32 epochs=50 learning_rate=0.01 batch_size=32 plot=False """ # # 1) Initialize act = 'relu' architecture = [ Dense( kwargs.get('neurons', 32), input_shape=(2, ), activation=act, ), Dense( kwargs.get('neurons', 32), activation=act, ), #Dense( # kwargs.get('neurons',32), # activation=act,), Dense(1, activation='sigmoid'), ] model = Sequential(architecture) model.compile( optimizer=SGD(learning_rate=kwargs.get('learning_rate', .01)), loss='mean_squared_error', metrics='accuracy', ) # # 2) Fit results = model.fit( *data['train'], batch_size=kwargs.get('batch_size', 32), epochs=kwargs.get('epochs', 50), verbose=1, callbacks=[EarlyStopping()], validation_data=data['val'], ) # # 3) return results results = results.history results['ytrue_val'] = data['val'][1] results['ytrue_test'] = data['test'][1] results['ypred_val'] = model.predict(data['val'][0]) results['ypred_test'] = model.predict(data['test'][0]) results['specs'] = kwargs # if kwargs.get('plot', False): case = 'test' from sklearn.linear_model import LogisticRegression as lr f, ax = plt.subplots(1, 3, figsize=(20, 7)) fpr, tpr, treshold = roc_curve(results['ytrue_' + case], results['ypred_' + case]) ax[0].plot( tuple(fpr), tuple(tpr), label='NN AUC ' + str( round(ras(results['ytrue_' + case], results['ypred_' + case]), 2))) if False: # Logistic Regression newytrue, newypred = data[case][1], lr(max_iter=5000).fit( *data['train']).predict_proba(data[case][0])[:, 1] fpr2, tpr2, treshold = roc_curve(newytrue, newypred) ax[0].plot(tuple(fpr2), tuple(tpr2), label='Logistic AUC ' + str(round(ras(newytrue, newypred), 2))) ax[0].set_title('ROC curve') ax[0].legend() weights = {0: [], 1: []} for i, x in enumerate(results['ypred_' + case]): weights[data[case][1][i][0]] += [x[0]]
series = range(1,9) #["hand"start","secondThing"....."Average of all"] cols = ['HandStart','FirstDigitTouch','BothStartLoadPhase','LiftOff','Replace','BothReleased'] totalScores = np.zeros(7) for subject in subjects: scores = np.zeros(7) print 'calculating scores for subject: ' + str(subject) for serie in series: data = pd.read_csv('SVM_results_binary_allCSP/subj%d_series%d_results.csv'%(subject, serie)) data = np.array(data[data.columns[1:]]) truth = pd.read_csv('input/train/subj%d_series%d_events.csv'%(subject, serie)) truth = np.array(truth[cols]) for i in range(0,6): scores[i] += ras(truth[:,i], data[:,i]) scores[6] += ras(truth, data, average='macro') scores = np.true_divide(scores, len(series)) totalScores += scores print 'Writing scores for subject: ' + str(subject) f = open('SVM_scores/subj%d_mean_scores.txt'%(subject), 'w') f.write('Average AUC score: {}\n'.format(scores[6])) f.write('Scores by Event:\n') f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(cols[0], cols[1], cols[2], cols[3], cols[4], cols[5])) f.write('\n') f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(scores[0], scores[1], scores[2], scores[3], scores[4], scores[5])) print 'Calculating overall mean scores'
# calculates scores for a given prediction file, against the given truth file. third parameter is the output file name import pandas as pd import sys import numpy as np from sklearn.metrics import roc_auc_score as ras results_file = sys.argv[1] truth_file = sys.argv[2] out = sys.argv[3] results = pd.read_csv(results_file) truth = pd.read_csv(truth_file) cols = np.array(truth.columns[1:]) scores = np.empty(6) for i in range(0,6): scores[i] = ras(np.array(truth[cols[i]]), np.array(results[cols[i]])) avg_score = ras(truth[cols], results[cols], average='macro') f=open(out, 'w') f.write('Average AUC score: ' + str(avg_score) + '\n') f.write('Scores by Event:\n') f.write('{0:>20} {1:>20} {2:>20} {3:>20} {4:>20} {5:>20}'.format(cols[0], cols[1], cols[2], cols[3], cols[4], cols[5])) f.write('\n') f.write('{0:20.5} {1:20.5} {2:20.5} {3:20.5} {4:20.5} {5:20.5}'.format(scores[0], scores[1], scores[2], scores[3], scores[4], scores[5]))
i = 0 for train_index, validation_index in kf.split(X_train_norm): X_tra, X_val = X_train_norm.ix[train_index, :], X_train_norm.ix[ validation_index, :] y_tra, y_val = y_train.ix[train_index], y_train.ix[validation_index] clf = clf.fit(X_tra, y_tra['Gravedad']) prediction = clf.predict(X_val) Acc.ix[i] = np.mean(np.array(y_val).T == prediction) feat_imp.ix[:, i] = clf.feature_importances_ i += 1 Acc_final_rf = np.mean(Acc) feat_imp = np.mean(feat_imp, axis=1) lista1, ordered_feat = zip(*sorted(zip(feat_imp, features), reverse=True)) final = clf.predict(X_test_norm) Acc_test[j] = np.mean(final == y_test) AUC[j] = ras(y_test, final, average='macro') del clf, kf, Acc # Final results: auc = AUC.copy() Acc_test.mean() Acc_test.std() plt.figure() plt.plot(range(1, 101), Acc_test) plt.plot(range(1, 101), np.tile(Acc_test.mean(), len(Acc_test)), c='r') plt.plot(range(1, 101), np.tile(Acc_test.mean() + Acc_test.std(), len(Acc_test)), c='r', ls='--') plt.plot(range(1, 101),