def predict_seg(model, sent, vocab, tags): s, length = utilities.process_data(sent, vocab) raw = model.predict(s)[0][-length:] result = [np.argmax(row) for row in raw] result_tags = [tags[i] for i in result] return utilities.tags2words(sent, result_tags)
def main(): #data_sets = ["clinica_train_synth_dengue.csv", # "laboratorio_train_synth_dengue.csv", # "completo_train_synth_dengue.csv"] data_sets = ["clinica_train_synth_dengue.csv"] """ data_sets=["clinica_train_synth_dengue.csv"] C = [ 0.5,1,1.5 ] kernels = ['linear', 'rbf', 'sigmoid'] gamma = ['scale', 'auto'] """ C = [1.5] kernels = ['rbf'] gamma = ['scale'] groups = 5 for a in data_sets: X, Y = util.process_data(a) X = util.normalizacion(X) for c in C: for kernel in kernels: for g in gamma: model = model_svm(C=c, kernel=kernel, gamma=g) groups_X, groups_Y = util.split_data(groups, X, Y) groups_Y = groups_Y.astype('int') for i in range(groups): validation_X = groups_X[i] validation_Y = groups_Y[i] if i != 0: training_X = groups_X[0] training_Y = groups_Y[0] else: training_X = groups_X[1] training_Y = groups_Y[1] for j in range(1, groups): if j != i: training_X = np.concatenate( (training_X, groups_X[i])) training_Y = np.concatenate( (training_Y, groups_Y[i])) model.fit(training_X, training_Y) F1_val = util.F1(model, validation_X, validation_Y) print(a, c, kernel, g, F1_val)
def predict_ner(model, sent, vocab, tags): s, length = utilities.process_data(sent, vocab) raw = model.predict(s)[0][-length:] results = [np.argmax(row) for row in raw] result_tags = [tags[i] for i in results] print(result_tags) per, loc, org = '', '', '' for s, t in zip(sent, result_tags): if t in ('b-per', 'i-per'): per += ' ' + s if (t == 'b-per') else s if t in ('b-org', 'i-org'): org += ' ' + s if (t == 'b-org') else s if t in ('b-loc', 'i-loc'): loc += ' ' + s if (t == 'b-loc') else s return ['person:' + per, 'location:' + loc, 'organzation:' + org]
def main(argv): df_train = pd.read_csv('data/train.csv') df_test = pd.read_csv('data/test.csv') df_answer = pd.DataFrame() df_train, df_test, df_answer = process_data(df_train, df_test, df_answer) label = df_train['NU_NOTA_MT'] df_train.drop(['NU_NOTA_MT'], axis=1, inplace=True) regression_model = rg.Regressor(df_train, df_test, df_answer, label) regression_model.auto_sklearn(time=int(sys.argv[1])) regression_model.prediction() regression_model.save_model('my_model') regression_model.save_answer('automl_answer') convert_to_zero('automl_answer')
] gyroscope_files = [ "my_data/gyroscope_sitting_hand.csv", "my_data/gyroscope_sitting_pocket.csv", "my_data/gyroscope_standing_hand.csv", "my_data/gyroscope_standing_pocket.csv", "my_data/gyroscope_up_down_stairs.csv", "my_data/gyroscope_walking_hand.csv", "my_data/gyroscope_walking_pocket.csv" ] for q in xrange(len(acceleration_files)): t, a, x_acc, y_acc, z_acc, x_gyo, y_gyo, z_gyo = \ process_data(acceleration_files[q], gyroscope_files[q]) for i in xrange(len(t)): j = 0 l = 0 while len(t[i]) - j > 128: for k in xrange(j, j + 128): if k != j + 127: if l % 3 == 2: f_acceleration_test_x.write("%f," % x_acc[i][k]) f_acceleration_test_y.write("%f," % y_acc[i][k]) f_acceleration_test_z.write("%f," % z_acc[i][k]) f_gyroscope_test_x.write("%f," % x_acc[i][k]) f_gyroscope_test_y.write("%f," % y_acc[i][k])
print '>>>> ranking of columns:' pprint(np.array(columns)[rfecv.ranking_-1]) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv if __name__=='__main__': x_train, y_regress, _, columns_train, weights, y_class = \ process_data('/home/jj/code/Kaggle/Fire/Data/train.csv', impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median', # fieldsToUse=FIELDS_CORR_ORDERED_TOP99[:20]) fieldsToUse=FIELDS_CLASS_GBC_TOP100[:20]) print 'CLASSIFICATION RFE' clf = GradientBoostingClassifier(learning_rate=0.1, loss='deviance') # rank_features(clf, x_train, y_class, columns_train, numFeatures=20, step=0.1) select_features(clf, x_train, y_class, columns_train, num_folds=5, step=1, random_state=0) # print 'REGRESSION RFE' # clf = Ridge(alpha = 0.1) # posYInd = y_regress > 0 # select_features(clf, x_train[posYInd, :], y_regress[posYInd], columns_train, num_folds=5, step=1, random_state=0)
# try: # numNans = np.isnan(df[col]).sum() # print "Num NaN's:", numNans, 100. * numNans / numRows # except: # pass # # try: # numZs = (df[col]=='Z').sum() # print "Num Zs:", numZs, 100. * numZs / numRows # except: # pass x_train, y_train, _, columns_train, weights, y_class = \ process_data(fname, impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median') clf = Ridge(alpha=1, normalize=False) clf.fit(x_train, y_train, sample_weight=weights) pprint(dict(zip(columns_train, clf.coef_*1e5))) colInd = list(columns_train).index('var13') # ---------- by category # for val in np.unique(x_train[:, colInd]): # for curInd in [x_train[:, colInd]==0, np.logical_not(x_train[:, colInd]==0)]: # # print curInd.sum() # # curInd = x_train[:,colInd]==val # curX = x_train[curInd, :]
""" convert field names to indices in the list of all column names :param allColumnNames: :param selectNames: :return: numpy array """ return np.array([list(allColumnNames).index(v) for v in set(selectNames) & set(allColumnNames)]) if __name__ == '__main__': x_train, y_train, _, columns_train, weights, y_class = \ process_data('/home/jj/code/Kaggle/Fire/Data/train.csv', impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median', fieldsToUse=['var11', 'var8', 'var13']) # fieldsToUse=FIELDS_CLASS_GBC_TOP100[:30]) # fieldsToUse=FIELDS_CDF_CORR_TOP99[:19]) # y_cdfs = np.array(pandas.read_csv('/home/jj/code/Kaggle/Fire/Data/y_pcdfs.csv')).reshape(NUM_TRAIN_SAMPLES,)[:len(y_train)] # in case smallTrain is used # clf = GradientBoostingRegressor(loss='quantile', learning_rate=0.02, n_estimators=100, subsample=0.9) # clf = LogisticRegression() plot_feature_importances(x_train, np.array(y_train), columns_train, numTopFeatures=3, numEstimators=50) classifier = SVR(kernel='rbf') regressor = Ridge(alpha=1) classFields = fieldNamesToInd(columns_train, FIELDS_CLASS_GBC_TOP100[:20]) regFields = fieldNamesToInd(columns_train, FIELDS_CORR_ORDERED_TOP99[:20]) # clf = GroupThenRegress(list(columns_train).index('var8'),
"my_data/acceleration_up_down_stairs.csv", "my_data/acceleration_walking_hand.csv", "my_data/acceleration_walking_pocket.csv"] gyroscope_files = ["my_data/gyroscope_sitting_hand.csv", "my_data/gyroscope_sitting_pocket.csv", "my_data/gyroscope_standing_hand.csv", "my_data/gyroscope_standing_pocket.csv", "my_data/gyroscope_up_down_stairs.csv", "my_data/gyroscope_walking_hand.csv", "my_data/gyroscope_walking_pocket.csv"] for q in xrange(len(acceleration_files)): t, a, x_acc, y_acc, z_acc, x_gyo, y_gyo, z_gyo = \ process_data(acceleration_files[q], gyroscope_files[q]) for i in xrange(len(t)): j = 0 l = 0 while len(t[i]) - j > 128: for k in xrange(j, j + 128): if k != j + 127: if l % 3 == 2: f_acceleration_test_x.write("%f," % x_acc[i][k]) f_acceleration_test_y.write("%f," % y_acc[i][k]) f_acceleration_test_z.write("%f," % z_acc[i][k]) f_gyroscope_test_x.write("%f," % x_acc[i][k]) f_gyroscope_test_y.write("%f," % y_acc[i][k])
for i in range(numFeatures): for j in range(i): print i, j # res = np.hstack((res, (xs[:,i] + xs[:,j]).reshape(numRows, 1))) temp.append(xs[:,i] * xs[:,j]) temp.append(xs[:,i] + xs[:,j]) if columns is not None: res_columns.append(columns[i] + '_' + columns[j]) res = np.hstack((res, np.array(temp).transpose())) return res, res_columns if __name__ == '__main__': x_train, y_train, _, columns, weights, y_class = \ process_data('/home/jj/code/Kaggle/Fire/Data/tinyTrain.csv', impute=True, imputeDataDir='/home/jj/code/Kaggle/Fire/intermediateOutput', imputeStrategy='median') # fieldsToUse=FIELDS_CLASS_GBC_TOP100[:15]) # y_cdfs = np.array(pandas.read_csv('/home/jj/code/Kaggle/Fire/Data/y_pcdfs.csv')).reshape(len(y_train),) # ---------- correlations between top x variables and y class_corrs = calculate_y_corrs(x_train, y_class)[0] ord = class_corrs.argsort()[::-1] topInd = ord[:25] plot_correlations(make_column_2D(class_corrs[topInd]), '') # # ----------------- correlations between variables # # plot correlations between ALL variables all_corrs = pandas.read_csv('/home/jj/code/Kaggle/Fire/intermediateOutput/corrs.csv') plot_correlations(all_corrs, 'Correlations Between All X Variables')