class Keras(BaseEstimator): def __init__(self, build_function, multi_class=False, keras_params = None): if not callable(build_function): raise ValueError('Model construction function must be callable.') self.multi_class = multi_class self.build_function = build_function if keras_params is None: keras_params = {} self.keras_params = keras_params def fit(self, X, y): if self.multi_class: self.n_classes_ = len(set(y)) else: self.n_classes_ = 1 build_callable = lambda: self.build_function(X.shape[1], self.n_classes_) keras_params=copy(self.keras_params) keras_params['build_fn']=build_callable self.classifier_ = KerasClassifier(**keras_params) self.classifier_.fit(X, y) def predict(self, X): return self.classifier_.predict(X)
def test_keras_classifier(): model = Sequential() model.add(Dense(input_dim, input_shape=(input_dim,))) model.add(Activation('relu')) model.add(Dense(nb_class)) model.add(Activation('softmax')) sklearn_clf = KerasClassifier(model, optimizer=optim, loss=loss, train_batch_size=batch_size, test_batch_size=batch_size, nb_epoch=nb_epoch) sklearn_clf.fit(X_train, y_train) sklearn_clf.score(X_test, y_test)
def NN(self, report=False): """Neutral Network. Args: report: whether print out the model analysis report. Returns: One layer neutral network model.""" from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier def baseline_model(): model = Sequential() model.add(Dense(8, input_dim=len(self.features), activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1) self.nn.fit(self.train[self.features], self.train[self.target]) if report: from Report import Report rpt = Report(self.nn, self.train, self.valid, self.target, self.features) rpt.ALL() return self.nn
def init_model(self): ''' init model ''' train_params = {"nb_epoch": 10, "batch_size": 10} self.dic_params.update(train_params) self.model = KerasClassifier(build_fn=self.create_model_func, **self.kargs["create_model"]["params"]) # self.model = KerasClassifier(build_fn=self.create_model_func) self.model.set_params(**self.dic_params)
def fit(self, X, y): if self.multi_class: self.n_classes_ = len(set(y)) else: self.n_classes_ = 1 build_callable = lambda: self.build_function(X.shape[1], self.n_classes_) keras_params=copy(self.keras_params) keras_params['build_fn']=build_callable self.classifier_ = KerasClassifier(**keras_params) self.classifier_.fit(X, y)
def main(): code_dir = '/home/john/git/kaggle/OttoGroup/' data_dir = '/home/john/data/otto/' training_file = 'train.csv' os.chdir(code_dir) np.random.seed(1337) print('Starting script...') print('Loading data...') X, labels = load_training_data(data_dir, training_file) print('Pre-processing...') scaler = create_scaler(X) X = apply_scaler(X, scaler) y, y_onehot, encoder = preprocess_labels(labels) num_features = X.shape[1] num_classes = y_onehot.shape[1] print('Features = ' + str(num_features)) print('Classes = ' + str(num_classes)) print('Building model...') model = define_model(num_features, num_classes) print('Complete.') print('Training model...') wrapper = KerasClassifier(model) wrapper.fit(X, y_onehot, nb_epoch=20) print('Complete.') print('Training score = ' + str(wrapper.score(X, y_onehot))) preds = wrapper.predict(X) print('Predictions shape = ' + str(preds.shape)) proba = wrapper.predict_proba(X) print('Probabilities shape = ' + str(proba.shape)) print('Building ensemble...') ensemble = BaggingClassifier(wrapper, n_estimators=3, max_samples=1.0, max_features=1.0) print('Complete.') print('Training ensemble...') ensemble.fit(X, y) print('Complete.') print('Ensemble score = ' + str(ensemble.score(X, y))) print('Script complete.')
seed = 7 np.random.seed(seed) # Function to create model, required for KerasClassifier def create_model(): # create model model = Sequential() model.add(Dense(12, input_dim=34, init='uniform', activation='relu')) model.add(Dense(8, init='uniform', activation='relu')) model.add(Dense(1, init='uniform', activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model # create model model = KerasClassifier(build_fn=create_model, nb_epoch=20, batch_size=32) # evaluate using 10-fold cross validation # kfold = KFold(n=len(features_train), n_folds=10, shuffle=True, random_state=seed) # results = cross_val_score(model, features_train.values, labels_train.values, cv=kfold) # print "Cross validation results:", (results.mean()*100), (results.std()*100) model.fit(features_train.values, labels_train.values) print "Model building complete:",round((time()-t0)/60,3),"m" # print len(np.unique(train.user_id)), len(np.unique(test.user_id)) # features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features_train, labels_train, test_size=0.60) # # neigh = neighbors.KNeighborsClassifier(weights='distance', n_jobs=-1).fit(train[features], train['hotel_cluster']) # forest = ensemble.RandomForestClassifier(n_estimators=10, n_jobs=-1).fit(train[features], train['hotel_cluster']) # # bayes = naive_bayes.GaussianNB().fit(train[features], train['hotel_cluster'])
Dense(units=12, kernel_initializer='uniform', activation='relu', input_dim=11)) classifier.add( Dense(units=12, kernel_initializer='uniform', activation='relu')) classifier.add( Dense(units=1, kernel_initializer='uniform', activation='sigmoid')) classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) return classifier classifieres = KerasClassifier(build_fn=build_classifieres, batch_size=10, epochs=2) accuracies = cross_val_score(estimator=classifieres, X=X_train, y=y_train, cv=10, n_jobs=-1) mean = accuracies.mean() variance = accuracies.std() print(mean) print(variance) # Regualrização do Dropout para reduzir overfitting, caso seja necessário from keras.wrappers.scikit_learn import KerasClassifier
Dense( units=neurons, activation=activation, kernel_initializer=kernel_initializer, )) rede_neural.add(Dropout(0.2)) rede_neural.add(Dense(units=1, activation='sigmoid')) rede_neural.compile(optimizer=optimizer, loss=loss, metrics=['binary_accuracy']) return rede_neural rede_neural = KerasClassifier(build_fn=criar_rede) parametros = { 'batch_size': [5, 10, 20, 30, 40], 'epochs': [50, 100, 500, 1000], 'optimizer': ['adam', 'sgd'], 'loss': ['binary_crossentropy', 'hinge'], 'kernel_initializer': ['random_uniform', 'normal'], 'activation': ['relu', 'tanh'], 'neurons': [8, 16, 24, 32, 64] } grid_search = GridSearchCV(estimator=rede_neural, param_grid=parametros, scoring='accuracy', cv=10)
# create model model = Sequential() model.add(Dropout(0.4, input_shape=(60, ))) model.add( Dense(60, init='normal', activation='relu', W_constraint=maxnorm(3))) model.add(Dense(1, init='normal', activation='sigmoid')) # Compile model sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) return model numpy.random.seed(seed) estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_model, nb_epoch=100, batch_size=10, verbose=0))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(y=encoded_Y, n_folds=10, shuffle=True, random_state=seed) results = cross_val_score(pipeline, X, encoded_Y, cv=kfold) print("Nom") print("Visible: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model seed = 2016 earlyStopping = callbacks.EarlyStopping(monitor='val_loss', patience=1, verbose=1, mode='auto') model = KerasClassifier(build_fn=create_model, epochs=40, batch_size=1024, verbose=1) from sklearn import model_selection from sklearn.metrics import log_loss from sklearn.model_selection import KFold # x_train = np.array(X) #y_train = np.array(y) train_stacker = [[0.0 for s in range(1)] for k in range(0, (x_train.shape[0]))] cv_scores = [] oof_preds = [] a = [0 for x in range(2345796)]
def processData(df): unique_degrees = df.degree.unique() degree_level, filtered = Filter_degree(df) stream_filtered, unsettled_streams = Filter_streams(filtered) df['stream_processed'] = stream_filtered df['degree_processed'] = degree_level ####################################################################################################################### majors_classification_df = pd.read_csv("College_Majors_Classification.csv") print(majors_classification_df.shape) majors_classification_df = majors_classification_df.dropna() majors_classification_df = majors_classification_df.applymap(lambda s: s.lower().strip()) majors_mapping = {} idx_majors_mapping = {} for major in majors_classification_df.Major_Category.unique(): majors_mapping.update({major: []}) for index, row in majors_classification_df.iterrows(): majors_mapping[row['Major_Category']].append(row['Major']) idx_majors_mapping.update({row['Major']: row['Major_Category']}) df['major'] = df['major'].fillna('None') ####################################################################################################################### unsettled_titles = [] settled_titles = [] known_majors_vectors = [] for key, item in idx_majors_mapping.items(): nlp_token = nlp(key) if (nlp_token.vector_norm): known_majors_vectors.append((nlp_token.vector, item)) created_mapping = {} for val in df.major.unique(): nlp_token = nlp(val) if not nlp_token.vector_norm: unsettled_titles.append(val) continue closest_major = find_closest(nlp_token.vector, known_majors_vectors) created_mapping.update({val: closest_major}) prettify_create_mapping = {} for major in majors_classification_df.Major_Category.unique(): prettify_create_mapping.update({major: []}) for key, val in created_mapping.items(): prettify_create_mapping[val].append(key) # pd.DataFrame.from_dict(prettify_create_mapping, orient='index').to_csv('majors_mapping.csv') ####################################################################################################################### r = pd.read_csv('majors_mapping.csv') r = r.set_index('Unnamed: 0') prettify_create_mapping = {} for i in r.index: prettify_create_mapping[i] = [] for enum, j in enumerate(r.columns): if enum == 1: continue val = r.loc[str(i), j] if not pd.isnull(val): prettify_create_mapping[i].append(val) majors_mapping = prettify_create_mapping ####################################################################################################################### majors_processed = [] for major_raw in df.major: major_ = "None" if major_raw == "None": majors_processed.append("None") continue for major_group, val in majors_mapping.items(): if major_raw in val: major_ = major_group break majors_processed.append(major_) df['majors_processed'] = majors_processed ####################################################################################################################### a = pd.read_csv("industries_classification.csv") a = a.drop(["Sector", "Industry Group", "Industry", "Sub-Industry"], axis=1) a = a.rename({"Unnamed: 1": "Sector", "Unnamed: 3": "Industry Group", "Unnamed: 5": "Industry", "Unnamed: 7": "Sub-Industry"}, axis=1) a = a.dropna(how='all') a = a.fillna("None") SP_classification = [] column_used = "Industry Group" sector = a.iloc[0][column_used] for index, row in a.iterrows(): subindustry = row['Sub-Industry'] if (row[column_used] != "None"): sector = row[column_used] SP_classification.append([sector, subindustry]) for tuple_ in SP_classification: if tuple_[0] == "(cont’d)": tuple_[0] = "Consumer Discretionary" if tuple_[0] == "Discretionary": tuple_[0] = "Consumer Discretionary" if tuple_[0] == "Consumer": tuple_[0] = "Consumer Discretionary" tuple_[0] = tuple_[0].replace(" (cont’d)", "") tuple_[0] = tuple_[0].replace("\n", " ") tuple_[0] = tuple_[0].split(" (")[0] tuple_[1] = tuple_[1].replace("\n", " ") tuple_[1] = tuple_[1].replace(" & ", " and ") tuple_[1] = tuple_[1].split(" -- ")[0] tuple_[1] = tuple_[1].split("(")[0] tuple_[1] = tuple_[1].replace("REITs", "Real Estate Investment Trusts") tuple_[1] = tuple_[1].replace("-", " ") ####################################################################################################################### sectors = [] for tuple_ in SP_classification: sectors.append(tuple_[0]) unique_sectors = np.unique(sectors) sector_mapping = {} for i, sector in enumerate(unique_sectors): sector_mapping.update({sector: i + 1}) ####################################################################################################################### SP_vectors = [] for tuple_ in SP_classification: idx = sector_mapping[tuple_[0]] SP_vectors.append(np.append(nlp(tuple_[1]).vector, idx)) ####################################################################################################################### data = pd.DataFrame(SP_vectors) X = data.loc[:, :299] y = data.loc[:, 300] from sklearn.preprocessing import LabelEncoder from keras.utils import np_utils encoder = LabelEncoder() encoder.fit(y) encoded_Y = encoder.transform(y) dummy_y = np_utils.to_categorical(encoded_Y, num_classes=len(unique_sectors)) data.head() from keras.wrappers.scikit_learn import KerasClassifier estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0) from keras.models import Sequential from sklearn.model_selection import train_test_split from keras.layers import Dense data = np.concatenate([X, dummy_y], axis=1) X_train, X_test, y_train, y_test = train_test_split(data[:, :300], data[:, 300:], test_size=0.33, random_state=42) estimator.fit(X_train, y_train) y_pred = estimator.predict(X_test) y_pred = np_utils.to_categorical(y_pred, num_classes=len(unique_sectors)) print(X_train.shape, X_test.shape, y_train.shape, y_test.shape, y_pred.shape) ######################################################################################################################## from sklearn.metrics import classification_report df['industry'] = df['industry'].fillna('None') vectors_unique = [] words_unique = [] industries_unique = df.industry.unique() industries_filtered = [] for val in industries_unique: val = val.replace("&", "") val = val.replace("/", " ") if val != "None" and val.find(':') == -1 and val.find("]"): industries_filtered.append(val) for val in industries_filtered: nlp_token = nlp(val) if (nlp_token.has_vector): vectors_unique.append(nlp_token.vector) words_unique.append(val) vectors_unique = np.array(vectors_unique) y_pred = estimator.predict(vectors_unique) predicted_sectors = {} for key, value in sector_mapping.items(): predicted_sectors[key] = [] for a, b in zip(words_unique, y_pred): predicted_sectors[unique_sectors[b]].append(a) # pd.DataFrame.from_dict(predicted_sectors, orient='index').to_csv("/content/predicted_industries_classification.csv") industry_processed = [] for industry_raw in df.industry: industry_ = "None" industry_raw = clean_string(industry_raw) for key, value in predicted_sectors.items(): if industry_raw in value: industry_ = key break if (industry_ == "None" and industry_raw != "None"): print(industry_raw) industry_processed.append(industry_) df['industry_processed'] = industry_processed ######################################################################################################################## workers_classification = pd.read_csv( 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=2085110439&single=true&output=csv') workers_classification.columns = ['none', 'level', 'title'] workers_classification = workers_classification.drop(['none'], axis=1) workers_classification = workers_classification.fillna(method='ffill') workers_classification = workers_classification.applymap(lambda s: s.lower().strip()) workers_mapping = {} for x in workers_classification.level.unique(): workers_mapping[x] = [] for index, row in workers_classification.iterrows(): workers_mapping[row['level']].append((row['title']).lower()) levels = list(workers_mapping.keys()) data = [] counter = 0 for level, titles in workers_mapping.items(): for title in titles: if nlp(title).has_vector: data.append(np.append(nlp(title).vector, counter)) counter += 1 data = pd.DataFrame(data) workers_mapping = {} for x in workers_classification.level.unique(): workers_mapping[x] = [] for index, row in workers_classification.iterrows(): workers_mapping[row['level']].append((row['title']).lower()) idx_workers_mapping = {} idx_vectors_mapping = {} for key, value in workers_mapping.items(): for item in value: idx_workers_mapping[item] = key idx_vectors_mapping[item] = (nlp(item)) ######################################################################################################################## import re from tqdm import tqdm_notebook switches = { "sr": "senior", "asst": "assistant", " & ": " ", "-": "", "engg": "engineer", "ceo": "chief executive officer", "cto": "chief technical officer", "@": " ", "dy": "deputy", "/": " ", '"': "", } levels_split = [] unsettled_titles = [] specialisation = [] already_done_ = {} df = df.sort_values(by=['job_title']) df['job_title'] = df['job_title'].fillna('None') df['level_raw'] = df['level_raw'].fillna('None') for title_raw, level_raw in tqdm_notebook(zip(df.job_title, df.level_raw), total=len(df.job_title)): selected_word = "" if title_raw == "None" and level_raw == "None": level_ = "None" else: if title_raw == "None": level_ = level_raw else: try: level_, selected_word = already_done_[(title_raw, level_raw)] except: title = title_raw for original, new in switches.items(): title = title.replace(original, new) title = title.strip() level_ = "None" for level in ['tm', 'mm', 'lm', 'fm', 'worker', 'others']: for level_granular in workers_mapping[level]: if title.find(level_granular) > -1: level_ = level selected_word = level_granular break if selected_word != "": break title_token = nlp(title) if level_ == "None" and title_token.vector_norm: for level_granular, level in idx_workers_mapping.items(): token = idx_vectors_mapping[level_granular] if token.vector_norm and title_token.similarity(token) > 0.9: level_ = level selected_word = level_granular break if level_ == "None" and title != "None": unsettled_titles.append(title) level_ = "undefined" already_done_.update({(title_raw, level_raw): (level_, selected_word)}) left_part = title_raw.replace(selected_word, "") if left_part == "": left_part = "None" levels_split.append(level_) specialisation.append(left_part) # print(title_raw,"|", level_raw,"|", level_, "|", left_part) # pd.DataFrame(unsettled_titles).to_csv('unsettled_titles.csv') print(len(levels_split)) print(len(unsettled_titles)) print(len(specialisation)) df['levels_processed'] = levels_split df['specialisation'] = specialisation ######################################################################################################################## # spec_classification = pd.read_csv("/content/drive/My Drive/Revamp/Data/specialisations_classification.csv") spec_classification = pd.read_csv( 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=0&single=true&output=csv') spec_classification.columns = ['specialisation', 'categories'] spec_classification['specialisation'] = spec_classification['specialisation'].fillna(method='ffill') spec_classification = spec_classification.applymap(lambda s: s.lower().strip()) spec_classification.head() specialisation_mapping = {} spec_classification = spec_classification.applymap(lambda s: s.lower() if type(s) == str else s) for spec in spec_classification['specialisation'].unique(): specialisation_mapping[spec] = [] for index, row in spec_classification.iterrows(): specialisation_mapping[row['specialisation']].append(row['categories']) idx_specialisation_mapping = {} idx_tokens_mapping = {} idx_vectors_mapping = {} for key, value in specialisation_mapping.items(): for item in value: idx_specialisation_mapping[item] = key idx_tokens_mapping[item] = nlp(item) idx_vectors_mapping[item] = idx_tokens_mapping[item].vector processed_specialisations = [] specialiasation_cleaned = [] granular_specialisations = [] already_done_ = {} count = 0 df['job_title'] = df['job_title'].fillna('None') choices = [] for key, value in idx_specialisation_mapping.items(): choices.append(key) from numpy import dot from numpy.linalg import norm def cos_sim(a, b): return dot(a, b) / (norm(a) * norm(b)) from tqdm import tqdm_notebook # for specialisation_raw in tqdm_notebook(df.job_title.unique(), total=len(df.job_title.unique())): for specialisation_raw in tqdm_notebook(df.job_title, total=len(df.job_title)): count += 1 # print(specialisation_raw) specialisation = specialisation_raw specialisation = specialisation.replace('"', "") specialisation = specialisation.replace("'", "") specialisation = specialisation.replace('-', " ") specialisation = specialisation.replace('*', " ") specialisation = specialisation.replace('(', "") specialisation = specialisation.replace('/', "") specialisation = specialisation.replace('|', "") specialisation = specialisation.replace('\\', "") specialisation = specialisation.replace(')', "") specialisation = specialisation.strip() if specialisation_raw == "None": processed_specialisation = "None" specialisation = "None" granular_specialisation = "None" else: if specialisation_raw in already_done_.keys(): granular_specialisation = already_done_[specialisation_raw][0] processed_specialisation = already_done_[specialisation_raw][1] else: processed_specialisation = "undefined" granular_specialisation = "undefined" spec_vector = nlp(specialisation).vector best_token = "None" max_sim = 0 max_len = 0 for spec_ in idx_specialisation_mapping.keys(): if (len(spec_) > 7) and specialisation.find(spec_) > -1: if len(spec_) > max_len: best_token = spec_ max_len = max(max_len, len(best_token)) elif len(spec_) > 3 and specialisation.find(spec_) > -1: if (cos_sim(spec_vector, idx_vectors_mapping[spec_]) > 0.5): best_token = spec_ break else: # print("Close touch here ->", specialisation, "->", spec_, "-- failed") pass if best_token == "None": for spec_ in idx_specialisation_mapping.keys(): current_sim = cos_sim(spec_vector, idx_vectors_mapping[spec_]) if (current_sim > max_sim): max_sim = current_sim best_token = spec_ # print(max_sim, best_token) if (max_sim < 0.50 and best_token == "None"): processed_specialisation = "undefined" granular_specialisation = "undefined" else: processed_specialisation = idx_specialisation_mapping[best_token] granular_specialisation = best_token already_done_.update({specialisation_raw: (granular_specialisation, processed_specialisation)}) if (count % 1 == 0): # print(specialisation, "|\t|", granular_specialisation, "|\t|", processed_specialisation, "|\t|", max_sim) pass processed_specialisations.append(processed_specialisation) granular_specialisations.append(granular_specialisation) specialiasation_cleaned.append(specialisation) df['specialisations_processed'] = processed_specialisations df['granular_specialisations'] = granular_specialisations df['job_title_cleaned'] = specialiasation_cleaned level_classification = pd.read_csv( "https://docs.google.com/spreadsheets/d/e/2PACX-1vSXxWGKiCFw6QRXV09znbdHmd5HqCGgzl8o8qGndrft2U9I9fyNz94rblr69YLQkqhDiTkGrwGH6M4R/pub?gid=2085110439&single=true&output=csv") level_classification['Categories'] = level_classification['Categories'].fillna(method='ffill') level_classification = level_classification.applymap(lambda s: s.lower() if type(s) == str else s) level_classification = level_classification.drop('Levels', axis=1) level_classification.columns = ['level', 'granular_specialisation'] level_mapping = {} for index, row in level_classification.iterrows(): level_mapping.update({row['granular_specialisation']: row['level']}) print(level_mapping) levels_processed = [] unsettled = [] for spec in df.granular_specialisations: if spec == "None": levels_processed.append("None") elif spec in level_mapping.keys(): levels_processed.append(level_mapping[(spec)]) else: levels_processed.append("undefined") unsettled.append(spec) df['levels_processed'] = levels_processed # df.to_csv('features_created_.csv') return df
def grid_search(X_train_, X_test_, y_train_, y_test_): from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler from keras.wrappers.scikit_learn import KerasClassifier scaler = StandardScaler() X_transform = scaler.fit_transform(X_train_) #parameters_svm = {'kernel': ('linear', 'poly', 'rbf'), 'C': [1, 10, 100, 1e5]} parameters_svm = {'kernel': ['linear'], 'C': [1]} #parameters_logistic = {'solver': ('liblinear', 'saga'), 'C': [1, 10, 100, 1e5], 'max_iter': [1000, 2000, 3000]} parameters_logistic = { 'solver': ['liblinear'], 'C': [1], 'max_iter': [1000] } #parameters_decisiontree = {'criterion': ('entropy', 'gini'), 'max_depth': [10, 21, 42]} parameters_decisiontree = {'criterion': ['entropy'], 'max_depth': [10]} #parameters_kneighbors = {'n_neighbors': (10, 15, 21, 27), 'p': (1, 2, 3)} parameters_kneighbors = {'n_neighbors': [10], 'p': [1]} #parameters_randomforest = {'n_estimators': (25,50,100,150), 'criterion': ('entropy', 'gini'), 'max_depth': [10, 21, 42]} parameters_randomforest = { 'n_estimators': [25], 'criterion': ['entropy'], 'max_depth': [10] } #parameters_nn = {'dropout_rate': (0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9), 'neurons': (88, 128, 168, 208, 248), # 'batch_size': [20, 40, 60, 80], 'epochs': [10, 50, 100]} parameters_nn = { 'dropout_rate': [0.2], 'neurons': [88], 'batch_size': [20], 'epochs': [10] } parameters_list = [ parameters_nn, parameters_decisiontree, parameters_svm, parameters_kneighbors, parameters_randomforest, parameters_logistic ] model5 = LogisticRegression(random_state=0, multi_class='auto') model1 = DecisionTreeClassifier(random_state=0) model2 = SVC(tol=1e-3, random_state=0, gamma="scale", verbose=True) # model2 = SVC(kernel='linear') model3 = KNeighborsClassifier(metric='minkowski', algorithm='auto') model4 = RandomForestClassifier(random_state=1, n_jobs=2) model = KerasClassifier(build_fn=nn_model, input_dim=np.shape(X_train_)[1], verbose=0) model_list = [model, model1, model2, model3, model4, model5] model_name_list = [ 'NeuralNetwork', 'DecisionTree', 'SVM', 'KNeighbors', 'RandomForest', 'LogisticRegression' ] for name, mod, parameter in zip(model_name_list, model_list, parameters_list): clf = GridSearchCV(mod, parameter, cv=5, scoring='balanced_accuracy') clf.fit(X_transform, y_train_) print(clf.cv_results_.keys()) with open(name + 'parameter.csv', 'w') as f: w = csv.writer(f) w.writerow(clf.cv_results_.keys()) for i in range(len(clf.cv_results_['mean_fit_time'])): row = [] for ele_key in clf.cv_results_.keys(): print(type(clf.cv_results_[ele_key])) if (isinstance(clf.cv_results_[ele_key], np.ma.core.MaskedArray)): row.append((clf.cv_results_[ele_key].data)[i]) else: row.append(clf.cv_results_[ele_key][i]) w.writerow(row)
model.add(Dense(512, init='normal', activation='relu')) model.add(Dense(9, init='normal', activation="softmax")) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model from keras.utils import np_utils from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() encoder.fit(train_y) encoded_y = encoder.transform(train_y) dummy_y = np_utils.to_categorical(encoded_y) print(dummy_y.shape) estimator = KerasClassifier(build_fn=baseline_model, nb_epochs=10, batch_size=64) estimator.fit(sentence_vectors[0:3321], dummy_y, validation_split=0.05) y_pred = estimator.predict_proba(sentence_vectors[3321:]) """ Submission """ submission = pd.DataFrame(y_pred) submission['id'] = test_index submission.columns = ['class1', 'class2', 'class3', 'class4', 'class5', 'class6', 'class7', 'class8', 'class9', 'id'] submission.to_csv("submission_keras_classify.csv",index=False)
# baseline def create_baseline(): # create model model = Sequential() model.add(Dense(60, input_dim=60, init='normal', activation='relu')) model.add(Dense(30, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation='sigmoid')) # Compile model sgd = SGD(lr=0.01, momentum=0.8, decay=0.0, nesterov=False) model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy']) return model numpy.random.seed(seed) estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=300, batch_size=16, verbose=0))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(y=encoded_Y, n_folds=10, shuffle=True, random_state=seed) results = cross_val_score(pipeline, X, encoded_Y, cv=kfold) print("Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
def train_wrapper(filename): # filename='measurements.mat' global dataset global seed global model global ss global mX global nX global mY global nY data = loadmat(filename) # this is a dict. keys = list(data.keys())[3:] # skip the first three columns values = list(data.values())[3:] dataset = pd.DataFrame() dataset = dataset.reindex(columns=keys) # create an empty dataframe for ii in np.arange(len(values)): v_ = np.array(values[ii]) dataset[keys[ii]] = pd.Series( v_.flatten()) # cannot add the data to this empty df. dataset['y'] = 1 * (dataset['BLER'] <= 0.19) # H-ARQ target. dataset = dataset[['RSRP', 'TBSINR_1', 'rank', 'y']] dataset.dropna(inplace=True, axis=0) if os.path.exists('dataset.csv'): dataset.to_csv('dataset.csv', index=False, mode='a', header=False) # append else: dataset.to_csv('dataset.csv', index=False) #print(dataset.head()) # Perform a split 30-70 train, test = train_test_split(dataset, test_size=0.30, random_state=seed) X_train = train.drop('y', axis=1) X_test = test.drop('y', axis=1) y_train = train['y'].values y_test = test['y'].values mX, nX = X_train.shape mY = y_train.shape nY = 1 ss = MinMaxScaler(feature_range=(0, 1)) # Scale the variables X_train_sc = ss.fit_transform(X_train) X_test_sc = ss.transform(X_test) model = KerasClassifier(build_fn=create_mlp, verbose=0, epochs=10, batch_size=8) # The hyperparameters width_dims = [3, 5, 10] n_hiddens = [3, 5] # the depth of hidden layers hyperparameters = dict(width=width_dims, depth=n_hiddens) class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train) grid = GridSearchCV(estimator=model, param_grid=hyperparameters, n_jobs=1, cv=3) gpu_available = tf.test.is_gpu_available() if (gpu_available == False): print('WARNING: No GPU available. Will continue with CPU.') with tf.device('/gpu:0'): grid_result = grid.fit(X_train_sc, y_train, class_weight=class_weights) # This is the best model best_model_mlp = grid_result.best_params_ print(best_model_mlp) model = grid_result.best_estimator_ mlp = model y_pred = mlp.predict(X_test_sc) y_score = mlp.predict_proba(X_test_sc) mu = accuracy_score(y_test, y_pred) # Compute ROC curve and ROC area try: roc_auc = roc_auc_score(y_test, y_score[:, 1]) except: print('WARNING: ROC was not computed. Returning NaN') roc_auc = np.nan print('ROC for training is: {}'.format(roc_auc)) print('Misclassification error for training is: {:.3f}'.format(1 - mu)) return [roc_auc, 1 - mu] # model is valid
embedding_matrix_frWac = embd_bin( 200, 'embeddings/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin') #https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.fr.vec wiki_dir = 'embeddings/wiki.fr/wiki.fr.vec' embedding_matrix_wiki = embd(embedding_dim2, wiki_dir) num_epochs = 50 batch_size = 102 ## ppl1 = Pipeline([("MODEL_WV", KerasClassifier(MODEL_WV, epochs=num_epochs + 50, batch_size=batch_size, verbose=1, shuffle=False))]) ppl2 = Pipeline([("MODEL_wiki", KerasClassifier(MODEL_wiki, epochs=num_epochs + 50, batch_size=batch_size, verbose=1, shuffle=False))]) ppl3 = Pipeline([("model_cv_wv", KerasClassifier(model_cv_wv, epochs=num_epochs + 30, batch_size=batch_size, verbose=1,
from sklearn.model_selection import cross_val_score from sklearn.model_selection import KFold from sklearn.pipeline import Pipeline from sklearn.model_selection import StratifiedKFold #****Basic split of Data****### #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,shuffle=True,stratify=y) #kfold = KFold(n_splits=5, shuffle=True) #***Stratified Split of the Data skf = StratifiedKFold(n_splits=5, random_state=None, shuffle=True) for train_index, test_index in skf.split(X, y): print("Train:", train_index, "Validation:", val_index) X_train, X_test = X[train_index], X[val_index] y_train, y_test = y[train_index], y[val_index] estimator = KerasClassifier(build_fn=rock_classifier, epochs=20, batch_size=1, verbose=1) results = cross_val_score(estimator, X_train, y_yrain, cv=skf) print("Baseline: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) filename = 'results_400_skf.sav' joblib.dump(results, filename)
kernel_initializer="random_uniform")) classifier.add(PReLU(input_shape=(6, ))) #Parametric RELU classifier.add(Dropout(rate=0.1)) classifier.add( Dense(units=1, activation="sigmoid", kernel_initializer="random_uniform")) classifier.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy']) return classifier # Grid Search for finding the best hyper params model = KerasClassifier(build_fn=classifier_fn, epochs=10, batch_size=25) params = { 'epochs': [10, 25, 100], 'batch_size': [10, 100], 'optimizer': ['adam', 'rmsprop', 'SGD'], 'dp1': [0.12, 0.25] } gridSearch = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=3) gs = gridSearch.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) ##################################### # Set callback functions to early stop training and save the best model so far
# create model def baseline_model(): model = Sequential() model.add(Dense(100, input_shape=(10249,))) model.add(Activation('relu')) model.add(Dropout(0.7)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model # In[43]: estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=nb_epoch, batch_size=batch_size, verbose=1) estimator.fit(X_train_mat, y_train_cat) # In[44]: predictions = estimator.predict(X_test_mat) print(set(predictions)) print(encoder.inverse_transform(predictions)) # In[45]: print 'macro f1:', f1_score(encoded_Y_test, predictions, average='macro')
pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(hidden_dims)) model.add(Dropout(0.5)) model.add(Activation('relu')) model.add(Dense(nb_classes)) model.add(Activation('softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) return model # モデルを生成 model = KerasClassifier( build_fn=build_model, nb_epoch=nb_epoch, batch_size=batch_size) # テストデータを読み込み data = json.load(open("./newstext/data-mini.json")) X = data["X"] Y = data["Y"] X_train, X_test, Y_train, Y_test = train_test_split(X, Y) Y_train = np_utils.to_categorical(Y_train, nb_classes) print(len(X_train),len(Y_train)) # 学習 model.fit(X_train, Y_train, verbose=1) y = model.predict(X_test) print(y)
def train_and_evaluate_model(model, features, labels): # Test, Train, Valid Split X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.10,random_state=832289) X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.25,random_state=832289) # Fit the model history = model.fit(X_train, y_train,batch_size=batch_size, nb_epoch=nb_epoch,verbose=1, validation_data=(X_valid, y_valid)) # evaluate the model scores = model.evaluate(X_test, y_test, verbose=0) print("Test Accuracy : %s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) # # Find Best Dropout rate using GridCV Search model = KerasClassifier(build_fn=create_model, nb_epoch=10, batch_size=5, verbose=1) dropout_rate = [0.0, 0.2, 0.5] param_grid = dict(dropout_rate=dropout_rate) grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5) grid_result = grid.fit(features, labels) print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) for params, mean_score, scores in grid_result.grid_scores_: print("%f (%f) with: %r" % (scores.mean(), scores.std(), params)) # # Training the model using the dropouts , discovered by GridCV Search best_params = grid_result.best_params_ dropout_keep_prob = best_params['dropout_rate']
metric = 'accuracy' tuned_parameters = { 'epochs': [50], 'batch_size': [16, 32, 64], 'conv_layers': [2, 3], 'filters': [16, 32], 'kernel_size': [3, 5], 'units': [256, 512, 1024], 'dropout_rate': [0.3, 0.5], 'optimizer': ['adam'], 'init_mode': ['glorot_uniform'], } print('> Grid search:') print(' - Tuning hyper-parameters for \'{}\' metric\n'.format(metric)) grid_search = GridSearchCV(KerasClassifier(build_fn=build_model, verbose=0), tuned_parameters, cv=3, n_jobs=-1, verbose=2) print(' - ', end='') grid_search.fit(X, y, callbacks=[ tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5, min_delta=0.01, restore_best_weights=True) ])
ch1 = np.append(buf,zo,axis=1) print('ch: ',str(1)) print(ch1) print(ch1.shape) # Separate to Input | Output X = ch1[:,0:32].astype(float) Y = ch1[:,32] # Larger model def create_larger(): # Create model (neural network) model = Sequential() model.add(Dense(32, input_dim=32, activation='relu')) model.add(Dense(16, activation='relu')) model.add(Dense(1, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model print('Estimating...') # KerasClassifier estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_larger, epochs=100, batch_size=5, verbose=1))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(n_splits=10, shuffle=True) results = cross_val_score(pipeline, X, Y, cv=kfold) print("Result: acc:%.2f%% stdev:(%.2f%%)" % (results.mean()*100, results.std()*100))
def get_model_from_name(model_name, training_params=None): # For Keras epochs = 250 if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': print( 'Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy' ) epochs = 30 all_model_params = { 'LogisticRegression': { 'n_jobs': -2 }, 'RandomForestClassifier': { 'n_jobs': -2 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': { 'n_estimators': 10 }, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'presort': False, 'learning_rate': 0.05, 'warm_start': True }, 'GradientBoostingClassifier': { 'presort': False, 'learning_rate': 0.05, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': { 'n_estimators': 10 }, 'XGBRegressor': { 'nthread': -1, 'n_estimators': 200 }, 'XGBClassifier': { 'nthread': -1, 'n_estimators': 200 }, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.05, 'num_leaves': 8, 'lambda_l2': 0.001 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'SGDClassifier': SGDClassifier(), 'Perceptron': Perceptron(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), 'SGDRegressor': SGDRegressor(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans() } if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor( calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier( calc_feature_importance=True) if keras_installed: model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) model_with_params = model_without_params.set_params(**model_params) return model_with_params
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model seed = 7 earlyStopping = callbacks.EarlyStopping(monitor='loss', patience=1, verbose=0, mode='auto') model = KerasClassifier(build_fn=create_model, epochs=90, batch_size=50, verbose=1) # evaluate using 10-fold cross validation kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) results = model_selection.cross_val_score( model, x_train, y_train, cv=kfold, scoring='neg_log_loss', fit_params={'callbacks': [earlyStopping]}) print(results.mean()) model.fit(x_train, y_train, callbacks=[earlyStopping])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print ("done remodel") return model print ("") print ("====================================") print ("START") print ("====================================") print ("") #train model and create model as estimator estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0) #estimator harus di fit agar bisa di save jadi model estimator.fit(X,Y) print ("") print ("====================================") print("TESTING") print ("====================================") print ("") #testing pake kfold kfold = KFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(estimator, X, dummy_y, cv=kfold) print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test , y_pred) #predict a new observation newObserv = np.array([[0.0,0,600,1,40,3,60000,2,1,1,50000]]) newObserv = sc_x.transform(newObserv) newPredection = classifier.predict(newObserv) newPredection = (newPredection>0.5) ''' #uisng k corss vaidation from keras.wrappers.scikit_learn import KerasClassifier from sklearn.model_selection import cross_val_score classifier = KerasClassifier(build_fn=build_classifier , batch_size = 10 , epochs = 100 ) accuracies = cross_val_score(estimator = classifier, X=x_train , y= y_train, cv=10 , n_jobs=-1 ) mean = accuracies.mean() variance = accuracies.std() #ANN tuning """ from keras.wrappers.scikit_learn import KerasClassifier from sklearn.model_selection import GridSearchCv classifier = KerasClassifier(build_fn=build_classifier , batch_size = 10 , epochs = 100 ) accuracies = cross_val_score(estimator = classifier, X=x_train , y= y_train, cv=10 , n_jobs=-1 ) mean = accuracies.mean()
model.add(Dropout(0.5)) model.add( Dense(len(df_train_new_arr[0]) * 3, init='uniform', activation='tanh')) model.add(Dropout(0.6)) model.add( Dense(len(df_train_new_arr[0]) * 1, init='uniform', activation='tanh')) model.add(Dropout(0.5)) model.add(Dense(len(target[0]), init='uniform', activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer=rms, metrics=['accuracy']) return model estimator = KerasClassifier(build_fn=keras_model, nb_epoch=49, batch_size=500, verbose=1) np.random.seed(123) estimator.fit(df_train_new_arr_normed, target, verbose=1, validation_split=0.3, show_accuracy=True) predictions = estimator.predict_proba(df_sub_arr_normed) pred = pd.DataFrame(data=predictions, columns=[x for x in products if x not in drop_targets]) pred['ncodpers'] = df_sub['ncodpers'] # Removing items already present in May-16 pred_T = pd.melt(pred, id_vars="ncodpers", var_name="Var", value_name="Val")
#sklearn.grid_search this is if the sklearn doesnt use model selection for GridsearchCV def build_classifier(optimizer): classifier = Sequential() classifier.add( Dense(output_dim=6, init='uniform', activation='relu', input_dim=11)) classifier.add(Dense(output_dim=6, init='uniform', activation='relu')) classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid')) classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) return classifier classifier = KerasClassifier(build_fn=build_classifier) #no batch_size or nb_epoch cause that will be used in gridsearchcv #use GridsearchCV to get value of best batch size for best accuracy parameters = { 'batch_size': [10, 20, 50, 100, 250, 500], 'nb_epoch': [100, 500], 'optimizer': ['adam', 'rmsprop'] } grid_search = GridSearchCV(estimator=classifier, param_grid=parameters, scoring='neg_mean_squared_error', cv=10) grid_search = grid_search.fit(X_train, Y_train) best_parameters = grid_search.best_params_
class Baseline(object): """Provide general machine learning models as baseline.""" def __init__(self, train, valid, target, features, impute=True): super(Baseline, self).__init__() self.target = target self.features = features self.train = train self.valid = valid if impute: import pandas as pd from sklearn.preprocessing import Imputer self.train_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.train), columns=self.train.columns) self.valid_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.valid), columns=self.valid.columns) else: self.train_prep = self.train self.valid_prep = self.valid def LR(self, report=False): """Logistic Regression. Args: feature_num: number of feaures to keep in the model. report: whether print out the model analysis report. Returns: Logistic regression model.""" from sklearn.linear_model import LogisticRegression self.lr = LogisticRegression(n_jobs=-1) self.lr.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.lr, self.train_prep, self.valid_prep, self.target, self.features) rpt.ALL() return self.lr def RF(self, report=False): """Random Forest. Args: report: whether print out the model analysis report. Returns: Decision tree model generated from Random Forest.""" from sklearn.ensemble import RandomForestClassifier self.rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=10, random_state=0, n_jobs=-1) self.rf.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.rf, self.train_prep, self.valid_prep, self.target, self.features) rpt.ALL() return self.rf def GBDT(self, report=False): """Gradient Boosting Decision Tree. Args: report: whether print out the model analysis report. Returns: Decision tree model generated from Gradient Boosting Decision Tree.""" from xgboost.sklearn import XGBClassifier self.gbdt = XGBClassifier(objective='binary:logistic', booster='gbtree', learning_rate=0.01, n_estimators=5000, max_depth=3, subsample=0.75, colsample_bytree=0.75, n_jobs=4, random_state=2018) self.gbdt.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features) rpt.ALL() return self.gbdt def NN(self, report=False): """Neutral Network. Args: report: whether print out the model analysis report. Returns: One layer neutral network model.""" from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier def baseline_model(): model = Sequential() model.add(Dense(8, input_dim=len(self.features), activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1) self.nn.fit(self.train[self.features], self.train[self.target]) if report: from Report import Report rpt = Report(self.nn, self.train, self.valid, self.target, self.features) rpt.ALL() return self.nn
class Baseline(object): """Provide general machine learning models as baseline.""" def __init__(self, train, valid, target, features, impute=True): super(Baseline, self).__init__() self.target = target self.features = features self.train = train self.valid = valid if impute: import pandas as pd from sklearn.preprocessing import Imputer self.train_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.train), columns=self.train.columns) self.valid_prep = pd.DataFrame(Imputer(strategy='mean').fit_transform(self.valid), columns=self.valid.columns) else: self.train_prep = self.train self.valid_prep = self.valid def LR(self, report=False): """Logistic Regression. Args: feature_num: number of feaures to keep in the model. report: whether print out the model analysis report. Returns: Logistic regression model.""" from sklearn.linear_model import LogisticRegression self.lr = LogisticRegression(n_jobs=-1) self.lr.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.lr, self.train_prep, self.valid_prep, self.target, self.features) rpt.ALL() return self.lr def RF(self, report=False): """Random Forest. Args: report: whether print out the model analysis report. Returns: Decision tree model generated from Random Forest.""" from sklearn.ensemble import RandomForestClassifier self.rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', max_depth=10, random_state=0, n_jobs=-1) self.rf.fit(self.train_prep[self.features], self.train_prep[self.target]) if report: from Report import Report rpt = Report(self.rf, self.train_prep, self.valid_prep, self.target, self.features) rpt.ALL() return self.rf def GBDT(self, report=False): """Gradient Boosting Decision Tree. Args: report: whether print out the model analysis report. Returns: Decision tree model generated from Gradient Boosting Decision Tree.""" import lightgbm as lgb from sklearn.model_selection import train_test_split train, test = train_test_split(self.train, test_size=0.2, random_state=0) lgb_train = lgb.Dataset(train[self.features], train[self.target], free_raw_data=False) lgb_valid = lgb.Dataset(test[self.features], test[self.target], reference=lgb_train, free_raw_data=False) params = { 'boosting_type': 'gbdt', 'objective': 'bianry', 'metric': 'auc', 'num_leaves': 64, 'learning_rate': 0.01, 'feature_fraction': 0.75, 'bagging_fraction': 0.75, 'bagging_freq': 5, 'verbose': 0 } self.gbdt = lgb.train(params, lgb_train, num_boost_round=10000, valid_set=lgb_valid, early_stopping_round=200, verbose_eval=100) if report: from Report import Report rpt = Report(self.gbdt, self.train, self.valid, self.target, self.features) rpt.ALL() return self.gbdt def NN(self, report=False): """Neutral Network. Args: report: whether print out the model analysis report. Returns: One layer neutral network model.""" from keras.models import Sequential from keras.layers import Dense from keras.wrappers.scikit_learn import KerasClassifier def baseline_model(): model = Sequential() model.add(Dense(8, input_dim=len(self.features), activation='relu')) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model self.nn = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=5, verbose=1) self.nn.fit(self.train[self.features], self.train[self.target]) if report: from Report import Report rpt = Report(self.nn, self.train, self.valid, self.target, self.features) rpt.ALL() return self.nn
""" Now it is time to evaluate this model using stratified cross validation in the scikit-learn framework. To use Keras models with scikit-learn, we must use the KerasClassifier wrapper. This class takes a function that creates and returns our neural network model. It also takes arguments that it will pass along to the call to fit() such as the number of epochs and the batch size. We pass the number of training epochs to the KerasClassifier, again using reasonable default values. Verbose output is also turned off given that the model will be created 10 times for the 10-fold cross validation being performed. """ # Rescale our data # evaluate baseline model with standardized dataset estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=1) """ We are going to use scikit-learn to evaluate the model using stratified k-fold cross validation. This is a resampling technique that will provide an estimate of the performance of the model. It does this by splitting the data into k-parts, training the model on all parts except one which is held out as a test set to evaluate the performance of the model. This process is repeated k-times and the average score across all constructed models is used as a robust estimate of performance. It is stratified, meaning that it will look at the output values and attempt to balance the number of instances that belong to each class in the k-splits of the data. """ kfold = StratifiedKFold(n_splits=1000, shuffle=True, random_state=seed) results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
le_target = LabelEncoder().fit(train[target]) y = le_target.transform(train[target]) train = train.drop([target, 'image', 'filename'], axis=1) combined_features = Pipeline([ ('pca', Pipeline([ ('scaler', StandardScaler()), ('pca', PCA(n_components=input_dim)), ])), ]) X = combined_features.fit_transform(train.as_matrix()) model = KerasClassifier(build_fn=create_mlp) splitter = ShuffleSplit(n_splits=5, test_size=0.1, random_state=0) cv_splits = cv_split_generator(X=X, y=y, splitter=splitter) scores = [] hist = {} for i, X_train, X_val, y_train, y_val in cv_splits: X = combined_features.fit_transform(train.as_matrix()) results = model.fit(X_train, y_train, nb_epoch=250, batch_size=128, validation_split=0.1, verbose=1)
base_nns = load_all_nns(n_nns) print('Loaded %d models' % len(base_nns)) # fit stacked model using the ensemble model = fit_stacked_model(base_nns, X_val, y_val) # evaluate model on test set ypred = stacked_prediction(base_nns, model, X_val) acc = accuracy_score(y_val, ypred) def get_model(): return load_model('base_nn.h5') stk_nn = KerasClassifier(build_fn=get_model) classif = [stk_nn] kf = model_selection.StratifiedKFold(n_splits=5) for i, ensem in enumerate(classif): cvscore = model_selection.cross_val_score(ensem, X_train, y_train, cv=kf, scoring='accuracy') print("Stacked Ensemble Model %0.0f" % i) print("Train (CV) Acc: %0.2f (+/- %0.2f)" % (cvscore.mean(), cvscore.std())) ensem.fit(X_train, y_train)
# return the best three results def top_n(matrix_prob, label_map): ans = [] for line in matrix_prob: rank = [label_map[item[0]] for item in sorted(enumerate(line), key=lambda v:v[1], reverse=True)] ans.append(rank[:3]) return ans # basic neural network model def basic_model(): model = Sequential() model.add(Dense(output_dim=500, input_dim=100, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(output_dim=42, input_dim=500, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model if __name__ == '__main__': X = pd.read_csv('./data/triple_train_x_mean.txt', header=None, encoding='utf-8') Y = pd.read_csv('./data/triple_train_y.txt', header=None, encoding='utf-8') X_test = pd.read_csv('./data/triple_test_x_mean.txt', header=None, encoding='utf-8') matrix_y = np_utils.to_categorical(Y,42) # KerasClassifier analysis classifier = KerasClassifier(build_fn=basic_model, nb_epoch=10, batch_size=500) classifier.fit(X, Y) pred_prob = classifier.predict_proba(X_test) with open('./model/task2_label_space.txt', encoding='utf-8') as flabel: label_map = flabel.read().split() pd.DataFrame(top_n(pred_prob, label_map)).to_csv('./data/task2_ans_int_index.txt', index=None, header=None, encoding='utf-8')
model.add(BatchNormalization(axis=-1, input_shape=(X.shape[1], X.shape[2]))) model.add(CuDNNLSTM(256, return_sequences=True)) model.add(CuDNNLSTM(256, return_sequences=True)) model.add(CuDNNLSTM(256, return_sequences=True)) model.add(Flatten()) model.add(Dropout(0.4)) model.add(Dense(8, activation='softmax')) #unit must match n classes # model compilation model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) return model # create the model model = create_model() print(model.summary()) # create model model = KerasClassifier(build_fn=create_model, epochs=200, batch_size=16, verbose=1) # evaluate using 5-fold cross validation kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=123) results = cross_val_score(model, X, y, cv=kfold) print(results.mean())
classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid')) #Compiling the ANN classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) #Root mean square propagation #classifier.compile(optimizer = 'rmsprop', loss='binary_crossentropy', metrics=['accuracy']) #Fitting the ANN to the training set #classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100) return classifier neural_network = KerasClassifier(build_fn=create_network, epochs=10, batch_size=5, verbose=0) #Create k-fold cross-validation from sklearn.model_selection import KFold, cross_val_score kf = KFold(n_splits=10, shuffle=True, random_state=1) score = cross_val_score(neural_network, X_train, y_train, cv=kf).mean() print(score) """ #Predict test set result y_pred = classifier.predict(X_test) #y_pred = (y_pred > 0.5) y_pred = [ 1 if x > 0.5 else 0 for x in y_pred] #Making confusion Matrix
# Add fully connected layer with a ReLU activation function network.add(layers.Dense(units=16, activation="relu")) # Add fully connected layer with a sigmoid activation function network.add(layers.Dense(units=1, activation="sigmoid")) # Compile neural network network.compile(loss="binary_crossentropy", # Cross-entropy optimizer=optimizer, # Optimizer metrics=["accuracy"]) # Accuracy performance metric # Return compiled network return network # Wrap Keras model so it can be used by scikit-learn neural_network = KerasClassifier(build_fn=create_network, verbose=0) # Create hyperparameter space epochs = [5, 10] batches = [5, 10, 100] optimizers = ["rmsprop", "adam"] # Create hyperparameter options hyperparameters = dict(optimizer=optimizers, epochs=epochs, batch_size=batches) # Create grid search grid = GridSearchCV(estimator=neural_network, param_grid=hyperparameters) # Fit grid search grid_result = grid.fit(features, target)
LRscore[j] = np.mean(cross_val_score(logisticModel,train_data,label_data,cv=5)) j = j+1 print(c) print(LRscore) # plt.plot(c,LRscore,'bx-') # plt.xlabel('penalty') # plt.ylabel('validation score') # plt.title('LR Model selection') # plt.show() # #logisticModel = LogisticRegression(penalty='l2') # #scores[1] = cross_val_score(logisticModel,train_data,label_data,cv=5) # #test model 3 : Neutral network #NNModel = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(5000,100), random_state=1,max_iter=500) tbCallback = TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True) NNModel = KerasClassifier(build_fn=create_model,epochs=1200, batch_size=150,verbose=0) cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0) #NNscore = cross_val_score(NNModel,train_data,label_data,fit_params={'callbacks': [tbCallback]},cv=cv) NNModel.fit(train_data,label_data) prediction = NNModel.predict(test_data) prediction = np.array(prediction) print(prediction) np.savetxt("prediction.csv", prediction, delimiter=",") #print('MLPClassifier validation score : ',NNscore) #test model 4 : SVM # c = [1] # SVMscore = np.zeros(len(c)) # j = 0 # for i in c:
y, test_size=0.33, random_state=42) print(X_train.shape, Y_train.shape) print(X_test.shape, Y_test.shape) tbCallBack = keras.callbacks.TensorBoard(log_dir='./Graph', histogram_freq=0, write_graph=True, write_images=True) batch_size = 32 model.fit(X_train, Y_train, epochs=7, batch_size=batch_size, verbose=2, callbacks=[tbCallBack]) score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size) print(score) print(acc) model = KerasClassifier(build_fn=createmodel, verbose=0) batch_size = [32, 64] epochs = [10, 2] param_grid = dict(batch_size=batch_size, epochs=epochs) from sklearn.model_selection import GridSearchCV grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1) grid_result = grid.fit(X_train, Y_train) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
""" Now it is time to evaluate this model using stratified cross validation in the scikit-learn framework. To use Keras models with scikit-learn, we must use the KerasClassifier wrapper. This class takes a function that creates and returns our neural network model. It also takes arguments that it will pass along to the call to fit() such as the number of epochs and the batch size. We pass the number of training epochs to the KerasClassifier, again using reasonable default values. Verbose output is also turned off given that the model will be created 10 times for the 10-fold cross validation being performed. """ # Rescale our data # evaluate baseline model with standardized dataset estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=1) """ We are going to use scikit-learn to evaluate the model using stratified k-fold cross validation. This is a resampling technique that will provide an estimate of the performance of the model. It does this by splitting the data into k-parts, training the model on all parts except one which is held out as a test set to evaluate the performance of the model. This process is repeated k-times and the average score across all constructed models is used as a robust estimate of performance. It is stratified, meaning that it will look at the output values and attempt to balance the number of instances that belong to each class in the k-splits of the data. """ kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(estimator, X, encoded_Y, cv=kfold)
def train_KerasBinaryClassifier(self, X_train, y_train, noOfepochs): # Use Tenserflow backend sess = tf.Session() K.set_session(sess) def custom_activation(x): return (1 / np.sqrt(self.h_size)) * tf.cos(x / 0.02) get_custom_objects().update( {'custom_activation': Activation(custom_activation)}) def model(): # Load the # print("Loading the Pretrained Supervised NN Model..... ") from keras.models import load_model from keras.models import model_from_json # # Model reconstruction from JSON file # with open('../models/supervisedBC/model_architecture.json', 'r') as f: # best_model = model_from_json(f.read()) # # Load weights into the new model # best_model.load_weights('../models/supervisedBC/model_weights.h5') # best_model.compile( # optimizer='rmsprop', # loss='binary_crossentropy', # metrics=['accuracy']) model = Sequential() model.add(Dense(128, input_dim=X_train.shape[1])) model.add(Activation(custom_activation)) model.add(Dense(64, activation='linear')) model.add(Dense(1)) model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) # ## Copy the weights from one model to another model # model.set_weights(best_model.get_weights()) return model # early_stopping = callbacks.EarlyStopping( # monitor='val_loss', patience=1, verbose=0, mode='auto') # print("Removed Early stopping......") pipe = pipeline.Pipeline([('rescale', preprocessing.StandardScaler()), ('nn', KerasClassifier(build_fn=model, epochs=noOfepochs, batch_size=128, verbose=0, validation_split=0.2))]) # callbacks=[early_stopping] pipe.fit(X_train, y_train) model_step = pipe.steps.pop(-1)[1] joblib.dump(pipe, os.path.join(self.directory, 'pipeline.pkl')) # print("Trained Model is Saved at relative path inside PROJECT_DIR ", # self.directory) models.save_model(model_step.model, os.path.join(self.directory, 'model.h5')) return
def create_model(): model = Sequential() model.add(SimpleRNN(X_train.shape[1], input_dim=X_train.shape[1])) model.add(Activation('relu')) model.add(SimpleRNN(20000)) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(SimpleRNN(nb_classes)) model.add(Activation('softmax')) model.compile(loss=loss, optimizer=optim, metrics=['accuracy']) return model classifier = KerasClassifier(build_fn=create_model, nb_epoch=nb_epoch, batch_size=batch_size) history = classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch) Y_pred = classifier.predict(X_test, batch_size=batch_size) print(classification_report(y_true=Y_test, y_pred=Y_pred)) plt.figure() plt.plot(history.history['acc']) plt.title('Genauigkeit') plt.ylabel('Genauigkeit') plt.xlabel('Epoche') plt.legend(['Training', 'Test'], loc='upper left') plt.savefig("data/acc.png") # summarize history for loss plt.figure()
model.add(Dropout(rate=0.1)) model.add( Dense(units=5, activation='sigmoid', kernel_initializer='glorot_uniform')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model # Cross Validation from keras.wrappers.scikit_learn import KerasClassifier from sklearn.model_selection import cross_val_score cv_classifier = KerasClassifier(build_fn=build_model, batch_size=25, nb_epoch=1000) accuracies = cross_val_score(estimator=cv_classifier, X=X_train, y=y_train, cv=10) accuracySum = 0 for accuracy in accuracies: accuracySum += accuracy print(accuracySum / accuracies.size) # -- Cross Val. Son -- # classifier = build_model()
class BaseKerasSklearnModel(base_model.BaseModel): ''' base keras model based on keras's model(without sklearn) ''' ## def __init__(self, data_file, delimiter, lst_x_keys, lst_y_keys, log_filename=DEFAULT_LOG_FILENAME, model_path=DEFAULT_MODEL_PATH, create_model_func=create_model_demo): ## ''' ## init ## ''' ## import framework.tools.log as log ## loger = log.init_log(log_filename) ## self.load_data(data_file, delimiter, lst_x_keys, lst_y_keys) ## self.model_path = model_path ## self.create_model_func=create_model_func def __init__(self, **kargs): ''' init ''' import framework.tools.log as log self.kargs = kargs log_filename = self.kargs["basic_params"]["log_filename"] model_path = self.kargs["basic_params"]["model_path"] self.load_data_func = self.kargs["load_data"]["method"] self.create_model_func = self.kargs["create_model"]["method"] loger = log.init_log(log_filename) (self.dataset, self.X, self.Y, self.X_evaluation, self.Y_evaluation) = self.load_data_func(**self.kargs["load_data"]["params"]) self.model_path = model_path self.dic_params = {} def load_data(self, data_file, delimiter, lst_x_keys, lst_y_keys): ''' load data ''' # Load the dataset self.dataset = numpy.loadtxt(data_file, delimiter=",") self.X = self.dataset[:, lst_x_keys] self.Y = self.dataset[:, lst_y_keys] def init_callbacks(self): ''' init all callbacks ''' os.system("mkdir -p %s" % (self.model_path)) checkpoint_callback = ModelCheckpoint(self.model_path + '/weights.{epoch:02d}-{acc:.2f}.hdf5', \ monitor='acc', save_best_only=False) history_callback = LossHistory() callbacks_list = [checkpoint_callback, history_callback] self.dic_params["callbacks"] = callbacks_list def init_model(self): ''' init model ''' train_params = {"nb_epoch": 10, "batch_size": 10} self.dic_params.update(train_params) self.model = KerasClassifier(build_fn=self.create_model_func, **self.kargs["create_model"]["params"]) # self.model = KerasClassifier(build_fn=self.create_model_func) self.model.set_params(**self.dic_params) def train_model(self): ''' train model ''' X = self.X Y = self.Y X_evaluation = self.X_evaluation Y_evaluation = self.Y_evaluation seed = 7 numpy.random.seed(seed) # Load the dataset history = self.model.fit(X, Y) scores = self.model.score(X, Y) #history_callback = self.dic_params["callbacks"][1] # print dir(history_callback) # logging.info(str(history_callback.losses)) logging.info("final : %.2f%%" % (scores * 100)) logging.info(str(history.history)) def process(self): ''' process ''' self.init_callbacks() self.init_model() self.train_model()
# one hot encoding encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) #baseline model def create_baseline(): #create model #we start with same number of neurons as input in hidden layer as a starting point model = Sequential() model.add(Dense(60, input_dim=60, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation='sigmoid')) #compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model estimator = KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0) kfold = StratifiedKFold(y=encoded_Y, n_folds=10, shuffle=True, random_state=seed) results = cross_val_score(estimator, X, encoded_Y, cv=kfold) print("Results: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100))
Y_test = np_utils.to_categorical(y_test, nb_classes)[:max_test_samples] ############################# # scikit-learn wrapper test # ############################# print('Beginning scikit-learn wrapper test') print('Defining model') model = Sequential() model.add(Dense(784, 50)) model.add(Activation('relu')) model.add(Dense(50, 10)) model.add(Activation('softmax')) print('Creating wrapper') classifier = KerasClassifier(model) print('Fitting model') classifier.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epoch) print('Testing score function') score = classifier.score(X_train, Y_train) print('Score: ', score) print('Testing predict function') preds = classifier.predict(X_test) print('Preds.shape: ', preds.shape) print('Testing predict proba function') proba = classifier.predict_proba(X_test) print('Proba.shape: ', proba.shape)
def modeling(conn, sentences, lib, dz): #def modeling(conn, df, lib, dz): #pts = pd.read_sql("SELECT DISTINCT SUBJECT_ID from UFM", conn) #pts =list(set(pts.SUBJECT_ID)) #pool = [] #for d in dz: # pool += d.pos + d.neg np.random.seed(7) decay = .0002 data = []; train = []; test = [] keys = [k[1] for k in lib] admits = pd.read_sql("SELECT * from admissions", conn) for itr in range(0,5): print ("Sess: {0}".format(itr)) for d in dz: neg = random.sample(d[1], len(d[0])) temp = d[0] + neg random.shuffle(temp) t1, t2 = cross_validation.train_test_split(temp, test_size = .2) train +=t1; test +=t2 #X stands for raw indexes of feature input; V stands for raw feature input #W stands for word vectors from feature input trained by Word2Vec X_train = []; t_train = []; W_train = []; Y_train = [] X_test = []; t_test = []; W_test = []; Y_test = [] V_train = []; V_test = [] count=0 for t in train: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] #order subject by time of entry for each sentence (admission) corpus = sorted(corpus, key = lambda x: x[1]) #transpose into nx2xd from 2xnxd #this way, corpus[0] refers to words and corpus[1] refers to times corpus = list(map(list, zip(*corpus))) x_train = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) x = np.array(list(map(lambda x: keys.index(x), x_train))) #configure each timestamp to reflect time elapsed from first time entry #calculate time decay from initial event temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] #append X_train.append(x) V_train.append(np.array(x_train)) t_train.append(np.array(t_stamps)) Y_train.append(t[3]) print ("X_train made.") count = 0 for t in test: print (count) count+=1 corpus = [[s[2], s[3]] for s in sentences if (s[0] == t[0]) and (pd.to_datetime(admits[admits['HADM_ID']==s[1]].ADMITTIME.values[0]) <= t[1])] corpus = sorted(corpus, key = lambda x: x[1]) corpus = list(map(list, zip(*corpus))) x_test = list(chain.from_iterable(corpus[0])) t_stamps = list(chain.from_iterable(corpus[1])) temp = t_stamps[0] t_stamps = [ii-temp for ii in t_stamps] x = np.array(list(map(lambda x: keys.index(x), x_test))) X_test.append(x) V_test.append(np.array(x_train)) t_test.append(np.array(t_stamps)) Y_test.append(t[3]) #training normal LSTM and CNN-LSTM top_words = [9444] max_review_length = [1000] embedding_length = [300] X_train = sequence.pad_sequences(X_train, maxlen=max_review_length[0]) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length[0]) #build model using KerasClassifier and Gridsearch cnn = KerasClassifier(build_fn=cnn_train, verbose=1) lstm = KerasClassifier(build_fn=lstm_train, verbose=1) d_cnn = KerasClassifier(build_fn=d_cnn_train, verbose = 1) d_lstm = KerasClassifier(build_fn=d_lstm_train, verbose = 1) # define the grid search parameters batch_size = [32, 64, 128] epochs = [20, 50, 100, 200] optimizer = ['SGD', 'RMSprop', 'Adam'] learn_rate = (10.0**np.arange(-4,-1)).tolist() momentum = np.arange(.5,.9,.1).tolist() neurons = [50, 100, 200] dropout_W = [.1, .2, .5] dropout_U = [.1, .2, .5] W_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] U_regularizer = [l1(.0001), l1(.001), l1(.01), l2(.0001), l2(.001), l2(.01), None] init_mode = ['uniform', 'normal', 'zero'] #activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] param_grid = dict(top_words=top_words, max_length = max_review_length, embedding_length = embedding_length, batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) d_param_grid = dict(input_shape = [(max_review_length[0], embedding_length[0])], batch_size=batch_size, nb_epoch=epochs, optimizer = optimizer, learn_rate = learn_rate, momentum = momentum, neurons = neurons, dropout_W = dropout_W, dropout_U = dropout_U, W_regularizer = W_regularizer, U_regularizer = U_regularizer, init_mode = init_mode) lr_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'penalty':('l1','l2')} sv_params = {'C':(10.0**np.arange(-4,4)).tolist(), 'kernel':('linear', 'poly', 'rbf', 'sigmoid')} rf_params = {'criterion': ['gini', 'entropy']} #setup GridSearch w/ cross validation cnn_grid = GridSearchCV(estimator=cnn, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) lstm_grid = GridSearchCV(estimator=lstm, param_grid=param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_cnn_grid = GridSearchCV(estimator=d_cnn, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) d_lstm_grid = GridSearchCV(estimator=d_lstm, param_grid=d_param_grid, scoring = 'roc_auc', cv = 5, n_jobs=-1) classics = GridSearchCV(estimator = (LR, SVM, RF), param_grid = (lr_params, sv_params, rf_params), scoring = 'roc_auc', sv = 5, n_jobs = -1) #lr_grid = GridSearchCV(estimator = lr_params, param_grid = lr_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #sv_grid = GridSearchCV(estimator = sv_params, param_grid = sv_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) #rf_grid = GridSearchCV(estimator = rf_params, param_grid = rf_params, scoring = 'roc_auc', sv = 5, n_jobs = -1) # Fit the model cnn_result = cnn_grid.fit(X_train, Y_train) lstm_result = lstm_grid.fit(X_train, Y_train) d_cnn_result = d_cnn_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) d_lstm_result = d_lstm_grid.fit(decay(x=np.array(V_train), t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[0], Y_train) classics_result = classics.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length[0], max_review_length=max_review_length[0])[1], Y_train) #lr_result = lr_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #sv_result = sv_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #rf_result = rf_grid.fit(decay(x=V_train, t_stamps =t_train, embedding_length=embedding_length, max_review_length=max_review_length)[1], Y_train) #grid_search results: print("CNN Best: %f using %s" % (cnn_result.best_score_, cnn_result.best_params_)) means = cnn_result.cv_results_['mean_test_score'] stds = cnn_result.cv_results_['std_test_score'] params = cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("LSTM Best: %f using %s" % (lstm_result.best_score_, lstm_result.best_params_)) means = lstm_result.cv_results_['mean_test_score'] stds = lstm_result.cv_results_['std_test_score'] params = lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay CNN Best: %f using %s" % (d_cnn_result.best_score_, d_cnn_result.best_params_)) means = d_cnn_result.cv_results_['mean_test_score'] stds = d_cnn_result.cv_results_['std_test_score'] params = d_cnn_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Decay LSTM Best: %f using %s" % (d_lstm_result.best_score_, d_lstm_result.best_params_)) means = d_lstm_result.cv_results_['mean_test_score'] stds = d_lstm_result.cv_results_['std_test_score'] params = d_lstm_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) print("Best of Classics: %f using %s, %s" % (classics_result.best_score_, classics_result.best_estimator_, classics_result.best_params_)) means = classics_result.cv_results_['mean_test_score'] stds = classics_result.cv_results_['std_test_score'] params = classics_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, params)) #KFold = 5 #kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7) #cvscores = [] #for training, testing in kfold.split(X_train, Y_train): # Fit the model #model.fit(X[training], Y[training], nb_epoch=150, batch_size=10, verbose=0) # evaluate the model #scores = model.evaluate(X[testing], Y[testing], verbose=0) #print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100)) #cvscores.append(scores[1] * 100) #print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) ######TESTING####### cnn = cnn_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) lstm = lstm_train(top_words = top_words, max_length = max_review_length, embedding_length=embedding_length) cnn.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) lstm.fit(X_train, Y_train, validation_split = .2, nb_epoch=100, batch_size=128, shuffle = True, verbose=1) #testing predictions_lstm = lstm.predict_classes(X_test) predictions_cnn = cnn.predict_classes(X_test) acc = accuracy_score(Y_test, predictions_lstm) f1 = f1_score (Y_test, predictions_lstm) auc = roc_auc_score (Y_test, predictions_lstm) scores_lstm = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] acc = accuracy_score(Y_test, predictions_cnn) f1 = f1_score (Y_test, predictions_cnn) auc = roc_auc_score (Y_test, predictions_cnn) scores_cnn = [("Accuracy", acc) , ("F1 Score", f1) , ("AUC Score",auc)] print ("LSTM DATA: ") for s in scores_lstm: print("%s: %.2f" %(s[0], s[1]), end = " ") print ("") print ("CNN DATA: ") for s in scores_cnn: print("%s: %.2f" %(s[0], s[1]), end = " ") data.append(data) return (Data)
optimizer='adam', metrics=['accuracy']) return model # データを読み込み --- (※2) data = json.load(open("./newstext/data-mini.json")) #data = json.load(open("./newstext/data.json")) X = data["X"] # テキストを表すデータ Y = data["Y"] # カテゴリデータ # 最大単語数を指定 max_words = len(X[0]) # 学習 --- (※3) X_train, X_test, Y_train, Y_test = train_test_split(X, Y) Y_train = np_utils.to_categorical(Y_train, nb_classes) print(len(X_train),len(Y_train)) model = KerasClassifier( build_fn=build_model, nb_epoch=nb_epoch, batch_size=batch_size) model.fit(X_train, Y_train) # 予測 --- (※4) y = model.predict(X_test) ac_score = metrics.accuracy_score(Y_test, y) cl_report = metrics.classification_report(Y_test, y) print("正解率=", ac_score) print("レポート=\n", cl_report)
# output layer model.add(Dense(1)) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy']) return model #thank god for wrappers def nn_model(): return KerasClassifier(build_fn=create_baseline, nb_epoch=20, batch_size=50, verbose = 1) model = KerasClassifier(build_fn=create_baseline, nb_epoch=10, batch_size=80, verbose = 0) model.fit(X_train, y_train, nb_epoch=7, batch_size=300, validation_split=0.1, show_accuracy=True) scores = cross_validation.cross_val_score(model, X, y, cv = 5, scoring = "accuracy", n_jobs = -1, verbose = 1) model.fit(X_train, y_train, verbose=2) y_pred = model.predict(X_test) ''' print y_pred print y_test print mean_squared_error(y_test, y_pred) ''' #scores = roc_auc_score(y_test,y_pred)