def main(arg=None): #load npz file. weights = np.load('./vgg16_weights.npz') #Weights_Tranined store the data of vgg16_weights.npz that download from internet. #For example: As a member of Weights_Tranined, 'conv11_w' corresponds with the # weights of the first conv3-64. 'conv11_b' corresponds with the bias # of the first conv3-64. w_trained = Weights_Tranined(weights) #Build training and validation sets data_path = os.path.join(os.getcwd(), 'fmnist') print(data_path) imgs, labels = pre.processing(data_path, CLASS_NUM) train_img, train_label, validation_img, validation_label = pre.split( imgs, labels) print(train_img.shape) print(train_label.shape) train_model(t_x=train_img, t_y=train_label, weights=w_trained, dprate=DROPOUT_RATE, imgsize=IMG_SIZE, imgchannel=IMG_CHANNEL, batchsize=BATCH_SIZE, train_step=TRAINING_STEP, learningrate=LEARNING_RATE_BASE, learningdecay=LEARNING_RATE_DECAY, regurate=REGULARIZATION_RATE)
def split_sample(file_name): #splitting datasets into test and train import preprocess as sp train_df,test_df = sp.split(file_name) #(random sampling)bagging train data dataset_name,file_type = file_name.split('.') dataset_name_train = dataset_name + "_train.csv" #(random sampling)bagging test data dataset_name,file_type = file_name.split('.') dataset_name_test = dataset_name + "_test.csv" return dataset_name_train,dataset_name_test,train_df,test_df
def score(filename, disp=True): cleaned_df = clean('oasis_longitudinal.csv') _, X_test, _, Y_test = split(cleaned_df) model = pickle.load(open(filename, 'rb')) Y_pred = model.predict(X_test) recall = recall_score(Y_test, Y_pred) accuracy = accuracy_score(Y_test, Y_pred) if disp: print(model) print(f"Accuracy = {accuracy}") print(f"Recall= {recall}") return model
def generate_data(self, seq2seq=False): import generator as gen series = gen.complete(self.time, pg.baseline, pg.slope, pg.period, pg.amplitude, pg.noise_level) train, valid = pp.split(pg.time, series, pg.split_time) train_set = pp.window_dataset(train[1], pm.window_size, pm.batch_size, seq2seq=seq2seq) valid_set = pp.window_dataset(valid[1], pm.window_size, pm.batch_size, seq2seq=seq2seq) self.main_series = series self.set_data(train_set, valid_set)
def id3(examples, attributes): root = dt.TreeNode() one_count = sum([int(y) for X, y in examples]) if one_count == len(examples): root.label = 1 return root if one_count == 0: root.label = 0 return root if not attributes: if one_count >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root best_attribute, best_value, info_gain = pp.split(examples, attributes) if info_gain == 0: if one_count >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root root.attribute = best_attribute root.split_value = best_value left_exs = [] right_exs = [] for ex in examples: if ex[0][root.attribute] <= root.split_value: left_exs.append(ex) else: right_exs.append(ex) new_attributes = attributes.copy() new_attributes.remove(best_attribute) root.left_child = id3(left_exs, new_attributes) root.right_child = id3(right_exs, new_attributes) return root
def run(): for gap in xrange(7): expansion = False data, comm = preprocess.loadData(gap, expansion) train_data, test_data = preprocess.split(data, comm) train = np.array(train_data[0]) test = np.array(test_data[0]) test_com = test_data[1] train_x = train[:, 1:] train_y = train[:, 0] print train_x.shape reg = gcv(train_x, train_y) if len(sys.argv) > 1 and sys.argv[1] == 'output': test_x = test[:, 1:] pred = reg.predict(test_x) output_test(pred, test_com, gap)
def id3_depth_limited(examples, attributes, depth): root = dt.TreeNode() if sum([y for X, y in examples]) == len(examples): root.label = 1 return root if sum([y for X, y in examples]) == 0: root.label = 0 return root if not attributes or depth == 0: if sum([y for X, y in examples]) >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root best_attribute, best_value, info_gain = pp.split(examples, attributes) if info_gain == 0: if sum([y for X, y in examples]) >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root root.attribute = best_attribute root.split_value = best_value left_exs = [] right_exs = [] for ex in examples: if ex[0][root.attribute] <= root.split_value: left_exs.append(ex) else: right_exs.append(ex) new_attributes = attributes.copy() new_attributes.remove(best_attribute) root.left_child = id3_depth_limited(left_exs, new_attributes, depth-1) root.right_child = id3_depth_limited(right_exs, new_attributes, depth-1) return root
def id3_depth_limited(examples, attributes, depth): root = dt.TreeNode() if sum([y for X, y in examples]) == len(examples): root.label = 1 return root if sum([y for X, y in examples]) == 0: root.label = 0 return root if not attributes or depth == 0: if sum([y for X, y in examples]) >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root best_attribute, best_value, info_gain = pp.split(examples, attributes) if info_gain == 0: if sum([y for X, y in examples]) >= len(examples) / 2.0: root.label = 1 else: root.label = 0 return root root.attribute = best_attribute root.split_value = best_value left_exs = [] right_exs = [] for ex in examples: if ex[0][root.attribute] <= root.split_value: left_exs.append(ex) else: right_exs.append(ex) new_attributes = attributes.copy() new_attributes.remove(best_attribute) root.left_child = id3_depth_limited(left_exs, new_attributes, depth - 1) root.right_child = id3_depth_limited(right_exs, new_attributes, depth - 1) return root
if mode == "train": print "Training the bayes model" bayes_model(mode="train") elif mode == "test": if os.path.exists(bayes_model_file): print "Model is already trained! Reading the file ..." trained_model = pickle.load(open(bayes_model_file, "rb")) bayes_model(mode="test", trained_model=trained_model) else: print "Model file not found :(" elif mode == "score": bayes_model(mode="score") elif mode == "join": pre.join_per_business() elif mode == "split": pre.split() elif sys.argv[1] == 'neural-net': mode = str(sys.argv[2]) if mode in ["train", "test"]: neural_net(mode) # if os.path.exists(bayes_model_file): # print "Model is already trained! Reading the file ..." # trained_model = pickle.load(open(bayes_model_file, "rb")) # bayes_model(mode="test", trained_model=trained_model) # else: # print "Model file not found :(" elif mode == "score": bayes_model(mode="score") elif mode in ["join", "split"]: pre.join_and_split(mode=mode) elif sys.argv[1] == 'scikit-classifier':
mode = sys.argv[1] input = pd.read_csv(sys.argv[2]) params = json.load(open(sys.argv[3])) feature = params[FEATURES] label = params[LABELS] print('FEATURE: {}\nLABEL: {}'.format(feature, label)) data = pd.DataFrame(columns={label, feature}) data[feature] = filter(input[feature]) if mode == TRAIN: # preprocess labels data[label] = filter(input[label]) data[label] = clean(data[label], 'label') y = data.pop(label) X_train, y_train, X_test, y_test = split(data, y, 0.3) #train-test ratio 70:30 X_train[label] = y_train X_test[label] = y_test X_test.to_csv(PATH + 'test_file.csv', sep=',') print('Begin training -- TRAIN: {} TEST: {}'.format( len(X_train), len(X_test))) # create the datasets for training test_file, dev_file, train_file = format_data(X_train, label, 0.3) # train model model = train(PATH, test_file, dev_file, train_file) # evaluate
import numpy as np import pandas as pd import algo import preprocess import recommend # raw data movies_raw = pd.read_table('./ml-1m/movies.dat', sep = '::', names = ['MovieID', 'Title', 'Genres'], engine = 'python') ratings_raw = pd.read_table('./ml-1m/ratings.dat', sep = '::', names = ['UserID', 'MovieID', 'Rating', 'Timestamp'],engine = 'python') users_raw = pd.read_table('./ml-1m/users.dat', sep = '::', names = ['UserID','Gender','Age','Occupation','Zip-code'], engine = 'python') ##################################### Part - 1 Refining dataset and creating test set # separting test sets i.e. new users and new movies ratings_raw, user_test, movie_test = preprocess.split(ratings_raw, fraction = 0.1) # refining ratings_raw ratings_nan = ratings_raw.iloc[:,0:3] ratings_nan = ratings_nan.pivot(index='UserID', columns='MovieID', values='Rating') # process info of users who have rated users = preprocess.user_info(users_raw,ratings_raw) # process info of movie been rated movies, genres = preprocess.movie_info(movies_raw, ratings_raw) # genre mapping : helps to convert a new movie vector lst = [] for i in range(0,len(genres)): lst.append((genres[i],i)) genre_mapping = dict(lst)
import embedding import model import preprocess from sklearn.model_selection import KFold input_dataset = './Augmented_Feat.csv' embedmodel = embedding.train_word2vec('./glove.6B.300d.txt') question = './questions.csv' df = preprocess.cleaning_dataset(input_dataset) df = preprocess.question_demoting(df, question) X, y = preprocess.scale(df) X_train, X_test, y_train, y_test = preprocess.split(X, y, 0.2) split = 5 index = 0 train_model = [None] * split tokenizer = [None] * split acc = [None] * split kfold = KFold(n_splits=split, shuffle=True, random_state=101) for train, test in kfold.split(X_train, y_train): train_model[index], tokenizer[index] = model.train(X_train.iloc[train], y_train[train], embedmodel) test_results = model.predict(X_train.iloc[test], train_model[index], tokenizer[index]) test_results, y_true = model.processresult(test_results, y_train[test]) acc[index], _ = model.evaluate(test_results, y_true) index += 1 index = 0
def main(args): # Create output folder util.mkdir(args['output'], args['clean']) # Tensorflow logging tf.logging.set_verbosity(tf.logging.WARN) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' # Logging to DWF server dwf_logging = None # Logging logger = logging.getLogger('DeepBugHunter') if 'dwf_client_info' in args: client_info = args['dwf_client_info'] sys.path.insert(0, client_info['util_path']) dwf_logging = __import__('dwf_logging') if not logger.handlers: formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(args['output'], 'dbh.log'), mode='a') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) if 'dwf_client_info' in args: http_handler = dwf_logging.LogHandler() http_handler.setLevel(logging.INFO) logger.addHandler(http_handler) else: ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) extra_log_data = {} if dwf_logging is not None: extra_log_data = {'progress' : 0, 'hash' : client_info['client_id']} logger.info(msg='DBH started...', extra=extra_log_data) logger.info('Input csv is ' + args['csv']) # Seeding global random states, just in case... tf.set_random_seed(args['seed']) # This is used for sklearn algorithms under the hood so we don't have to manually # set the random seed separately every time np.random.seed(args['seed']) # Load the whole input data = csv2pandas.load_data(args['csv'], args['label'], args['seed']) # Apply optional preprocessing for (what, how) in args['preprocess']: # TODO: use <what> and generalize preprocessors data = getattr(preprocess, how)(*data) table = [] strategy_i = 0 strategy_cnt = len(args['strategy']) for (strategy, sargs) in args['strategy']: strategy_i += 1 logger.info('(%d/%d) Strategy "%s" started with args: <%s>', strategy_i, strategy_cnt, strategy, sargs) # Aggregate confusion matrices cv_train = ConfMatrix() cv_dev = ConfMatrix() cv_test = ConfMatrix() # For each fold fold_generator = preprocess.split(data, folds=FOLDS, seed=args['seed']) fold_i = 0 for remainder, test in fold_generator(): fold_i += 1 # A single dev split # Not fully fair, but fairer... train, dev = next(preprocess.split(remainder, folds=FOLDS, seed=args['seed'])()) # Resample the training set if args['resample'] is not 'none': train = preprocess.resample(*train, mode=args['resample'], amount=args['resample_amount'], seed=args['seed']) # Evalute according to the current strategy train_res, dev_res, test_res, cl = getattr(strategies, strategy).learn(train, dev, test, args, sargs) # Aggregate metrics for cross-validation F-Measure cv_train.add(train_res) cv_dev.add(dev_res) cv_test.add(test_res) if args['calc_completeness']: preds = getattr(strategies, strategy).predict(cl, dev, args, sargs) issues = preprocess.get_orig_labels(dev[1]) cv_dev.calc_completeness(preds, issues) preds = getattr(strategies, strategy).predict(cl, test, args, sargs) issues = preprocess.get_orig_labels(test[1]) cv_test.calc_completeness(preds, issues) if dwf_logging is not None: extra_log_data = {'progress' : fold_i / FOLDS, 'hash' : client_info['client_id']} logger.info('Fold %d/10 done', fold_i, extra=extra_log_data) train_stats = cv_train.stats(False) dev_stats = cv_dev.stats(args['calc_completeness']) test_stats = cv_test.stats(args['calc_completeness']) logger.info('%s[%s] results:', strategy, sargs) logger.info('train: %s', train_stats) logger.info('dev: %s', dev_stats) logger.info('test: %s', test_stats) if dwf_logging is not None: result = dwf_logging.pack_results(train_stats, dev_stats, test_stats) dwf_logging.report_result(result, client_info['client_id']) table.append([ args['resample'], args['resample_amount'], args['preprocess'], strategy, sargs, train_stats['fmes'], dev_stats['fmes'], test_stats['fmes'], train_stats, dev_stats, test_stats, ]) with open(os.path.join(args['output'], 'dbh.csv'), 'a') as f: for line in table: f.write(';'.join([str(item) for item in line]) + '\n')
def get_data(type, oneyear): #without sequences data = read_data() proc_data, y = prepare_data(oneyear, data) x_train, x_test, y_train, y_test = split(proc_data, y, type) return x_train, x_test, y_train, y_test
def plot_forecast(self): series = self.main_series[pg.split_time - pm.window_size:-1] fc = self.forecast(series)[:, 0] _, valid = pp.split(pg.time, self.main_series, pg.split_time) from support import plot_series plot_series(valid[0], [valid[1], fc], labels=["Real", "Forecast"])
from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.ensemble import AdaBoostClassifier import pickle from preprocess import clean, split cleaned_df = clean('oasis_longitudinal.csv') X_train, X_test, Y_train, Y_test = split(cleaned_df) logistic_model = LogisticRegression(C=10).fit(X_train, Y_train) forest_model = RandomForestClassifier(n_estimators=3, max_features=4, n_jobs=4, max_depth=5, random_state=0).fit(X_train, Y_train) tree_model = DecisionTreeClassifier(random_state=0, max_depth=1, criterion='gini').fit(X_train, Y_train) adaboost_model = AdaBoostClassifier(n_estimators=3, learning_rate=0.0001, random_state=0).fit(X_train, Y_train) pickle.dump(logistic_model, open('model_files/logistic.sav', 'wb')) pickle.dump(forest_model, open('model_files/forest.sav', 'wb')) pickle.dump(tree_model, open('model_files/tree.sav', 'wb')) pickle.dump(adaboost_model, open('model_files/adaboost.sav', 'wb'))
def naive(series, split_time): return series[split_time - 1:-1] def moving_average(series, window_size): mov = np.cumsum(series) mov[window_size:] = mov[window_size:] - mov[:-window_size] return mov[window_size - 1:-1] / window_size series = gen.complete(p.time, p.baseline, p.slope, p.period, p.amplitude, p.noise_level) time2, series2 = pp.remove_season(time, series) train, valid = pp.split(time, series, split_time) train2, valid2 = pp.split(time, series2, split_time - period) naive_prediction = naive(series, split_time) plot_series(valid[0], [valid[1], naive_prediction], labels=["Series", "Naive Forecast"]) print(mae(naive_prediction, valid[1])) window = 30 moving_avg = moving_average(series, window)[split_time - window:] plot_series(valid[0], [valid[1], moving_avg], labels=["Series", "Moving average (30 days)"]) print(mae(moving_avg, valid[1])) window = 50 diff_moving_avg = moving_average(series2,
def main(): args = sys.argv ARG_NUM = 5 if(len(sys.argv) < ARG_NUM): print("Error") sys.exit(0) df = pd.read_csv('COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv') df = df[df['Country/Region']=='Japan'] df = df.iloc[:,4:].copy() data_at_japan = df.iloc[0,:] data_at_japan.index = pd.to_datetime(data_at_japan.index) #print(data_at_japan) plt.figure(figsize=(10,5)) plt.plot(data_at_japan) plt.title('COVID-19 confilmed at Japan', y = -0.2) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) #plt.show() #ファイル保存 fname_1 ='original.png' plt.savefig(fname_1) plt.close() data_at_japan_diff = data_at_japan - data_at_japan.shift(1) # 階差系列データの作成 data_at_japan_diff = data_at_japan_diff.dropna() data_at_japan_diff = data_at_japan_diff['2020-01-23':'2020-10-28']#10-28 #print(data_at_japan_diff) plt.figure(figsize=(10,5)) plt.plot(data_at_japan_diff) plt.title('COVID-19 confilmed at Japan', y=-0.2) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) #plt.show() #ファイル保存 fname_2 ='diff.png' plt.savefig(fname_2) plt.close() res = sm.tsa.seasonal_decompose(data_at_japan_diff)#データを分解 original = data_at_japan_diff # オリジナルデータ trend_original = res.trend # トレンドデータ seasonal_original = res.seasonal # 季節性データ residual = res.resid # 残差データ plt.figure(figsize=(10, 20)) # グラフ描画枠作成、サイズ指定 plt.subplot(411) # グラフ4行1列の1番目の位置(一番上) plt.plot(original) plt.title('COVID-19 confilmed(Original) at Japan', y=-0.17) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) # trend データのプロット plt.subplot(412) # グラフ4行1列の2番目の位置 plt.plot(trend_original) plt.title('COVID-19 confilmed(Trend) at Japan', y=-0.17) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) # seasonalデータ のプロット plt.subplot(413) # グラフ4行1列の3番目の位置 plt.plot(seasonal_original) plt.title('COVID-19 confilmed(Seasonality) at Japan', y=-0.17) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) # residual データのプロット plt.subplot(414) # グラフ4行1列の4番目の位置(一番下) plt.plot(residual) plt.title('COVID-19 confilmed(Residuals) at Japan', y=-0.17) plt.xlabel("Date") plt.ylabel("Person infected (people)") plt.grid(True) plt.tight_layout() # グラフの間隔を自動調整 fname_3 ='decompose.png' plt.savefig(fname_3) y = data_at_japan_diff.values.astype(float) test_size = 7# test_size train_original_data, test_original_data = split(y) train_normalized = normalized(train_original_data) window = 7# 学習時のウィンドウサイズ study_data, correct_data = sequence_creator(train_normalized, window) n_in_out = 1 n_hidden = args[1] drop_out = args[2] tf.random.set_seed(0) # parameters = { # 'n_hidden': [16, 32, 64, 128, 256, 512, 1024] # 'dropout': [0, 0.2, 0.4, 0.5, 0.6], # } # model = KerasClassifier(build_fn=gru, # verbose=0) # gridsearch = GridSearchCV(estimator=model, param_grid=parameters) # gridsearch.fit(study_data, correct_data) # print('Best params are: {}'.format(gridsearch.best_params_)) gru = gru(n_in_out, n_hidden, drop_out) print(gru.summary()) filename = 'gru_'+str(n_hidden)+'_'+str(drop_out)+'.png' plot_model(gru, show_shapes=True, show_layer_names=True, to_file=filename) Image(retina=False, filename=filename) epochs = 1 start_time = time.time() history = gru.fit(study_data, correct_data, batch_size=1, epochs=epochs, validation_split=0.1, verbose=1, callbacks=[])#lr_decay, print("学習時間:",time.time() - start_time) # === 学習推移の可視化 === # mse train_loss = history.history['loss'] val_loss = history.history['val_loss'] plt.plot(np.arange(len(train_loss)), train_loss, label="train_loss") plt.plot(np.arange(len(val_loss)), val_loss, label="val_loss") plt.title('Training and Validation loss') plt.ylim((0, 0.04))#add plt.legend() # plt.show() # === 学習推移の可視化 === # mae train_mae = history.history['mae'] val_mae = history.history['val_mae'] plt.plot(np.arange(len(train_mae)), train_mae, label="train_mae") plt.plot(np.arange(len(val_mae)), val_mae, label="val_mae") plt.title('Training and Validation mae') plt.ylim((0, 0.2))#add plt.legend() #plt.show() train_inverse = past_predict(study_data) upcoming_future=7 predictions_infected_pepole = test_predict(upcoming_future) x_all =np.arange('2020-01-23','2020-10-29', dtype='datetime64[D]').astype('datetime64[D]') x_past_predict = np.arange('2020-01-30','2020-10-22', dtype='datetime64[D]').astype('datetime64[D]')#23-26 x_train = np.arange('2020-01-23','2020-10-22', dtype='datetime64[D]').astype('datetime64[D]') x_test = np.arange('2020-10-22', '2020-10-29', dtype='datetime64[D]').astype('datetime64[D]') sns.set() COVID = plt.figure(figsize=(20,8)) plt.title("COVID-19 in Japan", y=-0.15) plt.grid(True) plt.xlabel("Date") plt.ylabel("Nunber of Person infected with corona virus (people)") plt.plot(x_all,data_at_japan_diff,'g',lw=3,label='daily_at_japan') # plt.plot(x_train,train_original_data,label='train_data') # plt.plot(x_test,test_original_data,label='test_data') plt.plot(x_past_predict,train_inverse,color='b', ls='-',lw=3,alpha=0.7, label='past_predict')#+8かも plt.plot(x_test, predictions_infected_pepole, 'r',lw=3,alpha=0.7,label='upcoming_future') plt.legend(loc='upper left') #plt.show() sns.set() COVID = plt.figure(figsize=(20,8)) plt.title("COVID-19 in Japan", y=-0.15) plt.grid(True) plt.xlabel("Date") plt.ylabel("Nunber of Person infected with corona virus (people)") plt.plot(x_test,test_original_data,color='b', ls='-',lw=3,alpha=0.7, label='past_predict') plt.plot(x_test, predictions_infected_pepole, 'r',lw=3,alpha=0.7,label='upcoming_future') #plt.show() train_mae, train_mse, train_rmse, train_r2, test_mae, test_mse, test_rmse, test_r2 = eval_func(train_inverse, test_original_data, predictions_infected_pepole)