def train(file_data=None, file_path="data/data.csv", train_tpye='lr', y_lable='PE', steps=200): """使用tensorflow官网推荐的Estimator高级接口编写模型""" # args = parser.parse_args() if file_data: (train_x, train_y), (test_x, test_y) = get_data.load_data(file_data=file_data) elif file_path: (train_x, train_y), (test_x, test_y) = get_data.load_data(file_path=file_path, y_name=y_lable) else: raise Exception("必须指定数据集文件dataset或者文件路径") train = (get_data.make_dataset(train_x, train_y).shuffle(10000).batch(100).repeat()) test = get_data.make_dataset(test_x, test_y).batch(100) feature_columns = [] for column in train_x.columns.values: feature_columns.append(tf.feature_column.numeric_column(key=column)) myhook = WriteLog() model = tf.estimator.LinearRegressor(feature_columns=feature_columns, model_dir='lr_model') if train_tpye == 'classifier': model = tf.estimator.DNNClassifier(hidden_units=[3, 2], feature_columns=feature_columns, model_dir='cl_model') model.train(input_fn=from_dataset(train), steps=steps, hooks=[myhook]) eval_result = model.evaluate(input_fn=from_dataset(test)) return eval_result
def main(argv): (train_x, train_y), (test_x, test_y) = load_data() feature_columns = [ tf.feature_column.categorical_column_with_identity(key='Pclass', num_buckets=4), tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female']), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S']), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch') ] optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001) model = tf.estimator.LinearClassifier(feature_columns=feature_columns, optimizer=optimizer) model.train(input_fn=lambda: inp(test_x, test_y), steps=10000) eval_result = model.evaluate(input_fn=lambda: inp(test_x, test_y)) average_loss = eval_result['average_loss'] print('\n' + 80 * '*') print('Error: ${:.0f}'.format(average_loss**0.5)) print()
def main(argv): (train_x, train_y), (test_x, test_y) = load_data() feature_columns = [ tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity(key='Pclass', num_buckets=4), dimension=3), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female']), dimension=3), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S']), dimension=3), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch') ] model = tf.estimator.Estimator(model_fn=dnn_model_fn, params={ 'feature_columns': feature_columns, 'learning_rate': 0.001, 'optimizer': tf.train.GradientDescentOptimizer, 'hidden_units': [20, 20] }) model.train(input_fn=lambda: inp(train_x, train_y), steps=100)
def main(argv): (train_x, train_y), (test_x, test_y) = load_data() feature_columns = [ tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_identity( key='Pclass', num_buckets=4), dimension=3), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female']), dimension=3), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S']), dimension=3), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch') ] model = tf.estimator.Estimator( model_fn=dnn_model_fn, params={ 'feature_columns': feature_columns, 'learning_rate': 0.001, 'optimizer': tf.train.GradientDescentOptimizer, 'hidden_units': [20, 20] } ) model.train(input_fn=lambda: inp(train_x, train_y), steps=100)
def __init__(self, use_test_data, asset_channels, time_range, train_period, test_period): self.train_period = train_period self.test_period = test_period if use_test_data: quotes_set = create_data(asset_channels) else: quotes_set = load_data(get_file=True) test_size = int(0.15 * quotes_set.shape[1]) test_quotes = np.array(quotes_set[:, -test_size:, :]) train_quotes = np.array(quotes_set[:, :-test_size, :]) self.env = StockMarketEnv(train_quotes, 10, time_range, asset_channels, sub_title='training result', testing_=use_test_data) n_x = time_range n_y = self.env.action_space n_pos = self.env.portfolio_len model_file_path = None #'output/crypto_trader_test4.ckpt' nr_assets = quotes_set.shape[0] self.bot = TestBot(input_sz=asset_channels, act_sz=n_y, nr_assets=nr_assets, save_path=model_file_path) self.bot = A2C(input_sz=asset_channels, act_sz=n_y, nr_assets=nr_assets, save_path=model_file_path)
def main(argv): (train_x, train_y), (test_x, test_y) = load_data() feature_columns = [ tf.feature_column.categorical_column_with_identity(key='Pclass', num_buckets=5), tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female']), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S']), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch') ] optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) model = tf.estimator.LinearClassifier(feature_columns=feature_columns, optimizer=optimizer) model.train(input_fn=lambda: inp(train_x, train_y, 'TRAIN'), steps=30000) eval_result = model.evaluate(input_fn=lambda: inp(test_x, test_y, 'EVAL')) average_loss = eval_result['average_loss'] print('Average loss: ' + str(average_loss)) brute_results = model.predict( input_fn=lambda: inp(load_submit(), (), 'PREDICT')) net_results = [] for line in brute_results: net_results.append(line['class_ids'][0]) write_to_file(net_results)
def main(argv): feature_columns = [ tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_identity( key='Pclass', num_buckets=5)), tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female'])), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S'])), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch'), tf.feature_column.numeric_column(key='agcl'), tf.feature_column.numeric_column(key='fsize'), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='title', vocabulary_list=['Mr', 'Mrs', 'Miss']), dimension=3), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='deck', vocabulary_list=cabin_list), dimension=3), tf.feature_column.numeric_column(key='Fare_Per_Person') ] (train_x, train_y), (test_x, test_y) = load_data(ratio=0.7) units = 2 * [30] optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) model = tf.estimator.DNNClassifier(hidden_units=units, feature_columns=feature_columns, optimizer=optimizer, activation_fn=tf.nn.sigmoid) model.train(input_fn=lambda: inp(train_x, train_y, 'TRAIN', rep=3000), steps=600000) eval_result = model.evaluate(input_fn=lambda: inp(test_x, test_y, 'EVAL', rep=1)) average_loss = eval_result['average_loss'] print('Average loss: ' + str(average_loss)) brute_results = model.predict(input_fn=lambda: inp(load_submit(), (), 'PREDICT')) net_results = [] for line in brute_results: net_results.append(line['class_ids'][0]) write_to_file(net_results)
def load_data_for_solve(file_name, for_train = True): label = "" if for_train: SAVE_DIR = "resultData/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original.csv", dir_name = SAVE_DIR) else: SAVE_DIR = "resultData/test/" # for testing or else, load the data from other place original_features, original_data, original = load_data(file_name, data_style = "Test Set") #print(deleted_features) data = original_data.copy() features = original_features.copy() save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR) return data, features, label
def portfolio_optimization(): optimal_path = "optimal_weights.txt" if not os.path.exists(optimal_path): data_path = os.path.join("tickers_data", "all_data.csv") if not os.path.exists(data_path): data = get_data.load_data( ) # [["Ticker", "close"]].groupby(["Ticker"]).T else: data = pd.read_csv(data_path) data_series = pd.pivot_table(data, index="datetime", columns="Ticker", values="close") # print(data_series.head()) mu = expected_returns.ema_historical_return(data_series) cov = risk_models.exp_cov(data_series) # plotting.plot_covariance(cov, plot_correlation=True) # print(mu, cov) ef = efficient_frontier.EfficientFrontier(mu, cov, weight_bounds=(0, 1)) ef.add_objective(objective_functions.L2_reg, gamma=1) ef.max_sharpe(0.02) weights_portfolio = ef.weights # ef.max_sharpe(risk_free_rate=0.002) # ef.max_sharpe() dict_keys = data_series.columns.values.tolist() # print(dict_keys) weights = {} for key, value in zip(dict_keys, weights_portfolio): # print(f"{key} - {value}") weights[key] = value # print("SORTED WEIGHTS") sorted_weights = dict( sorted(weights.items(), key=lambda item: item[1], reverse=True)) '''for key in sorted_weights.keys(): print(f"{key} - {sorted_weights[key]}") ''' cleaned_weights = { k: v for k, v in sorted_weights.items() if v > 10e-4 } with open(optimal_path, "w") as file: file.write(json.dumps(cleaned_weights)) # plt.pie(cleaned_weights.values(), labels=cleaned_weights.keys()) # plt.show() else: with open(optimal_path, "r") as file: cleaned_weights = json.loads(file.read()) return cleaned_weights
def main(): print("Please select question to display output for:\n" + "\t8 9") question = input() if question == '8': grid_search() elif question == '9': act = ['gilpin', 'bracco', 'harmon', 'baldwin', 'hader', 'carell'] model = plot_performance(act, 64, 20, 200, 100, torch.nn.LeakyReLU(), 0.001) # Model up to the hidden layer hidden = torch.nn.Sequential(model[0], model[1]) # Images specific to each actor harmon = Variable( torch.from_numpy(load_data('harmon', 100, 'bw64')).type(dtype_float)) baldwin = Variable( torch.from_numpy(load_data('baldwin', 100, 'bw64')).type(dtype_float)) # Get the activations and signifigance of each hidden layer harmon_activ = np.sum(hidden(harmon).data.cpu().numpy(), 0) baldwin_activ = np.sum(hidden(baldwin).data.cpu().numpy(), 0) harmon_last = model[2].weight.data[2].cpu().numpy() baldwin_last = model[2].weight.data[3].cpu().numpy() harmon_signif = harmon_activ * harmon_last baldwin_signif = baldwin_activ * baldwin_last # Find differences diffs = np.abs(harmon_signif - baldwin_signif) most_diff = np.argpartition(diffs, -4)[-4:][::-1] plt.figure(figsize=(5, 5)) for i, idx in enumerate(most_diff): plt.subplot(2, 2, i + 1) weights = model[0].weight.data[i].cpu().numpy() plt.imshow(weights.reshape((64, 64)), cmap=plt.cm.coolwarm) if harmon_signif[i] > baldwin_signif[i]: plt.title('Angie Harmon') else: plt.title('Alec Baldwin') plt.tight_layout() plt.savefig('q9.png') plt.show()
def main(argv): feature_columns = [ tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_identity(key='Pclass', num_buckets=5)), tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female'])), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S'])), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch'), tf.feature_column.numeric_column(key='agcl'), tf.feature_column.numeric_column(key='fsize'), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='title', vocabulary_list=['Mr', 'Mrs', 'Miss']), dimension=3), tf.feature_column.embedding_column( tf.feature_column.categorical_column_with_vocabulary_list( key='deck', vocabulary_list=cabin_list), dimension=3), tf.feature_column.numeric_column(key='Fare_Per_Person') ] (train_x, train_y), (test_x, test_y) = load_data(ratio=0.7) units = 2 * [30] optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) model = tf.estimator.DNNClassifier(hidden_units=units, feature_columns=feature_columns, optimizer=optimizer, activation_fn=tf.nn.sigmoid) model.train(input_fn=lambda: inp(train_x, train_y, 'TRAIN', rep=3000), steps=600000) eval_result = model.evaluate( input_fn=lambda: inp(test_x, test_y, 'EVAL', rep=1)) average_loss = eval_result['average_loss'] print('Average loss: ' + str(average_loss)) brute_results = model.predict( input_fn=lambda: inp(load_submit(), (), 'PREDICT')) net_results = [] for line in brute_results: net_results.append(line['class_ids'][0]) write_to_file(net_results)
def load_data_for_solve(file_name, for_train = True, is_round_two = False): label = "" if for_train: if is_round_two: SAVE_DIR = "resultData_two/" data_dir = "PPD-Second-Round-Data/" data_style = "Rematch Train/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name, data_dir, data_style) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original_round_two.csv", dir_name = SAVE_DIR) else: SAVE_DIR = "resultData/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name) original_features, original_data, train_label = extract_target(original_features, original_data) label = train_label.copy() save_result(label, "train_label_original.csv", dir_name = SAVE_DIR) else: if is_round_two: SAVE_DIR = "resultData_two/test" data_dir = "PPD-Second-Round-Data/" data_style = "Rematch Test/" # for training, load the data from train directory original_features, original_data, original = load_data(file_name, data_dir, data_style) else: SAVE_DIR = "resultData/test/" # for testing or else, load the data from other place original_features, original_data, original = load_data(file_name, data_style = "Test Set") data = original_data.copy() features = original_features.copy() save_result(data, "withoutLabel_originalData.csv", features, dir_name = SAVE_DIR) return data, features, label
def generate(label_mode='fine'): # load img (x_train, y_train_fine_label, y_train_coarse_label), (x_test, y_test_fine_label, y_test_coarse_label) = get_data.load_data( label_mode='both', path='./Data/cifar-100/') datagen = ImageDataGenerator(rotation_range=0.2, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode='nearest') y_train = [] y_train_fine_label = np.array([y_train_fine_label]) y_train_fine_label = y_train_fine_label.reshape( y_train_fine_label.shape[1], y_train_fine_label.shape[0]) y_train_coarse_label = np.array([y_train_coarse_label]) y_train_coarse_label = y_train_coarse_label.reshape( y_train_coarse_label.shape[1], y_train_coarse_label.shape[0]) y_train = np.concatenate((y_train_fine_label, y_train_coarse_label), axis=1) for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=200): x_train = np.concatenate((x_train, x_batch), axis=0) y_train = np.concatenate((y_train, y_batch), axis=0) print('x_train shape', x_train.shape) if x_train.shape[0] >= 50000 * 2: break (y_train_fine, y_train_coarse) = np.split(y_train, 2, axis=1) y_train_fine = y_train_fine.reshape(y_train_fine.shape[0]) y_train_coarse = y_train_coarse.reshape(y_train_coarse.shape[0]) if label_mode == 'fine': return (x_train, y_train_fine), (x_test, y_test_fine_label) elif label_mode == 'both': return (x_train, y_train_fine, y_train_coarse), (x_test, y_test_fine_label, y_test_coarse_label)
def main(): file = 'input/marvel-comics-character-dataset - marvel-wikia-data.csv' mydict = get_data.load_data(file) comic_ = get_random_id(mydict) orientation = get_key_from_dict('orientation', mydict[comic_]) year_introduced = get_key_from_dict('introduced', mydict[comic_]) num_appearances = get_key_from_dict('appearances', mydict[comic_]) good_evil_no = get_key_from_dict('alignment', mydict[comic_]) dead_alive = get_key_from_dict('status', mydict[comic_]) age = get_age(mydict[comic_]) print_test_lines(mydict[comic_], age, unit='years old', verb='is') print_test_lines(mydict[comic_], year_introduced, verb='was created in') print_test_lines(mydict[comic_], num_appearances, unit='times', verb='appeared') print_test_lines(mydict[comic_], good_evil_no, verb='is') print_test_lines(mydict[comic_], dead_alive, verb='is')
def select_data(actors, imagesize=32): sizes = [(54, 16, 16)] + [(64, 20, 20)] * 5 training = [] validation = [] testing = [] train_labels = [] valid_labels = [] test_labels = [] l = np.array([1] + [0] * 5) for i in range(6): d = load_data(actors[i], sum(sizes[i]), 'bw' + str(imagesize)) training.append(d[:sizes[i][0]]) validation.append(d[sizes[i][0]:-sizes[i][2]]) testing.append(d[-sizes[i][2]:]) train_labels.append(np.ones(sizes[i][0]) * i) valid_labels.append(np.ones(sizes[i][1]) * i) test_labels.append(np.ones(sizes[i][2]) * i) training = Variable(torch.from_numpy( np.vstack(training)).type(dtype_float), requires_grad=False) validation = Variable(torch.from_numpy( np.vstack(validation)).type(dtype_float), requires_grad=False) testing = Variable(torch.from_numpy(np.vstack(testing)).type(dtype_float), requires_grad=False) train_labels = Variable(torch.from_numpy( np.hstack(train_labels)).type(dtype_long), requires_grad=False) valid_labels = Variable(torch.from_numpy( np.hstack(valid_labels)).type(dtype_long), requires_grad=False) test_labels = Variable(torch.from_numpy( np.hstack(test_labels)).type(dtype_long), requires_grad=False) return (training, train_labels), (validation, valid_labels), \ (testing, test_labels)
def main(argv): (train_x, train_y), (test_x, test_y) = load_data() feature_columns = [ tf.feature_column.categorical_column_with_identity( key='Pclass', num_buckets=5), tf.feature_column.categorical_column_with_vocabulary_list( key='Sex', vocabulary_list=['male', 'female']), tf.feature_column.numeric_column(key='Age'), tf.feature_column.numeric_column(key='Fare'), tf.feature_column.categorical_column_with_vocabulary_list( key='Embarked', vocabulary_list=['C', 'Q', 'S']), tf.feature_column.numeric_column(key='SibSp'), tf.feature_column.numeric_column(key='Parch') ] optimizer = tf.train.AdagradOptimizer(learning_rate=0.1) model = tf.estimator.LinearClassifier(feature_columns=feature_columns, optimizer=optimizer) model.train(input_fn=lambda: inp(train_x, train_y, 'TRAIN'), steps=30000) eval_result = model.evaluate(input_fn=lambda: inp(test_x, test_y, 'EVAL')) average_loss = eval_result['average_loss'] print('Average loss: ' + str(average_loss)) brute_results = model.predict(input_fn=lambda: inp(load_submit(), (), 'PREDICT')) net_results = [] for line in brute_results: net_results.append(line['class_ids'][0]) write_to_file(net_results)
plt.show() return stats if __name__ == "__main__": # TODO: Determine why CPU is faster than GPU os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # -1: Defaults to CPU, 0: GPU imb_rate = 0.01 # Imbalance rate min_class = [1] # Minority classes, must be same as trained model maj_class = [0] # Majority classes, must be same as trained model datasource = "credit" # The dataset to be selected fp_model = "./models/20200928_FN20_FP91.h5" # Filepath to the .h5-model # Remove classes ∉ {min_class, maj_class}, imbalance the dataset # Make sure the same seed is used as during training to ensure no data contamination X_train, y_train, X_test, y_test, X_val, y_val = load_data( datasource, imb_rate, min_class, maj_class) # Load all data model = load_model(fp_model) y_pred = make_predictions(model, X_test) plot_conf_matrix(y_test, y_pred) # Plot confidence matrix based on test dataset # y_baseline = np.ones(len(y_pred), dtype=int) # Baseline, everything is Majority # plot_conf_matrix(y_val, y_baseline) # plt.imshow(X_test[0], cmap="Greys") # Show first image # plt.show()
choices=["image", "text", "structured"], default="image") parser.add_argument("--imb-rate", type=float, default=0.04) parser.add_argument("--min-class", type=str, default="2") parser.add_argument("--maj-class", type=str, default="3") parser.add_argument("--training-steps", type=int, default=10_000) args = parser.parse_args() data_source = args.data imb_rate = args.imb_rate training_steps = args.training_steps min_class = list(map(int, args.min_class)) # String to list of integers maj_class = list(map(int, args.maj_class)) # String to list of integers X_train, y_train, X_test, y_test, X_val, y_val = load_data( data_source, imb_rate, min_class, maj_class, normalization=NORMALIZATION) print(f"X_train: {X_train.shape}, y_train: {y_train.shape}") print(f"Minority: {min_class}, Majority: {maj_class}") input_shape = X_train.shape[1:] env = ClassifyEnv(MODE, imb_rate, X_train, y_train) if args.model == "image": model = get_image_model(input_shape) elif args.model == "text": input_shape = (5_000, 500) model = get_text_model(input_shape) else: model = get_structured_model(input_shape)
def main(): random.seed(318) ray.init(num_cpus=POOL_SIZE, ignore_reinit_error=True, local_mode=False, log_to_driver=True) logbook = tools.Logbook() # make a new folder 替每次實驗都產生一個文件夾,內含合格的因子,以及最高分的因子(如果 iter 結束都沒找到合格因子) test_number = datetime.now().strftime("%Y%m%d_%H_%M_%S") this_test_path = os.path.join(FACTOR_PATH, test_number) os.makedirs(this_test_path, exist_ok=True) os.makedirs(os.path.join(this_test_path, "best_factors"), exist_ok=True) os.makedirs(os.path.join(this_test_path, "found_factors"), exist_ok=True) # 將 config 存下來 with open(os.path.join(this_test_path, 'config.json'), 'w') as outfile: json.dump(config, outfile) # set up logger loggerFolder = os.path.join(this_test_path, 'log') os.makedirs(loggerFolder, exist_ok=True) logger = Logger(loggerFolder=loggerFolder, exeFileName='log') globalVars.initialize(logger) # load data to globalVars load_data("barra", os.path.join(os.path.join(PROJECT_ROOT, "data"), "h5")) # load_data("materialData", # os.path.join(os.path.join(PROJECT_ROOT,"data"), "h5") # ) load_data( "materialData", os.path.join(os.path.join(os.path.join(PROJECT_ROOT, "data"), "h5"), "materialData_newData.h5")) globalVars.logger.info('load all......done') # prepare data # 將我們要用的數據取出,只使用前面指定的數據 materialDataDict = { k: globalVars.materialData[k] for k in materialDataNames } # only take the data specified in materialDataNames barraDict = {k: globalVars.barra[k] for k in barraNames } # only take the data specified in barraNames toRegFactorDict = {} # get the return to compare # 定義用來放進 evaluation function 的 收益率 open_ = globalVars.materialData['open'] shiftedPctChange_df = open_.to_DataFrame().pct_change().shift( -2) #使用後天到明天開盤價的 pctChange 作為 收益率 open_shift_df = globalVars.materialData['open'].to_DataFrame().shift(-1) # align data within shiftedPctChange_df data # 將所有數據與 收益率數據對齊 periodShiftedPctChange_df = shiftedPctChange_df.loc[ PERIOD_START:PERIOD_END] periodShiftedPctChange = GeneralData('periodShiftedPctChange_df', periodShiftedPctChange_df) periodOpen_shift_df = open_shift_df.loc[PERIOD_START:PERIOD_END] periodOpen_shift = GeneralData('periodOpen_shift_df', periodOpen_shift_df) periodMaterialDataDict = align_all_to(materialDataDict, periodShiftedPctChange) periodBarraDict = align_all_to(barraDict, periodShiftedPctChange) del shiftedPctChange_df, periodShiftedPctChange_df # stack barra data # 事先將要用來回歸的數據,合併為三維數據 np array barraStack = None toRegFactorStack = None if len(barraDict) > 0: barraStack = np.stack( [aB.generalData for aB in periodBarraDict.values()], axis=2) if len(toRegFactorDict) > 0: toRegFactorStack = np.stack( [aB.generalData for aB in toRegFactorDict.values()], axis=2) # put data to ray for latter use materialDataDictID = ray.put(periodMaterialDataDict) barraStackID = ray.put(barraStack) toRegFactorStackID = ray.put(toRegFactorStack) # combine the func with data, so that the new partial function only needs input of factor ind evaluate = partial(preprocess_eval_single_period, materialDataDictID=materialDataDictID, barraStackID=barraStackID, toRegFactorStackID=toRegFactorStackID, factorEvalFunc=partial(EVALUATE_FUNC, layerNum=10, price=periodOpen_shift), pset=pset) for i in range(ITERTIMES): logger.info( "start easimple algorithm from iteration {}th time".format(i + 1)) # start easimple 開始用遺傳算法繁衍,使用對應參數與已經準備好的 evaluate 函數做適應度測試, # 回傳有兩種可能: # 1 找到合格的 因子 findFactor==True # 2 沒找到且 到達 N_GEN 次的繁衍 findFactor==False findFactor, returnIndividual, logbook = easimple( toolbox=toolbox, stats=mstats, logbook=logbook, evaluate=evaluate, logger=globalVars.logger, N_POP=N_POP, N_GEN=N_GEN, CXPB=CXPB, MUTPB=MUTPB) if findFactor: # 若找到因子,存入 found_factors func, factor_data = compileFactor( individual=returnIndividual, materialDataDict=periodMaterialDataDict, pset=pset) factor = Factor(name=str(returnIndividual), generalData=factor_data, functionName=str(returnIndividual), reliedDatasetNames={ "materialData": list(materialDataDict.keys()) }, parameters_dict={}, **config) factor.save( os.path.join(this_test_path, "found_factors\\{}.pickle".format(factor.name))) # 如果找到因子,後續找到的因子都要與前面的回歸,所以要重新定義 evaluate func toRegFactorDict.update({str(returnIndividual): factor}) if len(toRegFactorDict) > 0: toRegFactorStack = np.stack( [aB.generalData for aB in toRegFactorDict.values()], axis=2) toRegFactorStackID = ray.put(toRegFactorStack) evaluate = partial(preprocess_eval_single_period, materialDataDictID=materialDataDictID, barraStackID=barraStackID, toRegFactorStackID=toRegFactorStackID, factorEvalFunc=partial(EVALUATE_FUNC, layerNum=10, price=periodOpen_shift), pset=pset) continue else: # 若沒找到因子,存入 best_factors func, factor_data = compileFactor( individual=returnIndividual, materialDataDict=periodMaterialDataDict, pset=pset) factor = Factor(name=str(returnIndividual), generalData=factor_data, functionName=str(returnIndividual), reliedDatasetNames={ "materialData": list(materialDataDict.keys()) }, parameters_dict={}, **config) factor.save( os.path.join(this_test_path, "best_factors\\{}.pickle".format(factor.name))) logger.info( "end easimple algorithm from iteration {}th time".format(i + 1))
import sys sys.path.insert(0, './resnet') import tensorflow as tf import numpy as np from resnet152 import get_resnet # from convert2 import load_image from generator import combine_embeddings, generator_me from discriminator import discriminator from rnn_module import rnn_module from get_data import load_data, get_training_batch sess = tf.InteractiveSession() print('loading data') qa_data = load_data() (questions_in_true, answer_in_true, image_in_true) = get_training_batch(0, 50, qa_data) print('starting') im_feat_true = get_resnet(sess, image_in_true) print(im_feat_true) np.save('features_test', im_feat_true) #im_feat_true = np.random.normal(size=[batch_size,2048]) #im_feat_false = np.random.normal(size=[batch_size,2048]) # get embedding of true input data # features_true = sess.run(features, feed_dict={images: image_true, questions: questions_true}) # # get embedding of false input data # features_false = sess.run(features, feed_dict={images: image_false, questions: questions_false})
processor = ClassifyProcessor() with open(f"./logs_alt/DQN_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", 'w', newline='') as f: print(f"Writing files to: {f.name}") writer = csv.DictWriter(f, fieldnames=columns) writer.writeheader() f.flush() for i in tqdm(range(N_REPETITIONS)): X_train, y_train, X_test, y_test, X_val, y_val = load_data( "credit", imb_rate, min_class, maj_class, normalization=NORMALIZATION, print_stats=False) env = ClassifyEnv(MODE, imb_rate, X_train, y_train) memory = SequentialMemory(limit=MEMORY_SIZE, window_length=1) policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=EPS_MAX, value_min=EPS_MIN, value_test=0.05, nb_steps=EPS_STEPS) dqn = DQNAgent(model=model, policy=policy, nb_actions=2,
def check_performance(name, labels, parameters): images = get_data.load_data(name, sizes=[50]) a = classifier.labelling_accuracy(images, labels, parameters) print('\t', name, 'classified with', a, 'accuracy')
def main(): print("Please select question to display output for:\n" + "\t1 2 3 4 5 6 7 8") question = raw_input() if question == '1': print('The downloading and cropping of images is done by the ' 'get_data function in the get_data.py file.') get_data.get_data() # For some strange reason, when performing downloading all the images # the program hangs after completion, never exiting. 'Done' is printed # however. The program does not close even if I forcibly throw an error. # On small subsets however, it does close. I suspect it has something to # with the automatic garbage collection running into some issue, but I # do not have any clue as to what. If you know what could be the cause, # please let me know! Otherwise, take this as a notice that when 'Done' # is printed the program has finished. print('Done') elif question == '2': print('The loading of images is done by the load_data function in the ' 'get_data.py file. Seperating the images into three sets is done ' 'as part of the classify function in the classifier.py file.') images = get_data.load_data('baldwin') # for example # Note that by default, the seed is fixed so that repeated identical # calls will produce an identical selection of images. This is how we # ensure reproducibility. For true randomness, just set the parameter: rand_imgs = get_data.load_data('baldwin', is_random=True) elif question == '3': p, cost, accuracy = classifier.classify(['baldwin', 'carell']) print("Training Error:", cost[0]) print("Validation Error:", cost[1]) print("Validation Accuracy:", accuracy[1]) print("Testing Accuracy:", accuracy[2]) elif question == '4': def visualize(p, name): vis = p[1:].reshape((32, 32)) io.imshow(vis) plt.savefig(name) plt.show() print('Full training set') p, _, _ = classifier.classify(['baldwin', 'carell']) visualize(p, '4a-1.png') print('Two image training set') p, _, _ = classifier.classify(['baldwin', 'carell'], set_sizes=(2, 10, 10)) visualize(p, '4a-2.png') print('Stop too early vs stop too late') p, _, _ = classifier.classify(['baldwin', 'carell'], max_iter=100) visualize(p, '4b-1.png') p, _, _ = classifier.classify(['baldwin', 'carell'], epsilon=1e-7) visualize(p, '4b-2.png') elif question == '5': train_act = [['bracco', 'gilpin', 'harmon'], ['baldwin', 'carell', 'hader']] other_act = [['chenoweth', 'ferrera', 'drescher'], ['butler', 'vartan', 'radcliffe']] # Compare performance against other actors def check_performance(name, labels, parameters): images = get_data.load_data(name, sizes=[50]) a = classifier.labelling_accuracy(images, labels, parameters) print('\t', name, 'classified with', a, 'accuracy') p, _, _ = classifier.classify([train_act[0], train_act[1]], set_sizes=(66, 10, 10)) print('Training Actors') for act in train_act[0]: check_performance(act, np.ones(50), p) for act in train_act[1]: check_performance(act, np.zeros(50), p) print('Other Actors') for act in other_act[0]: check_performance(act, np.ones(50), p) for act in other_act[1]: check_performance(act, np.zeros(50), p) # Visualize performance vs set size plt.axis((0, 70, 0.5, 1)) plt.xlabel('Size of Training Set') plt.ylabel('Classification Accuracy') vald = mpatches.Patch(color='blue', label='Validation Set') test = mpatches.Patch(color='green', label='Testing Set') plt.legend(handles=[vald, test], loc=4) plt.ion() for i in range(1, 67): p, e, a = classifier.classify([train_act[0], train_act[1]], set_sizes=(i, 10, 10), is_random=True) plt.scatter(i, a[1], c='b') plt.scatter(i, a[2], c='g') plt.pause(0.01) plt.savefig('5.png') elif question == '6': # initialize some data to be used and find gradient images = get_data.load_data('bracco', [40]) param = np.random.random((1025, 6)) * 1e-2 labels = np.array([[1, 0, 0, 0, 0, 0]] * 40) grad = classifier.cost_gradient(images, labels, param) h = 1e-6 # compare against finite differences np.random.seed(17) for _ in range(5): x = np.random.randint(0, 1025) y = np.random.randint(0, 6) param_mod = param.copy() param_mod[x, y] += h estimate = (classifier.cost_function(images, labels, param_mod) - classifier.cost_function(images, labels, param)) / h print('(p,q) =', (x, y), '-> function:', '{:f}'.format(grad[x, y]), '\t', 'estimate:', '{:f}'.format(estimate)) elif question == '7': act = ['bracco', 'gilpin', 'harmon', 'baldwin', 'carell', 'hader'] p, cost, accuracy = classifier.classify(act, set_sizes=(66, 10, 10)) print("Validation Accuracy:", accuracy[1]) print("Testing Accuracy:", accuracy[2]) elif question == '8': act = ['bracco', 'gilpin', 'harmon', 'baldwin', 'carell', 'hader'] p, e, a = classifier.classify(act, set_sizes=(66, 10, 10)) for i, vis in enumerate(p[1:].T): vis = vis.reshape((32, 32)) io.imshow(vis) plt.savefig('8-' + str(i) + '.png') plt.show()
def serving_input_receiver_fn(): serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[batch_size], name='input_example_tensor') receiver_tensors = {'examples': serialized_tf_example} features = tf.parse_example(serialized_tf_example, feature_spec) return tf.estimator.export.ServingInputReceiver(features, receiver_tensors) graph = tf.Graph() with graph.as_default(): sess = tf.Session() (train_x, train_y), (test_x, test_y) = get_data.load_data() my_feature_columns = [] for key in train_x.keys(): my_feature_columns.append(tf.feature_column.numeric_column(key=key)) print(my_feature_columns) classifier = tf.estimator.Estimator(model_fn=my_model, params={ 'feature_columns': my_feature_columns, 'hidden_units': [100, 100, 100, 100], 'n_classes': 2, }, model_dir='/models/angela')
import numpy as np import keras import random from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras.utils import to_categorical from keras.optimizers import SGD from get_data import load_data from sklearn.model_selection import train_test_split X, y = load_data() X, y = np.asarray(X), np.asarray(y) y = to_categorical(y) X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) print("Shape of training data : ", x_train.shape, y_train.shape) print("Shape of testing data : ", x_test.shape, y_test.shape) model = Sequential() # input: 100x100 images with 3 channels -> (100, 100, 3) tensors. # this applies 32 convolution filters of size 3x3 each. model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 1))) model.add(Conv2D(32, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25))
train = region_average_high(train) if type == "train": train = lat_lon(train, "asset/train_lat_lon.csv") else: train = lat_lon(train, "asset/test_lat_lon.csv") return train ''' load latitude and longitude from asset/train_lat_lon.csv and asset/test_lat_lon.csv merge them into a new file ''' def lat_lon(data, file): import pandas as pd train = pd.read_csv(file) train.drop(['key', 'tolerance_m'], 1, inplace=True) result = data.merge(train, on='id') return result if __name__ == "__main__": data = get_data.load_data() train = data['train'] train = lat_lon(train, "asset/train_lat_lon.csv") print(train.head())
def classify(names, set_sizes=(100, 10, 10), learning_rate=1e-3, epsilon=4e-5, max_iter=1e5, is_random=False): # ensure that each name set is formated as a list (allow for single entries) names = map(lambda x: [x] if isinstance(x, basestring) else x, names) # initialize the different sets total_entries = len(np.array(names).flatten()) training = np.empty((total_entries * set_sizes[0], 1024), dtype='float64') validation = np.empty((total_entries * set_sizes[1], 1024), dtype='float64') testing = np.empty((total_entries * set_sizes[2], 1024), dtype='float64') # load images into the three different sets for i, name in enumerate(np.array(names).flatten()): images = load_data(name, sizes=set_sizes, is_random=is_random) training[i * set_sizes[0]: i * set_sizes[0] + set_sizes[0]] = \ images[:set_sizes[0]] validation[i * set_sizes[1]: i * set_sizes[1] + set_sizes[1]] = \ images[set_sizes[0]:-set_sizes[2]] testing[i * set_sizes[2]: i * set_sizes[2] + set_sizes[2]] = \ images[-set_sizes[2]:] # create the proper labels for the sets # note that since we do batch updating, the order of the different actors # does not matter, so all of one actor is first, then all of another, etc. if len(names) == 2: training_labels = np.hstack((np.ones(len(names[0]) * set_sizes[0]), np.zeros(len(names[1]) * set_sizes[0]))) validation_labels = np.hstack((np.ones(len(names[0]) * set_sizes[1]), np.zeros(len(names[1]) * set_sizes[1]))) testing_labels = np.hstack((np.ones(len(names[0]) * set_sizes[2]), np.zeros(len(names[1]) * set_sizes[2]))) parameters = np.ones(1025) * 1e-4 else: training_labels = np.zeros((total_entries * set_sizes[0], len(names))) validation_labels = np.zeros( (total_entries * set_sizes[1], len(names))) testing_labels = np.zeros((total_entries * set_sizes[2], len(names))) i = 0 for j, name_set in enumerate(names): training_labels[i * set_sizes[0]:(i + len(name_set)) * set_sizes[0], j] = 1 validation_labels[i * set_sizes[1]:(i + len(name_set)) * set_sizes[1], j] = 1 testing_labels[i * set_sizes[2]:(i + len(name_set)) * set_sizes[2], j] = 1 i += len(name_set) parameters = np.ones((1025, len(names))) * 1e-4 # perform gradient descent and evaluate the results parameters = gradient_descent(training, training_labels, parameters, learning_rate, epsilon, max_iter) cost = [ cost_function(training, training_labels, parameters), cost_function(validation, validation_labels, parameters), cost_function(testing, testing_labels, parameters) ] accuracy = [ labelling_accuracy(training, training_labels, parameters), labelling_accuracy(validation, validation_labels, parameters), labelling_accuracy(testing, testing_labels, parameters) ] return parameters, cost, accuracy
import keras from keras.models import Sequential from keras.layers import Dense, Dropout from keras.utils import plot_model import matplotlib.pyplot as plt import numpy as np import get_data num_class = 2 #分类数量 epochs = 60 #遍历次数 batch_size = 128 #批大小 #获取数据集 并且分组 (x_train, y_train), (x_test, y_test) = get_data.load_data(file_path='./data/train_data.csv', y_name='6') #转换DataFrame对象为张量数组 x_train = x_train.values y_train = y_train.values x_test = x_test.values y_test = y_test.values print(x_train.shape) y_train = keras.utils.to_categorical(y_train, 2) y_test = keras.utils.to_categorical(y_test, 2) # 定义神经网络模型的层结构 model = Sequential() model.add(Dense(5, input_shape=(5, ), activation='relu')) # model.add(Dropout(0.5))
import get_data import network get_data.load_data('mnist.pkl') train_data, validation_data, test_data = get_data.load_data('mnist.pkl') clf = network.Network(train_data[0].shape[1], 20, 10, 1, 'mini_batch') clf.fit(train_data[0], train_data[1]) accurate_rate = clf.predict(test_data[0], test_data[1])
import numpy as np import argparse from get_data import load_data from structures import C45Tree from pruning import entropy_with_numnodes, error_with_numnodes, pep if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--prune', required=True, type=str) parser.add_argument('--alpha', default=1.0, type=float) parser.add_argument('--log', default=None, type=str) args = parser.parse_args() args.prune = args.prune.lower() assert args.prune in ('entropy', 'error', 'pep') train_feats, train_labels, test_feats, test_labels, attr_dim = load_data() if args.prune == 'entropy': prune_func = entropy_with_numnodes(args.alpha) elif args.prune == 'error': prune_func = error_with_numnodes(args.alpha) else: prune_func, prune_at_last = pep() tree = C45Tree(train_feats, train_labels, attr_dim, prune_func) if args.prune == 'pep': tree = prune_at_last(tree) acc = tree.get_acc(test_feats, test_labels) log_str = 'c45 tree with prune %s, acc: %s' % (args.prune, acc) if args.log is None: print(log_str)
def test_mlp(learning_rate: float = 0.01, l1_reg: float = 0.00, l2_reg: float = 0.0001, n_epochs: int = 1000, dataset: str = 'mnist.pkl.gz', batch_size: int = 20, n_hidden: int = 500): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :param learning_rate: learning rate used (factor for the stochastic gradient) :param l1_reg: L1-norm's weight when added to the cost (see regularization) :param l2_reg: L2-norm's weight when added to the cost (see regularization) :param n_epochs: maximal number of epochs to run the optimizer :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz :param batch_size: the size of a mini-batch :param n_hidden: the number of hidden units """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of mini-batches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels rng = RandomState(1234) # construct the MLP class classifier = MultiLayerPerceptron(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10) # the cost we minimize during training is the negative log likelihood of the model # plus the regularization terms (L1 and L2); cost is expressed here symbolically cost = (classifier.negative_log_likelihood(y) + l1_reg * classifier.L1 + l2_reg * classifier.L2_sqr) # compiling a Theano function that computes the mistakes that are made # by the model on a mini-batch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # compute the gradient of cost with respect to theta (sorted in params) # the resulting gradients will be stored in a list g_params g_params = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * g_param) for param, g_param in zip(classifier.params, g_params)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many mini-batches before checking the network # on the validation set; in this case we check every epoch best_validation_loss = inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 for mini_batch_index in range(n_train_batches): mini_batch_avg_cost = train_model(mini_batch_index) # iteration number iteration = (epoch - 1) * n_train_batches + mini_batch_index if (iteration + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = mean(validation_losses) print('epoch %i, mini-batch %i/%i, validation error %f %%' % (epoch, mini_batch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iteration * patience_increase) best_validation_loss = this_validation_loss best_iter = iteration # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = mean(test_losses) print((' epoch %i, mini-batch %i/%i, test error of ' 'best model %f %%') % (epoch, mini_batch_index + 1, n_train_batches, test_score * 100.)) if patience <= iteration: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file %s ran for %.2fm' % (os.path.split(__file__)[1], (end_time - start_time) / 60.)), file=sys.stderr)