def start_app(self): data_origin = str(input('#----------------- > Hello, first, where is your file? (type 1 for Web, 2 for Local): ')) print('Given origin (1 Web, 2 Local): {}'.format(data_origin)) ##Uses web to find a file and return the data. if data_origin[0] == '1': data = Connect_web().start() if data == False: return False else: self.p_class = process_data(data) if self.p_class.group_month(): return True else: return False ##Uses a local file address to find a file and return the data. elif data_origin[0] == '2': data = Open_local().start() if data == False: return False else: self.p_class = process_data(data) if self.p_class.group_month(): return True else: return False else: return True
def main(cmd_args): if len(cmd_args) < 3: print "Usage: processing path/to/nodes/file path/to/edge/file path/to/output/file [deletion_names...]" elif not os.path.isfile(cmd_args[0]): print "Node file is no file" elif not os.path.isfile(cmd_args[1]): print "Edge file is no file" else: process_data(*cmd_args) sys.exit(0)
def get_data(rinv, N): df = h5py.File(path.parent / "data" / "jet_images" / f"LL-{rinv}.h5", "r") y = df["targets"][:N] X = df["features"][:N] X = process_data(X) return X, y
def main(): batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW) ceters, targets = next(batch_gen) word2vec(batch_gen)
from Plotting.plot_all_third_parameters import plot_all_sensitivities_per_alg from Plotting.plot_learning_curve import plot_learning_curve from Plotting.plot_learning_for_two_lambdas import plot_learning_curve_for_lambdas from Plotting.plot_sensitivity import plot_sensitivity_curve from Plotting.plot_waterfall import plot_waterfall_scatter from process_data import process_data process_data() plot_learning_curve() plot_sensitivity_curve() # plot_waterfall_scatter() # plot_all_sensitivities_per_alg() # plot_learning_curve_for_lambdas()
def data(): success = process_data() return {"success": success}
WINDOW = 30 STEP = 1 FORECAST = 1 ROLLING = 30 EMB_SIZE = 7 random.seed(42) # Load stock data eod_data = load_stock_data() # Loop through all stocks returned for key, data in eod_data.items(): # Process data processed_data = process_data(data) X, Y = [], [] for idx in range(0, len(processed_data)-WINDOW-FORECAST, STEP): # Get data from window hl = remap(np.array(processed_data['H-L'][idx:idx+WINDOW]), -1, 1) co = remap(np.array(processed_data['C-O'][idx:idx+WINDOW]), -1, 1) sma_3 = remap(np.array(processed_data['3day SMA'][idx:idx+WINDOW]), -1, 1) sma_10 = remap(np.array(processed_data['10day SMA'][idx:idx+WINDOW]), -1, 1) sma_30 = remap(np.array(processed_data['30day SMA'][idx:idx+WINDOW]), -1, 1) std_dev = remap(np.array(processed_data['Std_dev'][idx:idx+WINDOW]), -1, 1) rsi = remap(np.array(processed_data['RSI'][idx:idx+WINDOW]), -1, 1) # Stack in array x_i = np.column_stack((hl, co, sma_3, sma_10, sma_30, std_dev, rsi))
reader = csv.DictReader(csvfile) for row in reader: train_id = ast.literal_eval(row['train']) test_id = ast.literal_eval(row['test']) val_id = ast.literal_eval(row['val']) return train_id, test_id, val_id if __name__=="__main__": parser = get_parser() args = parser.parse_args() # training sents, W, word_index, vocab, labels, max_l, U, user_idx = process_data(args.input, False, args.vectors, args.user_vectors, args.tagField, args.textField, args.userField, idField=args.idField) # set_trace() model = args.model # if args.static: # print "model architecture: CNN-static" # non_static = False # else: # print "model architecture: CNN-non-static" # non_static = True non_static = True if args.vectors: print "using: word2vec vectors" else: print "using: random vectors" classes = set(x["y"] for x in sents)
#!/usr/bin/python from __future__ import print_function from argparse import ArgumentParser from get_data import get_data from setup_data import setup_data from process_data import process_data from run_walsh_alg import run if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('-s', '--settings', help='Settings file') parser.add_argument('-r', '--remove', help='Remove', action='store_true') args = parser.parse_args() if not args.settings: import settings.default as settings else: raise Exception('not impl') if settings.GET_DATA: get_data(settings) if settings.SETUP_DATA: setup_data(settings) if settings.PROCESS_DATA: process_data(settings) if settings.RUN_WALSH: run()
def semisupervised_selection(data_dir, dest_dir, initial_pos_filename, initial_neg_filename, initial_pool_filename, w2v_file, word_vectors="-rand", src_lan='en', trg_lan='de', non_static=True, n_iter=10, max_l=50, k=300, test_batch=7000, instances_to_add=50000, debug=False): """ Performs a semisupervised text selection over a pool of sentences based on initial positive/negative files. The steps that takes are: 1. Classify the pool according the positive/negative samples through a CNN 2. Take the most positive and most negative sentences from the pool and includes it into the positive/negative training samples 3. With this extended positive/negative sets, trains another CNN and backs to 1. :param data_dir: Directoty where the data files are :param initial_pos_filename: Initial "in-domain" corpus :param initial_neg_filename: Initial "out-of-domain" corpus :param initial_pool_filename: Pool of sentences where to perform the selection :param w2v_file: Word2vec file (for the CNN input) :param word_vectors: To use word vectors from word2vec or random word vector :param non_static: Non-static CNNs :param n_iter: Number of iterations carried out by the proccess :param test_batch: Classify the pool with this batch :param instances_to_add: Number of instances to add at each iteration :return: """ pos_filename_src = data_dir + '/' + initial_pos_filename + '.' + src_lan in_domain_file = open(pos_filename_src, 'r') in_domain = in_domain_file.readlines() in_domain_file.close() pos_filename_trg = data_dir + '/' + initial_pos_filename + '.' + trg_lan neg_filename_src = data_dir + '/' + initial_neg_filename + '.' + src_lan pool_filename_src = data_dir + '/' + initial_pool_filename + '.' + src_lan pool_filename_trg = data_dir + '/' + initial_pool_filename + '.' + trg_lan for i in range(n_iter): print "------------------ Starting iteration", i, "------------------" new_pos_filename_src = dest_dir + '/' + initial_pos_filename + '_' + str(i) + '.' + src_lan new_pos_filename_trg = dest_dir + '/' + initial_pos_filename + '_' + str(i) + '.' + trg_lan new_pos_filename_src_tmp = dest_dir + '/' + initial_pos_filename + 'tmp' + '.' + src_lan if debug: new_neg_filename_src_tmp = dest_dir + '/' + initial_neg_filename + 'tmp' + '_' + str(i) + '.' + src_lan new_neg_filename_src = dest_dir + '/' + initial_neg_filename + '_' + str(i) + '.' + src_lan new_pool_filename_src = dest_dir + '/' + initial_pool_filename + '_' + str(i) + '.' + src_lan new_pool_filename_trg = dest_dir + '/' + initial_pool_filename + '_' + str(i) + '.' + trg_lan if i > 0: copyfile(pos_filename_src, new_pos_filename_src_tmp) copyfile(pos_filename_src, new_pos_filename_src) copyfile(pos_filename_trg, new_pos_filename_trg) with open(new_pos_filename_src_tmp, "a") as f: for line in in_domain: f.write(line) copyfile(neg_filename_src, new_neg_filename_src) copyfile(pool_filename_src, new_pool_filename_src) copyfile(pool_filename_trg, new_pool_filename_trg) x = process_data(w2v_file, new_pos_filename_src_tmp, new_neg_filename_src, new_pool_filename_src, k=k) revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4] if word_vectors=="-rand": print "using: random vectors" U = W2 elif word_vectors=="-word2vec": print "using: word2vec vectors" U = W else: raise NotImplementedError, "Choose between -rand or -word2vec options" results = [] datasets = make_idx_data_holdout(revs, word_idx_map, max_l=max_l,k=k, filter_h=5) perf, predictions, prediction_probs = train_conv_net(datasets, U, img_w=k, lr_decay=0.95, filter_hs=[3,4,5], conv_non_linear="relu", hidden_units=[200,100,2], shuffle_batch=True, n_epochs=14, sqr_norm_lim=9, non_static=non_static, batch_size=128, dropout_rate=[0.5], test_batch=test_batch, savename="predictions_" + str(i), savetofile=False) positive_lines_src, positive_lines_trg, negative_lines, neutral_lines_src, neutral_lines_trg = \ process_prediction_probs(prediction_probs, instances_to_add, pool_filename_src, pool_filename_trg) print "Adding", len(positive_lines_src), "positive lines" print "Positive sample:", positive_lines_src[0], "---", positive_lines_trg[0] print "Adding", len(negative_lines), "negative lines" print "Negative sample:", negative_lines[0] print "Adding", len(neutral_lines_src), "neutral lines" print "Neutral sample:", neutral_lines_src[0], "---", neutral_lines_trg[0] new_pos_file_src = open(new_pos_filename_src, 'a') new_pos_file_trg = open(new_pos_filename_trg, 'a') new_neg_file = open(new_neg_filename_src, 'a') if debug: new_neg_file_tmp = open(new_neg_filename_src_tmp, 'a') new_pool_file_src = open(new_pool_filename_src, 'w') new_pool_file_trg = open(new_pool_filename_trg, 'w') for line in positive_lines_src: new_pos_file_src.write(line) for line in positive_lines_trg: new_pos_file_trg.write(line) for line in negative_lines: new_neg_file.write(line) if debug: new_neg_file_tmp.write(line) for line in neutral_lines_src: new_pool_file_src.write(line) for line in neutral_lines_trg: new_pool_file_trg.write(line) new_pos_file_src.close() new_pos_file_trg.close() new_neg_file.close() new_pool_file_src.close() new_pool_file_trg.close() if debug: new_neg_file_tmp.close() pos_filename_src = new_pos_filename_src pos_filename_trg = new_pos_filename_trg neg_filename_src = new_neg_filename_src pool_filename_src = new_pool_filename_src pool_filename_trg = new_pool_filename_trg print "perf: " + str(perf) results.append(perf) print str(np.mean(results))
def _import_data(self): return process_data(self.VOCAB_SIZE, self.BATCH_SIZE, self.SKIP_WINDOW)
if __name__ == '__main__': global_start_time = time.time() seq_len = 10 # 训练sequence长度 split_rate = 0.1 # 划分训练数据和测试数据的比例 # df = pd.read_csv(r'E:\data\data\test_env_12_1m_deal\data_proc1_sort_slot1.txt') # data = list(df['number'].values) df1 = pd.read_csv(r'E:\data\data\bsg_nova_1030_sort.csv') # df1 = df1.sort_values(by='time') df1 = df1.head(3000000) # 自己机器运行报memery error错,因此用前300000条运行 data = list(df1['event'].values) X_train, y_train, X_test, y_test, row = process_data.process_data( data, seq_len, split_rate) # 对数据格式进行处理,对数据进行划分为训练数据和测试数据 y_train = np_utils.to_categorical(y_train) params = { 'lstm_output_dim': 50, 'activation_lstm': 'relu', 'activation_dense': 'relu', 'activation_last': 'softmax', 'dense_layer': 1, 'lstm_layer': 2, 'nb_epoch': 1 } obj_lstm = lstm_model.RNN_network(**params) obj_lstm.model(X_train,
def run(): sentences, pos, tag, enc_pos, enc_tag = process_data(DF_PATH) meta_data = { 'enc_pos': enc_pos, 'enc_tag': enc_tag } joblib.dump(meta_data, META_PATH) num_pos = len(list(enc_pos.classes_)) num_tag = len(list(enc_tag.classes_)) ( train_sentences, valid_sentences, train_pos, valid_pos, train_tag, valid_tag, ) = model_selection.train_test_split(sentences, pos, tag, random_state=2020, test_size=0.1) tokenizer = transformers.BertTokenizer.from_pretrained(TOKENIZER_PATH, do_lower_case=True) train_dataset = EntityDataset( words=train_sentences, pos=train_pos, tags=train_tag, tokenizer=tokenizer, max_len=MAX_LEN ) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4 ) valid_dataset = EntityDataset( words=valid_sentences, pos=valid_pos, tags=valid_tag, tokenizer=tokenizer, max_len=MAX_LEN ) valid_dataloader = torch.utils.data.DataLoader( valid_dataset, batch_size=VALIDATION_BATCH_SIZE, num_workers=4 ) model = MODEL_DISPATCHER[BASE_MODEL](bert_path=BERT_PATH, num_tag=num_tag, num_pos=num_pos ) model.to(DEVICE) # parameters_optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] parameters_optimizer = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.001, }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, } ] optimizer = AdamW(parameters_optimizer, lr=LR) num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps ) best_loss = np.inf for epoch in range(EPOCHS): train_loss = train_loop_fn(train_dataloader, model, optimizer, DEVICE, scheduler) valid_loss = eval_loop_fn(valid_dataloader, model, DEVICE) print(f'Train_loss = {train_loss}, Valid_loss = {valid_loss}') if valid_loss < best_loss: torch.save(model.state_dict(), MODEL_PATH) best_loss = valid_loss
import pandas as pd import json import os from process_data import process_data from daily_readings import daily_readings from readings_by_title import readings_by_title if __name__ == '__main__': df = pd.read_csv('../source_data/daily-log.csv') df['date'] = pd.to_datetime(df['date']) # Process raw readings into daily tracking by titles readings = process_data(df) # Filter logs for 2018 readings CY18 = df[df['date'] <= pd.to_datetime('2018-12-31')] # Combine titles and create daily readings frame CY18_daily_readings = daily_readings(readings, min(CY18['date']), max(CY18['date'])) # List of dictionaries containing daily readings by title by_title = readings_by_title(readings)
class2_vectors = [(j[0], j[1]) for j in class2] class1_X = [i[0] for i in class1_vectors] class1_Y = [i[1] for i in class1_vectors] class2_X = [j[0] for j in class2_vectors] class2_Y = [j[1] for j in class2_vectors] sigmas = [get_sigma(class1_X, class1_Y, 0, mu1), \ get_sigma(class2_X, class2_Y, 1, mu2)] s1 = sigmas[0] s2 = sigmas[1] pi_vector = [pi1, pi2] mu_vector = [mu1, mu2] return (pi_vector, mu_vector, sigmas) pre_data = process_data.get_data() data = process_data.process_data(pre_data) features = data[0] labels = data[1] train_features = np.array(features[:80]) train_labels = np.array(labels[:80]).reshape(80, 1) test_features = np.array(features[80:]) test_labels = np.array(labels[80:]).reshape(20, 1) X = train_features y = train_labels X_ = test_features y_ = test_labels q_fit = QDA(X, y) sigmas = q_fit[2] qda_correct = 0 for i in range(len(X_)):
def main(): date = time.localtime() log_name = './data/logs/log_' log_name += str(date.tm_mon) + '_' log_name += str(date.tm_mday) + '_' log_name += str(date.tm_hour) + '_' log_name += str(date.tm_min) + '_' log_name += str(date.tm_sec) + '.txt' log_file = open(log_name, 'w') current_message = open('./data/starting_message.txt', 'r').read() log_file.write(current_message) print(current_message) beginnings_file = open('./data/starting_sentences.txt', 'r') beginnings = [ word_tokenize(beginning) for beginning in beginnings_file.readlines() ] if DEBUG: print('RUNNING IN DEBUGGING MODE!') for look_back in [4, 8, 20]: current_message = '-' * 60 + '\n' current_message += 'Preparing data for look_back of %d' % look_back + '\n' log_file.write(current_message) print(current_message) start_time = time.time() data_train, data_val, data_test, emb_matrix, w2t, t2w, emb_model = process_data( log_file=log_file, look_back=look_back, debug=DEBUG) current_message = "Data took %.2f seconds to prepare." % ( time.time() - start_time) + '\n' current_message += '\n' + '-' * 60 + '\n' log_file.write(current_message) print(current_message) #LSTM Euclid loss for nb_layers in [1, 2, 4]: start_time = time.time() tf.reset_default_graph() model = deep_LSTM_euclid.LSTMmodel(emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers) model.build_graph() model.train(data_train, data_val, nb_train_steps=1, folder_to_save='results/LSTM_euclid_layers_' + str(nb_layers) + '_look_back_' + str(look_back)) current_message = "Model Euclid with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(emb_model=emb_model, w2t=w2t, t2w=t2w, beginning=beginning) #LSTM Cross Entropy loss for nb_layers in [1, 2, 4]: tf.reset_default_graph() model = deep_LSTM_cross_entropy.LSTMmodel( emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers) model.build_graph() model.train( data_train, data_val, nb_train_steps=1, folder_to_save='results/LSTM_cross_entropy_layers_' + str(nb_layers) + ' look_back_' + str(look_back)) current_message = "Model Entropy with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(w2t=w2t, t2w=t2w, beginning=beginning) #LSTM NCE loss for nb_layers in [1, 2, 4]: tf.reset_default_graph() model = deep_LSTM_nce.LSTMmodel(emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers) model.build_graph() model.train(data_train, data_val, nb_train_steps=1, folder_to_save='results/LSTM_nce_layers_' + str(nb_layers) + ' look_back_' + str(look_back)) current_message = "Model NCE with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(w2t=w2t, t2w=t2w, beginning=beginning) else: for look_back in [4, 8, 20]: current_message = '-' * 60 + '\n' current_message += 'Preparing data for look_back of %d' % look_back log_file.write(current_message) print(current_message) start_time = time.time() data_train, data_val, data_test, emb_matrix, w2t, t2w = process_data( look_back=look_back, debug=DEBUG) current_message = "Data took %.2f seconds to prepare." % ( time.time() - start_time) + '\n' current_message += '\n' + '-' * 60 + '\n' log_file.write(current_message) print(current_message) #LSTM Euclid loss for nb_layers in [1, 2, 4]: start_time = time.time() tf.reset_default_graph() model = deep_LSTM_euclid.LSTMmodel(emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers, log_file=log_file) model.build_graph() model.train(data_train, data_val, nb_train_steps=5, folder_to_save='results/LSTM_euclid_layers_' + str(nb_layers) + '_look_back_' + str(look_back)) current_message = "Model Euclid with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(emb_model=emb_model, w2t=w2t, t2w=t2w, beginning=beginning) #LSTM Cross Entropy loss for nb_layers in [1, 2, 4]: tf.reset_default_graph() model = deep_LSTM_cross_entropy.LSTMmodel( emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers) model.build_graph() model.train( data_train, data_val, nb_train_steps=5, folder_to_save='results/LSTM_cross_entropy_layers_' + str(nb_layers) + ' look_back_' + str(look_back)) current_message = "Model Entropy with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(w2t=w2t, t2w=t2w, beginning=beginning) #LSTM NCE loss for nb_layers in [1, 2, 4]: tf.reset_default_graph() model = deep_LSTM_nce.LSTMmodel(emb_matrix=emb_matrix, look_back=look_back, nb_layers=nb_layers) model.build_graph() model.train(data_train, data_val, nb_train_steps=5, folder_to_save='results/LSTM_nce_layers_' + str(nb_layers) + ' look_back_' + str(look_back)) current_message = "Model NCE with %d layers took %.2f for building and training." % ( nb_layers, time.time() - start_time) current_message += '\n' + '-' * 40 + '\n' log_file.write(current_message) print(current_message) for beginning in beginnings: model.create_story(w2t=w2t, t2w=t2w, beginning=beginning)
def main(): model = SkipGramModel(VOCAB_SIZE, EMBED_SIZE, BATCH_SIZE, NUM_SAMPLED, LEARNING_RATE) model.build_graph() batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW) train_model(model, batch_gen, NUM_TRAIN_STEPS, WEIGHTS_FLD)
result = pd.Series(window_steps) return result #Test the function if __name__ == "__main__": from process_data import process_data ped_data_file = r"C:\Users\dwubu\Documents\mhc\pedometer_walk_dir\ee621e22-c7c2-45dc-b22d-3a9c59fe6e78\2648577.0" walk_data_file = r"C:\Users\dwubu\Desktop\accel_walk_dir\ee621e22-c7c2-45dc-b22d-3a9c59fe6e78\2648617.0" ped_data = pd.read_json(ped_data_file) walk_data = pd.read_json(walk_data_file) walk_data_windows = process_data(500, 99, walk_data_file) def compare_ped_predictions(df, idx): ''' A plotting function for comparing the pedometer predictions for a window and the acceleometry of the window ''' import matplotlib.pyplot as plt plt.figure() plt.plot(df['xwindows'].iloc[idx], label='x') plt.plot(df['ywindows'].iloc[idx], label='y') plt.plot(df['zwindows'].iloc[idx], label='z') plt.legend(loc='upper left') plt.title('Prediction : {}'.format(df['steps'].iloc[idx])) compare_ped_predictions(walk_data_windows, 3)
import pandas as pd import numpy as np from sklearn import preprocessing import xgboost as xgb from process_data import process_data df, df_features = process_data() le = preprocessing.LabelEncoder() target = le.fit_transform(df[df['train']]['OutcomeType'].values) features = df_features[df['train']].values dtrain = xgb.DMatrix(features, label=target) params = {'objective':'multi:softprob', 'eta':0.75, 'bst:max_depth':2, 'num_class':len(np.unique(target))} grad = xgb.train(params, dtrain) dtest = xgb.DMatrix(df_features[df['train'] == False].values) result = pd.DataFrame(grad.predict(dtest), columns=le.classes_) result.index += 1 result.to_csv('xg_boost_out_params1.csv', index_label='ID')
'hidden_layer_sizes' : [(500,), (1000,)], 'max_iter' : [400] } } ############################################## # Training data # ############################################## ## Loading and processing data # Merge to process data_train = pd.read_csv('../data/train.csv', header=0) data_test = pd.read_csv('../data/test.csv', header=0) data_merge = pd.concat([data_train, data_test], keys=['train', 'test']) dataset = process_data(data_merge) # Extracting train test X = dataset.loc['train'].drop(columns=['PassengerId', 'Survived']) y = dataset.loc['train'].Survived # Extracting data test id = dataset.loc['test'].PassengerId X_test = dataset.loc['test'].drop(columns=['PassengerId', 'Survived']) # # Dividing in training and cross validation set # X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3) # Table of estimators helper = EstimatorSelectionHelper(models, params) helper.fit(X, y)
import torch import torch.optim as optim # %matplotlib inline # getting the data from load_data import load_data train_loader, batch_size, num_workers = load_data() # visualize the data from load_data import visualize_data images, labels = visualize_data(train_loader) # pre-processing the data from process_data import process_data scaled_img = process_data(images) # define the model from model import Discriminator, Generator # define hyperparamaters conv_dim = 32 z_size = 100 # define discriminator and generator D = Discriminator(conv_dim) G = Generator(z_size=z_size, conv_dim=conv_dim) print(D) print() print(G)
def main(): batch_gen = process_data(vocabulary_size, batch_size, skip_window) word2vec(batch_gen)
centers, targets = next(batch_generator) batch = [centers, targets] # Update one step loss_batch, summary = model.step(batch, sess) # Summary this step writer.add_summary(summary, train_step) # Update loss total_loss += loss_batch # Print out the loss every few steps if (train_step + 1) % FLAGS.skip_every == 0: print("Average loss at step {}: {:5.1f}".format( train_step + 1, total_loss / FLAGS.skip_every)) total_loss = 0.0 # Save the session saver.save(sess, "checkpoints/step{}".format(initial_step + FLAGS.num_train_steps), global_step=initial_step + FLAGS.num_train_steps) if __name__ == '__main__': FLAGS = train_flags() model = SkipGramModel(FLAGS) batch_generator = process_data(FLAGS.vocab_size, FLAGS.batch_size, FLAGS.skip_window) with tf.Session() as sess: train(sess, model, batch_generator, FLAGS.num_train_steps) sess.close()
def helper(proc_info): all_proc,proc_num=proc_info cities = [] #cities.append("Bitola") #cities.append("Skopje-Petrovec") #cities.append("New York") ##cities.append("Anchorage") ##cities.append("Sidney") cities.append("Buffalo") ##cities.append("Nairobi") ##cities.append("Singapore") #cities.append("Seattle") countries = [] ##countries.append("GR") countries.append("NO") ##countries.append("CA") ar = 1 if ar == 0: print("Removing images, Please wait .. . . . . .") for cc in countries: cities, country = weather_codes.get_cities(cc) for city in cities: rm_img(city[0],country) elif ar == 1: c_c_r = 0 if c_c_r == 0: cities = weather_codes.get_cities(countries) elif c_c_r == 1: cities = weather_codes.get_cities(cities) elif c_c_r == 2: cities = weather_codes.get_cities(40) for c in cities: c[0] = c[0].replace('/',' ') #print(c) #print(cities) get_data_multithread.fetch_data_multithread(cities,1900,2016) dt = datetime.now() start=(dt.minute*60+dt.second)*1000000+dt.microsecond paralel_time=0 main_time=0 qlen=0 q = queue.Queue() print("Number of cities: ",len(cities)) for city in cities: print(city) #for city in cities: bot=int(len(cities)/all_proc*(proc_num-1)) top=int(len(cities)/all_proc*(proc_num)) for xxx in range(bot,top): city=cities[xxx] dt = datetime.now() ct=(dt.minute*60+dt.second)*1000000+dt.microsecond country = city[2] #print(city[0]) table = [] flag = True count_to_flag = 0 year=2015 while flag: url="http://www.wunderground.com/history/airport/"+city[1]+"/"+str(year)+"/1/1/CustomHistory.html?dayend=31&monthend=12&yearend="+str(year)+"&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=&format=1" temp = process_data.process_data(url,year,city[0],country) table.append(temp) #print(table[0][0]) #print(len(temp)) if len(temp) < 1: count_to_flag+=1 else: count_to_flag = 0 if count_to_flag > 20: flag = False year-=1 #print("TABLE: ",len(table)) ilo=[[None for j in range(0,367)] for k in range(0,len(table)-21)] ihi=[[None for j in range(0,367)] for k in range(0,len(table)-21)] ipr=[[None for j in range(0,367)] for k in range(0,len(table)-21)] icc=[[None for j in range(0,367)] for k in range(0,len(table)-21)] for l in range(0,len(table)): for m in range(0,len(table[l])): #for n in range(0,len(table[l][m])): dy = int(table[l][m][len(table[l][m])-1]) if len(table[l][m][5]) >= 1: ilo[l][dy]=int(table[l][m][5]) if len(table[l][m][3]) >= 1: ihi[l][dy]=int(table[l][m][3]) if len(table[l][m][21]) >= 1: try: pr=float(table[l][m][21]) pr=float(math.log10((pr+1))*76.4) ipr[l][dy]=int(pr) except: excflg = True if len(table[l][m][22]) >= 1: icc[l][dy]=int(table[l][m][22]) dt = datetime.now() ct=((dt.minute*60+dt.second)*1000000+dt.microsecond-ct)/1000 print("Main time: ", ct) main_time+=ct ############################################ ############################################ qlen+=3 ## t = threading.Thread(target=draw_env, args = (q,ihi,1,-25,55,"hi",city[0],country,3)) t.daemon = True t.start() ## t = threading.Thread(target=draw_env, args = (q,ilo,1,-40,40,"lo",city[0],country,3)) t.daemon = True t.start() ## min_val, max_val = minmax.min_max2(ipr,"pr") if min_val != max_val: t = threading.Thread(target=draw_env, args = (q,ipr,1,min_val,max_val,"pr",city[0],country,3)) t.daemon = True t.start() qlen+=1 ## t = threading.Thread(target=draw_env, args = (q,icc,1,0,8,"cc",city[0],country,4)) t.daemon = True t.start() ## #print(qlen) ############################################ ########################################### #print("::::::::::::::::::::::::::::::::::",qlen) for i in range(0,qlen): s = q.get() paralel_time+=s #print(i,". ",s) ############################################ ############################################ dt = datetime.now() fin=float((dt.minute*60+dt.second)*1000000+dt.microsecond-start)/1000000 print("Real Time: ",fin," sec") print("Paralel Time: ",paralel_time/1000," sec") print("Main Time: ",main_time/1000," sec") ## ## for city in cities: ## print(city) ## for x in ("HIGH","LOW","PRECIPITATION","CLOUD COVER"): ## ## file_in = "C:/Python34/Scripts/WeatherData/ALL_IMG/"+x+"/"+str(city[2])+"_"+str(city[0])+".bmp" ## if os.path.isfile(file_in): ## im = Image.open(file_in) ## #print(im.size) ## directory = "C:/Python34/Scripts/WeatherData/ALL_IMG/PNG/"+x+"/" ## file_out = "C:/Python34/Scripts/WeatherData/ALL_IMG/PNG/"+x+"/"+str(city[2])+"_"+str(city[0])+".png" ## #if not os.path.isfile(file_out): ## if not os.path.exists(directory): ## os.makedirs(directory) ## del_f=True ## try: ## im.save(file_out,"png") ## #im.close() ## except Exception as e: ## print (file_in," failed to convert to png.",e) ## #print(file_in," failed to convert to png.") ## del_f=False ## if del_f: ## try: ## os.remove(file_in) ## except: ## print(file_in," failed to delete.") ## ## ## return fin
# set up connection to the database # edit this when working on the server db.set_up_connection(db.db, 'bence_test', create_tables=True) # insert stations stations_df = station_names.get_stations_dataframe() db.insert_into_table(stations_df, 'Station') # get daily measurement data #userpath = os.path.dirname(os.path.realpath(__file__)) userpath = '/local/data_dwd/october2018' #get_data.get_data(userpath, historical=True, recent=True, # hourly=False, verbose=True) # insert measurement data with db.porm.db_session: print('inserting measurement data into the database...') for i, s_id in enumerate(stations_df.index): try: mes = process_data.process_data(userpath, s_id, 'daily') except BaseException as e: print('something went wrong processing station: {}'.format(s_id)) print(e) else: if not mes.empty: db.insert_into_table(mes, 'DailyMeasurement', overwrite=True) print('{}: {}'.format(i, s_id)) else: print('{}: {} was empty'.format(i, s_id))
predictlist = [] #去除停用词 for p in predictlist1: predictlist.append(p) word2idx = dict((w, i) for i, w in enumerate(vocab)) print(word2idx) print("predict_txt:"), print(predict_text) print(predictlist) # x = [word2idx.get(w[0].lower(), 1) for w in predictlist] # print("x:"),print(x) # length = len(x) # x = pad_sequences([x], maxlen) # left padding str, length = process_data.process_data(predictlist, vocab) model1.load_weights('without_crf.h5') raw = model1.predict(str)[0][-length:] print("raw:"), print(raw) result = [np.argmax(row) for row in raw] result_tags = [chunk_tags[i] for i in result] # print(result_tags) for s, t in zip(predictlist, result_tags): print("(" + s + "," + t + ")")
def main(): batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW) print('Start Word2Vec') word2vec(batch_gen)
import os # Instantiating our Streamlit dashboard st.title('Robot Dance') st.header('Simulador COVID-19') # File upload interface st.write("Faça o upload dos arquivos para realizar a simulação.") day_14 = st.file_uploader("14DayWindow", type="csv") reduced_mobility = st.file_uploader("reduced_mobility", type="csv") # file3 = st.file_uploader("file3", type="csv") # file4 = st.file_uploader("file4", type="csv") if (day_14 is not None): # Simulation running if (st.button('Rodar simulação!')): with st.spinner('Processando dados'): process_data(day_14, reduced_mobility) st.success("Concluído") my_bar = st.progress(0) imgs = [i for i in os.listdir() if (".png" in i)] aux = [] for idx, path in enumerate(imgs): aux.append(mpimg.imread(path)) my_bar.progress((idx + 1) / len(imgs)) st.image(aux, use_column_width=True)
def gen_stats(args): """ Perfom k-fold validation. :param args: args for gen_stats :type args: Namespace """ # pull args out training_dir = args.training_dir train_file = args.train_file splits = args.splits epochs = args.epochs test_file = args.test_file hyp = args.hyp vocab = args.vocab # do some checks try: assert os.path.exists(training_dir), "training_dir must exist" assert os.path.exists(train_file), "eval_file must exist" if test_file is not None: assert os.path.exists(test_file), "test_file must exist" assert hyp >= 0 and hyp <= 1, "hyp must be between 0 and 1" assert epochs > 0, 'epochs must be positive' assert os.path.exists(vocab), "vocab file must exist" assert splits > 0, "splits must be positive" except AssertionError as err: logger.error("Failed check: {}".format(err)) return # get the data together train_data_process, train_labels_process = process_data.process_data( train_file, vocab) data_split, data_lables = process_data.gen_splits(splits, train_data_process, train_labels_process) if test_file is not None: test_data_process, test_labels_process = process_data.process_data( test_file, vocab) test_data_split, test_data_lables = process_data.gen_splits( splits, test_data_process, test_labels_process) # delte all the data in this directory common.clean_dir_dir(training_dir) accuracy_per_session = [] # run the tests for split in range(0, splits): # pull out the test data for this session eval_data = data_split[split] start = 0 end = len(eval_data) # just so we aren't testing and validating on the same data test_data = eval_data[int(end / 2):end] eval_data = eval_data[start:int(end / 2)] # the rest is now trainig indicies = list(range(0, splits)) # remove the test data del indicies[split] train_data = data_split[indicies[0]] train_labels = data_lables[indicies[0]] # remove the first one del indicies[0] for i in indicies: train_data = np.append(train_data, data_split[i], axis=0) train_labels = np.append(train_labels, data_lables[i], axis=0) # logger.debug(train_data) # make the checkpoint directory checkpoint_dir = common.grab_next_session(training_dir) # train the model logger.debug("TRINING") history, model_summary = rnn.train_and_validate( train_data, train_labels, eval_data, eval_labels, epochs, checkpoint_dir) # get the result logger.debug("EVAL") metrics = rnn.eval(test_data, test_labels, checkpoint_dir, show_results=False) common.write_file(str(model_summary), checkpoint_dir + "/MODEL_SUMMARY") accuracy_per_session.append(metrics[1]) logger.debug("accuracy so far: {}".format(accuracy_per_session)) common.plot_graphs_val(history, 'categorical_accuracy', checkpoint_dir) common.plot_graphs_val(history, 'loss', checkpoint_dir) t, s, avg = process_data.get_stats(accuracy_per_session, hyp) data = "list of accs: {}".format(accuracy_per_session) data += "\n t-value:{}, std:{}, avg:{}".format(t, s, avg) data += "\n hyp 0: {}".format(hyp) common.write_file(data, training_dir + "/SESSION_INFO")
def worker(data, i): if len(np.where(data['segment_label'] == 2)[0]) < 1000: return None true_energy_fitted_voxels = [] charge_fitted_voxels = [] fitted_energy_fitted_voxels = [] fitted_energy_recorded_voxels = [] charge_recorded_voxels = [] true_energy_recorded_voxels = [] true_energy_all_voxels = [] pi0_cos = [] # cos theta of gamma pair gamma_sep = np.array([]) # minimum backward separation of gammas chosen_particles = [] # selected particle indices d, true_shower_hits = process_data(data) energies = model( (torch.Tensor(d[:, :4]).cuda(), torch.Tensor(d[:, 4:-1]).cuda())).detach().cpu().numpy().flatten() em_primaries = data['em_primaries'] input_true = data['input_true'] group_label = data['group_label'] # assemble group labels to from primaries distances = distance_matrix(em_primaries[:, :3], d[:, :3]) min_indices = np.argmin(distances, axis=1) primary_groups = true_shower_hits[min_indices] # determine shower directions gamma_dir, gamma_pca_data, gamma_pca_nhits = gamma_direction.do_calculation( d[:, :3], em_primaries, radius=16.0, eps=7.6, min_samples=5.8) if not len(gamma_dir) or not any(gamma_dir[:, 4]): print('gamma_dir failure') return None # pair up gamma candidates selected_showers, sep_matrix = gamma2_selection.do_iterative_selection( gamma_dir, maximum_sep=10.) if not len(selected_showers): print('gamma2 failure') return None gamma_sep = sep_matrix[np.triu_indices(len(selected_showers), k=1)] # calculate pi0 parameters for gamma pairs paired_gammas_mask = selected_showers[:, -1] != 0 gamma_pairs = np.unique(selected_showers[paired_gammas_mask, -1]) if not len(gamma_pairs) == 1: print('pairlen failure') return None vtx_data = np.empty((len(gamma_pairs), 5)) particles = em_primaries[:, -1] # find shower hits # fitted_shower_hits = [] # fitted_shower_primary_labels = [] fitted_shower_hits = cone_clusterer.cluster( d[:, :3], em_primaries[:, :3], params=[50.0, 32.538603965969806, 4.920066409426372, 9.34588243103269], inclusive=True) # labels, hits = cone_clusterer.cluster(d[:, :3], em_primaries[:, :3], params=[50.0, 32.538603965969806, 4.920066409426372, 9.34588243103269]) # print('EM primary labels', labels) # print('EM primary groups', primary_groups) # fitted_shower_hits.append(hits) # fitted_shower_primary_labels.append(labels) # if not len(fitted_shower_hits[-1]): # print('cone failure') # return None # fitted_shower_hits.append(spectral_clusterer.cluster(d[:, :3], em_primaries[:, :3], params=[46.37086851922889, -1.5574991699405842, 0.7537768189993856, 0.9695745937212652])) # fitted_shower_primary_labels.append(fitted_shower_hits[-1][min_indices]) # if not len(fitted_shower_hits[-1]): # print('spectral failure') # return None for idx, label in enumerate(gamma_pairs): gamma_label_mask = selected_showers[:, -1] == label gamma_pair = gamma_dir[gamma_label_mask] gamma0_idx = int(np.argwhere(gamma_label_mask)[0]) true_gamma0_hits = np.where( true_shower_hits == primary_groups[gamma0_idx]) gamma1_idx = int(np.argwhere(gamma_label_mask)[1]) true_gamma1_hits = np.where( true_shower_hits == primary_groups[gamma1_idx]) print('gamma indices', gamma0_idx, gamma1_idx) if len(fitted_shower_hits[gamma0_idx]) == 0 or len( fitted_shower_hits[gamma1_idx]) == 0: continue cos_val = np.dot( gamma_pair[0, -3:], gamma_pair[1, -3:]) / np.linalg.norm( gamma_pair[0, -3:]) / np.linalg.norm(gamma_pair[1, -3:]) if cos_val > 1: cos_val = 1.0 pi0_cos += [cos_val] true_energy_fitted_voxels += [[ np.sum(d[fitted_shower_hits[gamma0_idx], -1]), np.sum(d[fitted_shower_hits[gamma1_idx], -1]) ]] charge_fitted_voxels += [[ np.sum(d[:, 4][fitted_shower_hits[gamma0_idx]]), np.sum(d[:, 4][fitted_shower_hits[gamma1_idx]]) ]] fitted_energy_fitted_voxels += [[ np.sum(energies[fitted_shower_hits[gamma0_idx]]), np.sum(energies[fitted_shower_hits[gamma1_idx]]) ]] fitted_energy_recorded_voxels += [[ np.sum(energies[true_gamma0_hits]), np.sum(energies[true_gamma1_hits]) ]] charge_recorded_voxels += [[ np.sum(d[true_gamma0_hits, 4]), np.sum(d[true_gamma1_hits, 4]) ]] true_energy_recorded_voxels += [[ np.sum(d[true_gamma0_hits, -1]), np.sum(d[true_gamma1_hits, -1]) ]] true_energy_all_voxels += [[ np.sum(input_true[np.where( group_label[:, -1] == primary_groups[gamma0_idx]), -1]), np.sum(input_true[np.where( group_label[:, -1] == primary_groups[gamma1_idx]), -1]) ]] chosen_particles.append([particles[gamma0_idx], particles[gamma1_idx]]) if len(true_energy_fitted_voxels) == 0: return None all_energies = [ true_energy_fitted_voxels, charge_fitted_voxels, fitted_energy_fitted_voxels, fitted_energy_recorded_voxels, charge_recorded_voxels, true_energy_recorded_voxels, true_energy_all_voxels ] return (all_energies, np.array(pi0_cos), np.array(chosen_particles).astype(int), gamma_sep, gamma_dir[paired_gammas_mask, -3:], gamma_pca_data[paired_gammas_mask, -1], gamma_pca_nhits[paired_gammas_mask, -1])
def main(): batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW) word2vec(batch_gen)
SKIP_WINDOW = 5 # the context window NUM_SAMPLED = 10 # Number of negative examples to sample. LEARNING_RATE = 0.1 NPY_FILENAME = 'text17.npy' MAX_NPY_WORDS = 188333610 - 100 # max words in textNN.npy VOCAB_FILENAME = 'vocab_50k.tsv' VOCAB_SEP = '|' NUM_TRAIN_STEPS = MAX_NPY_WORDS * 5 SKIP_STEP = 10 # how many steps to skip before reporting the loss from process_data import process_data make_batch_gen = lambda: process_data(NPY_FILENAME, VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW, MAX_NPY_WORDS) class SkipGramModel: """ Build the graph for word2vec model """ def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate): self.vocab_size = vocab_size self.embed_size = embed_size self.batch_size = batch_size self.num_sampled = num_sampled self.lr = learning_rate
textField = args.textField test_set_x = read_corpus(args.input, word_index, max_l, pad, textField=textField) test_set_y_pred = cnn.predict(test_set_x) test_model = theano.function([cnn.x], test_set_y_pred, allow_input_downcast=True) results = test_model(test_set_x) # invert indices (from process_data.py) labels = ['negative', 'positive', 'neutral'] for line, y in zip(open(args.input), results): tokens = line.split("\t") tokens[tagField] = labels[y] print "\t".join(tokens), sys.exit() # training print "loading data...", sents, U, word_index, vocab = process_data(args.input, args.clean, args.vectors, args.tagField, args.textField) # sents is a list of entries, where each entry is a dict: # {"y": 0/1, "text": , "num_words": , "split": cv fold} # vocab: dict of word doc freq print "data loaded!" filter_hs = [int(x) for x in args.filters.split(',')] model = args.model if args.static: print "model architecture: CNN-static" non_static = False else: print "model architecture: CNN-non-static" non_static = True if args.vectors: print "using: word2vec vectors"
parser.add_argument('-tagField', type=int, default=1, help='label field in files (default %(default)s)') parser.add_argument('-textField', type=int, default=2, help='text field in files (default %(default)s)') return parser if __name__=="__main__": parser = get_parser() args = parser.parse_args() userField = 0 # training sents, W, word_index, vocab, labels, max_l, U, user_idx = process_data(args.input, args.clean, args.vectors, None, args.tagField, args.textField, userField) model = args.model if args.static: print "model architecture: CNN-static" non_static = False else: print "model architecture: CNN-non-static" non_static = True if args.vectors: print "using: word2vec vectors" else: print "using: random vectors" classes = set(x["y"] for x in sents) width = W.shape[1]
import bilsm_crf_model import process_data import numpy as np import re model, (vocab, chunk_tags) = bilsm_crf_model.create_model(train=False) predict_text = '针对一些在生活中孩子遇到的不常见字,为了方便阅读,我们都加以拼音标注,这样就克服了小朋友自助阅读的障碍,有利于他们快速正确的阅读' #去掉输入文本的所有标点 predict_text = re.sub("[^\u4e00-\u9fa5]+", "", predict_text) print(predict_text) str, length = process_data.process_data(predict_text, vocab) model.load_weights('model/crf.h5') raw = model.predict(str)[0][-length:] result = [np.argmax(row) for row in raw] result_tags = [chunk_tags[i] for i in result] per, loc, org = '', '', '' for s, t in zip(predict_text, result_tags): if t == 1: per = per + s + ',' elif t == 2: per = per + s + '。' elif t == 3: per = per + s + '?' else: per = per + s # # per += ' ' + s if (t == 0) else s # if t in (2, 1): # org += ' ' + s if (t == 2) else s
current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') submission.to_csv("output/{}_{}.csv".format(current_time, extension), float_format='%.4f', index=None) train_data = process_data.read_train() test_data_raw = process_data.read_test() train_label = train_data["TARGET"] train_data.drop("TARGET", axis=1, inplace=True) process_start_time = time.time() train_data, test_data, cate_feats = process_data.process_data( train_data, test_data_raw, always_label_encode=True) util.print_time(time.time() - process_start_time) # pp = pprint.PrettyPrinter(width=200, compact=True) # pp.pprint(list(train_data)) # scores = cross_val_score(create_classifier(), train_data, train_label, cv=5, scoring='roc_auc') # print(scores) # print(train_data.isnull().sum()) print(train_data.shape) fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = list(train_data) skf = StratifiedKFold(n_splits=5, shuffle=True)
from __future__ import print_function from argparse import ArgumentParser from get_data import get_data from setup_data import setup_data from process_data import process_data from run_walsh_alg import run if __name__ == "__main__": parser = ArgumentParser() parser.add_argument('-s', '--settings', help='Settings file') parser.add_argument('-r', '--remove', help='Remove', action='store_true') args = parser.parse_args() if not args.settings: import settings.default as settings else: raise Exception('not impl') if settings.GET_DATA: get_data(settings) if settings.SETUP_DATA: setup_data(settings) if settings.PROCESS_DATA: process_data(settings) if settings.RUN_WALSH: run()
def refresh_data(): # call shell script function run(['./get_data.sh'], stdout=PIPE, stderr=PIPE) process_data() build_simple_model_df() return render_template('index.html')