def convert_test_data(filename, alarm, ticket): ''' Function converts the encoded hex values to numpy arrays. Returns: converted data in form of numpy array. Parameters: filename: prediction filename alarm: alarm file required by DataProcessor object ticket: ticket file required by DataProcessor object ''' dp = DataProcessor(alarm, ticket) test_alarm_values = [] with open(filename, encoding='utf8') as csv_file: reader = csv.reader(csv_file) next(reader) for row in reader: test_alarm_values.append(row[3]) result_list = [] for item in test_alarm_values: encoded_alarm = dp.encode_hex_values(item) result_list.append(encoded_alarm) return dp.convert_array_to_np_array(result_list)
def train(raw, flags): data_processor = DataProcessor(flags.forecast_length, flags.batch_size, flags.window) train_loader, val_loader = data_processor.get_train_test_data( raw, flags.validation_ratio) model = DeepAR(cov_dim=data_processor.num_features, hidden_dim=flags.num_units, num_layers=flags.num_layers, num_class=len(raw['type'].unique()), embedding_dim=flags.embedding_size, batch_first=True, dropout=flags.dropout) opt = torch.optim.Adam(model.parameters(), lr=flags.learning_rate) teacher_ratio = flags.teacher_ratio loss_history = [] loss_fn = gaussian_likelihood_loss model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model, opt) if start_epoch >= flags.num_epochs: print('start_epoch is larger than num_epochs!') epoch = start_epoch # TODO: add early stop for epoch in range(start_epoch, flags.num_epochs): for step, data in enumerate(train_loader): avg_loss, _ = _forward(data, model, loss_fn, flags.window, flags.forecast_length, teacher_ratio) loss_history.append(avg_loss) opt.zero_grad() avg_loss.backward() opt.step() teacher_ratio *= flags.teacher_ratio_decay validation_loss = evaluate(val_loader, model, loss_fn, flags.window, flags.forecast_length) print('Epoch: %d' % epoch) print("Training Loss:%.3f" % avg_loss) print("Validation Loss:%.3f" % validation_loss) print('Teacher_ratio: %.3f' % teacher_ratio) print() print('Model training completed and save at %s' % flags.checkpoint_path) state = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': opt.state_dict() } if not os.path.exists(flags.checkpoint_path): os.mkdir(flags.checkpoint_path) torch.save(state, flags.checkpoint_path + '/model.pt') data_processor.save(flags.checkpoint_path) return model, loss_history
def __init__(self, ip): self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB) self.log.info('[MAIN THREAD] Instantiated client') self.receiving = False self.define_headers() self.targets = {} self.transmit = Queue.Queue() self.data_client = DataClient(self.transmit, ip) self.data_processor = DataProcessor(self.transmit, self.headers, self.targets) self.connect(ip)
def train(raw, flags): data_processor = DataProcessor(flags.forecast_length, flags.batch_size, flags.window) train_loader, val_loader = data_processor.get_train_test_data(raw, flags.validation_ratio) encoder = EncoderRNN(data_processor.num_features + 1, flags.num_units) decoder = DecoderRNN(data_processor.num_features + 1, flags.num_units, output_size=flags.output_dim, batch_first=True) def init_weights(m): for name, param in m.named_parameters(): torch.nn.init.uniform_(param.data, -0.1, 0.1) encoder.apply(init_weights) decoder.apply(init_weights) loss_fn = torch.nn.MSELoss() # loss_fn = SMAPE() model_params = list(encoder.parameters()) + list(decoder.parameters()) opt = torch.optim.Adam(model_params, lr=flags.learning_rate) teacher_ratio = flags.teacher_ratio loss_history = [] # model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model, opt) # if start_epoch >= flags.num_epochs: # print('start_epoch is larger than num_epochs!') start_epoch = 0 epoch = start_epoch # TODO: add early stop for epoch in range(start_epoch, flags.num_epochs): for step, data in enumerate(train_loader): avg_loss, _, acc = _forward(data, [encoder, decoder], loss_fn, flags.window, flags.forecast_length, True, teacher_ratio=teacher_ratio) loss_history.append(avg_loss) opt.zero_grad() avg_loss.backward() torch.nn.utils.clip_grad_norm_(model_params, 5.0) opt.step() teacher_ratio *= flags.teacher_ratio_decay val_loss, val_acc = evaluate(val_loader, [encoder, decoder], loss_fn, flags.window, flags.forecast_length) print('Epoch: %d' % epoch) print("Training Loss:%.3f" % avg_loss) print('Training Avg Accuracy:%.3f' % acc) print("Validation Loss:%.3f" % val_loss) print("Validation Accuracy:%.3f" % val_acc) print('Teacher_ratio: %.3f' % teacher_ratio) # print('Model training completed and save at %s' % flags.checkpoint_path) # state = {'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': opt.state_dict()} # if not os.path.exists(flags.checkpoint_path): # os.mkdir(flags.checkpoint_path) # torch.save(state, flags.checkpoint_path + '/model.pt') # data_processor.save(flags.checkpoint_path) # return model, loss_history print('Gradients:%.3f' % torch.mean( (torch.stack([torch.mean(torch.abs(p.grad)) for p in model_params], 0)))) print()
def main_execute(): trading_advisor = TradingAdvisor() intervals = [20, 60, 120] processor = DataProcessor(intervals) subset_df = processor.get_dataframe_subset(500) advisor.calc_buy_sell(intervals, subset_df) processor.plot(subset_df) processor.db.close()
def main_execute(): advisor = Advisor() intervals = [60, 120, 720] processor = DataProcessor(intervals) subset_df = processor.get_dataframe_subset(3500) advisor.calc_buy_sell(intervals, subset_df) processor.plot(subset_df) processor.db.close()
def __init__(self, split, args): """ Initialise the MetaLearningDataset class Parameters ---------- split : str 'train' or 'test' args : parsed args args passed in to the model """ self.data_dir = "../../../datascience-projects/internal/multitask_learning/processed_data/Streetbees_Mood/Streetbees_Mood_all.csv" data = pd.read_csv(self.data_dir, index_col=0) self.data_processor = DataProcessor(data_dir=None, data_name='Streetbees_Mood', labels=['0','1']) self.K = args.K self.num_classes = args.num_classes self.baseLM_name = args.model_name.split("-")[0] do_lower_case = False if args.model_name.split("-")[-1] == "cased" else True self.tokenizer = TOKENIZERS[self.baseLM_name].from_pretrained(args.model_name, do_lower_case=do_lower_case) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Pick categories for dev and test set ALL_CATEGORIES = np.unique(data['category']) TEST_CATEGORIES = ['Healthy', 'Positive', 'Unwell', 'Fine'] self.tasks = {} for category in ALL_CATEGORIES: if (split == 'train' and category in TEST_CATEGORIES) or (split == 'test' and category not in TEST_CATEGORIES): continue # Each category will become a separate task. Explicitly redefine variable below for clarity task = category pos_examples, neg_examples = self.get_positive_and_negative_examples(data, category=task) if task not in self.tasks: self.tasks[task] = (pos_examples, neg_examples) task_list = [] task_names = [] for task in self.tasks: pos_examples = self.tasks[task][0] neg_examples = self.tasks[task][1] if len(pos_examples) < self.K or len(neg_examples) < self.K: print('not enough examples', task) continue # skip for now if not enough examples task_list.append((pos_examples, neg_examples)) task_names.append(task) self.tasks = task_list self.task_names = task_names self.num_tasks = len(self.tasks)
def infer(raw, flags): """ :param raw: :param flags: :return: """ data_processor = DataProcessor.load(flags.checkpoint_path) model = LSTM(data_processor.num_features, flags.num_units, output_dim=flags.output_dim, num_layers=flags.num_layers, batch_first=True, dropout=flags.dropout) model, _, _ = load_checkpoint(flags.checkpoint_path, model, None) model.eval() # predict results = {} loader, ts_types = data_processor.get_forecast_data(raw) with torch.no_grad(): for type, data in zip(ts_types, loader): scale = data[4] _, outputs = _infer(data, model, flags.window, flags.forecast_length) results[type] = [(output * scale).detach().numpy()[0] for output in outputs] print(results) return results
def run_k_folds(self, n_runs=5, n_folds=2): dp = DataProcessor() dp.load('data/SQuAD/squad-v7.file') model = LogRegModel() model.load_vectors(dp.articles, n_folds=n_folds) baseline_results = [] sentiment_results = [] for run in range(n_runs): print("k-fold run:", run) baseline_results.append(model.run_k_fold(with_sentiment=False)) sentiment_results.append(model.run_k_fold()) model.create_new_folds(n_folds=n_folds) self.save_results(baseline_results, "results/5x2_baseline") self.save_results(sentiment_results, "results/5x2_sentiment")
def get_data( model_args, training_args, tokenizer, text_data_path="../data/test_dataset"): # 경로 변경 ../data/test_dataset """ get data Args: model_args: model arguments training_args: training arguments tokenizer: tokenizer text_data_path: Defaults to "../data/test_dataset" Returns: text_data, val_iter, val_dataset, scores """ text_data = load_from_disk(text_data_path) # run_ lasticsearch if "elastic" in model_args.retrieval_type: is_sentence_trainformer = False if "sentence_trainformer" in model_args.retrieval_type: is_sentence_trainformer = True # number of text to concat concat_num = model_args.retrieval_elastic_num text_data, scores = run_elasticsearch(text_data, concat_num, model_args, is_sentence_trainformer) elif model_args.retrieval_type == "dense": concat_num = model_args.retrieval_elastic_num text_data, scores = run_concat_dense_retrival(text_data, concat_num) column_names = text_data["validation"].column_names data_collator = (DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # 데이터 tokenize(mrc 모델안에 들어 갈 수 있도록) data_processor = DataProcessor(tokenizer) val_text = text_data["validation"] val_dataset = data_processor.val_tokenzier(val_text, column_names) val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=1) return text_data, val_iter, val_dataset, scores
def load_rf_data(cur_path): data_folder = "data\\titanic" processed_data_folder = os.path.join(cur_path, data_folder) # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess # how well the model performed. data_file_path = os.path.join(processed_data_folder, "train.csv") data = DataProcessor(data_file_path, processed_data_folder) try: #Try to load data data.load_processed_data() except FileNotFoundError: #No data found, so process it # 10% test, 10% validation, 80% training samples from data splits = (0.1, 0.1, 0.8) # Only use certain columns use_cols = ( # 0, #PassengerID 1, # Survived 2, # Pclass # 3, #Name 4, # Sex 5, # Age 6, # SibSp 7, # Parch # 8, #Ticket 9, # Fare # 10, #Cabin 11, # Embarked ) # Mark features as categorical (so we can one-hot-encode them later) # categorical_cols = () categorical_cols = (2, # Pclass 4, # Sex 11 # Embarked ) # Convert certain columns to float values (so we can use numpy arrays) converters = {4: lambda sex: {'male': 0.0, 'female': 1.0}[sex], 11: lambda embarked: {'S': 0.0, 'C': 1.0, 'Q': 2.0}[embarked]} data.process_data(splits=splits, use_cols=use_cols, categorical_cols=categorical_cols, converters=converters, filter_missing=True) return data
def run_gru(s): x_dims = len(x_cols[s.dataSet]) if s.dataSet in x_cols else s.lookback random.seed(6) np.random.seed(6) rnn = Sequential() rnn.add( GRU(s.nodes, input_shape=(None, x_dims), kernel_initializer='he_uniform', stateful=False)) #rnn.add(Dropout(0.15)) rnn.add(Dense(1, kernel_initializer='he_uniform')) opt = adam(lr=s.lr, decay=0.0) #1e-3) rnn.compile(loss='mae', optimizer=opt) # prepare dataset as pyBrain sequential dataset sequence = readDataSet(s.dataSet, s.dataSetDetailed, s) if s.limit_to: sequence = sequence[:s.limit_to] dp = DataProcessor() # standardize data by subtracting mean and dividing by std #(meanSeq, stdSeq) = dp.normalize('data', sequence) dp.windowed_normalize(sequence) for key in sequence.keys(): if key != "data": dp.normalize(key, sequence) predictedInput = np.zeros((len(sequence), )) targetInput = np.zeros((len(sequence), )) trueData = np.zeros((len(sequence), )) if s.dataSet in differenceSets: predictedInputNodiff = np.zeros((len(sequence), )) targetInputNodiff = np.zeros((len(sequence), )) if s.dataSet in differenceSets: backup_sequence = sequence sequence = dp.difference(sequence, s.lookback) allX = getX(sequence, s) allY = np.array(sequence['data']) allX = allX[48:] allY = allY[48:] #if s.dataSet not in x_cols: # allY = allY[s.lookback:] trainX = allX[0:s.nTrain] trainY = allY[s.predictionStep:s.nTrain + s.predictionStep] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) curBatch = 1.0 callback = LossCallback() temp_set = np.array(sequence['data'])[:48 + s.nTrain + 5] configure_batches(48, s.batch_size, np.reshape(temp_set, (temp_set.shape[0], 1, 1))) rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2), callbacks=[callback]) for i in xrange(0, s.nTrain): targetInput[i] = allY[i + s.predictionStep] for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0): if i % s.retrain_interval == 0 and i > s.numLags + s.nTrain and s.online: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain:i] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) temp_set = np.array(sequence['data'])[i - s.nTrain - s.predictionStep - 48:i] configure_batches(48, s.batch_size, np.reshape(temp_set, (temp_set.shape[0], 1, 1))) rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=2, callbacks=[callback]) targetInput[i] = allY[i + s.predictionStep] predictedInput[i] = rnn.predict(np.reshape(allX[i], (1, 1, x_dims))) if s.dataSet in differenceSets: predictedInputNodiff[i] = predictedInput[i] targetInputNodiff[i] = targetInput[i] predictedInput[i] = dp.inverse_difference(backup_sequence['data'], predictedInput[i], i - 1) targetInput[i] = dp.inverse_difference(backup_sequence['data'], targetInput[i], i - 1) predictedInput[0] = 0 trueData[i] = sequence['data'][i] #predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq) #targetInput = dp.denormalize(targetInput, meanSeq, stdSeq) dp.windowed_denormalize(predictedInput, targetInput) if s.dataSet in differenceSets: # predictedInputNodiff = dp.denormalize(predictedInputNodiff) # targetInputNodiff = dp.denormalize(targetInputNodiff) pass #trueData = (trueData * stdSeq) + meanSeq dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity) skipTrain = error_ignore_first[s.dataSet] from plot import computeSquareDeviation squareDeviation = computeSquareDeviation(predictedInput, targetInput) squareDeviation[:skipTrain] = None nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInput - predictedInput)) if s.max_verbosity > 0: print "MAE {}".format(mae) if s.dataSet in differenceSets: dp.saveResultToFile(s.dataSet, predictedInputNodiff, targetInputNodiff, 'gru_nodiff', s.predictionStep, s.max_verbosity) squareDeviation = computeSquareDeviation(predictedInputNodiff, targetInputNodiff) squareDeviation[:skipTrain] = None nrmse = np.sqrt( np.nanmean(squareDeviation)) / np.nanstd(targetInputNodiff) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInputNodiff - predictedInputNodiff)) if s.max_verbosity > 0: print "MAE {}".format(mae) mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, 24)) if s.max_verbosity > 0: print "MAE {}".format(mae) return nrmse
def train(raw, flags): data_processor = DataProcessor(flags.forecast_length, flags.batch_size, flags.window) train_loader, val_loader = data_processor.get_train_test_data( raw, if_scale=True, val_ratio=flags.validation_ratio) model = LSTM(data_processor.num_features, flags.num_units, output_dim=flags.output_dim, num_layers=flags.num_layers, batch_first=True, dropout=flags.dropout) if flags.loss == 'mse': loss_fn = torch.nn.MSELoss() else: loss_fn = SMAPE() opt = torch.optim.Adam(model.parameters(), lr=flags.learning_rate) teacher_ratio = flags.teacher_ratio loss_history = [] model, opt, start_epoch = load_checkpoint(flags.checkpoint_path, model, opt) if start_epoch >= flags.num_epochs: print('start_epoch is larger than num_epochs!') epoch = start_epoch # TODO: add early stop for epoch in range(start_epoch, flags.num_epochs): for step, data in enumerate(train_loader): avg_loss, _, acc = _forward(data, model, loss_fn, flags.window, flags.forecast_length, True, teacher_ratio) loss_history.append(avg_loss) opt.zero_grad() avg_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) opt.step() teacher_ratio *= flags.teacher_ratio_decay val_loss, val_acc = evaluate(val_loader, model, loss_fn, flags.window, flags.forecast_length) print('Epoch: %d' % epoch) print("Training Loss:%.3f" % avg_loss) print('Training Avg Accuracy:%.3f' % acc) print("Validation Loss:%.3f" % val_loss) print("Validation Accuracy:%.3f" % val_acc) print('Teacher_ratio: %.3f' % teacher_ratio) print('Gradients:%.3f' % torch.mean((torch.stack( [torch.mean(torch.abs(p.grad)) for p in model.parameters()], 0)))) print() print('Model training completed and save at %s' % flags.checkpoint_path) state = { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': opt.state_dict() } if not os.path.exists(flags.checkpoint_path): os.mkdir(flags.checkpoint_path) torch.save(state, flags.checkpoint_path + '/model.pt') data_processor.save(flags.checkpoint_path) return model, loss_history
def run_gru(s): s.print_settings() prob = tf.placeholder_with_default( 1.0, shape=()) #Retain probability for TF dropout start_time = timeit.default_timer() if s.implementation == "keras": if s.use_binary: raise Exception("Binary Keras not implemented") input = Input(shape=(1, s.x_dims)) dense1 = Dense(s.nodes, activation='sigmoid')(input) dense2 = Dense(s.nodes, activation='sigmoid')(input) dense3 = Dense(s.nodes, activation='tanh')(input) mult1 = Multiply()([dense2, dense3]) act1 = Activation('tanh')(mult1) mult2 = Multiply()([dense1, act1]) reshape = Reshape((s.nodes, ))(mult2) dropout = Dropout(0.5)(reshape) dense_out = Dense(1)(dropout) rnn = Model(inputs=[input], outputs=[dense_out]) opt = adam(lr=s.lr, decay=0.0, epsilon=s.adam_eps) #, clipvalue=1.)#1e-3) #opt = rmsprop(lr=s.lr) rnn.compile(loss=s.loss, optimizer=opt) if s.max_verbosity > 0: print(rnn.summary()) else: raise Exception("Unknown implementation " + s.implementation) sequence = readDataSet(s.dataSet, s.dataSetDetailed, s).values if s.limit_to: sequence = sequence[:s.limit_to] #Get rid of unneeded columns sequence = sequence[:, 0:s.feature_count] #sequence[-1000,0] = 666 #print "Changed -1000 to 666" """ We need to leave some values unpredicted in front so that - We can fill the lookback window for each prediction - We can get the value from 1 season earlier for MASE --> Don't use the first `front_buffer` values as prediction --> Independent from `prediction_step`, so the first actual value predicted is `front_buffer`\ plus however many steps the `prediction_step` is higher than 1 In other words, the most recent X-value for the first prediction will be the final value in the `front_buffer` """ first_prediction_index = s.front_buffer + s.predictionStep - 1 targetInput = sequence[ first_prediction_index:, 0].copy() #grab this now to avoid having to denormalize dp = DataProcessor() if s.normalization_type == 'default': (meanSeq, stdSeq) = dp.normalize( sequence, s.nTrain if s.cutoff_normalize else len(sequence)) elif s.normalization_type == 'windowed': dp.windowed_normalize(sequence, columns=[0]) if s.feature_count > 1: dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count)) elif s.normalization_type == 'AN': an = AdaptiveNormalizer(s.lookback, s.lookback + s.predictionStep) an.set_pruning(False) an.set_source_data(sequence, s.nTrain) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() print seq_norm.shape if s.feature_count > 1: dp.normalize(sequence, s.nTrain, columns=range(1, s.feature_count)) start = sequence.shape[0] - seq_norm.shape[ 0] - s.lookback - s.predictionStep + 1 for i in range(seq_norm.shape[0]): seq_norm[i, :, 1:s.feature_count] = sequence[start + i:start + i + seq_norm.shape[1], 1:s.feature_count] #an.do_ma('s') #an.do_stationary() #an.remove_outliers() #seq_norm = an.do_adaptive_normalize() #print seq_norm[15000,0,0] #exit(1) else: raise Exception("Unsupported normalization type: " + s.normalization_type) #seq_actual = sequence[s.front_buffer:] #Leave enough headroom for MASE calculation and lookback #seq_full_norm = np.reshape(sequence[:,0], (sequence.shape[0],)) #seq_actual_norm = seq_full_norm[s.front_buffer:] if s.normalization_type != "AN": #Default and windowed change the seq itself but still require creating lookback frames allX = getX(sequence, s) allY = sequence[first_prediction_index:, 0] else: #AN creates a new array but takes care of lookback internally allX = seq_norm[:, 0:-s.predictionStep] allY = np.reshape(seq_norm[:, -1, 0], (-1, )) predictedInput = np.full((len(allY), ), np.nan) #Initialize all predictions to NaN #print "TESTT", allX[15000,0,1:] print "FIRST", allX[875] trainX = allX[:s.nTrain] trainY = allY[:s.nTrain] #print "FIRST", trainX[0], trainY[0] trainX = np.reshape(trainX, s.actual_input_shape_train) trainY = np.reshape(trainY, s.actual_output_shape_train) #print "FIRST", trainX[0], trainY[0] if s.implementation == "keras": #for _ in tqdm(range(s.epochs)): for _ in range(1): rnn.fit( trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2), shuffle=not s.stateful ) #, validation_data=(trainX, trainY), callbacks=[TensorBoard(log_dir='./logs', histogram_freq=1, write_grads=True)]) if s.stateful: rnn_layer.reset_states() # for layer in rnn.layers: # print layer.get_weights() #for i in xrange(0, s.nTrain + s.predictionStep): # rnn.predict(np.reshape(allX[i], (1, 1, x_dims))) #predictedInput[s.nTrain + s.predictionStep : len(allX)] = rnn.predict(np.reshape(allX[s.nTrain + s.predictionStep : len(allX)], (1, 12510, x_dims))) latestStart = None do_non_lookback = True latest_onego = 0 #buffer = s.retrain_interval / 2 buffer = 0 for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0): if i % s.retrain_interval == 0 and s.online and i > s.nTrain + s.predictionStep + buffer: do_non_lookback = True if s.normalization_type == 'AN': #print "TEST", seq_norm[15000,0,1] predictedInput = np.array( an.do_adaptive_denormalize( predictedInput, therange=(i - s.retrain_interval, i))) latestStart = i an.set_ignore_first_n(i - s.nTrain - s.predictionStep) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() print seq_norm[15000, 0, 0] print seq_norm.shape #exit(1) #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0] #print sequence[start+i-s.nTrain-s.predictionStep:start+ if s.feature_count > 1: #dp.normalize(sequence, s.nTrain, columns=range(1,s.feature_count)) start = sequence.shape[0] - seq_norm.shape[ 0] - s.lookback - s.predictionStep + 1 for j in range(seq_norm.shape[0]): seq_norm[j, :, 1:s.feature_count] = sequence[ start + j:start + j + seq_norm.shape[1], 1:s.feature_count] #print "FIRST", seq_norm[i-s.nTrain -s.predictionStep,0]#, trainY[0] allX = seq_norm[:, 0:-s.predictionStep] allY = np.reshape(seq_norm[:, -1, 0], (-1, )) if s.lookback: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep] else: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep] #print "TESTT", allX[15000,0,:] print "at", i - s.nTrain - s.predictionStep print "FIRST", allX[875] #, trainY[0] #exit(1) trainX = np.reshape(trainX, s.actual_input_shape_train) trainY = np.reshape(trainY, s.actual_output_shape_train) #print "FIRST", trainX[0], trainY[0] #exit(1) if s.implementation == "keras": if s.reset_on_retrain: input = Input(shape=(1, s.x_dims)) dense1 = Dense(s.nodes, activation='sigmoid')(input) dense2 = Dense(s.nodes, activation='sigmoid')(input) dense3 = Dense(s.nodes, activation='tanh')(input) mult1 = Multiply()([dense2, dense3]) act1 = Activation('tanh')(mult1) mult2 = Multiply()([dense1, act1]) reshape = Reshape((s.nodes, ))(mult2) dropout = Dropout(0.5)(reshape) dense_out = Dense(1)(dropout) rnn = Model(inputs=[input], outputs=[dense_out]) opt = adam(lr=s.lr, decay=0.0, epsilon=s.adam_eps) # , clipvalue=1.)#1e-3) #opt = rmsprop(lr=s.lr) rnn.compile(loss=s.loss, optimizer=opt) for _ in range(1): rnn.fit(trainX, trainY, epochs=s.epochs_retrain if s.epochs_retrain else s.epochs, batch_size=s.batch_size, verbose=2, shuffle=not s.stateful) if s.stateful: rnn_layer.reset_states() if s.lookback: if s.implementation == "keras": predictedInput[i] = rnn.predict( np.reshape(allX[i], s.predict_input_shape)) elif do_non_lookback: do_non_lookback = False up_to = min(allX.shape[0], i - (i % s.retrain_interval) + s.retrain_interval) if s.online else allX.shape[0] start_time = time.time() #print allX[0] start = 0 if s.refeed_on_retrain else latest_onego new_predicts = rnn.predict( np.reshape(allX[start:up_to], (1, -1, s.x_dims))) new_predicts = np.reshape(new_predicts, (new_predicts.shape[1], )) predictedInput[i:up_to] = new_predicts[-(up_to - i):] latest_onego = up_to for i in range(s.nTrain + s.predictionStep): predictedInput[i] = np.nan if s.normalization_type == 'default': predictedInput = dp.denormalize(predictedInput, meanSeq[0], stdSeq[0]) elif s.normalization_type == 'windowed': dp.windowed_denormalize(predictedInput, pred_step=s.predictionStep) elif s.normalization_type == 'AN': if latestStart: predictedInput = np.array( an.do_adaptive_denormalize(predictedInput, therange=(latestStart, len(predictedInput)))) else: predictedInput = np.array( an.do_adaptive_denormalize(predictedInput)) if an.pruning: targetInput = np.delete(targetInput, an.deletes) print "Final time", (timeit.default_timer() - start_time) #print "Last not to change:", predictedInput[-996], targetInput[-996] #print "First to change:", predictedInput[-995], targetInput[-995] dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity) for ignore in s.ignore_for_error: skipTrain = ignore from plot import computeSquareDeviation squareDeviation = computeSquareDeviation(predictedInput, targetInput) squareDeviation[:skipTrain] = None nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInput - predictedInput)) if s.max_verbosity > 0: print "MAE {}".format(mae) mape = errors.get_mape(predictedInput, targetInput, skipTrain) if s.max_verbosity > 0: print "MAPE {}".format(mape) mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, s.season), skipTrain) if s.max_verbosity > 0: print "MASE {}".format(mase) return mase
def data_preprocess(): with DataProcessor() as dp: line_file_path = os.path.join(DATA_PATH, LINE_FILE) conv_file_path = os.path.join(DATA_PATH, CONV_FILE) dp.prepare_text_data(line_file_path, conv_file_path, PROCESSED_PATH, TESTSET_SIZE) dp.process_data(PROCESSED_PATH, THRESHOLD, UNK_SYM, SOS, EOS)
import json from flask import Flask, abort, request from db_engine import Engine from data_processing import DataProcessor app = Flask(__name__) engine = Engine() processor = DataProcessor() @app.after_request def build_response(resp): resp.headers['Access-Control-Allow-Origin'] = '*' resp.headers[ 'Access-Control-Allow-Headers'] = 'Origin, X-Requested-With, Content-Type, Accept' resp.headers['Access-Control-Allow-Methods'] = 'GET, POST, DELETE' return resp # Here begins the routing and corresponding calls @app.route('/scheduleforseason=<int:season>') def get_schedule(season): """ send the league's schedule for a season to front end """ # validation: converter <int:x> forces all values to ints, rejects otherwise # season must be a 4-character year, i.e. 2017 schedule = [] try: schedule = engine.get_league_schedule_for_season(season)
class MetaLearningDataset(Dataset): def __init__(self, split, args): """ Initialise the MetaLearningDataset class Parameters ---------- split : str 'train' or 'test' args : parsed args args passed in to the model """ self.data_dir = "../../../datascience-projects/internal/multitask_learning/processed_data/Streetbees_Mood/Streetbees_Mood_all.csv" data = pd.read_csv(self.data_dir, index_col=0) self.data_processor = DataProcessor(data_dir=None, data_name='Streetbees_Mood', labels=['0','1']) self.K = args.K self.num_classes = args.num_classes self.baseLM_name = args.model_name.split("-")[0] do_lower_case = False if args.model_name.split("-")[-1] == "cased" else True self.tokenizer = TOKENIZERS[self.baseLM_name].from_pretrained(args.model_name, do_lower_case=do_lower_case) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Pick categories for dev and test set ALL_CATEGORIES = np.unique(data['category']) TEST_CATEGORIES = ['Healthy', 'Positive', 'Unwell', 'Fine'] self.tasks = {} for category in ALL_CATEGORIES: if (split == 'train' and category in TEST_CATEGORIES) or (split == 'test' and category not in TEST_CATEGORIES): continue # Each category will become a separate task. Explicitly redefine variable below for clarity task = category pos_examples, neg_examples = self.get_positive_and_negative_examples(data, category=task) if task not in self.tasks: self.tasks[task] = (pos_examples, neg_examples) task_list = [] task_names = [] for task in self.tasks: pos_examples = self.tasks[task][0] neg_examples = self.tasks[task][1] if len(pos_examples) < self.K or len(neg_examples) < self.K: print('not enough examples', task) continue # skip for now if not enough examples task_list.append((pos_examples, neg_examples)) task_names.append(task) self.tasks = task_list self.task_names = task_names self.num_tasks = len(self.tasks) @staticmethod def get_positive_and_negative_examples(data, category): positive_examples = data[(data['category'] == category) & (data['label'] == 1)] negative_examples = data[(data['category'] == category) & (data['label'] == 0)] return positive_examples.drop(columns='category'), negative_examples.drop(columns='category') def __getitem__(self, task_index): # choose the task indicated by index pos_examples, neg_examples = self.tasks[task_index] # for now just choose randomly among examples pos_indices = np.random.choice(range(len(pos_examples)), size=self.K) neg_indices = np.random.choice(range(len(neg_examples)), size=self.K) # # interleave randomly - DIFFERENT FROM RANDOMLY SHUFFLING # examples = np.empty((self.K*2, 2), dtype=pos.dtype) # if np.random.uniform() > .5: # examples[0::2, :] = pos # examples[1::2, :] = neg # else: # examples[0::2, :] = neg # examples[1::2, :] = pos # Randomly shuffle positive and negative examples for now all_examples = pd.concat([pos_examples.iloc[pos_indices, :], neg_examples.iloc[neg_indices, :]]).sample(frac=1) train_examples = self.data_processor.get_examples(input_df=all_examples.iloc[:self.K, :], set_type='train') test_examples = self.data_processor.get_examples(input_df=all_examples.iloc[self.K:, :], set_type='test') train_features = convert_examples_to_features(train_examples, label_list=['0','1'], max_seq_length=128, tokenizer=self.tokenizer, task_name='Streetbees_Mood', model_name=self.baseLM_name, do_logging=False) all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long) all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long) all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long) all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long) task_train_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids, 'input_masks': all_input_masks, 'label_ids': all_label_ids} test_features = convert_examples_to_features(test_examples, label_list=['0','1'], max_seq_length=128, tokenizer=self.tokenizer, task_name='Streetbees_Mood', model_name=self.baseLM_name, do_logging=False) all_input_ids = torch.tensor([feature.input_ids for feature in train_features], dtype=torch.long) all_segment_ids = torch.tensor([feature.segment_ids for feature in train_features], dtype=torch.long) all_input_masks = torch.tensor([feature.input_mask for feature in train_features], dtype=torch.long) all_label_ids = torch.tensor([feature.label_id for feature in train_features], dtype=torch.long) task_test_data = {'input_ids': all_input_ids, 'segment_ids': all_segment_ids, 'input_masks': all_input_masks, 'label_ids': all_label_ids} return task_train_data, task_test_data def __len__(self): return len(self.tasks)
def get_data(data_args, training_args, tokenizer): '''train과 validation의 dataloader와 dataset를 반환하는 함수''' if data_args.dataset_name == 'basic': if os.path.isdir("../data/train_dataset"): dataset = load_from_disk("../data/train_dataset") else: raise Exception("Set the data path to 'p3-mrc-team-ikyo/data/.'") elif data_args.dataset_name == 'preprocessed': if os.path.isfile("../data/preprocess_train.pkl"): dataset = get_pickle("../data/preprocess_train.pkl") else: dataset = make_custom_dataset("../data/preprocess_train.pkl") elif data_args.dataset_name == 'concat': if os.path.isfile("../data/concat_train.pkl"): dataset = get_pickle("../data/concat_train.pkl") else: dataset = make_custom_dataset("../data/concat_train.pkl") elif data_args.dataset_name == 'korquad': if os.path.isfile("../data/korquad_train.pkl"): dataset = get_pickle("../data/korquad_train.pkl") else: dataset = make_custom_dataset("../data/korquad_train.pkl") elif data_args.dataset_name == "question_type": if os.path.isfile("../data/question_type.pkl"): dataset = get_pickle("../data/question_type.pkl") else: dataset = make_custom_dataset("../data/question_type.pkl") elif data_args.dataset_name == "ai_hub": if os.path.isfile("../data/ai_hub_dataset.pkl"): dataset = get_pickle("../data/ai_hub_dataset.pkl") else: dataset = make_custom_dataset("../data/ai_hub_dataset.pkl") elif data_args.dataset_name == "only_korquad": dataset = load_dataset("squad_kor_v1") elif data_args.dataset_name == "random_masking": if os.path.isfile("../data/random_mask_train.pkl"): dataset = get_pickle("../data/random_mask_train.pkl") else: dataset = make_custom_dataset("../data/random_mask_train.pkl") elif data_args.dataset_name == "token_masking": if os.path.isfile("../data/concat_token_mask_top_3.pkl"): dataset = get_pickle("../data/concat_token_mask_top_3.pkl") else: dataset = make_mask_dataset("../data/concat_token_mask_top_3.pkl", tokenizer) train_dataset = dataset['train'] val_dataset = dataset['validation'] else: raise Exception( "dataset_name have to be one of ['basic', 'preprocessed', 'concat', 'korquad', 'only_korquad', 'question_type', 'ai_hub', 'random_masking', 'token_masking']" ) if data_args.dataset_name != "token_masking": train_dataset = dataset['train'] val_dataset = dataset['validation'] train_column_names = train_dataset.column_names val_column_names = val_dataset.column_names data_processor = DataProcessor(tokenizer, data_args.max_seq_length, data_args.doc_stride) train_dataset = data_processor.train_tokenizer(train_dataset, train_column_names) val_dataset = data_processor.val_tokenzier(val_dataset, val_column_names) data_collator = (DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) train_iter = DataLoader( train_dataset, collate_fn=data_collator, batch_size=training_args.per_device_train_batch_size) val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=training_args.per_device_eval_batch_size) return dataset, train_iter, val_iter, train_dataset, val_dataset
def run_gru(s): global global_step global increment_global_step_op global reset_global_step_op global batches global images_placeholder global batches_op global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int32) increment_global_step_op = tf.assign(global_step, global_step + 1) reset_global_step_op = tf.assign(global_step, 0) batches = tf.get_variable( "batches", [s.nTrain / int(s.batch_size), s.batch_size, 1, 1], dtype=tf.float32, initializer=tf.zeros_initializer) images_placeholder = tf.placeholder(tf.float32, shape=(s.nTrain / int(s.batch_size), s.batch_size, 1, 1)) batches_op = tf.assign(batches, images_placeholder) x_dims = len(x_cols[s.dataSet]) if s.dataSet in x_cols else s.lookback random.seed(6) np.random.seed(6) rnn = Sequential() rnn.add( GRU(s.nodes, input_shape=(None, x_dims), kernel_initializer='he_uniform', stateful=False)) #rnn.add(Dropout(0.15)) rnn.add(Dense(1, kernel_initializer='he_uniform')) opt = adam(lr=s.lr, decay=0.0) #1e-3) rnn.compile(loss='mae', optimizer=opt) # prepare dataset as pyBrain sequential dataset sequence = readDataSet(s.dataSet, s.dataSetDetailed, s) if s.limit_to: sequence = sequence[:s.limit_to] dp = DataProcessor() # standardize data by subtracting mean and dividing by std (meanSeq, stdSeq) = dp.normalize('data', sequence, s.nTrain) #dp.windowed_normalize(sequence) for key in sequence.keys(): if key != "data": dp.normalize(key, sequence, s.nTrain) if s.dataSet in differenceSets: predictedInputNodiff = np.zeros((len(sequence), )) targetInputNodiff = np.zeros((len(sequence), )) if s.dataSet in differenceSets: backup_sequence = sequence sequence = dp.difference(sequence, s.lookback) seq_full = sequence['data'].values seq_actual = seq_full[s.front_buffer:] allX = getX(seq_full, s) allY = seq_actual[s.predictionStep - 1:] predictedInput = np.full((len(allY), ), np.nan) #if s.dataSet not in x_cols: # allY = allY[s.lookback:] trainX = allX[:s.nTrain] trainY = allY[:s.nTrain] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2)) #for i in xrange(0,s.nTrain): # targetInput[i] = allY[i+s.predictionStep] targetInput = allY pred_diffs = [] pred_closer_to_actual = [] isFirst = True for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0): #for i in tqdm(xrange(0, len(allX)), disable=s.max_verbosity == 0): if i % s.retrain_interval == 0 and i > s.numLags + s.nTrain and s.online: trainX = allX[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainY = allY[i - s.nTrain - s.predictionStep:i - s.predictionStep] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=0) #targetInput[i] = allY[i] predictedInput[i] = rnn.predict(np.reshape(allX[i], (1, 1, x_dims))) if isFirst: print predictedInput[i] isFirst = False #predictedInput[i] = targetInput[i-1440] pred_diffs.append(abs(predictedInput[i] - allX[i][-1])) pred_closer_to_actual.append( abs(predictedInput[i] - targetInput[i]) < abs(predictedInput[i] - allX[i][-1])) if s.dataSet in differenceSets: predictedInputNodiff[i] = predictedInput[i] targetInputNodiff[i] = targetInput[i] predictedInput[i] = dp.inverse_difference(backup_sequence['data'], predictedInput[i], i - 1) targetInput[i] = dp.inverse_difference(backup_sequence['data'], targetInput[i], i - 1) for i in range(s.nTrain + s.predictionStep): predictedInput[i] = np.nan predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq) targetInput = dp.denormalize(targetInput, meanSeq, stdSeq) #dp.windowed_denormalize(predictedInput, targetInput) print "FINAL", predictedInput[-1], targetInput[-1], len( predictedInput), len(targetInput) if s.dataSet in differenceSets: # predictedInputNodiff = dp.denormalize(predictedInputNodiff) # targetInputNodiff = dp.denormalize(targetInputNodiff) pass dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity) skipTrain = error_ignore_first[s.dataSet] from plot import computeSquareDeviation squareDeviation = computeSquareDeviation(predictedInput, targetInput) squareDeviation[:skipTrain] = None nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInput - predictedInput)) if s.max_verbosity > 0: print "MAE {}".format(mae) mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, s.season)) if s.max_verbosity > 0: print "MASE {}".format(mase) if s.dataSet in differenceSets: dp.saveResultToFile(s.dataSet, predictedInputNodiff, targetInputNodiff, 'gru_nodiff', s.predictionStep, s.max_verbosity) squareDeviation = computeSquareDeviation(predictedInputNodiff, targetInputNodiff) squareDeviation[:skipTrain] = None nrmse = np.sqrt( np.nanmean(squareDeviation)) / np.nanstd(targetInputNodiff) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInputNodiff - predictedInputNodiff)) if s.max_verbosity > 0: print "MAE {}".format(mae) closer_rate = pred_closer_to_actual.count(True) / float( len(pred_closer_to_actual)) if s.max_verbosity > 0: pred_diffs.sort() print pred_diffs[0], pred_diffs[-1], pred_diffs[int(0.9 * len(pred_diffs))] print "Good results:", closer_rate return mase, closer_rate
def __init__(self): self.engine = Engine() self.processor = DataProcessor()
sensor = model._getSensorRegion() encoderList = sensor.getSelf().encoder.getEncoderList() if sensor.getSelf().disabledEncoder is not None: classifier_encoder = sensor.getSelf().disabledEncoder.getEncoderList() classifier_encoder = classifier_encoder[0] else: classifier_encoder = None print "Load dataset: ", dataSet skips = data_skips[dataSet] df = pd.read_csv(inputData, header=0, skiprows=skips) df = preprocess(df) if limit_to: df = df[:limit_to] dp = DataProcessor() #dp.windowed_normalize(df, field_name=predictedField, is_data_field=True) print " run SP through the first %i samples %i passes " % (nMultiplePass, nTrain) model = runMultiplePassSPonly(df, model, nMultiplePass, nTrain) model._spLearningEnabled = False maxBucket = classifier_encoder.n - classifier_encoder.w + 1 likelihoodsVecAll = np.zeros((maxBucket, len(df))) prediction_nstep = None time_step = [] actual_data = [] patternNZ_track = [] predict_data = np.zeros((_options.stepsAhead, 0))
class LightClient(object): def __init__(self, ip): self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB) self.log.info('[MAIN THREAD] Instantiated client') self.receiving = False self.define_headers() self.targets = {} self.transmit = Queue.Queue() self.data_client = DataClient(self.transmit, ip) self.data_processor = DataProcessor(self.transmit, self.headers, self.targets) self.connect(ip) def connect(self, ip): self.soc_ctrl = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.soc_ctrl.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) my_ip = socket.gethostbyname('') self.log.debug('[MAIN THREAD] connecting...') self.soc_ctrl.connect((ip,SOC_PORT_CTRL)) self.log.info('[MAIN THREAD] Client connected to server') def disconnect(self): ### data processor should not be here self.data_processor.stop() self.soc_ctrl.close() def define_headers(self): head = {} head['process'] = PROC_CPU_DATA + PROC_MEM_DATA + TIMESTAMPS head['system'] = SYS_CPU_OTHER + LOAD_AVG + SYS_CPU_DATA + SYS_MEM_DATA + TIMESTAMPS self.headers = head def add_target(self, target, name): if target in self.targets: self.targets[target].append(name) else: self.targets[target]=[name] def remove_target(self, target, name): if target in self.targets: if name in self.targets[target]: self.targets[target].remove(name) self.log.info('[MAIN THREAD] Removed {} named {}'.format(target, name)) else: self.log.error('[MAIN THREAD] Asked to remove {} named {} while not recorded'.format(target, name)) else: self.log.error('[MAIN THREAD] Asked to remove {} named {} while not recorded'.format(target, name)) def start_record(self, target, name): self.log.debug('[MAIN THREAD] Asking server to start recording') msg = MSG_SEP.join([START_RECORD, target, name]) answer = send_data(self.soc_ctrl,msg) self.log.info('[MAIN THREAD] Server asked to start recording') if answer == SYNC: self.add_target(target, name) self.log.info('[MAIN THREAD] Added {} named {}'.format(target, name)) else: self.log.warn('[MAIN THREAD] Could not add {} named {} because of server answer'.format(target, name)) def stop_record(self, target, name): self.log.debug('[MAIN THREAD] Asking server to stop recording') msg = MSG_SEP.join([STOP_RECORD, target, name]) answer = send_data(self.soc_ctrl,msg) self.log.info('[MAIN THREAD] Server asked to stop recording {}'.format(name)) if answer == SYNC: self.remove_target(target, name) else: self.log.warn('[MAIN THREAD] Could not remove {} named {} because of server answer'.format(target, name)) def start_receive(self): if not self.receiving: self.receiving = True self.log.debug('[MAIN THREAD] Asking server to start sending') status = send_data(self.soc_ctrl,START_SEND) self.log.info('[MAIN THREAD] Server asked to start sending') if status == FAIL: self.log.error('[MAIN THREAD] Client tried to receive but server denied it') else: print status self.data_client.start() self.log.info('[MAIN THREAD] Client is receiving') self.log.debug("[MAIN THREAD] DATA THREAD started") else: self.log.warn("[MAIN THREAD] Asked to start receiving while already receiving") def stop_receive(self): if self.receiving: self.log.debug('[MAIN THREAD] Closing data channel. Exiting data client thread') self.data_client.stop() self.log.info("[MAIN THREAD] Asked server to stop receiving") self.receiving = False send_data(self.soc_ctrl,STOP_SEND) else: self.log.warn("[MAIN THREAD] Asked to stop receiving while already receiving") def start_store(self, dirname = 'easy_client'): return self.data_processor.start_store(dirname) def stop_store(self): self.data_processor.stop_store() def start_print(self): self.data_processor.start_print() def stop_print(self): self.printing = self.data_processor.stop_print() def stop_process(self): self.stop_print() self.stop_store() self.data_processor.stop() self.stop_receive() self.soc_ctrl.close() def stop_all(self): self.stop_process() send_data(self.soc_ctrl, STOP_ALL)
result["conf_matrices"].append( confusion_matrix( self.model.predict(test_vectors), test_targets )) result["roc_curves"].append( roc_curve( test_targets, self.model.predict_proba(test_vectors)[:,1]) ) result["roc_auc_scores"].append( roc_auc_score( test_targets, self.model.predict(test_vectors)) ) result["coefficients"].append( self.model.coef_[0] ) return result if __name__ == "__main__": dp = DataProcessor() dp.load('data/SQuAD/squad-v7.file') model = LogRegModel() model.load_vectors(dp.articles) model.run_k_fold()
class LightClient(object): def __init__(self, ip): self.log = Logger(MAIN_CLIENT_LOG_FILE, D_VERB) self.log.info('[MAIN THREAD] Instantiated client') self.receiving = False self.define_headers() self.targets = {} self.transmit = Queue.Queue() self.data_client = DataClient(self.transmit, ip) self.data_processor = DataProcessor(self.transmit, self.headers, self.targets) self.connect(ip) def connect(self, ip): self.soc_ctrl = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.soc_ctrl.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) my_ip = socket.gethostbyname('') self.log.debug('[MAIN THREAD] connecting...') self.soc_ctrl.connect((ip, SOC_PORT_CTRL)) self.log.info('[MAIN THREAD] Client connected to server') def disconnect(self): ### data processor should not be here self.data_processor.stop() self.soc_ctrl.close() def define_headers(self): head = {} head['process'] = PROC_CPU_DATA + PROC_MEM_DATA + TIMESTAMPS head[ 'system'] = SYS_CPU_OTHER + LOAD_AVG + SYS_CPU_DATA + SYS_MEM_DATA + TIMESTAMPS self.headers = head def add_target(self, target, name): if target in self.targets: self.targets[target].append(name) else: self.targets[target] = [name] def remove_target(self, target, name): if target in self.targets: if name in self.targets[target]: self.targets[target].remove(name) self.log.info('[MAIN THREAD] Removed {} named {}'.format( target, name)) else: self.log.error( '[MAIN THREAD] Asked to remove {} named {} while not recorded' .format(target, name)) else: self.log.error( '[MAIN THREAD] Asked to remove {} named {} while not recorded'. format(target, name)) def start_record(self, target, name): self.log.debug('[MAIN THREAD] Asking server to start recording') msg = MSG_SEP.join([START_RECORD, target, name]) answer = send_data(self.soc_ctrl, msg) self.log.info('[MAIN THREAD] Server asked to start recording') if answer == SYNC: self.add_target(target, name) self.log.info('[MAIN THREAD] Added {} named {}'.format( target, name)) else: self.log.warn( '[MAIN THREAD] Could not add {} named {} because of server answer' .format(target, name)) def stop_record(self, target, name): self.log.debug('[MAIN THREAD] Asking server to stop recording') msg = MSG_SEP.join([STOP_RECORD, target, name]) answer = send_data(self.soc_ctrl, msg) self.log.info( '[MAIN THREAD] Server asked to stop recording {}'.format(name)) if answer == SYNC: self.remove_target(target, name) else: self.log.warn( '[MAIN THREAD] Could not remove {} named {} because of server answer' .format(target, name)) def start_receive(self): if not self.receiving: self.receiving = True self.log.debug('[MAIN THREAD] Asking server to start sending') status = send_data(self.soc_ctrl, START_SEND) self.log.info('[MAIN THREAD] Server asked to start sending') if status == FAIL: self.log.error( '[MAIN THREAD] Client tried to receive but server denied it' ) else: print status self.data_client.start() self.log.info('[MAIN THREAD] Client is receiving') self.log.debug("[MAIN THREAD] DATA THREAD started") else: self.log.warn( "[MAIN THREAD] Asked to start receiving while already receiving" ) def stop_receive(self): if self.receiving: self.log.debug( '[MAIN THREAD] Closing data channel. Exiting data client thread' ) self.data_client.stop() self.log.info("[MAIN THREAD] Asked server to stop receiving") self.receiving = False send_data(self.soc_ctrl, STOP_SEND) else: self.log.warn( "[MAIN THREAD] Asked to stop receiving while already receiving" ) def start_store(self, dirname='easy_client'): return self.data_processor.start_store(dirname) def stop_store(self): self.data_processor.stop_store() def start_print(self): self.data_processor.start_print() def stop_print(self): self.printing = self.data_processor.stop_print() def stop_process(self): self.stop_print() self.stop_store() self.data_processor.stop() self.stop_receive() self.soc_ctrl.close() def stop_all(self): self.stop_process() send_data(self.soc_ctrl, STOP_ALL)
def train_and_validate(alarm, ticket): ''' Function will perform the following actions: 1. Data will be converted to numpy arrays to be accepted as input to the learning model 2. Construct learning model and train on the data provided 3. Generate data predictions using a subset of the ticket data 4. Validate the predictions generated with the test data against the manually predicted values in the ticket file Parameters: alarm: alarm file needed to initialize DataProcessor ticket: ticket file needed to intialize DataProcessor ''' # Initialize DataProcessor object dp = DataProcessor(alarm, ticket) # Create thread to run progress bar thread = threading.Thread(target=run_progress_bar, args=(670, )) # Start thread to run progress bar thread.start() ''' Converting lists to numpy array, the number specifies the index in the array of associated label values ''' encoded_hex_codes = dp.convert_array_to_np_array( dp.encode_ticket_hex_codes()) event_cause_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.event_cause_vals, 0)) detection_method_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.detection_method, 1)) restore_method_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.restore_method, 2)) fix_classification_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.fix_classification, 3)) subsystem_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.subsystem, 4)) relevance_options = dp.convert_array_to_np_array( dp.get_encoded_label_value(dp.relevance, 5)) # Train on Data. Training & saving the model for each of the labels classify_data(encoded_hex_codes, event_cause_options, 'event_cause.hdf5', 101, 110, 0.80, len(dp.event_cause_vals)) classify_data(encoded_hex_codes, detection_method_options, 'detection_method.hdf5', 101, 110, 0.80, len(dp.detection_method)) classify_data(encoded_hex_codes, restore_method_options, 'restore_method.hdf5', 101, 110, 0.80, len(dp.restore_method)) classify_data(encoded_hex_codes, fix_classification_options, 'fix_classification.hdf5', 101, 110, 0.80, len(dp.fix_classification)) classify_data(encoded_hex_codes, subsystem_options, 'subsystem.hdf5', 101, 110, 0.80, len(dp.subsystem)) classify_data(encoded_hex_codes, relevance_options, 'relevance.hdf5', 101, 110, 0.80, len(dp.relevance)) ''' Generate Training Data Predictions 20 percent that needs to be tested for alarm hex and all labels Calculate the starting place by multiplying length of encoded_hex_codes which is the input for the model ''' start_index = int(len(encoded_hex_codes) * 0.8) predict_input_hex = encoded_hex_codes[start_index:] predict_event_cause = event_cause_options[start_index:] predict_detection_method = detection_method_options[start_index:] predict_restore_method = restore_method_options[start_index:] predict_fix_classification = fix_classification_options[start_index:] predict_subsystem = subsystem_options[start_index:] predict_relevance = relevance_options[start_index:] # calling prediction from predict.py and returns array of confidence values event_cause_prediction = prediction(predict_input_hex, 'event_cause.hdf5') detection_method_prediction = prediction(predict_input_hex, 'detection_method.hdf5') restore_method_prediction = prediction(predict_input_hex, 'restore_method.hdf5') fix_classification_prediction = prediction(predict_input_hex, 'fix_classification.hdf5') subsystem_prediction = prediction(predict_input_hex, 'subsystem.hdf5') relevance_prediction = prediction(predict_input_hex, 'relevance.hdf5') ''' Validate Training Data Predictions Validation calls for all labels using the prediction function returns ''' validation(predict_event_cause, event_cause_prediction, predict_input_hex, 'event_cause_predictions.txt') validation(predict_detection_method, detection_method_prediction, predict_input_hex, 'detection_method_predictions.txt') validation(predict_restore_method, restore_method_prediction, predict_input_hex, 'restore_method_predictions.txt') validation(predict_fix_classification, fix_classification_prediction, predict_input_hex, 'fix_classication_predictions.txt') validation(predict_subsystem, subsystem_prediction, predict_input_hex, 'subsystem_predictions.txt') validation(predict_relevance, relevance_prediction, predict_input_hex, 'relevance_predictions.txt') # Join thread back to main process thread.join()
def run_gru(s): prob = tf.placeholder_with_default(1.0, shape=()) global global_step global increment_global_step_op global reset_global_step_op global batches global images_placeholder global batches_op global_step = tf.Variable(0, name='global_step', trainable=False, dtype=tf.int32) increment_global_step_op = tf.assign(global_step, global_step + 1) reset_global_step_op = tf.assign(global_step, 0) batches = tf.get_variable("batches", [s.nTrain / int(s.batch_size), s.batch_size, 1, 1], dtype=tf.float32, initializer=tf.zeros_initializer) images_placeholder = tf.placeholder(tf.float32, shape=(s.nTrain / int(s.batch_size), s.batch_size, 1, 1)) batches_op = tf.assign(batches, images_placeholder) x_dims = s.lookback random.seed(6) np.random.seed(6) tf.set_random_seed(6) if s.implementation == "keras": if s.use_binary: raise Exception("Binary Keras not implemented") rnn = Sequential() if s.rnn_type == "lstm": rnn.add(LSTM(s.nodes, input_shape=(None,x_dims), kernel_initializer='he_uniform')) elif s.rnn_type == "gru": rnn.add(GRU(s.nodes, input_shape=(None, x_dims), kernel_initializer='he_uniform')) rnn.add(Dropout(0.5)) rnn.add(Dense(1, kernel_initializer='he_uniform')) opt = rmsprop(lr=s.lr)#1e-3) rnn.compile(loss='mae', optimizer=opt) input = Input(shape=(1, x_dims)) dense1 = Dense(s.nodes, activation='sigmoid')(input) dense2 = Dense(s.nodes, activation='sigmoid')(input) dense3 = Dense(s.nodes, activation='tanh')(input) mult1 = Multiply()([dense2, dense3]) act1 = Activation('tanh')(mult1) mult2 = Multiply()([dense1, act1]) reshape = Reshape((s.nodes,))(mult2) dropout = Dropout(0.5)(reshape) dense_out = Dense(1)(dropout) rnn = Model(inputs=[input], outputs=[dense_out]) opt = adam(lr=s.lr) # 1e-3) rnn.compile(loss='mae', optimizer=opt) print rnn.summary() elif s.implementation == "tf": data = tf.placeholder(tf.float32, [None, s.lookback, 1]) # Number of examples, number of input, dimension of each input target = tf.placeholder(tf.float32, [None, 1]) if s.rnn_type == "lstm" and s.use_binary: cell = rnn_tf.LSTMCell(s.nodes) elif s.rnn_type == "lstm" and not s.use_binary: cell = tf.nn.rnn_cell.LSTMCell(s.nodes) elif s.rnn_type == "gru" and s.use_binary: cell = rnn_tf.GRUCell(s.nodes) elif s.rnn_type == "gru" and not s.use_binary: cell = tf.nn.rnn_cell.GRUCell(s.nodes) val, _ = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32) with tf.name_scope('rnn_summaries'): var = val mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) val = tf.nn.dropout(val, prob) if not s.use_binary: dense = tf.layers.dense(val, 1) else: dense = core_discretize.dense(val, 1) with tf.name_scope('dense_summaries'): var = dense mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) pred = tf.reshape(dense, (tf.shape(dense)[0], 1)) summary = tf.summary.merge_all() optimizer = tf.train.AdamOptimizer(learning_rate=s.lr) #cost = tf.losses.mean_squared_error(target, pred) cost = tf.reduce_mean(tf.abs(target - pred)) minimize = optimizer.minimize(cost) else: raise Exception("Unknown implementation " + s.implementation) sequence = readDataSet(s.dataSet, s.dataSetDetailed, s) if s.limit_to: sequence = sequence[:s.limit_to] #TEMP SANITY CHECK # sequence['data'][7001] = 0 # sequence['data'][7002] = 0 # sequence['data'][7003] = 0 # sequence['data'][7004] = 0 # sequence['data'][7005] = 0 seq_full = sequence['data'].values #use .values to copy targetInput = seq_full[s.front_buffer + s.predictionStep - 1:].copy() #grab this now to avoid having to denormalize dp = DataProcessor() if s.normalization_type == 'default': (meanSeq, stdSeq) = dp.normalize('data', sequence, s.nTrain) elif s.normalization_type == 'windowed': dp.windowed_normalize(sequence) elif s.normalization_type == 'AN': an = AdaptiveNormalizer(s.lookback, s.lookback + s.predictionStep) an.set_pruning(False) an.set_source_data(seq_full, s.nTrain) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() else: raise Exception("Unsupported normalization type: " + s.normalization_type) seq_actual = seq_full[s.front_buffer:] #Leave enough headroom for MASE calculation and lookback seq_full_norm = sequence['data'].values seq_actual_norm = seq_full_norm[s.front_buffer:] if s.normalization_type != "AN": #Default and windowed change the seq itself but still require creating lookback frames allX = getX(seq_full_norm, s) allY = seq_actual_norm[s.predictionStep-1:] else: #AN creates a new array but takes care of lookback internally allX= seq_norm[:,0:-s.predictionStep] allY = np.reshape(seq_norm[:,-1], (-1,)) # TODO FIX PROPERLY (now rolled too far) too_long = len(allX) - (len(seq_full) - s.front_buffer - s.predictionStep + 1) if too_long > 0: allX = allX[too_long:] allY = allY[too_long:] print len(allX), len(allY), s.front_buffer predictedInput = np.full((len(allY),), np.nan) #Initialize all predictions to NaN trainX = allX[:s.nTrain] trainY = allY[:s.nTrain] trainX = np.reshape(trainX, (trainX.shape[0],1, trainX.shape[1])) trainY = np.reshape(trainY, ( trainY.shape[0],)) if s.implementation == "keras": rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=min(s.max_verbosity, 2)) elif s.implementation == "tf": sess = tf.Session() writer = tf.summary.FileWriter("results/", graph=sess.graph) init = tf.global_variables_initializer() sess.run(init) for v in tf.trainable_variables(): print v.name for epoch in tqdm(range(s.epochs)): the_cost, _, summ = sess.run([cost, minimize, summary], feed_dict={data: trainX, target: trainY, prob: 0.5}) writer.add_summary(summ, epoch) if epoch % 10 == 0: print the_cost #print(psutil.Process(os.getpid()).memory_percent()) var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/gates/kernel:0"][0] print sess.run(tf.reduce_min(var)) print sess.run(tf.reduce_max(var)) # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/gates/bias:0"][0] # print sess.run(tf.reduce_min(var)) # print sess.run(tf.reduce_max(var)) # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/candidate/kernel:0"][0] # print sess.run(tf.reduce_min(var)) # print sess.run(tf.reduce_max(var)) # var = [v for v in tf.trainable_variables() if v.name == "rnn/gru_cell/candidate/bias:0"][0] # print sess.run(tf.reduce_min(var)) # print sess.run(tf.reduce_max(var)) # print "loop" var = [v for v in tf.trainable_variables() if v.name == "dense/bias:0"] print sess.run(var) minval = 10 latestStart = None for i in tqdm(xrange(s.nTrain + s.predictionStep, len(allX)), disable=s.max_verbosity == 0): #for i in tqdm(xrange(0, len(allX)), disable=s.max_verbosity == 0): #for i in tqdm(xrange(10475, len(allX)), disable=s.max_verbosity == 0): if i % s.retrain_interval == 0 and i > s.numLags+s.nTrain and s.online: if s.normalization_type == 'AN': predictedInput = np.array(an.do_adaptive_denormalize(predictedInput, therange=(i-s.retrain_interval, i))) latestStart = i an.set_ignore_first_n(i-s.nTrain-s.predictionStep) an.do_ma('s') an.do_stationary() an.remove_outliers() seq_norm = an.do_adaptive_normalize() allX = seq_norm[:, 0:-s.predictionStep] allY = np.reshape(seq_norm[:, -1], (-1,)) trainX = allX[i-s.nTrain-s.predictionStep:i-s.predictionStep] trainY = allY[i-s.nTrain-s.predictionStep:i-s.predictionStep] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) trainY = np.reshape(trainY, (trainY.shape[0], 1)) if s.implementation == "keras": rnn.fit(trainX, trainY, epochs=s.epochs, batch_size=s.batch_size, verbose=0) elif s.implementation == "tf": for epoch in range(s.epochs): sess.run(minimize, feed_dict={data: trainX, target: trainY, prob: 0.5}) if s.implementation == "keras": predictedInput[i] = rnn.predict(np.reshape(allX[i], (1,1,x_dims))) elif s.implementation == "tf": predictedInput[i] = sess.run(dense, feed_dict={data: np.reshape(allX[i], (1, x_dims, 1))}) #if len(allX) > i+5: # predictedInput[i] = allY[i-3000] # if i == 10000: # print allX[i] # print "should be ", (targetInput[i] - meanSeq) / stdSeq # print "predicted as ", predictedInput[i] # for i in range(s.nTrain + s.predictionStep): # predictedInput[i] = np.nan print "SMALLEST", minval # np.set_printoptions(threshold=np.nan, suppress=True) # print "ALLY START" # for val in allY: # print val # print "ALLY STOP" if s.normalization_type == 'default': predictedInput = dp.denormalize(predictedInput, meanSeq, stdSeq) #targetInput = dp.denormalize(targetInput, meanSeq, stdSeq) elif s.normalization_type == 'windowed': dp.windowed_denormalize(predictedInput, targetInput, pred_step=s.predictionStep) elif s.normalization_type == 'AN': if latestStart: predictedInput = np.array(an.do_adaptive_denormalize(predictedInput, therange=(latestStart, len(predictedInput)))) else: predictedInput = np.array(an.do_adaptive_denormalize(predictedInput)) if an.pruning: targetInput = np.delete(targetInput, an.deletes) print len(predictedInput), len(targetInput), "LENS" #TEMP SANITY CHECK #print predictedInput[7005 - s.front_buffer - s.predictionStep +1] #print predictedInput[7006 - s.front_buffer - s.predictionStep + 1] dp.saveResultToFile(s.dataSet, predictedInput, targetInput, 'gru', s.predictionStep, s.max_verbosity) skipTrain = s.ignore_for_error from plot import computeSquareDeviation squareDeviation = computeSquareDeviation(predictedInput, targetInput) squareDeviation[:skipTrain] = None nrmse = np.sqrt(np.nanmean(squareDeviation)) / np.nanstd(targetInput) if s.max_verbosity > 0: print "", s.nodes, "NRMSE {}".format(nrmse) mae = np.nanmean(np.abs(targetInput-predictedInput)) if s.max_verbosity > 0: print "MAE {}".format(mae) mape = errors.get_mape(predictedInput,targetInput, s.ignore_for_error) if s.max_verbosity > 0: print "MAPE {}".format(mape) mase = errors.get_mase(predictedInput, targetInput, np.roll(targetInput, s.season), s.ignore_for_error) if s.max_verbosity > 0: print "MASE {}".format(mase) if s.implementation == "tf": sess.close() return mase
# from keras.models import Sequential # Note: included for debug source access from tensorflow.keras.layers import Dense # from keras.layers import Dense # Note: included for debug source access import os import numpy as np import sys from data_processing import DataProcessor from matplotlib import pyplot cur_path = os.path.dirname(__file__) data_folder = "data\\titanic" processed_data_folder = os.path.join(cur_path, data_folder) # Note: Not using test.csv as it does not provide whether or not the passenger survived; therefore we cannot assess # how well the model performed. data_file_path = os.path.join(processed_data_folder, "train.csv") data_processor = DataProcessor(data_file_path, processed_data_folder, "ffnn_processed.npz") # Load data try: # Try to load data data_processor.load_processed_data() except FileNotFoundError: # No data found, so process it # 20% test, 20% validation, 60% training samples from data splits = (0.2, 0.2, 0.6) # Only use certain columns use_cols = ( # 0, #PassengerID 1, # Survived 2, # Pclass # 3, #Name
class DataCollector: """ For all data collection. """ def __init__(self): self.engine = Engine() self.processor = DataProcessor() def get_single_box_score(self, game_id): """ Gathers a box score from basketball-reference.com and stores to database. Keyword arguments: game_id: 12 character long string in form YYYYMMDD0XXX, where YYYY is the year MM is a 2 digit numeric representation of the month, with zero padding if necessary DD is a 2 digit numeric representation of the day, with zero padding if necessary XXX is the 3-character abbreviation of the home team, i.e. 'BOS' for Boston Celtics or 'NYK' for New York Knicks """ url = BK_REF_URL + game_id + HTML_SUFFIX page_response = requests.get(url, headers=REQUEST_HEADERS) page_tree = html.fromstring(page_response.content) home_stats, away_stats = [], [] for stat in [ 'pts', 'fg', 'fga', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 'stl', 'blk', 'tov' ]: away, home = page_tree.xpath(BK_REF_XPATH % stat) away_stats.append(int(away.text.strip())) home_stats.append(int(home.text.strip())) minutes = int(page_tree.xpath(BK_REF_XPATH % 'mp')[0].text.strip()) / 5 data_values = tuple([None, None, game_id] + away_stats + home_stats + [minutes]) self.engine.insert_box_score(data_values) def get_season_schedule(self, year): """ Gathers a full season's game schedule by traversing basketball-reference.com These will eventually be used by the get_single_box_score method to gather box scores Keywords arguments: year: int representing the year that a season concludes in i.e. 1986-1987 season is represented by 1987 """ schedule = [] for month in [ 'october', 'november', 'december', 'january', 'february', 'march', 'april' ]: url = BK_REF_SCHEDULE_URL % (str(year), month) page_response = requests.get(url, headers=REQUEST_HEADERS) if page_response.status_code == 404: continue page_tree = html.fromstring(page_response.content) game_headers = page_tree.xpath(BK_REF_SCHEDULE_XPATH + 'th') away_xpath = 'td[1]/a' if int(year) <= 2000 else 'td[2]/a' away_teams = page_tree.xpath(BK_REF_SCHEDULE_XPATH + away_xpath) # handle special case for april, where playoff games are displayed on the page if month == 'april': header_list = page_tree.xpath(BK_REF_SCHEDULE_XPATH + 'th') try: end_index = next(index for index, val in enumerate(header_list) if not val.get('class', False)) except StopIteration: end_index = len(game_headers) else: end_index = len(game_headers) for index, game in enumerate(game_headers): if index == end_index: break game_code = game.attrib['csk'] away_url = away_teams[index].attrib['href'] away_team = away_url.split('/')[2] home_team = game_code[-3:] game_date = '{}-{}-{}'.format(game_code[:4], game_code[4:6], game_code[6:8]) schedule.append( (game_code, game_date, year, away_team, home_team)) self.engine.insert_scheduled_games(schedule) self.engine.commit_changes() def gather_all_scheduled_games(self): """ simple loop to gather all games from 1986 to the present """ print "Loading each season schedule and saving to database:" for season in SEASON_LIST: print season self.get_season_schedule(season) def gather_all_box_scores(self): """ Gather all games on schedule. Save after each because this will likely be interrupted at some point """ games = self.engine.get_game_ids_to_gather() for game_id in games: print game_id self.get_single_box_score(game_id) self.engine.commit_changes() def fill_database_from_scratch(self): """ Starting with model but no records, fill in the database """ # start by loading teams table in self.engine.insert_all_team_data() # # gather and save all scheduled games into db self.gather_all_scheduled_games() # gather and save all box scores into db self.gather_all_box_scores() # calculate all team stats from box scores self.processor.complete_database_setup() def in_season_update(self): self.gather_all_box_scores() self.processor.process_all_stats_for_year(CURRENT_SEASON)