def test_unseen(self): test_datasets = ['twitter', 'hotel'] laptop_texts_raw_indices, laptop_texts_raw_without_aspects_indices, laptop_texts_left_indices, laptop_texts_left_with_aspects_indices, \ laptop_aspects_indices, laptop_texts_right_indices, laptop_texts_right_with_aspects_indices, laptop_dataset_index, \ laptop_polarities_matrix, laptop_polarities = \ read_dataset(types=[test_datasets[0]], mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) hotel_texts_raw_indices, hotel_texts_raw_without_aspects_indices, hotel_texts_left_indices, hotel_texts_left_with_aspects_indices, \ hotel_aspects_indices, hotel_texts_right_indices, hotel_texts_right_with_aspects_indices, hotel_dataset_index, \ hotel_polarities_matrix, hotel_polarities = \ read_dataset(types=[test_datasets[1]], mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) polarities = [laptop_polarities, hotel_polarities] for j in range(len(polarities)): print("Dataset id : {}".format(test_datasets[j])) test_unique_elements, test_counts_elements = np.unique(polarities[j], return_counts=True) for i in range(len(test_unique_elements)): print("Sentiment : {0} , Count : {1}".format(test_unique_elements[i], test_counts_elements[i])) self.model.evaluate( [laptop_texts_left_indices, laptop_texts_right_indices, laptop_dataset_index, laptop_aspects_indices], \ [laptop_polarities_matrix, laptop_polarities_matrix, laptop_polarities_matrix], steps=1) self.model.evaluate( [hotel_texts_left_indices, hotel_texts_right_indices, hotel_dataset_index, hotel_aspects_indices], \ [hotel_polarities_matrix, hotel_polarities_matrix, hotel_polarities_matrix], steps=1)
def main(): # set mode try: mode = sys.argv[1] assert (mode == 'dnn' or mode == 'cnn') except: print('Error: Model mode not found') exit() read_dataset() # load data try: img_name = sys.argv[2] img = mpimg.imread(img_name) except: print('Error: Img not found') exit() img = np.array([img]) o_shape = img.shape img = np.reshape(img, (1, -1)) #img = preprocessing.scale(img) img = (img - 0.5) * 4 img = np.reshape(img, (1, 48, 48, 1)) classes = [ 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' ] # load model emotion_classifier = model.build_model(mode) emotion_classifier.load_weights(mode + '.h5') # predict predictions = emotion_classifier.predict_classes(img) print(classes[predictions[0]])
def train_and_evaluate(output_dir, hparams): EVAL_INTERVAL = 30 run_config = tf.estimator.RunConfig(save_checkpoints_secs=EVAL_INTERVAL, keep_checkpoint_max=3, save_summary_steps=25) estimator = tf.estimator.Estimator(model_fn=sequence_regressor, params=hparams, model_dir=output_dir, config=run_config) train_spec = tf.estimator.TrainSpec(input_fn=utils.read_dataset( hparams['train_set'], mode=tf.estimator.ModeKeys.TRAIN, batch_size=hparams['batch_size'], timeserie_column=TIMESERIES_COL), max_steps=hparams['training_steps']) exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec( input_fn=utils.read_dataset(hparams['eval_set'], mode=tf.estimator.ModeKeys.EVAL, batch_size=hparams['batch_size'], timeserie_column=TIMESERIES_COL), steps=hparams['eval_steps'], start_delay_secs=60, # start evaluating after N seconds throttle_secs=EVAL_INTERVAL, # evaluate every N seconds exporters=exporter) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def get_dataset(args, fold): tr_X, tr_y = read_dataset(os.path.join(args.dataset, '%03d' % fold), "train.csv") ts_X, ts_y = read_dataset(os.path.join(args.dataset, '%03d' % fold), "test.csv") return np.asarray(tr_X), np.asarray(tr_y), np.asarray(ts_X), \ np.asarray(ts_y)
def run_network(window, model=None, save_model=False, show_plot=False): start_time = time.time() print('loading and prepare data set...') data = read_dataset('../datasets/internet-traffic-data-5minutes.csv') X_train, y_train, X_test, y_test, mean, std = split_dataset( data, window, ratio=0.90, standardize=True) print('number of training samples ', len(y_train)) print('number of test samples ', len(y_test)) if not model: print('initialize model...') model = compile_model( hidden_neurons=25, loss_fn='mse', input_dim=sum(1 for x in window if x), activation_fn='tanh') print('model ', model.summary()) print('train model...') early_stopping = EarlyStopping(monitor='val_loss', patience=2) model.fit(X_train, y_train, nb_epoch=500, validation_split=0.1, callbacks=[early_stopping]) print('make predictions...') prediction = model.predict(X_test).flatten() if show_plot: plot_result(prediction, y_test, mean, std) print('mase = ', mase(y_train, y_test, prediction)) if save_model: store_model(model) print('totoal duration: {:.2f} seconds'.format(time.time() - start_time))
def main(proc_num: int, no_limits: bool) -> None: df = read_dataset(no_limits) chunks = split_dataset_into_chunks(df, proc_num) with Pool(proc_num) as pool: pool.map(worker, chunks) for chunk in chunks: print(len(chunk.head()))
def forecasting_different_horizons(): print('load data set...') data_df = read_dataset('../datasets/internet-traffic-data-5minutes.csv') mean = data_df.mean() std = data_df.std() data_df -= mean data_df /= std data = data_df['data (in bytes)'] start_forecast_idx = int(len(data) * 0.90) test_set = data[start_forecast_idx:] print('calculate forecasts using a MLP neural network...') mlp_model = load_model('Saturday_192115', 'mse') window = create_window_array(139, 288) window_size = sum(1 for x in window if x) plt.ylabel('data (normalized)') plt.xlabel('time') plt.plot(test_set, 'r-', label='test set') for steps, style in [(1, 'g-'), (24, 'b--')]: forecast = [] for t in range(start_forecast_idx, len(data) - steps): forecast.append(iterative_prediction( mlp_model, data[:t], (1, window_size), window, steps+1)) series = pd.Series([np.nan] * steps + forecast, index=test_set.index) plt.plot(series, style, label='h={}'.format(steps)) plt.legend(loc='upper left') plt.show()
def train(self): tbCallBack = TensorBoard(log_dir='./lstm_logs', histogram_freq=0, write_graph=True, write_images=True) def modelSave(epoch, logs): if (epoch + 1) % 5 == 0: self.model.save('lstm_saved_model.h5') msCallBack = LambdaCallback(on_epoch_end=modelSave) texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \ aspects_indices, texts_right_indices, texts_right_with_aspects_indices, \ polarities_matrix = \ read_dataset(type=self.DATASET, mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.model.fit(self.texts_raw_indices, self.polarities_matrix, validation_data=(texts_raw_indices, polarities_matrix), epochs=self.EPOCHS, batch_size=self.BATCH_SIZE, callbacks=[tbCallBack])
def train(self): tbCallBack = TensorBoard(log_dir='./ram_logs', histogram_freq=0, write_graph=True, write_images=True) texts_raw_indices, texts_left_indices, aspects_indices, texts_right_indices, polarities_matrix = \ read_dataset(type=self.DATASET, mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) for i in range(1, self.ITERATION): print() print('-' * 50) print('Iteration', i) self.model.fit( [self.texts_raw_indices, self.aspects_indices], self.polarities_matrix, validation_data=([texts_raw_indices, aspects_indices], polarities_matrix), batch_size=self.BATCH_SIZE, callbacks=[tbCallBack]) if i % 5 == 0: self.model.save('ram_saved_model.h5') print('model saved')
def __init__(self): self.DATASET = 'restaurant' # 'twitter', 'restaurant', 'laptop' self.POLARITIES_DIM = 3 self.EMBEDDING_DIM = 100 self.LEARNING_RATE = 0.01 self.LSTM_PARAMS = { 'units': 200, 'activation': 'tanh', 'recurrent_activation': 'sigmoid', 'kernel_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003), 'recurrent_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003), 'bias_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003), 'kernel_regularizer': regularizers.l2(0.001), 'recurrent_regularizer': regularizers.l2(0.001), 'bias_regularizer': regularizers.l2(0.001), 'dropout': 0, 'recurrent_dropout': 0, } self.MAX_SEQUENCE_LENGTH = 80 self.MAX_ASPECT_LENGTH = 2 self.BATCH_SIZE = 200 self.ITERATION = 500 self.texts_raw_indices, self.texts_left_indices, self.aspects_indices, self.texts_right_indices, \ self.polarities_matrix, \ self.embedding_matrix, \ self.tokenizer = \ read_dataset(type=self.DATASET, mode='train', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.left_input = np.concatenate((self.texts_left_indices, self.aspects_indices), axis=1) self.right_input = np.concatenate((self.texts_right_indices, self.aspects_indices), axis=1) if os.path.exists('td_lstm_saved_model.h5'): print('loading saved model...') self.model = load_model('td_lstm_saved_model.h5') else: print('Build model...') inputs_l = Input(shape=(self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH,)) inputs_r = Input(shape=(self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH,)) Embedding_Layer = Embedding(input_dim=len(self.tokenizer.word_index) + 1, output_dim=self.EMBEDDING_DIM, input_length=self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH, weights=[self.embedding_matrix], trainable=False) x_l = Embedding_Layer(inputs_l) x_r = Embedding_Layer(inputs_r) x_l = LSTM(**self.LSTM_PARAMS)(x_l) x_r = LSTM(**self.LSTM_PARAMS, go_backwards=True)(x_r) x = Concatenate()([x_l, x_r]) x = Dense(self.POLARITIES_DIM)(x) predictions = Activation('softmax')(x) model = Model(inputs=[inputs_l, inputs_r], outputs=predictions) model.summary() model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=self.LEARNING_RATE), metrics=['acc']) # plot_model(model, to_file='model.png') self.model = model
def train_dataset(dataset_name, initial_design, mode, runs=1): data = read_dataset('../datasets/', dataset_name + '.csv') X_train, X_val, y_train, y_val = train_test_split( data.iloc[:, :-1].values, data[data.columns.to_list()[-1]]) training_lower = np.min(X_train, axis=0) training_upper = np.max(X_train, axis=0) for i in range(runs): print('run', i + 1, 'of', runs) # TODO make universal for any model objective_function = ML(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val) # size: number of hyperparameters opt_lower = np.array([1, 0.00001, 0.0001, 50, 0.01, 0.09, 0.0999, 5]) opt_upper = np.array([150, 0.01, 0.1, 300, 0.9, 0.9, 0.999, 15]) n_init = 3 # number of points for the initial design. init_design = init_latin_hypercube_sampling n_iterations = 100 X_init = None # mvp Y_init = None # mvp maximizer = 'random' acquisition_func = 'log_ei' model_type = 'gp' result_path = ('../optimization_results/' + mode + '/f-score/' + maximizer + '-' + acquisition_func + '-' + model_type + '/' + dataset_name + '/run-' + str(i)) if not os.path.exists(result_path): os.makedirs(result_path) d_name = dataset_name neighbor = get_nearest_names(1, d_name)[0] results = bayesian_optimization(objective_function, d_name, neighbor, opt_lower, opt_upper, num_iterations=n_iterations, initial_design=initial_design, X_init=X_init, Y_init=Y_init, maximizer=maximizer, acquisition_func=acquisition_func, model_type=model_type, n_init=3, rng=None, output_path=result_path) json.dump(results, open(os.path.join(result_path, 'RESULTS.json'), 'w'))
def main(): ts_5minutes = read_dataset('../datasets/internet-traffic-data-5minutes.csv') ts_hourly = read_dataset('../datasets/internet-traffic-data-hourly.csv') ts_daily = read_dataset('../datasets/internet-traffic-data-daily.csv') plot_all_full_timeseries(ts_5minutes, ts_hourly, ts_daily) plot_full_timeseries(ts_5minutes) plot_full_timeseries(ts_hourly) plot_full_timeseries(ts_daily) plot_interval_of_timeseries(ts_5minutes, '2005-06-22', '2005-06-22') plot_interval_of_timeseries(ts_5minutes, '2005-07-04', '2005-07-10') plot_daily_means(ts_5minutes) plot_acf(ts_hourly, 200, 24, 'ACF hourly data (one week)') plot_acf(ts_5minutes, 300, 50, 'ACF 5 min, data (one day)')
def main(train, test, ngram, we): ''' Given output path (out), liblinear path (liblinear), Given ngram string rule (like "123"), ngram ''' global times times = 0 print 'loading data...' train_df = read_dataset(train) test_df = read_dataset(test) print 'cleaning data and add lex indicators...' train_df[['text_lex', 'lex_ws']] = train_df.apply(add_lex_indicator, axis=1) test_df[['text_lex', 'lex_ws']] = test_df.apply(add_lex_indicator, axis=1) print "using train to build pos and neg dic..." pos_dic, neg_dic = build_dict(train_df, ngram) print "computing log-count ratio r..." dic, r, v = compute_ratio(pos_dic, neg_dic) print 'loading word embedding...' word2vec = load_bin_vec(we) print "building train and test features --- ngram part..." train_df.sort_index(inplace=True) test_df.sort_index(inplace=True) X_train_ngram, y_train = process_files_ngram(train_df, dic, r, v, ngram) X_test_ngram, y_test = process_files_ngram(test_df, dic, r, v, ngram) print "building train and test features --- pos embedding part..." X_train_embed = process_files_wemb(train_df, word2vec) X_test_embed = process_files_wemb(test_df, word2vec) print "combining log-count ratio and pos embedding features..." train_f = sp.hstack((X_train_ngram, X_train_embed), format='csr') test_f = sp.hstack((X_test_ngram, X_test_embed), format='csr') print "running model..." basemodel = LogisticRegression() f_score = model_run(basemodel, train_f, test_f, y_train, y_test) print '##############f_score is: ', f_score print 'model ended.'
def input_fn(dataset_filename, vocab_filename, num_channels=39, batch_size=8, num_epochs=1): dataset = utils.read_dataset(dataset_filename, num_channels) vocab_table = utils.create_vocab_table(vocab_filename) dataset = utils.process_dataset( dataset, vocab_table, utils.SOS, utils.EOS, batch_size, num_epochs) return dataset
def load_data(): """ Load data for train. :return: features and labels, cleaned. """ print("Loading train dataset...") features = utils.read_dataset(const.SEMCOR_TRAIN) labels = utils.read_dataset(const.SEMCOR_LABEL) # clean senteces features = preprocess.clean_sentences(features) labels = preprocess.clean_sentences(labels) print("Loading dev dataset...") features_dev = utils.read_dataset(const.SE07_FEATURE) labels_dev = utils.read_dataset(const.SE07_LABEL) # clean senteces features_dev = preprocess.clean_sentences(features_dev) labels_dev = preprocess.clean_sentences(labels_dev) return features, features_dev, labels, labels_dev
def __init__(self): self.DATASET = 'twitter' # 'twitter', 'restaurant', 'laptop' self.POLARITIES_DIM = 3 self.EMBEDDING_DIM = 100 self.LEARNING_RATE = 0.01 self.INITIALIZER = initializers.RandomUniform(minval=-0.003, maxval=0.003) self.REGULARIZER = regularizers.l2(0.001) self.LSTM_PARAMS = { 'units': 200, 'activation': 'tanh', 'recurrent_activation': 'sigmoid', 'kernel_initializer': self.INITIALIZER, 'recurrent_initializer': self.INITIALIZER, 'bias_initializer': self.INITIALIZER, 'kernel_regularizer': self.REGULARIZER, 'recurrent_regularizer': self.REGULARIZER, 'bias_regularizer': self.REGULARIZER, 'dropout': 0, 'recurrent_dropout': 0, } self.MAX_SEQUENCE_LENGTH = 80 self.MAX_ASPECT_LENGTH = 10 self.BATCH_SIZE = 200 self.EPOCHS = 100 self.texts_raw_indices, self.texts_raw_without_aspects_indices, self.texts_left_indices, self.texts_left_with_aspects_indices, \ self.aspects_indices, self.texts_right_indices, self.texts_right_with_aspects_indices, \ self.polarities_matrix, \ self.embedding_matrix, \ self.tokenizer = \ read_dataset(type=self.DATASET, mode='train', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) if os.path.exists('lstm_saved_model.h5'): print('loading saved model...') self.model = load_model('lstm_saved_model.h5') else: print('Build model...') inputs = Input(shape=(self.MAX_SEQUENCE_LENGTH, )) x = Embedding(input_dim=len(self.tokenizer.word_index) + 1, output_dim=self.EMBEDDING_DIM, input_length=self.MAX_SEQUENCE_LENGTH, weights=[self.embedding_matrix], trainable=False)(inputs) x = LSTM(**self.LSTM_PARAMS)(x) x = Dense(self.POLARITIES_DIM)(x) predictions = Activation('softmax')(x) model = Model(inputs, predictions) model.summary() model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=self.LEARNING_RATE), metrics=['acc', f1]) # plot_model(model, to_file='model.png') self.model = model
def test_unseen(self): laptop_texts_raw_indices, laptop_texts_raw_without_aspects_indices, laptop_texts_left_indices, laptop_texts_left_with_aspects_indices, \ laptop_aspects_indices, laptop_texts_right_indices, laptop_texts_right_with_aspects_indices, laptop_dataset_index, \ laptop_polarities_matrix, laptop_polarities = \ read_dataset(types=['twitter'], mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.model.evaluate([laptop_texts_left_indices, laptop_texts_right_indices, laptop_dataset_index], \ [laptop_polarities_matrix, laptop_polarities_matrix, laptop_polarities_matrix], steps=1) hotel_texts_raw_indices, hotel_texts_raw_without_aspects_indices, hotel_texts_left_indices, hotel_texts_left_with_aspects_indices, \ hotel_aspects_indices, hotel_texts_right_indices, hotel_texts_right_with_aspects_indices, hotel_dataset_index, \ hotel_polarities_matrix, hotel_polarities = \ read_dataset(types=['hotel'], mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.model.evaluate([hotel_texts_left_indices, hotel_texts_right_indices, hotel_dataset_index], \ [hotel_polarities_matrix, hotel_polarities_matrix, hotel_polarities_matrix], steps=1)
def test(self): tw_texts_raw_indices, tw_texts_raw_without_aspects_indices, tw_texts_left_indices, tw_texts_left_with_aspects_indices, \ tw_aspects_indices, tw_texts_right_indices, tw_texts_right_with_aspects_indices, tw_dataset_index, \ tw_polarities_matrix,tw_polarities= \ read_dataset(types=self.DATASET, mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.model.evaluate([tw_texts_left_indices,tw_texts_right_indices],\ tw_polarities,steps=1)
def main(): ''' Main function. ''' args = parse_args() dataset = read_dataset(args.file, args.separator, args.colobs, args.varnames, args.indices) discretized = discretize(dataset) write_dataset(discretized, args.out, args.separator, args.colobs, args.varnames, args.indices)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-n', '--top_n', type=int, required=False, default=TOP_N, help='Number of documents to use.') FLAGS, _ = parser.parse_known_args() top_n = FLAGS.top_n train_dataset = utils.read_dataset(TRAIN_DATA_PATH, top_n) train_dataset = utils.augment_with_permutations(train_dataset) train_data, train_labels = utils.to_numpy(train_dataset, top_n) del train_dataset val_dataset = utils.read_dataset(VAL_DATA_PATH, top_n) val_data, val_labels = utils.to_numpy(val_dataset, top_n) del val_dataset model = arch.get_model(top_n) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) model.summary() filepath = os.path.join( MODELS_DIR, "model-" + str(uuid.uuid4()) + "-{epoch:04d}-{val_loss:.4f}-{val_acc:.4f}.hdf5") save_model = ModelCheckpoint(filepath, monitor='val_acc', verbose=0, save_best_only=False, mode='max') model.fit(train_data, train_labels, validation_data=(val_data, val_labels), batch_size=128, epochs=75, verbose=1, callbacks=[save_model])
def prepare_dataset(dataset_name): data = read_dataset('../datasets/', dataset_name + '.csv') for i in range(len(data.columns) - 1): column = data.columns[i] data[column] = pd.factorize(data[column])[0] + 1 # shuffle = data.sample(frac=1) # shuffle rows # if len(shuffle) > 5000: # shuffle = shuffle[:5000] # cut objects # shuffle.to_csv('../datasets/' + dataset_name + '.csv', index=False) data.to_csv('../datasets/' + dataset_name + '.csv', index=False)
def train(model, ckpt, manager, dataset_dir, n_epoch): for n in range(n_epoch): dataset = utils.read_dataset(dataset_dir) print(f"Epoch: {n}") for image, labels, image_id in dataset: current_loss = train_step(model, image, labels) ckpt.step.assign_add(1) if int(ckpt.step) % 10 == 0: save_path = manager.save() print("Checkpoint stored at step {}: {}".format( int(ckpt.step), save_path)) print("loss {:1.2f}".format(current_loss.numpy()))
def read_model(): start_time = time.time() print('loading and prepare data set...') data_5m = read_dataset('../datasets/internet-traffic-data-5minutes.csv') data_hoursly = read_dataset('../datasets/internet-traffic-data-hourly.csv') data_daily = read_dataset('../datasets/internet-traffic-data-daily.csv') # TODO: ändern beim merge... data_5m = data_5m['data (in bytes)'].tolist() data_daily = data_daily['data (in bytes)'].tolist() data_hoursly = data_hoursly['data (in bytes)'].tolist() rest_5m = int(len(data_5m) * 0.80) % 288 start_forecast_idx_5m = int(len(data_5m) * 0.90) - rest_5m train_5m = data_5m[:start_forecast_idx_5m] test_5m = data_5m[start_forecast_idx_5m:] rest_hoursly = int(len(data_hoursly) * 0.80) % 168 start_forecast_idx_hoursly = int(len(data_hoursly) * 0.90) - rest_hoursly train_hoursly = data_hoursly[:start_forecast_idx_hoursly] test_hoursly = data_hoursly[start_forecast_idx_hoursly:] rest_daily = int(len(data_daily) * 0.80) % 7 start_forecast_idx_daily = int(len(data_daily) * 0.90) - rest_daily train_daily = data_daily[:start_forecast_idx_daily] test_daily = data_daily[start_forecast_idx_daily:] print('5min: ', len(train_5m), " ", len(test_5m)) print('hoursly: ', len(train_hoursly), " ", len(test_hoursly)) print('daily: ', len(train_daily), " ", len(test_daily), "\n") plot_daily(train_daily, test_daily) plot_hoursly(train_hoursly, test_hoursly) plot_5min(train_5m, test_5m) #errors plot_errors(train_5m, test_5m, train_hoursly, test_hoursly, train_daily, test_daily)
def main(): args = parser.parse_args() with open(args.classifier, 'r') as f: serialized_classifier = f.read() processor = TextProcessor() classifier = Classifier.load(serialized_classifier, processor) for example in read_dataset(args.data): text = example['content'] predicted_tag = classifier.classify(text) print predicted_tag
def main(): args = parser.parse_args() data_json = read_dataset(args.data) processor = TextProcessor() classifier = Classifier(processor) classifier.train(data_json) serialized_classifier = classifier.dump() ensure_directory(args.output) with open(args.output, 'w') as f: f.write(serialized_classifier) f.write(os.linesep)
def get_weak_scaling_datasets(basedir, datasets, weak, scratch): if len(datasets) > 1: raise RuntimeError('Weak scaling with more than one data set is not yet supported') dname = datasets[0] dataset = all_datasets[dname] read = read_dataset(join(basedir, dataset[0]), dataset[3], dataset[4], dataset[5], dataset[6]) weak_datasets = [] for n in weak: dataset_n = list(dataset) dataset_n[0] = join(scratch, basename(dataset[0]) + '.' + str(n)) dataset_n[1] = n write_dataset(read.iloc[:,:n], dataset_n[0], dataset_n[3], dataset_n[4], dataset_n[5], dataset_n[6]) all_datasets.update([(dname + '.' + str(n), tuple(dataset_n))]) weak_datasets.append(dname + '.' + str(n)) return weak_datasets
def main(): args = parser.parse_args() data_json = read_dataset(args.data) random.shuffle(data_json) training_set_ratio = 0.7 training_set_size = int(training_set_ratio * len(data_json) + 0.5) training_set = data_json[:training_set_size] test_set = data_json[training_set_size:] processor = TextProcessor() classifier = Classifier(processor) classifier.train(training_set) print classifier.dump() == Classifier.load(classifier.dump(), processor).dump()
def test_model(dataset_dir, model: model.RCNN, output_dir, category_colors): total_accuracy = 0 class_correct_counts = np.zeros(model.num_classes) class_total_counts = np.zeros(model.num_classes) i = 0 for image, labels, img_id in utils.read_dataset(dataset_dir): i += 1 start_time = time.time() accuracy = 0.0 h, w = labels.shape input_image = np.append(image, np.zeros( shape=[h, w, model.num_classes], dtype=np.float32), axis=2) model([input_image]) logits1, logits2 = model.logits logits = logits1 if model.num_layers == 1 else logits2 stride = 16 if model.model_v == 1 else 4 predicted_labels = np.argmax(logits, axis=3) true_labels = labels[::stride, ::stride] correct_labels = np.equal(predicted_labels, true_labels) accuracy = np.mean(correct_labels) total_accuracy += accuracy for c in range(model.num_classes): current_class_labels = np.equal(true_labels, c) class_total_counts[c] += np.sum(current_class_labels) class_correct_counts[c] += np.sum( np.equal(true_labels, c) * correct_labels) print("Image #%d: %s: Accuracy: %f (time: %.1fs)" % ( i, img_id, accuracy, time.time() - start_time)) for layer_num in [1, 2]: output_filename = os.path.join( output_dir, img_id + '_test_%d.png' % layer_num) utils.save_labels_array(predicted_labels.astype( np.uint8), output_filename, colors=category_colors) print("%d Images, Total Accuracy: %f" % (i, total_accuracy / i)) print("Per Class correct counts:", class_correct_counts) print("Per Class totals:", class_total_counts) print("Per Class accuracy:", class_correct_counts / class_total_counts)
def read_data(self, max_train_size, max_dev_size, read_ahead=10, batch_mode='standard', shuffle=True, crash_test=False, **kwargs): utils.debug('reading training data') self.batch_iterator, self.train_size = utils.get_batch_iterator( self.filenames.train, self.extensions, self.vocabs, self.batch_size, max_size=max_train_size, character_level=self.character_level, max_seq_len=self.max_len, read_ahead=read_ahead, mode=batch_mode, shuffle=shuffle, binary=self.binary, crash_test=crash_test ) utils.debug('reading development data') dev_sets = [ utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size, character_level=self.character_level, binary=self.binary)[0] for dev in self.filenames.dev ] # subset of the dev set whose loss is periodically evaluated self.dev_batches = [utils.get_batches(dev_set, batch_size=self.batch_size) for dev_set in dev_sets]
def train(self): # tbCallBack = TensorBoard(log_dir='./ram_logs', histogram_freq=0, write_graph=True, write_images=True) def modelSave(epoch, logs): if (epoch + 1) % 5 == 0: self.model.save('lstm_saved_model.h5') msCallBack = LambdaCallback(on_epoch_end=modelSave) texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \ aspects_indices, texts_right_indices, texts_right_with_aspects_indices, \ polarities_matrix = \ read_dataset(type=self.DATASET, mode='test', embedding_dim=self.EMBEDDING_DIM, max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH) self.model.fit([self.texts_raw_indices, self.aspects_indices], self.polarities_matrix, epochs=self.EPOCHS, batch_size=self.BATCH_SIZE,callbacks=[msCallBack]) scores = self.model.evaluate([texts_raw_indices, aspects_indices], polarities_matrix, verbose=0) print("Loss :", scores[0], "Accuracy", scores[1]*100)
def input_fn(dataset_filename, vocab_filename, norm_filename=None, num_channels=39, batch_size=8, num_epochs=1, binf2phone=None, num_parallel_calls=32, max_frames=-1, max_symbols=-1): binary_targets = binf2phone is not None labels_shape = [] if not binary_targets else len(binf2phone.index) labels_dtype = tf.string if not binary_targets else tf.float32 dataset = utils.read_dataset(dataset_filename, num_channels, labels_shape=labels_shape, labels_dtype=labels_dtype) vocab_table = utils.create_vocab_table(vocab_filename) if norm_filename is not None: means, stds = utils.load_normalization(args.norm) else: means = stds = None sos = binf2phone[utils.SOS].values if binary_targets else utils.SOS eos = binf2phone[utils.EOS].values if binary_targets else utils.EOS dataset = utils.process_dataset(dataset, vocab_table, sos, eos, means, stds, batch_size, num_epochs, binary_targets=binary_targets, labels_shape=labels_shape, num_parallel_calls=num_parallel_calls, max_frames=max_frames, max_symbols=max_symbols) return dataset
def main(): args = parser.parse_args() full_data_json = read_dataset(args.data) # for n in xrange(30, len(full_data_json), 30): for n in [len(full_data_json)]: corrects = 0 total = 0 for _ in xrange(SAMPLES): random.shuffle(full_data_json) data_json = full_data_json[:n] training_set_ratio = 0.7 training_set_size = int(training_set_ratio * len(data_json) + 0.5) training_set = data_json[:training_set_size] test_set = data_json[training_set_size:] processor = TextProcessor() classifier = Classifier(processor) classifier.train(training_set) for example in test_set: text = example["content"] predicted_tag = classifier.classify(text) expected_tag = classifier.normalize_tag_label(example["tag"]) if expected_tag in Classifier.IGNORE_TAGS: continue if predicted_tag == expected_tag: corrects += 1 else: # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag) pass total += 1 print "{} {}".format(len(data_json), float(corrects) / total)
def input_fn(dataset_filename, vocab_filename, norm_filename=None, num_channels=39, batch_size=8, take=0, binf2phone=None): binary_targets = binf2phone is not None labels_shape = [] if not binary_targets else len(binf2phone.index) labels_dtype = tf.string if not binary_targets else tf.float32 dataset = utils.read_dataset(dataset_filename, num_channels, labels_shape=labels_shape, labels_dtype=labels_dtype) vocab_table = utils.create_vocab_table(vocab_filename) if norm_filename is not None: means, stds = utils.load_normalization(args.norm) else: means = stds = None sos = binf2phone[utils.SOS].values if binary_targets else utils.SOS eos = binf2phone[utils.EOS].values if binary_targets else utils.EOS dataset = utils.process_dataset(dataset, vocab_table, sos, eos, means, stds, batch_size, 1, binary_targets=binary_targets, labels_shape=labels_shape, is_infer=True) if args.take > 0: dataset = dataset.take(take) return dataset
def main(): args = parser.parse_args() full_data_json = read_dataset(args.data) # for n in xrange(30, len(full_data_json), 30): for n in [len(full_data_json)]: corrects = 0 total = 0 for _ in xrange(SAMPLES): random.shuffle(full_data_json) data_json = full_data_json[:n] training_set_ratio = 0.7 training_set_size = int(training_set_ratio * len(data_json) + 0.5) training_set = data_json[:training_set_size] test_set = data_json[training_set_size:] processor = TextProcessor() classifier = Classifier(processor) classifier.train(training_set) for example in test_set: text = example['content'] predicted_tag = classifier.classify(text) expected_tag = classifier.normalize_tag_label(example['tag']) if expected_tag in Classifier.IGNORE_TAGS: continue if predicted_tag == expected_tag: corrects += 1 else: # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag) pass total += 1 print '{} {}'.format(len(data_json), float(corrects) / total)
def hyper_parameter_search(max_evals=100): from hyperopt import fmin, tpe, hp, STATUS_OK, STATUS_FAIL data = read_dataset('../datasets/internet-traffic-data-5minutes.csv') space = { 'nneurons': hp.randint('nneurons', 41), 'window': hp.randint('window', 2048), 'season': hp.choice('season', ['no_season', 'half_day', 'full_day']), 'activation_function': hp.choice('func', ['sigmoid', 'tanh', 'relu']) } def objective(params): nneurons = params['nneurons'] if params['season'] == 'full_day': window = create_window_array(params['window'], season_lag=288) if params['season'] == 'half_day': window = create_window_array(params['window'], season_lag=168) else: window = create_window_array(params['window']) if not any(window) or nneurons < 2: return {'status': STATUS_FAIL} X_train, y_train, *_ = split_dataset( data, window, ratio=0.90, standardize=True) model = compile_model( nneurons, input_dim=sum(1 for x in window if x), loss_fn='mse', activation_fn=params['activation_function']) hist = model.fit( X_train, y_train, nb_epoch=50, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=2)], verbose=0) return {'loss': hist.history['val_loss'][-1], 'status': STATUS_OK} return fmin(objective, space=space, algo=tpe.suggest, max_evals=max_evals)
def parse_datasets(args): ''' Get datasets to be used for the experiments. ''' from os.path import splitext experiment_datasets = [] if args.dataset is None: args.dataset = list(big_datasets.keys()) for d in args.dataset: if d in all_datasets: experiment_datasets.append(d) elif d in dataset_groups: experiment_datasets.extend(list(dataset_groups[d].keys())) else: try: rd = read_dataset(d, args.separator, args.colobs, args.varnames, args.indices) m, n = rd.shape name = splitext(basename(d))[0] all_datasets[name] = (d, n, m, args.separator, args.colobs, args.varnames, args.indices) experiment_datasets.append(name) except: raise RuntimeError('Dataset %s is not recognized' % d) args.dataset = experiment_datasets
def forecasting_error_experiment(): print('load data set...') data_df = read_dataset('../datasets/internet-traffic-data-5minutes.csv') mean = data_df.mean() std = data_df.std() data_df -= mean data_df /= std data = data_df['data (in bytes)'] start_forecast_idx = int(len(data) * 0.90) training_data = data[:start_forecast_idx] test_data = data[start_forecast_idx:] print('calculate forecasts using the naive method...') naive_forecast = [data[t-1] for t in range(start_forecast_idx, len(data))] naive_forecast_errors = [] for steps in trange(24): forecast = naive_forecast[:len(naive_forecast)-steps] error = mase(training_data, test_data[steps:], forecast) naive_forecast_errors.append(error) print('calculate forecasts using a MLP neural network...') mlp_model = load_model('Saturday_192115', 'mse') window = create_window_array(139, 288) window_size = sum(1 for x in window if x) mlp_forecast_errors = [] for steps in trange(24): mlp_forecast = [] for t in range(start_forecast_idx, len(data) - steps): mlp_forecast.append(iterative_prediction( mlp_model, data[:t], (1, window_size), window, steps+1)) mlp_forecast_errors.append( mase(training_data, test_data[steps:], mlp_forecast)) print('calculate forecasts using a LSTM neural network...') lstm_model = load_model('Saturday_181721', 'mse') window = [True] * 19 lstm_forecast_errors = [] for steps in trange(24): lstm_forecast = [] for t in range(start_forecast_idx, len(data) - steps): lstm_forecast.append(iterative_prediction( lstm_model, data[:t], (1, len(window), 1), window, steps+1)) lstm_forecast_errors.append( mase(training_data, test_data[steps:], lstm_forecast)) print('calculate forecasts using a deep LSTM neural network...') lstm_model = load_model('Saturday_171936', 'mse') window = [True] * 14 deep_lstm_forecast_errors = [] for steps in trange(24): lstm_forecast = [] for t in range(start_forecast_idx, len(data) - steps): lstm_forecast.append(iterative_prediction( lstm_model, data[:t], (1, len(window), 1), window, steps+1)) deep_lstm_forecast_errors.append( mase(training_data, test_data[steps:], lstm_forecast)) plt.ylabel('Error') plt.xlabel('Steps') plt.plot(naive_forecast_errors, label='naïve') plt.plot(mlp_forecast_errors, label='MLP') plt.plot(lstm_forecast_errors, label='1 layer LSTM') plt.plot(deep_lstm_forecast_errors, label='2 layer LSTM') plt.legend(loc='upper left') plt.show()
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor import utils trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/factorized1.npz') extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= 50, max_depth = 35, min_samples_leaf= 4, n_jobs = 4) y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf) y = y.reshape((len(y),1)) trainy= trainy.reshape((len(trainy), 1)) utils.save_dataset("data/extra_trees_factorized.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names= ['extra_trees'])
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor import utils trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/no_big_cats1.npz') extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= .3, max_depth = 30, min_samples_leaf= 4, n_jobs = 4) y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf) y = y.reshape((len(y),1)) trainy= trainy.reshape((len(trainy), 1)) utils.save_dataset("data/random_forest2.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names=['random_forest'])