def get_split_entropy(dataset, feature, split): ''' calculate the entropy of two datasets, formed by splitting 'dataset' according to 'split' and 'feature'. note that we don't need to normalise the total entropy (normalisation is arbitrary) ''' left_dataset, right_dataset = helpers.split_dataset( dataset, split, feature) return get_entropy(left_dataset) * len(left_dataset) + get_entropy( right_dataset) * len(right_dataset)
def make_algorithms(list_algorithms, patient_x, patient_y, set_split=0.66): split_index = int(len(patient_x) * set_split) train_x, train_y, test_x, test_y = split_dataset(patient_x, patient_y, split_index) trained_models = train_algorithms(list_algorithms, train_x, train_y) tested_models = test_algorithms(trained_models, test_x) eval_models = eval_algorithms(tested_models, test_y) return eval_models
def train(): print "Training started ...." #metadata = data_utils.ourmodel.data_util.load_metadata() metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data() (trainX, trainY), (testX, testY), (validX, validY) = helpers.split_dataset(idx_q, idx_a) model = create_model(metadata, trainX.shape[-1], trainY.shape[-1]) if FLAGS.celltype == 'GRU': if FLAGS.attention == False: ckpt_paths = 'ckpt/checkpoint/GRU/noAttention/' else: ckpt_paths = 'ckpt/checkpoint/GRU/Attention' else: if FLAGS.attention == False: ckpt_paths = 'ckpt/checkpoint/LSTM/noAttention/' else: ckpt_paths = 'ckpt/checkpoint/LSTM/Attention' print "Check if model exist already to retrieve" ckpt = tf.train.get_checkpoint_state(ckpt_paths) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) sess = model.restore_last_session() else: print("Created model with fresh parameters.") batch_size = FLAGS.batch_size #val_batch_gen = helpers.rand_batch_gen(validX, validY, batch_size) #train_batch_gen = helpers.rand_batch_gen(trainX, trainY, batch_size) #sess = model.train(train_batch_gen, val_batch_gen) sess = model.train_batch_file(batch_size=batch_size) print "Training Complete"
def decode(): print "This is for interactive Version....." metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data() (trainX, trainY), (testX, testY), (validX, validY) = helpers.split_dataset(idx_q, idx_a) model = create_model(metadata, trainX.shape[-1], trainY.shape[-1]) sess = model.restore_last_session() sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: #process input strings now" inputs = data_utils.ourmodel.data_util.get_tokens(sentence) fqtokens = [w for w in inputs if not w in stopwords.words('english')] processed_input = data_utils.ourmodel.data_util.zero_pad_single( fqtokens, metadata['w2idx']) #sess = model.restore_last_session() output = model.predict(sess, processed_input.T) #replies = [] for ii, ot in zip(processed_input, output.T): q = helpers.decode(sequence=ii, lookup=metadata['idx2w'], separator=' ') decoded = helpers.decode(sequence=ot, lookup=metadata['idx2w'], separator=' ').split(' ') #if decoded.count('unk') == 0: # if decoded not in replies: print('Review : [{0}]; Summary : [{1}]'.format( q, ' '.join(decoded))) sys.stdout.flush() sentence = sys.stdin.readline()
def decision_tree_learning(training_dataset, depth, right_moves=None): ''' return a node that represents the optimal decision (wrt information gain) represented by this node ''' X, y = extract_X_y(training_dataset) # if all labels are the same, make this a leaf node if np.all(y == y[0]).all(): return Node(depth, value=y[0], right_moves=right_moves), depth else: # find the optimal split split, feature_to_split = find_split(training_dataset) # if split is undefined, make the current node a leaf node with modal value if feature_to_split is None: return Node(depth, alue=get_mode(y), right_moves=right_moves), depth # define the current node, use recursion to define its children node = Node(depth, feature=feature_to_split, threshold=split, right_moves=right_moves) l_dataset, r_dataset = split_dataset(training_dataset, split, feature_to_split) l_branch, l_depth = decision_tree_learning(l_dataset, depth + 1, right_moves) if right_moves is not None: right_moves += 1 r_branch, r_depth = decision_tree_learning(r_dataset, depth + 1, right_moves) node.left_child = l_branch node.right_child = r_branch # return the current node return node, max(l_depth, r_depth)
def run(): logging.basicConfig(format='%(message)s', level=logging.INFO, filename='results.log', filemode='w') # load the new file df = read_csv('pre-processed-in-24-hours.csv', index_col=0, parse_dates=True) for cell in [108 * 2 + 1]: # : for epoch in [ 1000, ]: # [1000, 2000, 3000, 4000, 5000]: for batch_size in [ 500, ]: # [500, 1000, 1500]: for n_input in [1, 2, 4, 8, 12, 16]: for n_out in range(1, 9): logging.info( "Starting... cell {0}, epoch {1}, batch_size {2}, input {3} and output {4}" \ .format(cell , epoch , batch_size , n_input , n_out)) try: logging.info("Training {} {}".format( n_input, n_out)) # transform data scaler, data_scaled = scale(df.values) train, test = split_dataset(df.values, n_out) train_scaled, test_scaled = split_dataset( data_scaled, n_out) # restructure into window size train_scaled, test_scaled = restructure_data_by_window( train_scaled, test_scaled, n_out) train, test = restructure_data_by_window( train, test, n_out) # fit model model = build_model(train_scaled, n_input, n_out, cell, epoch, batch_size) # history is a list by window size history_scaled = [ x for x in train_scaled[:n_input, :, :] ] history = [x for x in train[:n_input, :, :]] train_walk_foward_validation( history, history_scaled, model, n_input, scaler, train, train_scaled) predictions_inverted = test_walk_foward_validation( model, n_input, scaler, test, test_scaled, train, train_scaled) logging.info("predictions_inverted: {}".format( predictions_inverted.shape)) logging.info("test {}".format(test.shape)) data = { 'predict': predictions_inverted.reshape( predictions_inverted.shape[0] * predictions_inverted.shape[1]), 'real': test[:, :, 0].reshape(test[:, :, 0].shape[0] * test[:, :, 0].shape[1]) } data['time'] = df.index[-data["predict"].shape[0]:] df_plot = pandas.DataFrame.from_dict(data) df_plot.to_csv('plot_results_{0}_{1}.csv'.format( n_input, n_out)) plot_results(df_plot) plot_scatter(df_plot) except Exception as e: logging.info(e)
def self_test(): print " In Test Mode" metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data() (trainX, trainY), (testX, testY), (validX, validY) = helpers.split_dataset(idx_q, idx_a) model = create_model(metadata, trainX.shape[-1], trainY.shape[-1]) if FLAGS.celltype == 'GRU': if FLAGS.attention == False: ckpt_paths = 'ckpt/checkpoint/GRU/noAttention/' else: ckpt_paths = 'ckpt/checkpoint/GRU/Attention/' else: if FLAGS.attention == False: ckpt_paths = 'ckpt/checkpoint/LSTM/noAttention/' else: ckpt_paths = 'ckpt/checkpoint/LSTM/Attention/' print "Retrieving Last Model State" XX = np.load('datasets/test_review.npy', mmap_mode='r') YY = np.load('datasets/test_summary.npy', mmap_mode='r') result = [[0 for x in range(6)] for y in range(XX.shape[0])] sess = model.restore_last_session() batch_size = 16 if sess: for i in range(0, XX.shape[0], batch_size): if (i + 1) + batch_size < XX.shape[0]: output = model.predict(sess, XX[i:(i + 1) + batch_size].T) nn = XX[i:(i + 1) + batch_size] for j in range(nn.shape[0]): result[i + j][0] = helpers.decode(sequence=XX[i + j], lookup=metadata['idx2w'], separator=' ') result[i + j][1] = helpers.decode(sequence=YY[i + j], lookup=metadata['idx2w'], separator=' ') result[i + j][2] = helpers.decode(sequence=output.T[j], lookup=metadata['idx2w'], separator=' ') if len(result[i + j][2]) == 0: result[i + j][2] = ['UNK'] if len(result[i + j][1]) != 0: result[i + j][3] = score.rouge_n( result[i + j][2], result[i + j][1], 1) result[i + j][4] = score.bleu(result[i + j][2], result[i + j][1], 1) result[i + j][5] = score.f1(result[i + j][3], result[i + j][4]) else: result[i + j][3] = result[i + j][4] = result[i + j][5] = 0 df = pd.DataFrame(result) df.columns = [ "Review", "Actual Summary", "Generated Summary", "Rogue1", "Bleu1", "F1" ] df = df[:-batch_size] print("Average Rogue-1 = %.3f, Max Rouge-1 =%.3f,Min Rogue-1 = %.3f" % (df["Rogue1"].mean(), df["Rogue1"].max(), df["Rogue1"].min())) print("Average Bleu1 = %.3f, Max Bleu1=%.3f,Min Bleu1 = %.3f" % (df["Bleu1"].mean(), df["Bleu1"].max(), df["Bleu1"].min())) print("Average F1 = %.3f, Max F1=%.3f,Min F1 = %.3f" % (df["F1"].mean(), df["F1"].max(), df["F1"].min())) result_file = 'results/default.csv' if FLAGS.celltype == 'GRU': if FLAGS.attention == False: result_file = 'results/GRU_noAttention.csv' else: result_file = 'results/GRU_Attention.csv' else: if FLAGS.attention == False: result_file = 'results/LSTM_noAttention.csv' else: result_file = 'results/LSTM_Attention' df.to_csv(result_file)
def test_split_data(self): data = load_csv(self.path) trainset, testset = split_dataset(data, test_ratio=0.4) self.assertEqual(trainset.shape, (6, )) self.assertEqual(testset.shape, (4, ))
# IMPORTS import math import torch from torch import FloatTensor, LongTensor, Tensor # our own written code import helpers as HL ### Welcoming print('Linear, ReLU, Linear, ReLU, Linear, Tanh, Linear, Tanh') print('300 epochs') ### Generate data inputs, targets = HL.generate_disc_data(n=1000) ### Split the dataset into train, validation and test set train_inputs, train_targets, validation_inputs, validation_targets, test_inputs, test_targets = HL.split_dataset( inputs, targets, train_perc=0.7, val_perc=0.1, test_perc=0.2) ### Normalize data mu, std = inputs.mean(), inputs.std() train_inputs.sub_(mu).div_(std) validation_inputs.sub_(mu).div_(std) test_inputs.sub_(mu).div_(std) ### Create model input_dim = 2 hidden_width = 25 output_dim = 2 model = HL.Sequential([ HL.Linear(input_dim, hidden_width), HL.ReLu(),
df = df[df['week'] > df['week'].max() - PAST_TIMESTEPS] df['week'] = map(lambda x: df['week'].max()-x) df = df.group_by('Client').apply(lambda x: merge(x, PAST_TIMESTEPS)) print(df) >>>>>>> Stashed changes ##### scaled = scale_data(df_train) trainset = scaled # series_to_supervised(scaled, PAST_TIMESTEPS, FUTURE_TIMESTEPS) scaled = scale_data(df_test) testset = scaled # series_to_supervised(scaled, PAST_TIMESTEPS, FUTURE_TIMESTEPS) x_train, y_train, x_test, y_test = split_dataset(trainset, testset, PAST_TIMESTEPS, FUTURE_TIMESTEPS) predictor = build_predictor((x_train.shape[1], x_train.shape[2])) fit_predictor(predictor, x_train, y_train, x_test, y_test, EPOCHS, BATCH_SIZE) ##### # TODO: OneHotEncoding from sklearn.preprocessing import LabelEncoder df.Client = LabelEncoder().fit_transform(df.Client) df.Poblacio = LabelEncoder().fit_transform(df.Poblacio) df.Tipus_Client = LabelEncoder().fit_transform(df.Tipus_Client) df.Article = LabelEncoder().fit_transform(df.Article) df.Congelat = LabelEncoder().fit_transform(df.Congelat) df.Lloc_Descarrega = LabelEncoder().fit_transform(df.Lloc_Descarrega) df.Bonarea = LabelEncoder().fit_transform(df.Bonarea)