def get_split_entropy(dataset, feature, split):
    '''
    calculate the entropy of two datasets, formed by splitting 'dataset' according to 'split' and 'feature'.
    note that we don't need to normalise the total entropy (normalisation is arbitrary)
    '''
    left_dataset, right_dataset = helpers.split_dataset(
        dataset, split, feature)
    return get_entropy(left_dataset) * len(left_dataset) + get_entropy(
        right_dataset) * len(right_dataset)
Ejemplo n.º 2
0
def make_algorithms(list_algorithms, patient_x, patient_y, set_split=0.66):
    split_index = int(len(patient_x) * set_split)

    train_x, train_y, test_x, test_y = split_dataset(patient_x, patient_y,
                                                     split_index)
    trained_models = train_algorithms(list_algorithms, train_x, train_y)
    tested_models = test_algorithms(trained_models, test_x)
    eval_models = eval_algorithms(tested_models, test_y)

    return eval_models
Ejemplo n.º 3
0
def train():
    print "Training started ...."
    #metadata = data_utils.ourmodel.data_util.load_metadata()
    metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data()
    (trainX, trainY), (testX,
                       testY), (validX,
                                validY) = helpers.split_dataset(idx_q, idx_a)

    model = create_model(metadata, trainX.shape[-1], trainY.shape[-1])

    if FLAGS.celltype == 'GRU':
        if FLAGS.attention == False:
            ckpt_paths = 'ckpt/checkpoint/GRU/noAttention/'
        else:
            ckpt_paths = 'ckpt/checkpoint/GRU/Attention'

    else:
        if FLAGS.attention == False:
            ckpt_paths = 'ckpt/checkpoint/LSTM/noAttention/'
        else:
            ckpt_paths = 'ckpt/checkpoint/LSTM/Attention'

    print "Check if model exist already to retrieve"

    ckpt = tf.train.get_checkpoint_state(ckpt_paths)
    if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)

        sess = model.restore_last_session()
    else:

        print("Created model with fresh parameters.")
        batch_size = FLAGS.batch_size
        #val_batch_gen = helpers.rand_batch_gen(validX, validY, batch_size)
        #train_batch_gen = helpers.rand_batch_gen(trainX, trainY, batch_size)
        #sess =  model.train(train_batch_gen, val_batch_gen)
        sess = model.train_batch_file(batch_size=batch_size)

    print "Training Complete"
Ejemplo n.º 4
0
def decode():
    print "This is for interactive Version....."

    metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data()
    (trainX, trainY), (testX,
                       testY), (validX,
                                validY) = helpers.split_dataset(idx_q, idx_a)

    model = create_model(metadata, trainX.shape[-1], trainY.shape[-1])

    sess = model.restore_last_session()
    sys.stdout.write("> ")
    sys.stdout.flush()
    sentence = sys.stdin.readline()
    while sentence:
        #process input strings now"
        inputs = data_utils.ourmodel.data_util.get_tokens(sentence)
        fqtokens = [w for w in inputs if not w in stopwords.words('english')]
        processed_input = data_utils.ourmodel.data_util.zero_pad_single(
            fqtokens, metadata['w2idx'])
        #sess = model.restore_last_session()
        output = model.predict(sess, processed_input.T)
        #replies = []

        for ii, ot in zip(processed_input, output.T):
            q = helpers.decode(sequence=ii,
                               lookup=metadata['idx2w'],
                               separator=' ')
            decoded = helpers.decode(sequence=ot,
                                     lookup=metadata['idx2w'],
                                     separator=' ').split(' ')

            #if decoded.count('unk') == 0:
            #   if decoded not in replies:
            print('Review : [{0}]; Summary : [{1}]'.format(
                q, ' '.join(decoded)))

        sys.stdout.flush()
        sentence = sys.stdin.readline()
def decision_tree_learning(training_dataset, depth, right_moves=None):
    '''
    return a node that represents the optimal decision (wrt information gain) represented by this node
    '''
    X, y = extract_X_y(training_dataset)

    # if all labels are the same, make this a leaf node
    if np.all(y == y[0]).all():
        return Node(depth, value=y[0], right_moves=right_moves), depth

    else:
        # find the optimal split
        split, feature_to_split = find_split(training_dataset)

        # if split is undefined, make the current node a leaf node with modal value
        if feature_to_split is None:
            return Node(depth, alue=get_mode(y),
                        right_moves=right_moves), depth

        # define the current node, use recursion to define its children
        node = Node(depth,
                    feature=feature_to_split,
                    threshold=split,
                    right_moves=right_moves)
        l_dataset, r_dataset = split_dataset(training_dataset, split,
                                             feature_to_split)
        l_branch, l_depth = decision_tree_learning(l_dataset, depth + 1,
                                                   right_moves)
        if right_moves is not None:
            right_moves += 1
        r_branch, r_depth = decision_tree_learning(r_dataset, depth + 1,
                                                   right_moves)
        node.left_child = l_branch
        node.right_child = r_branch

        # return the current node
        return node, max(l_depth, r_depth)
Ejemplo n.º 6
0
def run():
    logging.basicConfig(format='%(message)s',
                        level=logging.INFO,
                        filename='results.log',
                        filemode='w')

    # load the new file
    df = read_csv('pre-processed-in-24-hours.csv',
                  index_col=0,
                  parse_dates=True)

    for cell in [108 * 2 + 1]:  # :
        for epoch in [
                1000,
        ]:  # [1000, 2000, 3000, 4000, 5000]:
            for batch_size in [
                    500,
            ]:  # [500, 1000, 1500]:
                for n_input in [1, 2, 4, 8, 12, 16]:
                    for n_out in range(1, 9):
                        logging.info(
                            "Starting... cell {0}, epoch {1}, batch_size {2}, input {3} and output {4}" \
                                .format(cell
                                        , epoch
                                        , batch_size
                                        , n_input
                                        , n_out))
                        try:
                            logging.info("Training {} {}".format(
                                n_input, n_out))
                            # transform data
                            scaler, data_scaled = scale(df.values)

                            train, test = split_dataset(df.values, n_out)
                            train_scaled, test_scaled = split_dataset(
                                data_scaled, n_out)

                            # restructure into window size
                            train_scaled, test_scaled = restructure_data_by_window(
                                train_scaled, test_scaled, n_out)
                            train, test = restructure_data_by_window(
                                train, test, n_out)

                            # fit model
                            model = build_model(train_scaled, n_input, n_out,
                                                cell, epoch, batch_size)

                            # history is a list by window size
                            history_scaled = [
                                x for x in train_scaled[:n_input, :, :]
                            ]
                            history = [x for x in train[:n_input, :, :]]

                            train_walk_foward_validation(
                                history, history_scaled, model, n_input,
                                scaler, train, train_scaled)

                            predictions_inverted = test_walk_foward_validation(
                                model, n_input, scaler, test, test_scaled,
                                train, train_scaled)

                            logging.info("predictions_inverted: {}".format(
                                predictions_inverted.shape))
                            logging.info("test {}".format(test.shape))

                            data = {
                                'predict':
                                predictions_inverted.reshape(
                                    predictions_inverted.shape[0] *
                                    predictions_inverted.shape[1]),
                                'real':
                                test[:, :, 0].reshape(test[:, :, 0].shape[0] *
                                                      test[:, :, 0].shape[1])
                            }

                            data['time'] = df.index[-data["predict"].shape[0]:]

                            df_plot = pandas.DataFrame.from_dict(data)
                            df_plot.to_csv('plot_results_{0}_{1}.csv'.format(
                                n_input, n_out))
                            plot_results(df_plot)
                            plot_scatter(df_plot)

                        except Exception as e:
                            logging.info(e)
Ejemplo n.º 7
0
def self_test():
    print " In Test Mode"
    metadata, idx_q, idx_a = data_utils.ourmodel.data_util.load_data()
    (trainX, trainY), (testX,
                       testY), (validX,
                                validY) = helpers.split_dataset(idx_q, idx_a)

    model = create_model(metadata, trainX.shape[-1], trainY.shape[-1])

    if FLAGS.celltype == 'GRU':
        if FLAGS.attention == False:
            ckpt_paths = 'ckpt/checkpoint/GRU/noAttention/'
        else:
            ckpt_paths = 'ckpt/checkpoint/GRU/Attention/'

    else:
        if FLAGS.attention == False:
            ckpt_paths = 'ckpt/checkpoint/LSTM/noAttention/'
        else:
            ckpt_paths = 'ckpt/checkpoint/LSTM/Attention/'

    print "Retrieving Last Model State"
    XX = np.load('datasets/test_review.npy', mmap_mode='r')
    YY = np.load('datasets/test_summary.npy', mmap_mode='r')
    result = [[0 for x in range(6)] for y in range(XX.shape[0])]

    sess = model.restore_last_session()
    batch_size = 16
    if sess:
        for i in range(0, XX.shape[0], batch_size):
            if (i + 1) + batch_size < XX.shape[0]:
                output = model.predict(sess, XX[i:(i + 1) + batch_size].T)
                nn = XX[i:(i + 1) + batch_size]
                for j in range(nn.shape[0]):
                    result[i + j][0] = helpers.decode(sequence=XX[i + j],
                                                      lookup=metadata['idx2w'],
                                                      separator=' ')
                    result[i + j][1] = helpers.decode(sequence=YY[i + j],
                                                      lookup=metadata['idx2w'],
                                                      separator=' ')

                    result[i + j][2] = helpers.decode(sequence=output.T[j],
                                                      lookup=metadata['idx2w'],
                                                      separator=' ')

                    if len(result[i + j][2]) == 0:
                        result[i + j][2] = ['UNK']
                    if len(result[i + j][1]) != 0:
                        result[i + j][3] = score.rouge_n(
                            result[i + j][2], result[i + j][1], 1)
                        result[i + j][4] = score.bleu(result[i + j][2],
                                                      result[i + j][1], 1)
                        result[i + j][5] = score.f1(result[i + j][3],
                                                    result[i + j][4])
                    else:
                        result[i + j][3] = result[i + j][4] = result[i +
                                                                     j][5] = 0
    df = pd.DataFrame(result)
    df.columns = [
        "Review", "Actual Summary", "Generated Summary", "Rogue1", "Bleu1",
        "F1"
    ]
    df = df[:-batch_size]
    print("Average Rogue-1 = %.3f, Max Rouge-1 =%.3f,Min Rogue-1 = %.3f" %
          (df["Rogue1"].mean(), df["Rogue1"].max(), df["Rogue1"].min()))
    print("Average Bleu1 = %.3f, Max Bleu1=%.3f,Min Bleu1 = %.3f" %
          (df["Bleu1"].mean(), df["Bleu1"].max(), df["Bleu1"].min()))
    print("Average F1 = %.3f, Max F1=%.3f,Min F1 = %.3f" %
          (df["F1"].mean(), df["F1"].max(), df["F1"].min()))
    result_file = 'results/default.csv'
    if FLAGS.celltype == 'GRU':
        if FLAGS.attention == False:
            result_file = 'results/GRU_noAttention.csv'
        else:
            result_file = 'results/GRU_Attention.csv'

    else:
        if FLAGS.attention == False:
            result_file = 'results/LSTM_noAttention.csv'
        else:
            result_file = 'results/LSTM_Attention'
    df.to_csv(result_file)
Ejemplo n.º 8
0
    def test_split_data(self):
        data = load_csv(self.path)

        trainset, testset = split_dataset(data, test_ratio=0.4)
        self.assertEqual(trainset.shape, (6, ))
        self.assertEqual(testset.shape, (4, ))
Ejemplo n.º 9
0
# IMPORTS
import math
import torch
from torch import FloatTensor, LongTensor, Tensor
# our own written code
import helpers as HL

### Welcoming
print('Linear, ReLU, Linear, ReLU, Linear, Tanh, Linear, Tanh')
print('300 epochs')

### Generate data
inputs, targets = HL.generate_disc_data(n=1000)

### Split the dataset into train, validation and test set
train_inputs, train_targets, validation_inputs, validation_targets, test_inputs, test_targets = HL.split_dataset(
    inputs, targets, train_perc=0.7, val_perc=0.1, test_perc=0.2)

### Normalize data
mu, std = inputs.mean(), inputs.std()
train_inputs.sub_(mu).div_(std)
validation_inputs.sub_(mu).div_(std)
test_inputs.sub_(mu).div_(std)

### Create model
input_dim = 2
hidden_width = 25
output_dim = 2

model = HL.Sequential([
    HL.Linear(input_dim, hidden_width),
    HL.ReLu(),
Ejemplo n.º 10
0
df = df[df['week'] > df['week'].max() - PAST_TIMESTEPS]
df['week'] = map(lambda x: df['week'].max()-x)
df = df.group_by('Client').apply(lambda x: merge(x, PAST_TIMESTEPS))
print(df)
>>>>>>> Stashed changes
#####

scaled = scale_data(df_train)
trainset = scaled
# series_to_supervised(scaled, PAST_TIMESTEPS, FUTURE_TIMESTEPS)

scaled = scale_data(df_test)
testset = scaled
# series_to_supervised(scaled, PAST_TIMESTEPS, FUTURE_TIMESTEPS)

x_train, y_train, x_test, y_test = split_dataset(trainset, testset, PAST_TIMESTEPS, FUTURE_TIMESTEPS)
predictor = build_predictor((x_train.shape[1], x_train.shape[2]))
fit_predictor(predictor, x_train, y_train, x_test, y_test, EPOCHS, BATCH_SIZE)

#####


# TODO: OneHotEncoding
from sklearn.preprocessing import LabelEncoder
df.Client = LabelEncoder().fit_transform(df.Client)
df.Poblacio = LabelEncoder().fit_transform(df.Poblacio)
df.Tipus_Client = LabelEncoder().fit_transform(df.Tipus_Client)
df.Article = LabelEncoder().fit_transform(df.Article)
df.Congelat = LabelEncoder().fit_transform(df.Congelat)
df.Lloc_Descarrega = LabelEncoder().fit_transform(df.Lloc_Descarrega)
df.Bonarea = LabelEncoder().fit_transform(df.Bonarea)