Example #1
0
    def load_raw_data(self, nrows=None, save_to_hdf=False):
        """Load the data from the csv or hdf (if it exists)
        
        Parameters
        ----------
        nrows : int or None
            The number of lines to load in the train.
        save_to_hdf : bool
            If the file is saved in hdf after beeing loaded            
        """
        train = load_train(nrows=nrows, save_to_hdf=save_to_hdf)
        test = load_test(save_to_hdf=save_to_hdf)

        train.drop(self.drop_features, axis=1, inplace=True)
        test.drop(self.drop_features, axis=1, inplace=True)

        correct_dates(train)
        to_datetime(train, keep_dates=True)
        correct_dates(test)
        to_datetime(test, keep_dates=True)
        test.loc[test.listen_type == 0, 'listen_type'] = 1

        train['diff_days'] = (train.dt_listen - train.dt_media).dt.days
        test['diff_days'] = (test.dt_listen - test.dt_media).dt.days
        train.drop([
            'dt_listen',
            'dt_media',
        ], axis=1, inplace=True)
        test.drop([
            'dt_listen',
            'dt_media',
        ], axis=1, inplace=True)

        train.to_pickle('../input/train_clean.pkl')
        test.to_pickle('../input/test_clean.pkl')

        self.train = train
        self.test = test
Example #2
0
max_features = 10000  # number of words to consider as features
maxlen = 500  # cut texts after this number of words (among top max_features most common words)
batch_size = 32
#print('Loading data...')
#(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
#print(len(input_train), 'train sequences')
#print(len(input_test), 'test sequences')
#print('Pad sequences (samples x time)')
#input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
#input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
#print('input_train shape:', input_train.shape)
#print('input_test shape:', input_test.shape)

input_train, y_train, x_val, y_val = loading.load_train()
input_test, y_test = loading.load_test()
print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)

from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Embedding

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(SimpleRNN(32))
model.add(Dense(3, activation='sigmoid'))
model.summary()
#model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
Example #3
0
@author: xabuka
using RNN, LSTM, GRU , BI 
"""

import loading

max_features = 10000
max_len = 100
training_samples = 1000  # We will be training on 200 samples
validation_samples = 2000  # We will be validating on 10000 samples
data_dir = '../data/SplitedPalSent'
#'/Users/xabuka/PycharmProjects/measuring_acceptability/python-files/aclImdb' #

input_train, y_train = loading.load_train(data_dir,max_len,training_samples,validation_samples,max_features, Validation = False )
input_test, y_test = loading.load_test(data_dir,max_len,max_features)


print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)


from keras.models import Sequential
from keras.layers import Dense, Embedding,GRU,LSTM

model = Sequential()
model.add(Embedding(max_features, 64, input_length= max_len))
model.add(LSTM(32)) #GRU
#bidirectional 
#model.add(layers.Bidirectional(layers.LSTM(32)))
#model.add(layers.Bidirectional(
Example #4
0
def main():

    eh = ExperimentHandler(args, REPORTS_DIR)
    signal_handler = eh.signal_handler
    ''' GET RELATIVE PATHS TO DATA AND MODELS '''
    '''----------------------------------------------------------------------- '''
    with open(args.model_list_filename, "r") as f:
        model_paths = [l.strip('\n') for l in f.readlines() if l[0] != '#']

    with open(args.data_list_filename, "r") as f:
        data_paths = [l.strip('\n') for l in f.readlines() if l[0] != '#']

    logging.info("DATA PATHS\n{}".format("\n".join(data_paths)))
    logging.info("MODEL PATHS\n{}".format("\n".join(model_paths)))
    ''' BUILD ROCS '''
    '''----------------------------------------------------------------------- '''
    if args.load_rocs is None:
        for data_path in data_paths:

            logging.info(
                'Building ROCs for models trained on {}'.format(data_path))
            tf = load_tf(DATA_DIR, "{}-train.pickle".format(data_path))
            if args.set == 'test':
                data = load_test(tf, DATA_DIR,
                                 "{}-test.pickle".format(data_path),
                                 args.n_test)
            elif args.set == 'valid':
                data = load_test(tf, DATA_DIR,
                                 "{}-valid.pickle".format(data_path),
                                 args.n_test)
            elif args.set == 'train':
                data = load_test(tf, DATA_DIR,
                                 "{}-train.pickle".format(data_path),
                                 args.n_test)

            for model_path in model_paths:
                logging.info(
                    '\tBuilding ROCs for instances of {}'.format(model_path))
                r, f, t = build_rocs(data,
                                     os.path.join(MODELS_DIR,
                                                  model_path), args.batch_size)

                absolute_roc_path = os.path.join(
                    eh.exp_dir,
                    "rocs-{}-{}.pickle".format("-".join(model_path.split('/')),
                                               data_path))
                with open(absolute_roc_path, "wb") as fd:
                    pickle.dump((r, f, t), fd)
    else:
        for data_path in data_paths:
            for model_path in model_paths:

                previous_absolute_roc_path = os.path.join(
                    REPORTS_DIR, args.load_rocs,
                    "rocs-{}-{}.pickle".format("-".join(model_path.split('/')),
                                               data_path))
                with open(previous_absolute_roc_path, "rb") as fd:
                    r, f, t = pickle.load(fd)

                absolute_roc_path = os.path.join(
                    eh.exp_dir,
                    "rocs-{}-{}.pickle".format("-".join(model_path.split('/')),
                                               data_path))
                with open(absolute_roc_path, "wb") as fd:
                    pickle.dump((r, f, t), fd)
    ''' PLOT ROCS '''
    '''----------------------------------------------------------------------- '''

    labels = model_paths
    colors = ['c', 'm', 'y', 'k']

    for data_path in data_paths:
        for model_path, label, color in zip(model_paths, labels, colors):
            absolute_roc_path = os.path.join(
                eh.exp_dir,
                "rocs-{}-{}.pickle".format("-".join(model_path.split('/')),
                                           data_path))
            with open(absolute_roc_path, "rb") as fd:
                r, f, t = pickle.load(fd)

            if args.remove_outliers:
                r, f, t = remove_outliers(r, f, t)

            report_score(r, f, t, label=label)
            plot_rocs(r, f, t, label=label, color=color)

    figure_filename = os.path.join(eh.exp_dir, 'rocs.png')
    plot_save(figure_filename)
    if args.plot:
        plot_show()

    signal_handler.job_completed()
Example #5
0
@author: xabuka
using CNN
"""

import loading

max_features = 10000  # number of words to consider as features
max_len = 500
training_samples = 700  # We will be training on 200 samples
validation_samples = 200  # We will be validating on 10000 samples

x_train, y_train, x_val, y_val = loading.load_train(max_len, training_samples,
                                                    validation_samples,
                                                    max_features)
x_test, y_test = loading.load_test(max_len, max_features)
print('input_train shape:', x_train.shape)
print('input_test shape:', x_test.shape)

from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop
#from keras.layers import Embedding, Conv1D,MaxPooling1D, GlobalMaxPooling1D, Dense
model = Sequential()
model.add(layers.Embedding(max_features, 128, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(3))
model.summary()
import loading

max_features = 10000
maxlen = 100
training_samples = 1000  # We will be training on 200 samples
validation_samples = 2000  # We will be validating on 10000 samples
data_dir = '../data/SplitedPalSent'

x_train, y_train = loading.load_train(data_dir,
                                      maxlen,
                                      training_samples,
                                      validation_samples,
                                      max_features,
                                      Validation=False)
x_test, y_test = loading.load_test(data_dir, maxlen, max_features)
print('input_train shape:', x_train.shape)
print('input_test shape:', x_test.shape)

from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, LSTM

# Embedding

embedding_size = 300

# Convolution
kernel_size = 5
filters = 64