def load_raw_data(self, nrows=None, save_to_hdf=False): """Load the data from the csv or hdf (if it exists) Parameters ---------- nrows : int or None The number of lines to load in the train. save_to_hdf : bool If the file is saved in hdf after beeing loaded """ train = load_train(nrows=nrows, save_to_hdf=save_to_hdf) test = load_test(save_to_hdf=save_to_hdf) train.drop(self.drop_features, axis=1, inplace=True) test.drop(self.drop_features, axis=1, inplace=True) correct_dates(train) to_datetime(train, keep_dates=True) correct_dates(test) to_datetime(test, keep_dates=True) test.loc[test.listen_type == 0, 'listen_type'] = 1 train['diff_days'] = (train.dt_listen - train.dt_media).dt.days test['diff_days'] = (test.dt_listen - test.dt_media).dt.days train.drop([ 'dt_listen', 'dt_media', ], axis=1, inplace=True) test.drop([ 'dt_listen', 'dt_media', ], axis=1, inplace=True) train.to_pickle('../input/train_clean.pkl') test.to_pickle('../input/test_clean.pkl') self.train = train self.test = test
max_features = 10000 # number of words to consider as features maxlen = 500 # cut texts after this number of words (among top max_features most common words) batch_size = 32 #print('Loading data...') #(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features) #print(len(input_train), 'train sequences') #print(len(input_test), 'test sequences') #print('Pad sequences (samples x time)') #input_train = sequence.pad_sequences(input_train, maxlen=maxlen) #input_test = sequence.pad_sequences(input_test, maxlen=maxlen) #print('input_train shape:', input_train.shape) #print('input_test shape:', input_test.shape) input_train, y_train, x_val, y_val = loading.load_train() input_test, y_test = loading.load_test() print('input_train shape:', input_train.shape) print('input_test shape:', input_test.shape) from keras.models import Sequential from keras.layers import Dense, SimpleRNN, Embedding model = Sequential() model.add(Embedding(max_features, 32)) model.add(SimpleRNN(32)) model.add(Dense(3, activation='sigmoid')) model.summary() #model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc']) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
@author: xabuka using RNN, LSTM, GRU , BI """ import loading max_features = 10000 max_len = 100 training_samples = 1000 # We will be training on 200 samples validation_samples = 2000 # We will be validating on 10000 samples data_dir = '../data/SplitedPalSent' #'/Users/xabuka/PycharmProjects/measuring_acceptability/python-files/aclImdb' # input_train, y_train = loading.load_train(data_dir,max_len,training_samples,validation_samples,max_features, Validation = False ) input_test, y_test = loading.load_test(data_dir,max_len,max_features) print('input_train shape:', input_train.shape) print('input_test shape:', input_test.shape) from keras.models import Sequential from keras.layers import Dense, Embedding,GRU,LSTM model = Sequential() model.add(Embedding(max_features, 64, input_length= max_len)) model.add(LSTM(32)) #GRU #bidirectional #model.add(layers.Bidirectional(layers.LSTM(32))) #model.add(layers.Bidirectional(
def main(): eh = ExperimentHandler(args, REPORTS_DIR) signal_handler = eh.signal_handler ''' GET RELATIVE PATHS TO DATA AND MODELS ''' '''----------------------------------------------------------------------- ''' with open(args.model_list_filename, "r") as f: model_paths = [l.strip('\n') for l in f.readlines() if l[0] != '#'] with open(args.data_list_filename, "r") as f: data_paths = [l.strip('\n') for l in f.readlines() if l[0] != '#'] logging.info("DATA PATHS\n{}".format("\n".join(data_paths))) logging.info("MODEL PATHS\n{}".format("\n".join(model_paths))) ''' BUILD ROCS ''' '''----------------------------------------------------------------------- ''' if args.load_rocs is None: for data_path in data_paths: logging.info( 'Building ROCs for models trained on {}'.format(data_path)) tf = load_tf(DATA_DIR, "{}-train.pickle".format(data_path)) if args.set == 'test': data = load_test(tf, DATA_DIR, "{}-test.pickle".format(data_path), args.n_test) elif args.set == 'valid': data = load_test(tf, DATA_DIR, "{}-valid.pickle".format(data_path), args.n_test) elif args.set == 'train': data = load_test(tf, DATA_DIR, "{}-train.pickle".format(data_path), args.n_test) for model_path in model_paths: logging.info( '\tBuilding ROCs for instances of {}'.format(model_path)) r, f, t = build_rocs(data, os.path.join(MODELS_DIR, model_path), args.batch_size) absolute_roc_path = os.path.join( eh.exp_dir, "rocs-{}-{}.pickle".format("-".join(model_path.split('/')), data_path)) with open(absolute_roc_path, "wb") as fd: pickle.dump((r, f, t), fd) else: for data_path in data_paths: for model_path in model_paths: previous_absolute_roc_path = os.path.join( REPORTS_DIR, args.load_rocs, "rocs-{}-{}.pickle".format("-".join(model_path.split('/')), data_path)) with open(previous_absolute_roc_path, "rb") as fd: r, f, t = pickle.load(fd) absolute_roc_path = os.path.join( eh.exp_dir, "rocs-{}-{}.pickle".format("-".join(model_path.split('/')), data_path)) with open(absolute_roc_path, "wb") as fd: pickle.dump((r, f, t), fd) ''' PLOT ROCS ''' '''----------------------------------------------------------------------- ''' labels = model_paths colors = ['c', 'm', 'y', 'k'] for data_path in data_paths: for model_path, label, color in zip(model_paths, labels, colors): absolute_roc_path = os.path.join( eh.exp_dir, "rocs-{}-{}.pickle".format("-".join(model_path.split('/')), data_path)) with open(absolute_roc_path, "rb") as fd: r, f, t = pickle.load(fd) if args.remove_outliers: r, f, t = remove_outliers(r, f, t) report_score(r, f, t, label=label) plot_rocs(r, f, t, label=label, color=color) figure_filename = os.path.join(eh.exp_dir, 'rocs.png') plot_save(figure_filename) if args.plot: plot_show() signal_handler.job_completed()
@author: xabuka using CNN """ import loading max_features = 10000 # number of words to consider as features max_len = 500 training_samples = 700 # We will be training on 200 samples validation_samples = 200 # We will be validating on 10000 samples x_train, y_train, x_val, y_val = loading.load_train(max_len, training_samples, validation_samples, max_features) x_test, y_test = loading.load_test(max_len, max_features) print('input_train shape:', x_train.shape) print('input_test shape:', x_test.shape) from keras.models import Sequential from keras import layers from keras.optimizers import RMSprop #from keras.layers import Embedding, Conv1D,MaxPooling1D, GlobalMaxPooling1D, Dense model = Sequential() model.add(layers.Embedding(max_features, 128, input_length=max_len)) model.add(layers.Conv1D(32, 7, activation='relu')) model.add(layers.MaxPooling1D(5)) model.add(layers.Conv1D(32, 7, activation='relu')) model.add(layers.GlobalMaxPooling1D()) model.add(layers.Dense(3)) model.summary()
import loading max_features = 10000 maxlen = 100 training_samples = 1000 # We will be training on 200 samples validation_samples = 2000 # We will be validating on 10000 samples data_dir = '../data/SplitedPalSent' x_train, y_train = loading.load_train(data_dir, maxlen, training_samples, validation_samples, max_features, Validation=False) x_test, y_test = loading.load_test(data_dir, maxlen, max_features) print('input_train shape:', x_train.shape) print('input_test shape:', x_test.shape) from keras.models import Sequential from keras import layers from keras.optimizers import RMSprop from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, LSTM # Embedding embedding_size = 300 # Convolution kernel_size = 5 filters = 64