Beispiel #1
0
    def test_unseen(self):
        test_datasets = ['twitter', 'hotel']
        laptop_texts_raw_indices, laptop_texts_raw_without_aspects_indices, laptop_texts_left_indices, laptop_texts_left_with_aspects_indices, \
        laptop_aspects_indices, laptop_texts_right_indices, laptop_texts_right_with_aspects_indices, laptop_dataset_index, \
        laptop_polarities_matrix, laptop_polarities = \
            read_dataset(types=[test_datasets[0]],
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        hotel_texts_raw_indices, hotel_texts_raw_without_aspects_indices, hotel_texts_left_indices, hotel_texts_left_with_aspects_indices, \
        hotel_aspects_indices, hotel_texts_right_indices, hotel_texts_right_with_aspects_indices, hotel_dataset_index, \
        hotel_polarities_matrix, hotel_polarities = \
            read_dataset(types=[test_datasets[1]],
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        polarities = [laptop_polarities, hotel_polarities]

        for j in range(len(polarities)):
            print("Dataset id : {}".format(test_datasets[j]))
            test_unique_elements, test_counts_elements = np.unique(polarities[j], return_counts=True)

            for i in range(len(test_unique_elements)):
                print("Sentiment : {0} , Count : {1}".format(test_unique_elements[i], test_counts_elements[i]))

        self.model.evaluate(
            [laptop_texts_left_indices, laptop_texts_right_indices, laptop_dataset_index, laptop_aspects_indices], \
            [laptop_polarities_matrix, laptop_polarities_matrix, laptop_polarities_matrix], steps=1)
        self.model.evaluate(
            [hotel_texts_left_indices, hotel_texts_right_indices, hotel_dataset_index, hotel_aspects_indices], \
            [hotel_polarities_matrix, hotel_polarities_matrix, hotel_polarities_matrix], steps=1)
Beispiel #2
0
def main():
    # set mode
    try:
        mode = sys.argv[1]
        assert (mode == 'dnn' or mode == 'cnn')
    except:
        print('Error: Model mode not found')
        exit()
    read_dataset()
    # load data
    try:
        img_name = sys.argv[2]
        img = mpimg.imread(img_name)
    except:
        print('Error: Img not found')
        exit()
    img = np.array([img])
    o_shape = img.shape
    img = np.reshape(img, (1, -1))
    #img = preprocessing.scale(img)
    img = (img - 0.5) * 4
    img = np.reshape(img, (1, 48, 48, 1))
    classes = [
        'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'
    ]

    # load model
    emotion_classifier = model.build_model(mode)
    emotion_classifier.load_weights(mode + '.h5')

    # predict
    predictions = emotion_classifier.predict_classes(img)
    print(classes[predictions[0]])
Beispiel #3
0
def train_and_evaluate(output_dir, hparams):
    EVAL_INTERVAL = 30
    run_config = tf.estimator.RunConfig(save_checkpoints_secs=EVAL_INTERVAL,
                                        keep_checkpoint_max=3,
                                        save_summary_steps=25)
    estimator = tf.estimator.Estimator(model_fn=sequence_regressor,
                                       params=hparams,
                                       model_dir=output_dir,
                                       config=run_config)
    train_spec = tf.estimator.TrainSpec(input_fn=utils.read_dataset(
        hparams['train_set'],
        mode=tf.estimator.ModeKeys.TRAIN,
        batch_size=hparams['batch_size'],
        timeserie_column=TIMESERIES_COL),
                                        max_steps=hparams['training_steps'])
    exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=utils.read_dataset(hparams['eval_set'],
                                    mode=tf.estimator.ModeKeys.EVAL,
                                    batch_size=hparams['batch_size'],
                                    timeserie_column=TIMESERIES_COL),
        steps=hparams['eval_steps'],
        start_delay_secs=60,  # start evaluating after N seconds
        throttle_secs=EVAL_INTERVAL,  # evaluate every N seconds
        exporters=exporter)
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Beispiel #4
0
def get_dataset(args, fold):
    tr_X, tr_y = read_dataset(os.path.join(args.dataset, '%03d' % fold),
                              "train.csv")
    ts_X, ts_y = read_dataset(os.path.join(args.dataset, '%03d' % fold),
                              "test.csv")

    return np.asarray(tr_X), np.asarray(tr_y), np.asarray(ts_X), \
        np.asarray(ts_y)
Beispiel #5
0
def run_network(window, model=None, save_model=False, show_plot=False):
    start_time = time.time()

    print('loading and prepare data set...')
    data = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    X_train, y_train, X_test, y_test, mean, std = split_dataset(
        data, window, ratio=0.90, standardize=True)

    print('number of training samples ', len(y_train))
    print('number of test samples     ', len(y_test))

    if not model:
        print('initialize model...')
        model = compile_model(
            hidden_neurons=25, loss_fn='mse',
            input_dim=sum(1 for x in window if x), activation_fn='tanh')

        print('model ', model.summary())

        print('train model...')
        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        model.fit(X_train, y_train, nb_epoch=500, validation_split=0.1,
                  callbacks=[early_stopping])

    print('make predictions...')
    prediction = model.predict(X_test).flatten()

    if show_plot:
        plot_result(prediction, y_test, mean, std)
        print('mase = ', mase(y_train, y_test, prediction))

    if save_model:
        store_model(model)

    print('totoal duration: {:.2f} seconds'.format(time.time() - start_time))
Beispiel #6
0
def main(proc_num: int, no_limits: bool) -> None:
    df = read_dataset(no_limits)
    chunks = split_dataset_into_chunks(df, proc_num)
    with Pool(proc_num) as pool:
        pool.map(worker, chunks)
    for chunk in chunks:
        print(len(chunk.head()))
Beispiel #7
0
def forecasting_different_horizons():
    print('load data set...')
    data_df = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    mean = data_df.mean()
    std = data_df.std()
    data_df -= mean
    data_df /= std
    data = data_df['data (in bytes)']

    start_forecast_idx = int(len(data) * 0.90)
    test_set = data[start_forecast_idx:]

    print('calculate forecasts using a MLP neural network...')
    mlp_model = load_model('Saturday_192115', 'mse')
    window = create_window_array(139, 288)
    window_size = sum(1 for x in window if x)

    plt.ylabel('data (normalized)')
    plt.xlabel('time')
    plt.plot(test_set, 'r-', label='test set')

    for steps, style in [(1, 'g-'), (24, 'b--')]:
        forecast = []
        for t in range(start_forecast_idx, len(data) - steps):
            forecast.append(iterative_prediction(
                mlp_model, data[:t], (1, window_size), window, steps+1))
        series = pd.Series([np.nan] * steps + forecast, index=test_set.index)
        plt.plot(series, style, label='h={}'.format(steps))

    plt.legend(loc='upper left')
    plt.show()
Beispiel #8
0
    def train(self):
        tbCallBack = TensorBoard(log_dir='./lstm_logs',
                                 histogram_freq=0,
                                 write_graph=True,
                                 write_images=True)

        def modelSave(epoch, logs):
            if (epoch + 1) % 5 == 0:
                self.model.save('lstm_saved_model.h5')

        msCallBack = LambdaCallback(on_epoch_end=modelSave)

        texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \
        aspects_indices, texts_right_indices, texts_right_with_aspects_indices, \
        polarities_matrix = \
            read_dataset(type=self.DATASET,
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        self.model.fit(self.texts_raw_indices,
                       self.polarities_matrix,
                       validation_data=(texts_raw_indices, polarities_matrix),
                       epochs=self.EPOCHS,
                       batch_size=self.BATCH_SIZE,
                       callbacks=[tbCallBack])
Beispiel #9
0
    def train(self):
        tbCallBack = TensorBoard(log_dir='./ram_logs',
                                 histogram_freq=0,
                                 write_graph=True,
                                 write_images=True)

        texts_raw_indices, texts_left_indices, aspects_indices, texts_right_indices, polarities_matrix = \
            read_dataset(type=self.DATASET,
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        for i in range(1, self.ITERATION):
            print()
            print('-' * 50)
            print('Iteration', i)
            self.model.fit(
                [self.texts_raw_indices, self.aspects_indices],
                self.polarities_matrix,
                validation_data=([texts_raw_indices,
                                  aspects_indices], polarities_matrix),
                batch_size=self.BATCH_SIZE,
                callbacks=[tbCallBack])
            if i % 5 == 0:
                self.model.save('ram_saved_model.h5')
                print('model saved')
Beispiel #10
0
    def __init__(self):
        self.DATASET = 'restaurant'  # 'twitter', 'restaurant', 'laptop'
        self.POLARITIES_DIM = 3
        self.EMBEDDING_DIM = 100
        self.LEARNING_RATE = 0.01
        self.LSTM_PARAMS = {
            'units': 200,
            'activation': 'tanh',
            'recurrent_activation': 'sigmoid',
            'kernel_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003),
            'recurrent_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003),
            'bias_initializer': initializers.RandomUniform(minval=-0.003, maxval=0.003),
            'kernel_regularizer': regularizers.l2(0.001),
            'recurrent_regularizer': regularizers.l2(0.001),
            'bias_regularizer': regularizers.l2(0.001),
            'dropout': 0,
            'recurrent_dropout': 0,
        }
        self.MAX_SEQUENCE_LENGTH = 80
        self.MAX_ASPECT_LENGTH = 2
        self.BATCH_SIZE = 200
        self.ITERATION = 500

        self.texts_raw_indices, self.texts_left_indices, self.aspects_indices, self.texts_right_indices, \
        self.polarities_matrix, \
        self.embedding_matrix, \
        self.tokenizer = \
            read_dataset(type=self.DATASET,
                         mode='train',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        self.left_input = np.concatenate((self.texts_left_indices, self.aspects_indices), axis=1)
        self.right_input = np.concatenate((self.texts_right_indices, self.aspects_indices), axis=1)

        if os.path.exists('td_lstm_saved_model.h5'):
            print('loading saved model...')
            self.model = load_model('td_lstm_saved_model.h5')
        else:
            print('Build model...')
            inputs_l = Input(shape=(self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH,))
            inputs_r = Input(shape=(self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH,))
            Embedding_Layer = Embedding(input_dim=len(self.tokenizer.word_index) + 1,
                          output_dim=self.EMBEDDING_DIM,
                          input_length=self.MAX_SEQUENCE_LENGTH + self.MAX_ASPECT_LENGTH,
                          weights=[self.embedding_matrix],
                          trainable=False)
            x_l = Embedding_Layer(inputs_l)
            x_r = Embedding_Layer(inputs_r)
            x_l = LSTM(**self.LSTM_PARAMS)(x_l)
            x_r = LSTM(**self.LSTM_PARAMS, go_backwards=True)(x_r)
            x = Concatenate()([x_l, x_r])
            x = Dense(self.POLARITIES_DIM)(x)
            predictions = Activation('softmax')(x)
            model = Model(inputs=[inputs_l, inputs_r], outputs=predictions)
            model.summary()
            model.compile(loss='categorical_crossentropy', optimizer=optimizers.Adam(lr=self.LEARNING_RATE), metrics=['acc'])
            # plot_model(model, to_file='model.png')
            self.model = model
Beispiel #11
0
def train_dataset(dataset_name, initial_design, mode, runs=1):
    data = read_dataset('../datasets/', dataset_name + '.csv')

    X_train, X_val, y_train, y_val = train_test_split(
        data.iloc[:, :-1].values, data[data.columns.to_list()[-1]])

    training_lower = np.min(X_train, axis=0)
    training_upper = np.max(X_train, axis=0)

    for i in range(runs):
        print('run', i + 1, 'of', runs)
        # TODO make universal for any model
        objective_function = ML(X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val)

        # size: number of hyperparameters
        opt_lower = np.array([1, 0.00001, 0.0001, 50, 0.01, 0.09, 0.0999, 5])
        opt_upper = np.array([150, 0.01, 0.1, 300, 0.9, 0.9, 0.999, 15])

        n_init = 3  # number of points for the initial design.
        init_design = init_latin_hypercube_sampling

        n_iterations = 100

        X_init = None  # mvp
        Y_init = None  # mvp

        maximizer = 'random'
        acquisition_func = 'log_ei'
        model_type = 'gp'
        result_path = ('../optimization_results/' + mode + '/f-score/' +
                       maximizer + '-' + acquisition_func + '-' + model_type +
                       '/' + dataset_name + '/run-' + str(i))
        if not os.path.exists(result_path):
            os.makedirs(result_path)

        d_name = dataset_name
        neighbor = get_nearest_names(1, d_name)[0]

        results = bayesian_optimization(objective_function,
                                        d_name,
                                        neighbor,
                                        opt_lower,
                                        opt_upper,
                                        num_iterations=n_iterations,
                                        initial_design=initial_design,
                                        X_init=X_init,
                                        Y_init=Y_init,
                                        maximizer=maximizer,
                                        acquisition_func=acquisition_func,
                                        model_type=model_type,
                                        n_init=3,
                                        rng=None,
                                        output_path=result_path)

        json.dump(results, open(os.path.join(result_path, 'RESULTS.json'),
                                'w'))
Beispiel #12
0
def main():
    ts_5minutes = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    ts_hourly = read_dataset('../datasets/internet-traffic-data-hourly.csv')
    ts_daily = read_dataset('../datasets/internet-traffic-data-daily.csv')

    plot_all_full_timeseries(ts_5minutes, ts_hourly, ts_daily)

    plot_full_timeseries(ts_5minutes)
    plot_full_timeseries(ts_hourly)
    plot_full_timeseries(ts_daily)

    plot_interval_of_timeseries(ts_5minutes, '2005-06-22', '2005-06-22')
    plot_interval_of_timeseries(ts_5minutes, '2005-07-04', '2005-07-10')

    plot_daily_means(ts_5minutes)
    plot_acf(ts_hourly, 200, 24, 'ACF hourly data (one week)')
    plot_acf(ts_5minutes, 300, 50, 'ACF 5 min, data (one day)')
def main(train, test, ngram, we):
    '''
    Given output path (out), liblinear path (liblinear),
    Given ngram string rule (like "123"), ngram
    '''
    global times
    times = 0

    print 'loading data...'
    train_df = read_dataset(train)
    test_df = read_dataset(test)

    print 'cleaning data and add lex indicators...'
    train_df[['text_lex', 'lex_ws']] = train_df.apply(add_lex_indicator,
                                                      axis=1)
    test_df[['text_lex', 'lex_ws']] = test_df.apply(add_lex_indicator, axis=1)

    print "using train to build pos and neg dic..."
    pos_dic, neg_dic = build_dict(train_df, ngram)

    print "computing log-count ratio r..."
    dic, r, v = compute_ratio(pos_dic, neg_dic)

    print 'loading word embedding...'
    word2vec = load_bin_vec(we)

    print "building train and test features --- ngram part..."
    train_df.sort_index(inplace=True)
    test_df.sort_index(inplace=True)
    X_train_ngram, y_train = process_files_ngram(train_df, dic, r, v, ngram)
    X_test_ngram, y_test = process_files_ngram(test_df, dic, r, v, ngram)

    print "building train and test features --- pos embedding part..."
    X_train_embed = process_files_wemb(train_df, word2vec)
    X_test_embed = process_files_wemb(test_df, word2vec)

    print "combining log-count ratio and pos embedding features..."
    train_f = sp.hstack((X_train_ngram, X_train_embed), format='csr')
    test_f = sp.hstack((X_test_ngram, X_test_embed), format='csr')

    print "running model..."
    basemodel = LogisticRegression()
    f_score = model_run(basemodel, train_f, test_f, y_train, y_test)
    print '##############f_score is: ', f_score

    print 'model ended.'
def input_fn(dataset_filename, vocab_filename, num_channels=39, batch_size=8, num_epochs=1):
    dataset = utils.read_dataset(dataset_filename, num_channels)
    vocab_table = utils.create_vocab_table(vocab_filename)

    dataset = utils.process_dataset(
        dataset, vocab_table, utils.SOS, utils.EOS, batch_size, num_epochs)

    return dataset
Beispiel #15
0
def load_data():
    """
    Load data for train.
    :return: features and labels, cleaned.
    """
    print("Loading train dataset...")
    features = utils.read_dataset(const.SEMCOR_TRAIN)
    labels = utils.read_dataset(const.SEMCOR_LABEL)
    # clean senteces
    features = preprocess.clean_sentences(features)
    labels = preprocess.clean_sentences(labels)
    print("Loading dev dataset...")
    features_dev = utils.read_dataset(const.SE07_FEATURE)
    labels_dev = utils.read_dataset(const.SE07_LABEL)
    # clean senteces
    features_dev = preprocess.clean_sentences(features_dev)
    labels_dev = preprocess.clean_sentences(labels_dev)
    return features, features_dev, labels, labels_dev
Beispiel #16
0
    def __init__(self):
        self.DATASET = 'twitter'  # 'twitter', 'restaurant', 'laptop'
        self.POLARITIES_DIM = 3
        self.EMBEDDING_DIM = 100
        self.LEARNING_RATE = 0.01
        self.INITIALIZER = initializers.RandomUniform(minval=-0.003,
                                                      maxval=0.003)
        self.REGULARIZER = regularizers.l2(0.001)
        self.LSTM_PARAMS = {
            'units': 200,
            'activation': 'tanh',
            'recurrent_activation': 'sigmoid',
            'kernel_initializer': self.INITIALIZER,
            'recurrent_initializer': self.INITIALIZER,
            'bias_initializer': self.INITIALIZER,
            'kernel_regularizer': self.REGULARIZER,
            'recurrent_regularizer': self.REGULARIZER,
            'bias_regularizer': self.REGULARIZER,
            'dropout': 0,
            'recurrent_dropout': 0,
        }
        self.MAX_SEQUENCE_LENGTH = 80
        self.MAX_ASPECT_LENGTH = 10
        self.BATCH_SIZE = 200
        self.EPOCHS = 100

        self.texts_raw_indices, self.texts_raw_without_aspects_indices, self.texts_left_indices, self.texts_left_with_aspects_indices, \
        self.aspects_indices, self.texts_right_indices, self.texts_right_with_aspects_indices, \
        self.polarities_matrix, \
        self.embedding_matrix, \
        self.tokenizer = \
            read_dataset(type=self.DATASET,
                         mode='train',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        if os.path.exists('lstm_saved_model.h5'):
            print('loading saved model...')
            self.model = load_model('lstm_saved_model.h5')
        else:
            print('Build model...')
            inputs = Input(shape=(self.MAX_SEQUENCE_LENGTH, ))
            x = Embedding(input_dim=len(self.tokenizer.word_index) + 1,
                          output_dim=self.EMBEDDING_DIM,
                          input_length=self.MAX_SEQUENCE_LENGTH,
                          weights=[self.embedding_matrix],
                          trainable=False)(inputs)
            x = LSTM(**self.LSTM_PARAMS)(x)
            x = Dense(self.POLARITIES_DIM)(x)
            predictions = Activation('softmax')(x)
            model = Model(inputs, predictions)
            model.summary()
            model.compile(loss='categorical_crossentropy',
                          optimizer=optimizers.Adam(lr=self.LEARNING_RATE),
                          metrics=['acc', f1])
            # plot_model(model, to_file='model.png')
            self.model = model
Beispiel #17
0
    def test_unseen(self):
        laptop_texts_raw_indices, laptop_texts_raw_without_aspects_indices, laptop_texts_left_indices, laptop_texts_left_with_aspects_indices, \
        laptop_aspects_indices, laptop_texts_right_indices, laptop_texts_right_with_aspects_indices, laptop_dataset_index, \
        laptop_polarities_matrix, laptop_polarities = \
            read_dataset(types=['twitter'],
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)
        self.model.evaluate([laptop_texts_left_indices, laptop_texts_right_indices, laptop_dataset_index], \
                            [laptop_polarities_matrix, laptop_polarities_matrix, laptop_polarities_matrix], steps=1)

        hotel_texts_raw_indices, hotel_texts_raw_without_aspects_indices, hotel_texts_left_indices, hotel_texts_left_with_aspects_indices, \
        hotel_aspects_indices, hotel_texts_right_indices, hotel_texts_right_with_aspects_indices, hotel_dataset_index, \
        hotel_polarities_matrix, hotel_polarities = \
            read_dataset(types=['hotel'],
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)
        self.model.evaluate([hotel_texts_left_indices, hotel_texts_right_indices, hotel_dataset_index], \
                            [hotel_polarities_matrix, hotel_polarities_matrix, hotel_polarities_matrix], steps=1)
Beispiel #18
0
    def test(self):
        tw_texts_raw_indices, tw_texts_raw_without_aspects_indices, tw_texts_left_indices, tw_texts_left_with_aspects_indices, \
        tw_aspects_indices, tw_texts_right_indices, tw_texts_right_with_aspects_indices, tw_dataset_index, \
        tw_polarities_matrix,tw_polarities= \
            read_dataset(types=self.DATASET,
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SENTENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        self.model.evaluate([tw_texts_left_indices,tw_texts_right_indices],\
                            tw_polarities,steps=1)
Beispiel #19
0
def main():
    '''
    Main function.
    '''
    args = parse_args()

    dataset = read_dataset(args.file, args.separator, args.colobs,
                           args.varnames, args.indices)
    discretized = discretize(dataset)
    write_dataset(discretized, args.out, args.separator, args.colobs,
                  args.varnames, args.indices)
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n',
                        '--top_n',
                        type=int,
                        required=False,
                        default=TOP_N,
                        help='Number of documents to use.')
    FLAGS, _ = parser.parse_known_args()
    top_n = FLAGS.top_n

    train_dataset = utils.read_dataset(TRAIN_DATA_PATH, top_n)
    train_dataset = utils.augment_with_permutations(train_dataset)
    train_data, train_labels = utils.to_numpy(train_dataset, top_n)
    del train_dataset

    val_dataset = utils.read_dataset(VAL_DATA_PATH, top_n)
    val_data, val_labels = utils.to_numpy(val_dataset, top_n)
    del val_dataset

    model = arch.get_model(top_n)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()

    filepath = os.path.join(
        MODELS_DIR, "model-" + str(uuid.uuid4()) +
        "-{epoch:04d}-{val_loss:.4f}-{val_acc:.4f}.hdf5")
    save_model = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=0,
                                 save_best_only=False,
                                 mode='max')
    model.fit(train_data,
              train_labels,
              validation_data=(val_data, val_labels),
              batch_size=128,
              epochs=75,
              verbose=1,
              callbacks=[save_model])
Beispiel #21
0
def prepare_dataset(dataset_name):
    data = read_dataset('../datasets/', dataset_name + '.csv')

    for i in range(len(data.columns) - 1):
        column = data.columns[i]
        data[column] = pd.factorize(data[column])[0] + 1

    # shuffle = data.sample(frac=1)  # shuffle rows
    # if len(shuffle) > 5000:
    #     shuffle = shuffle[:5000]  # cut objects
    # shuffle.to_csv('../datasets/' + dataset_name + '.csv', index=False)
    data.to_csv('../datasets/' + dataset_name + '.csv', index=False)
def train(model, ckpt, manager, dataset_dir, n_epoch):

    for n in range(n_epoch):
        dataset = utils.read_dataset(dataset_dir)
        print(f"Epoch: {n}")
        for image, labels, image_id in dataset:
            current_loss = train_step(model, image, labels)
            ckpt.step.assign_add(1)
            if int(ckpt.step) % 10 == 0:
                save_path = manager.save()
                print("Checkpoint stored at step {}: {}".format(
                    int(ckpt.step), save_path))
                print("loss {:1.2f}".format(current_loss.numpy()))
Beispiel #23
0
def read_model():
    start_time = time.time()

    print('loading and prepare data set...')
    data_5m = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    data_hoursly = read_dataset('../datasets/internet-traffic-data-hourly.csv')
    data_daily = read_dataset('../datasets/internet-traffic-data-daily.csv')

    # TODO: ändern beim merge...
    data_5m = data_5m['data (in bytes)'].tolist()
    data_daily = data_daily['data (in bytes)'].tolist()
    data_hoursly = data_hoursly['data (in bytes)'].tolist()

    rest_5m = int(len(data_5m) * 0.80) % 288
    start_forecast_idx_5m = int(len(data_5m) * 0.90) - rest_5m
    train_5m = data_5m[:start_forecast_idx_5m]
    test_5m = data_5m[start_forecast_idx_5m:]

    rest_hoursly = int(len(data_hoursly) * 0.80) % 168
    start_forecast_idx_hoursly = int(len(data_hoursly) * 0.90) - rest_hoursly
    train_hoursly = data_hoursly[:start_forecast_idx_hoursly]
    test_hoursly = data_hoursly[start_forecast_idx_hoursly:]

    rest_daily = int(len(data_daily) * 0.80) % 7
    start_forecast_idx_daily = int(len(data_daily) * 0.90) - rest_daily
    train_daily = data_daily[:start_forecast_idx_daily]
    test_daily = data_daily[start_forecast_idx_daily:]


    print('5min: ', len(train_5m), " ", len(test_5m))
    print('hoursly: ', len(train_hoursly), " ", len(test_hoursly))
    print('daily: ', len(train_daily), " ", len(test_daily), "\n")

    plot_daily(train_daily, test_daily)
    plot_hoursly(train_hoursly, test_hoursly)
    plot_5min(train_5m, test_5m)

    #errors
    plot_errors(train_5m, test_5m, train_hoursly, test_hoursly, train_daily, test_daily)
Beispiel #24
0
def main():
    args = parser.parse_args()

    with open(args.classifier, 'r') as f:
        serialized_classifier = f.read()

    processor = TextProcessor()
    classifier = Classifier.load(serialized_classifier, processor)

    for example in read_dataset(args.data):
        text = example['content']
        predicted_tag = classifier.classify(text)
        print predicted_tag
Beispiel #25
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(data_json)

    serialized_classifier = classifier.dump()

    ensure_directory(args.output)
    with open(args.output, 'w') as f:
        f.write(serialized_classifier)
        f.write(os.linesep)
Beispiel #26
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(data_json)

    serialized_classifier = classifier.dump()

    ensure_directory(args.output)
    with open(args.output, 'w') as f:
        f.write(serialized_classifier)
        f.write(os.linesep)
Beispiel #27
0
def get_weak_scaling_datasets(basedir, datasets, weak, scratch):
    if len(datasets) > 1:
        raise RuntimeError('Weak scaling with more than one data set is not yet supported')
    dname = datasets[0]
    dataset = all_datasets[dname]
    read = read_dataset(join(basedir, dataset[0]), dataset[3], dataset[4], dataset[5], dataset[6])
    weak_datasets = []
    for n in weak:
        dataset_n = list(dataset)
        dataset_n[0] = join(scratch, basename(dataset[0]) + '.' + str(n))
        dataset_n[1] = n
        write_dataset(read.iloc[:,:n], dataset_n[0], dataset_n[3], dataset_n[4], dataset_n[5], dataset_n[6])
        all_datasets.update([(dname + '.' + str(n), tuple(dataset_n))])
        weak_datasets.append(dname + '.' + str(n))
    return weak_datasets
Beispiel #28
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)
    random.shuffle(data_json)

    training_set_ratio = 0.7
    training_set_size = int(training_set_ratio * len(data_json) + 0.5)

    training_set = data_json[:training_set_size]
    test_set = data_json[training_set_size:]

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(training_set)

    print classifier.dump() == Classifier.load(classifier.dump(), processor).dump()
def test_model(dataset_dir, model: model.RCNN, output_dir, category_colors):
    total_accuracy = 0
    class_correct_counts = np.zeros(model.num_classes)
    class_total_counts = np.zeros(model.num_classes)
    i = 0

    for image, labels, img_id in utils.read_dataset(dataset_dir):
        i += 1
        start_time = time.time()
        accuracy = 0.0
        h, w = labels.shape
        input_image = np.append(image, np.zeros(
            shape=[h, w, model.num_classes], dtype=np.float32), axis=2)

        model([input_image])
        logits1, logits2 = model.logits

        logits = logits1 if model.num_layers == 1 else logits2
        stride = 16 if model.model_v == 1 else 4

        predicted_labels = np.argmax(logits, axis=3)

        true_labels = labels[::stride, ::stride]

        correct_labels = np.equal(predicted_labels, true_labels)
        accuracy = np.mean(correct_labels)
        total_accuracy += accuracy

        for c in range(model.num_classes):
            current_class_labels = np.equal(true_labels, c)
            class_total_counts[c] += np.sum(current_class_labels)
            class_correct_counts[c] += np.sum(
                np.equal(true_labels, c) * correct_labels)

        print("Image #%d: %s: Accuracy: %f (time: %.1fs)" % (
            i, img_id, accuracy, time.time() - start_time))

        for layer_num in [1, 2]:
            output_filename = os.path.join(
                output_dir, img_id + '_test_%d.png' % layer_num)
            utils.save_labels_array(predicted_labels.astype(
                np.uint8), output_filename, colors=category_colors)

    print("%d Images, Total Accuracy: %f" % (i, total_accuracy / i))
    print("Per Class correct counts:", class_correct_counts)
    print("Per Class totals:", class_total_counts)
    print("Per Class accuracy:", class_correct_counts / class_total_counts)
Beispiel #30
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)
    random.shuffle(data_json)

    training_set_ratio = 0.7
    training_set_size = int(training_set_ratio * len(data_json) + 0.5)

    training_set = data_json[:training_set_size]
    test_set = data_json[training_set_size:]

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(training_set)

    print classifier.dump() == Classifier.load(classifier.dump(),
                                               processor).dump()
    def read_data(self, max_train_size, max_dev_size, read_ahead=10, batch_mode='standard', shuffle=True,
                  crash_test=False, **kwargs):
        utils.debug('reading training data')
        self.batch_iterator, self.train_size = utils.get_batch_iterator(
            self.filenames.train, self.extensions, self.vocabs, self.batch_size,
            max_size=max_train_size, character_level=self.character_level, max_seq_len=self.max_len,
            read_ahead=read_ahead, mode=batch_mode, shuffle=shuffle, binary=self.binary, crash_test=crash_test
        )

        utils.debug('reading development data')

        dev_sets = [
            utils.read_dataset(dev, self.extensions, self.vocabs, max_size=max_dev_size,
                               character_level=self.character_level, binary=self.binary)[0]
            for dev in self.filenames.dev
            ]
        # subset of the dev set whose loss is periodically evaluated
        self.dev_batches = [utils.get_batches(dev_set, batch_size=self.batch_size) for dev_set in dev_sets]
    def train(self):
#        tbCallBack = TensorBoard(log_dir='./ram_logs', histogram_freq=0, write_graph=True, write_images=True)
        def modelSave(epoch, logs):
            if (epoch + 1) % 5 == 0:
                self.model.save('lstm_saved_model.h5')
        msCallBack = LambdaCallback(on_epoch_end=modelSave)

        texts_raw_indices, texts_raw_without_aspects_indices, texts_left_indices, texts_left_with_aspects_indices, \
        aspects_indices, texts_right_indices, texts_right_with_aspects_indices, \
        polarities_matrix = \
            read_dataset(type=self.DATASET,
                         mode='test',
                         embedding_dim=self.EMBEDDING_DIM,
                         max_seq_len=self.MAX_SEQUENCE_LENGTH, max_aspect_len=self.MAX_ASPECT_LENGTH)

        self.model.fit([self.texts_raw_indices, self.aspects_indices], self.polarities_matrix,
                       epochs=self.EPOCHS, batch_size=self.BATCH_SIZE,callbacks=[msCallBack])
        scores =  self.model.evaluate([texts_raw_indices, aspects_indices], polarities_matrix, verbose=0)
        print("Loss :", scores[0], "Accuracy", scores[1]*100)
Beispiel #33
0
def input_fn(dataset_filename,
             vocab_filename,
             norm_filename=None,
             num_channels=39,
             batch_size=8,
             num_epochs=1,
             binf2phone=None,
             num_parallel_calls=32,
             max_frames=-1,
             max_symbols=-1):
    binary_targets = binf2phone is not None
    labels_shape = [] if not binary_targets else len(binf2phone.index)
    labels_dtype = tf.string if not binary_targets else tf.float32
    dataset = utils.read_dataset(dataset_filename,
                                 num_channels,
                                 labels_shape=labels_shape,
                                 labels_dtype=labels_dtype)
    vocab_table = utils.create_vocab_table(vocab_filename)

    if norm_filename is not None:
        means, stds = utils.load_normalization(args.norm)
    else:
        means = stds = None

    sos = binf2phone[utils.SOS].values if binary_targets else utils.SOS
    eos = binf2phone[utils.EOS].values if binary_targets else utils.EOS

    dataset = utils.process_dataset(dataset,
                                    vocab_table,
                                    sos,
                                    eos,
                                    means,
                                    stds,
                                    batch_size,
                                    num_epochs,
                                    binary_targets=binary_targets,
                                    labels_shape=labels_shape,
                                    num_parallel_calls=num_parallel_calls,
                                    max_frames=max_frames,
                                    max_symbols=max_symbols)

    return dataset
Beispiel #34
0
def main():
    args = parser.parse_args()
    full_data_json = read_dataset(args.data)

    # for n in xrange(30, len(full_data_json), 30):
    for n in [len(full_data_json)]:

        corrects = 0
        total = 0

        for _ in xrange(SAMPLES):

            random.shuffle(full_data_json)
            data_json = full_data_json[:n]

            training_set_ratio = 0.7
            training_set_size = int(training_set_ratio * len(data_json) + 0.5)

            training_set = data_json[:training_set_size]
            test_set = data_json[training_set_size:]

            processor = TextProcessor()
            classifier = Classifier(processor)
            classifier.train(training_set)

            for example in test_set:
                text = example["content"]
                predicted_tag = classifier.classify(text)
                expected_tag = classifier.normalize_tag_label(example["tag"])
                if expected_tag in Classifier.IGNORE_TAGS:
                    continue
                if predicted_tag == expected_tag:
                    corrects += 1
                else:
                    # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag)
                    pass
                total += 1

        print "{} {}".format(len(data_json), float(corrects) / total)
Beispiel #35
0
def input_fn(dataset_filename,
             vocab_filename,
             norm_filename=None,
             num_channels=39,
             batch_size=8,
             take=0,
             binf2phone=None):
    binary_targets = binf2phone is not None
    labels_shape = [] if not binary_targets else len(binf2phone.index)
    labels_dtype = tf.string if not binary_targets else tf.float32
    dataset = utils.read_dataset(dataset_filename,
                                 num_channels,
                                 labels_shape=labels_shape,
                                 labels_dtype=labels_dtype)
    vocab_table = utils.create_vocab_table(vocab_filename)

    if norm_filename is not None:
        means, stds = utils.load_normalization(args.norm)
    else:
        means = stds = None

    sos = binf2phone[utils.SOS].values if binary_targets else utils.SOS
    eos = binf2phone[utils.EOS].values if binary_targets else utils.EOS

    dataset = utils.process_dataset(dataset,
                                    vocab_table,
                                    sos,
                                    eos,
                                    means,
                                    stds,
                                    batch_size,
                                    1,
                                    binary_targets=binary_targets,
                                    labels_shape=labels_shape,
                                    is_infer=True)

    if args.take > 0:
        dataset = dataset.take(take)
    return dataset
Beispiel #36
0
def main():
    args = parser.parse_args()
    full_data_json = read_dataset(args.data)

    # for n in xrange(30, len(full_data_json), 30):
    for n in [len(full_data_json)]:

        corrects = 0
        total = 0

        for _ in xrange(SAMPLES):

            random.shuffle(full_data_json)
            data_json = full_data_json[:n]

            training_set_ratio = 0.7
            training_set_size = int(training_set_ratio * len(data_json) + 0.5)

            training_set = data_json[:training_set_size]
            test_set = data_json[training_set_size:]

            processor = TextProcessor()
            classifier = Classifier(processor)
            classifier.train(training_set)

            for example in test_set:
                text = example['content']
                predicted_tag = classifier.classify(text)
                expected_tag = classifier.normalize_tag_label(example['tag'])
                if expected_tag in Classifier.IGNORE_TAGS:
                    continue
                if predicted_tag == expected_tag:
                    corrects += 1
                else:
                    # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag)
                    pass
                total += 1

        print '{} {}'.format(len(data_json), float(corrects) / total)
Beispiel #37
0
def hyper_parameter_search(max_evals=100):
    from hyperopt import fmin, tpe, hp, STATUS_OK, STATUS_FAIL

    data = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    space = {
        'nneurons': hp.randint('nneurons', 41),
        'window': hp.randint('window', 2048),
        'season': hp.choice('season', ['no_season', 'half_day', 'full_day']),
        'activation_function': hp.choice('func', ['sigmoid', 'tanh', 'relu'])
    }

    def objective(params):
        nneurons = params['nneurons']

        if params['season'] == 'full_day':
            window = create_window_array(params['window'], season_lag=288)
        if params['season'] == 'half_day':
            window = create_window_array(params['window'], season_lag=168)
        else:
            window = create_window_array(params['window'])

        if not any(window) or nneurons < 2:
            return {'status': STATUS_FAIL}

        X_train, y_train, *_ = split_dataset(
            data, window, ratio=0.90, standardize=True)
        model = compile_model(
            nneurons, input_dim=sum(1 for x in window if x), loss_fn='mse',
            activation_fn=params['activation_function'])
        hist = model.fit(
            X_train, y_train, nb_epoch=50, validation_split=0.1,
            callbacks=[EarlyStopping(monitor='val_loss', patience=2)],
            verbose=0)

        return {'loss': hist.history['val_loss'][-1], 'status': STATUS_OK}

    return fmin(objective, space=space, algo=tpe.suggest, max_evals=max_evals)
Beispiel #38
0
def parse_datasets(args):
    '''
    Get datasets to be used for the experiments.
    '''
    from os.path import splitext

    experiment_datasets = []
    if args.dataset is None:
        args.dataset = list(big_datasets.keys())
    for d in args.dataset:
        if d in all_datasets:
            experiment_datasets.append(d)
        elif d in dataset_groups:
            experiment_datasets.extend(list(dataset_groups[d].keys()))
        else:
            try:
                rd = read_dataset(d, args.separator, args.colobs, args.varnames, args.indices)
                m, n = rd.shape
                name = splitext(basename(d))[0]
                all_datasets[name] = (d, n, m, args.separator, args.colobs, args.varnames, args.indices)
                experiment_datasets.append(name)
            except:
                raise RuntimeError('Dataset %s is not recognized' % d)
    args.dataset = experiment_datasets
Beispiel #39
0
def forecasting_error_experiment():
    print('load data set...')
    data_df = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    mean = data_df.mean()
    std = data_df.std()
    data_df -= mean
    data_df /= std
    data = data_df['data (in bytes)']

    start_forecast_idx = int(len(data) * 0.90)
    training_data = data[:start_forecast_idx]
    test_data = data[start_forecast_idx:]

    print('calculate forecasts using the naive method...')
    naive_forecast = [data[t-1] for t in range(start_forecast_idx, len(data))]
    naive_forecast_errors = []
    for steps in trange(24):
        forecast = naive_forecast[:len(naive_forecast)-steps]
        error = mase(training_data, test_data[steps:], forecast)
        naive_forecast_errors.append(error)

    print('calculate forecasts using a MLP neural network...')
    mlp_model = load_model('Saturday_192115', 'mse')
    window = create_window_array(139, 288)
    window_size = sum(1 for x in window if x)

    mlp_forecast_errors = []
    for steps in trange(24):
        mlp_forecast = []
        for t in range(start_forecast_idx, len(data) - steps):
            mlp_forecast.append(iterative_prediction(
                mlp_model, data[:t], (1, window_size), window, steps+1))
        mlp_forecast_errors.append(
            mase(training_data, test_data[steps:], mlp_forecast))

    print('calculate forecasts using a LSTM neural network...')
    lstm_model = load_model('Saturday_181721', 'mse')
    window = [True] * 19

    lstm_forecast_errors = []
    for steps in trange(24):
        lstm_forecast = []
        for t in range(start_forecast_idx, len(data) - steps):
            lstm_forecast.append(iterative_prediction(
                lstm_model, data[:t], (1, len(window), 1), window, steps+1))
        lstm_forecast_errors.append(
            mase(training_data, test_data[steps:], lstm_forecast))

    print('calculate forecasts using a deep LSTM neural network...')
    lstm_model = load_model('Saturday_171936', 'mse')
    window = [True] * 14

    deep_lstm_forecast_errors = []
    for steps in trange(24):
        lstm_forecast = []
        for t in range(start_forecast_idx, len(data) - steps):
            lstm_forecast.append(iterative_prediction(
                lstm_model, data[:t], (1, len(window), 1), window, steps+1))
        deep_lstm_forecast_errors.append(
            mase(training_data, test_data[steps:], lstm_forecast))

    plt.ylabel('Error')
    plt.xlabel('Steps')
    plt.plot(naive_forecast_errors, label='naïve')
    plt.plot(mlp_forecast_errors, label='MLP')
    plt.plot(lstm_forecast_errors, label='1 layer LSTM')
    plt.plot(deep_lstm_forecast_errors, label='2 layer LSTM')
    plt.legend(loc='upper left')
    plt.show()
Beispiel #40
0
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import utils
trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/factorized1.npz')
extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= 50,
 max_depth = 35, min_samples_leaf= 4, n_jobs = 4)
y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf)
y = y.reshape((len(y),1))
trainy= trainy.reshape((len(trainy), 1))
utils.save_dataset("data/extra_trees_factorized.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names= ['extra_trees'])
Beispiel #41
0
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
import utils
trainf, trainl, testf, test_ids, feature_names = utils.read_dataset('data/no_big_cats1.npz')
extra_trees = ExtraTreesRegressor(n_estimators = 100, max_features= .3,
 max_depth = 30, min_samples_leaf= 4, n_jobs = 4)
y, trainy = utils.cross_val_model(extra_trees, trainf, trainl, testf)
y = y.reshape((len(y),1))
trainy= trainy.reshape((len(trainy), 1))
utils.save_dataset("data/random_forest2.npz",train_features=trainy, train_labels=trainl, test_features=y, ids=test_ids, feature_names=['random_forest'])