Esempio n. 1
0
    pre.load_all_texts_from_directory(path=TRAIN_PATH_POS, name='raw_pos')
    pre.load_all_texts_from_directory(path=TRAIN_PATH_NEG, name='raw_neg')

    texts_list = pd.concat([pre.data['raw_pos'], pre.data['raw_neg']])

    print('fit & transform training data')

    text_transformer = TextTransformer(num_words=n_words)
    train_features = text_transformer.fit_and_transform(texts_list)
    train_features['target'] = np.append(np.ones(len(pre.data['raw_pos'])),
                                         np.zeros(len(pre.data['raw_neg'])))

    #shuffle datafram in-place
    train_features = train_features.sample(frac=1).reset_index(drop=True)

    pre.set('training_data' + '_' + str(n_words), train_features)
    pre.save('training_data' + '_' + str(n_words))

    #transform the test data

    TEST_PATH_POS = 'test/pos/'
    TEST_PATH_NEG = 'test/neg/'

    pre.load_all_texts_from_directory(path=TEST_PATH_POS, name='raw_pos_test')
    pre.load_all_texts_from_directory(path=TEST_PATH_NEG, name='raw_neg_test')

    print('transform test data')

    texts_list_test = pd.concat(
        [pre.data['raw_pos_test'], pre.data['raw_neg_test']])
    test_features = text_transformer.transform(texts_list_test)
    X_train_encoded = trained_model.encoder(X_train)
    X_test_encoded = trained_model.encoder(X_test)
    X_test_decoded = trained_model.decoder(X_test_encoded)

    X_train_encoded_df = pd.DataFrame(X_train_encoded.detach().numpy())
    X_test_encoded_df = pd.DataFrame(X_test_encoded.detach().numpy())

    cols = list(range(1, n_features_encoded + 1))

    X_train_encoded_df.columns = cols
    X_test_encoded_df.columns = cols

    train_encoded_data = X_train_encoded_df.join(y_train_df)
    test_encoded_data = X_test_encoded_df.join(y_test_df)

    pre.set('train_encoded', train_encoded_data)
    pre.set('test_encoded', test_encoded_data)

    pre.save('train_encoded')
    pre.save('test_encoded')

    plt.figure(1, figsize=(20, 10))
    for idx in range(30):
        image = X_test[idx].detach().numpy().reshape(16, 16)
        image2 = X_test_encoded[idx].detach().numpy().reshape(
            int(math.sqrt(n_features_encoded)),
            int(math.sqrt(n_features_encoded)))
        image3 = X_test_decoded[idx].detach().numpy().reshape(16, 16)
        # Call signature: subplot(nrows, ncols, index, **kwargs)
        plt.subplot(10, 9, 1 + idx * 3)
        plt.imshow(image, cmap='hot', interpolation='none')
Esempio n. 3
0
    pre_train.load_data(filename='zip.train', name='raw', **kwarg)

    pre_train.cleanup(name='raw',
                      drop_duplicates=True,
                      dropna={
                          'axis': 1,
                          'thresh': 2
                      })

    print(pre_train.get('clean').head())

    #classes = ['0_0.0', '0_1.0', '0_2.0', '0_3.0', '0_4.0', '0_5.0', '0_6.0', '0_7.0', '0_8.0', '0_9.0']
    X = pre_train.get('clean').drop(columns=[0])
    y = pre_train.get('clean')[0]

    pre_train.set(name='train', value=pre_train.get('clean'))

    pre_train.save(name='train')

    pre_test = Preprocessing('digits')
    kwarg = {'header': None, 'sep': ' '}
    pre_test.load_data(filename='zip.test', name='raw', **kwarg)

    pre_test.cleanup(name='raw',
                     drop_duplicates=True,
                     dropna={
                         'axis': 1,
                         'thresh': 2
                     })

    print(pre_test.get('clean').head())