pre.load_all_texts_from_directory(path=TRAIN_PATH_NEG, name='raw_neg') texts_list = pd.concat([pre.data['raw_pos'], pre.data['raw_neg']]) print('fit & transform training data') text_transformer = TextTransformer(num_words=n_words) train_features = text_transformer.fit_and_transform(texts_list) train_features['target'] = np.append(np.ones(len(pre.data['raw_pos'])), np.zeros(len(pre.data['raw_neg']))) #shuffle datafram in-place train_features = train_features.sample(frac=1).reset_index(drop=True) pre.set('training_data' + '_' + str(n_words), train_features) pre.save('training_data' + '_' + str(n_words)) #transform the test data TEST_PATH_POS = 'test/pos/' TEST_PATH_NEG = 'test/neg/' pre.load_all_texts_from_directory(path=TEST_PATH_POS, name='raw_pos_test') pre.load_all_texts_from_directory(path=TEST_PATH_NEG, name='raw_neg_test') print('transform test data') texts_list_test = pd.concat( [pre.data['raw_pos_test'], pre.data['raw_neg_test']]) test_features = text_transformer.transform(texts_list_test) test_features['target'] = np.append(
X_train_encoded_df = pd.DataFrame(X_train_encoded.detach().numpy()) X_test_encoded_df = pd.DataFrame(X_test_encoded.detach().numpy()) cols = list(range(1, n_features_encoded + 1)) X_train_encoded_df.columns = cols X_test_encoded_df.columns = cols train_encoded_data = X_train_encoded_df.join(y_train_df) test_encoded_data = X_test_encoded_df.join(y_test_df) pre.set('train_encoded', train_encoded_data) pre.set('test_encoded', test_encoded_data) pre.save('train_encoded') pre.save('test_encoded') plt.figure(1, figsize=(20, 10)) for idx in range(30): image = X_test[idx].detach().numpy().reshape(16, 16) image2 = X_test_encoded[idx].detach().numpy().reshape( int(math.sqrt(n_features_encoded)), int(math.sqrt(n_features_encoded))) image3 = X_test_decoded[idx].detach().numpy().reshape(16, 16) # Call signature: subplot(nrows, ncols, index, **kwargs) plt.subplot(10, 9, 1 + idx * 3) plt.imshow(image, cmap='hot', interpolation='none') plt.subplot(10, 9, 2 + idx * 3) plt.imshow(image2, cmap='winter', interpolation='none') plt.subplot(10, 9, 3 + idx * 3)
pre_train.cleanup(name='raw', drop_duplicates=True, dropna={ 'axis': 1, 'thresh': 2 }) print(pre_train.get('clean').head()) #classes = ['0_0.0', '0_1.0', '0_2.0', '0_3.0', '0_4.0', '0_5.0', '0_6.0', '0_7.0', '0_8.0', '0_9.0'] X = pre_train.get('clean').drop(columns=[0]) y = pre_train.get('clean')[0] pre_train.set(name='train', value=pre_train.get('clean')) pre_train.save(name='train') pre_test = Preprocessing('digits') kwarg = {'header': None, 'sep': ' '} pre_test.load_data(filename='zip.test', name='raw', **kwarg) pre_test.cleanup(name='raw', drop_duplicates=True, dropna={ 'axis': 1, 'thresh': 2 }) print(pre_test.get('clean').head()) #classes = ['0_0.0', '0_1.0', '0_2.0', '0_3.0', '0_4.0', '0_5.0', '0_6.0', '0_7.0', '0_8.0', '0_9.0']