def create_image_data(): try: img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='w') atom = tables.Float32Atom() img_arr = img_arr_file.create_earray( img_arr_file.root, 'data', atom, (0, cnt.IMAGE_SIZE, cnt.IMAGE_SIZE, 3)) chunk_size, labels = 5000, [] for df_chunk in pd.read_csv(cnt.OUTPUT_FILE_PATH, chunksize=chunk_size): df = df_chunk[list( df_chunk['image_path'].apply(lambda x: os.path.exists(x)))] print(df.shape) labels += list(df['age_group']) file_paths = list(df['image_path']) img_arr.append([ img_to_array( load_img(image).convert('RGB').resize( (cnt.IMAGE_SIZE, cnt.IMAGE_SIZE))) / 255.0 for image in file_paths ]) shutils.save_data_pkl(labels, cnt.LABELS_PATH) finally: img_arr_file.close()
def create_train_test(): try: img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='r') img_arr = img_arr_file.root.data train_indices, test_indices = train_test_split(range(img_arr.shape[0]), test_size=0.2) encoder = MultiLabelBinarizer() labels = shutils.load_data_pkl(cnt.LABELS_PATH) labels = [x.strip().split('__') for x in labels] transfomed_labels = encoder.fit_transform(labels) shutils.save_data_pkl(transfomed_labels, cnt.TRANSFORMED_LABELS_PATH) shutils.save_data_pkl(encoder, cnt.ENCODER_PATH) print(len(train_indices), len(test_indices)) shutils.save_data_pkl(train_indices, cnt.TRAIN_INDICES_PATH) shutils.save_data_pkl(test_indices, cnt.TEST_INDICES_PATH) finally: img_arr_file.close()
def create_text_data(): df = pd.read_csv(cnt.OUTPUT_FILE_PATH) df = df[list(df['image_path'].apply(lambda x: os.path.exists(x)))] titles = list(df['title']) short_desc = list(df['short_description']) long_desc = list(df['long_description']) class_labels = [x.strip().split('__') for x in list(df['age_group'])] text_corpus = [ titles[i] + " " + short_desc[i] + " " + long_desc[i] for i in range(len(titles)) ] features = set( shutils.get_features_mi(text_corpus, class_labels, cnt.MAX_FEATURES)) input_tensor, nn_tokenizer = shutils.get_preprocessed_data( text_corpus, feature_set=features, max_length=cnt.MAX_WORDS) shutils.save_data_pkl(input_tensor, cnt.INPUT_TENSOR_PATH) shutils.save_data_pkl(nn_tokenizer, cnt.TENSOR_TOKENIZER_PATH) vocab_size = len(nn_tokenizer.word_index) + 1 shutils.save_data_pkl(vocab_size, cnt.VOCAB_SIZE_PATH)