Beispiel #1
0
def create_image_data():
    try:
        img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='w')
        atom = tables.Float32Atom()
        img_arr = img_arr_file.create_earray(
            img_arr_file.root, 'data', atom,
            (0, cnt.IMAGE_SIZE, cnt.IMAGE_SIZE, 3))

        chunk_size, labels = 5000, []
        for df_chunk in pd.read_csv(cnt.OUTPUT_FILE_PATH,
                                    chunksize=chunk_size):
            df = df_chunk[list(
                df_chunk['image_path'].apply(lambda x: os.path.exists(x)))]
            print(df.shape)
            labels += list(df['age_group'])
            file_paths = list(df['image_path'])
            img_arr.append([
                img_to_array(
                    load_img(image).convert('RGB').resize(
                        (cnt.IMAGE_SIZE, cnt.IMAGE_SIZE))) / 255.0
                for image in file_paths
            ])

        shutils.save_data_pkl(labels, cnt.LABELS_PATH)
    finally:
        img_arr_file.close()
Beispiel #2
0
def create_train_test():
    try:
        img_arr_file = tables.open_file(cnt.IMAGE_ARRAY_PATH, mode='r')
        img_arr = img_arr_file.root.data

        train_indices, test_indices = train_test_split(range(img_arr.shape[0]),
                                                       test_size=0.2)

        encoder = MultiLabelBinarizer()

        labels = shutils.load_data_pkl(cnt.LABELS_PATH)
        labels = [x.strip().split('__') for x in labels]

        transfomed_labels = encoder.fit_transform(labels)

        shutils.save_data_pkl(transfomed_labels, cnt.TRANSFORMED_LABELS_PATH)
        shutils.save_data_pkl(encoder, cnt.ENCODER_PATH)

        print(len(train_indices), len(test_indices))

        shutils.save_data_pkl(train_indices, cnt.TRAIN_INDICES_PATH)
        shutils.save_data_pkl(test_indices, cnt.TEST_INDICES_PATH)

    finally:
        img_arr_file.close()
Beispiel #3
0
def create_text_data():
    df = pd.read_csv(cnt.OUTPUT_FILE_PATH)
    df = df[list(df['image_path'].apply(lambda x: os.path.exists(x)))]

    titles = list(df['title'])
    short_desc = list(df['short_description'])
    long_desc = list(df['long_description'])

    class_labels = [x.strip().split('__') for x in list(df['age_group'])]

    text_corpus = [
        titles[i] + " " + short_desc[i] + " " + long_desc[i]
        for i in range(len(titles))
    ]

    features = set(
        shutils.get_features_mi(text_corpus, class_labels, cnt.MAX_FEATURES))
    input_tensor, nn_tokenizer = shutils.get_preprocessed_data(
        text_corpus, feature_set=features, max_length=cnt.MAX_WORDS)

    shutils.save_data_pkl(input_tensor, cnt.INPUT_TENSOR_PATH)
    shutils.save_data_pkl(nn_tokenizer, cnt.TENSOR_TOKENIZER_PATH)

    vocab_size = len(nn_tokenizer.word_index) + 1
    shutils.save_data_pkl(vocab_size, cnt.VOCAB_SIZE_PATH)