from utils import load_set, load_clean_descriptions, load_photo_features, create_tokenizer, create_sequences, max_length, define_model
from keras.callbacks import ModelCheckpoint

filename = 'dataset/Flickr_8k.trainImages.txt'
train = load_set(filename)
print('Dataset: %d' % len(train))
train_descriptions = load_clean_descriptions('descriptions.txt', train)
print('Descriptions: train=%d' % len(train_descriptions))
# photo features
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))
# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)
# prepare sequences
X1train, X2train, ytrain = create_sequences(tokenizer, max_length,
                                            train_descriptions, train_features)

# dev dataset

# load test set
filename = 'dataset/Flickr_8k.devImages.txt'
test = load_set(filename)
print('Dataset: %d' % len(test))
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
test_features = load_photo_features('features.pkl', test)
Ejemplo n.º 2
0
from utils import get_base_dir, load_clean_descriptions, load_image_names
from utils import max_length, data_generator, build_model

base_dir = get_base_dir()

train_features_path = os.path.join(
    base_dir, "data/captioning/pickle/encoded_train_images.pkl")
train_features = pickle.load(open(train_features_path, "rb"))

print('Photos: train=%d' % len(train_features))

text_file_base_dir = os.path.join(base_dir, "data/captioning/TextFiles/")
filename = os.path.join(text_file_base_dir, 'Flickr_8k.trainImages.txt')
train_image_names = load_image_names(filename)
# descriptions
train_descriptions = load_clean_descriptions(
    'data/captioning/descriptions.txt', train_image_names)
print('Descriptions: train=%d' % len(train_descriptions))

# Create a list of all the training captions
all_train_captions = []
for key, val in train_descriptions.items():
    for cap in val:
        all_train_captions.append(cap)
print(len(all_train_captions))

# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nsents = 0
for sent in tqdm(all_train_captions):
    nsents += 1
Ejemplo n.º 3
0
from constants import TRAIN_SET, DESCRIPTION, MAX_LENGTH, TOKEN


# save max_length to file
def save_max_length(max_length, filename):
    file = open(filename, 'w')
    file.write(max_length)
    file.close()


# load training dataset (6K)
train = load_set(TRAIN_SET)
print('Dataset: %d' % len(train))

# descriptions
train_descriptions = load_clean_descriptions(DESCRIPTION, train)
print('Descriptions: train=%d' % len(train_descriptions))

# prepare tokenizer
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

# save max_length
save_max_length(str(max_length), MAX_LENGTH)

# save the tokenizer
Ejemplo n.º 4
0
Archivo: evaluate.py Proyecto: shin7/ML
          corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' %
          corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' %
          corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' %
          corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


# load the tokenizer
tokenizer = load(open(TOKEN, 'rb'))
# pre-define the max sequence length (from training)
max_length = int(load_doc(MAX_LENGTH))
print('Description Length: %d' % max_length)

# prepare test set

# load test set
test = load_set(TEST_SET)
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions(DESCRIPTION, test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features(FEATURE, test)
print('Photos: test=%d' % len(test_features))

# load the model
model = load_model(MODEL_FILE)
# evaluate model
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)