Beispiel #1
0
def download_im_with_text(search_query,
                          start,
                          end,
                          dataset_dir='data',
                          subdir='photos'):
    """Download images using the urls in the dataframe specified by the search query.

    Parameters:
        search_query: A string giving the sentiment to load the corresponding dataframe.
        start: A start index for the loaded dataframe.
        end: An end index for the loaded dataframe. -1 corresponds to the last row.
        dataset_dir: A directory where the dataframes are stored.
        subdir: A subdirectory to store the photos.

    Returns:
        Images downloaded in the directory dataset_dir/subdir/search_query, having 
        the posts ids as names.
    """
    # Load data
    emb_name = 'glove'
    text_dir = 'text_model'
    emb_dir = 'embedding_weights'
    filename = 'glove.6B.50d.txt'
    if emb_name == 'word2vec':
        vocabulary, embedding = _load_embedding_weights_word2vec(
            text_dir, emb_dir, filename)
    else:
        vocabulary, embedding = _load_embedding_weights_glove(
            text_dir, emb_dir, filename)

    df = preprocess_one_df(vocabulary, embedding, search_query, _POST_SIZE)
    if end == -1:
        end = df.shape[0]

    links = df['photo']
    # Create subdir if it doesn't exist
    if not tf.gfile.Exists(os.path.join(dataset_dir, subdir)):
        tf.gfile.MakeDirs(os.path.join(dataset_dir, subdir))
    # Create search_query folder if it doesn't exist
    photos_dir = os.path.join(dataset_dir, subdir, search_query)
    if not tf.gfile.Exists(photos_dir):
        tf.gfile.MakeDirs(photos_dir)
    for i in range(start, end):
        # Check for NaNs
        if links[i] == links[i]:
            # Open url and convert to JPEG image
            try:
                f = urlopen(links[i])
            except Exception:
                continue
            image_file = io.BytesIO(f.read())
            im = Image.open(image_file)
            # The filename is the index of the image in the dataframe
            filename = str(i) + '.jpg'
            im.convert('RGB').save(os.path.join(photos_dir, filename), 'JPEG')
def convert_images_with_text(dataset_dir, num_valid, photos_subdir='photos', tfrecords_subdir='tfrecords'):
	"""Downloads the photos and convert them to TFRecords.

	Parameters:
		dataset_dir: The data directory.
		photos_subdir: The subdirectory where the photos are stored.
		tfrecords_subdir: The subdirectory to store the TFRecords files.
	"""
	# Create the tfrecords_subdir if it doesn't exist
	if not tf.gfile.Exists(os.path.join(dataset_dir, tfrecords_subdir)):
		tf.gfile.MakeDirs(os.path.join(dataset_dir, tfrecords_subdir))

	if _dataset_exists(dataset_dir, photos_subdir):
		print('Dataset files already exist. Exiting without re-creating them.')
		return

	photo_filenames, class_names = _get_filenames_and_classes(
		dataset_dir, photos_subdir, full_list='shuffled_test.csv')
	class_names_to_ids = dict(zip(class_names, range(len(class_names))))

	# Divide into train and test:
	random.seed(_RANDOM_SEED)
	random.shuffle(photo_filenames)

	training_filenames = photo_filenames[num_valid:]
	validation_filenames = photo_filenames[:num_valid]

	# Load dataframes
	df_dict = dict()
	emb_name = 'glove'
	text_dir = 'text_model'
	emb_dir = 'embedding_weights'
	filename = 'glove.6B.50d.txt'
	if emb_name == 'word2vec':
		vocabulary, embedding = _load_embedding_weights_word2vec(
			text_dir, emb_dir, filename)
	else:
		vocabulary, embedding = _load_embedding_weights_glove(
			text_dir, emb_dir, filename)

	for emotion in class_names:
		df_dict[emotion] = preprocess_one_df(
			vocabulary, embedding, emotion, _POST_SIZE)

	# First, convert the training and validation sets.
	_convert_dataset_with_text('train', training_filenames, class_names_to_ids,
									 dataset_dir, df_dict, tfrecords_subdir)
	_convert_dataset_with_text('validation', validation_filenames, class_names_to_ids,
									 dataset_dir, df_dict, tfrecords_subdir)

	# Write the train/validation split size
	train_valid_split = dict(
		zip(['train', 'validation'], [len(photo_filenames) - num_valid, num_valid]))
	train_valid_filename = os.path.join(
		dataset_dir, photos_subdir, _TRAIN_VALID_FILENAME)
	with tf.gfile.Open(train_valid_filename, 'w') as f:
		for split_name in train_valid_split:
			size = train_valid_split[split_name]
			f.write('%s:%d\n' % (split_name, size))

	# Finally, write the labels file:
	labels_to_class_names = dict(zip(range(len(class_names)), class_names))
	dataset_utils.write_label_file(
		labels_to_class_names, dataset_dir, photos_subdir)

	#_clean_up_temporary_files(dataset_dir)
	print('\nFinished converting the dataset!')
    'happy', 'sad', 'scared', 'angry', 'surprised', 'disgusted', 'annoyed',
    'bored', 'love', 'calm', 'amazed', 'optimistic', 'pensive', 'ashamed',
    'excited'
]  #removed interested

df_dict = dict()

text_dir = 'text_model'
emb_dir = 'embedding_weights'
filename = 'glove.6B.50d.txt'
vocabulary, embedding = _load_embedding_weights_glove(text_dir, emb_dir,
                                                      filename)
_POST_SIZE = 50

for emotion in emotions:
    df = preprocess_one_df(vocabulary, embedding, emotion, _POST_SIZE)
    df_dict[emotion] = df

columns = [
    u'id', u'post_url', u'type', u'timestamp', u'date', u'tags', u'liked',
    u'note_count', u'photo', u'text', u'search_query', u'text_list',
    u'text_len'
]
df_all = pd.DataFrame(columns=[])

for emotion in emotions:
    df_all = pd.concat([df_all, df_dict[emotion]]).reset_index(drop=True)

# Get top 1000 most occurring words
vocabulary_count = dict(
    zip(vocabulary, np.zeros(len(vocabulary), dtype=np.int32)))