def download_im_with_text(search_query, start, end, dataset_dir='data', subdir='photos'): """Download images using the urls in the dataframe specified by the search query. Parameters: search_query: A string giving the sentiment to load the corresponding dataframe. start: A start index for the loaded dataframe. end: An end index for the loaded dataframe. -1 corresponds to the last row. dataset_dir: A directory where the dataframes are stored. subdir: A subdirectory to store the photos. Returns: Images downloaded in the directory dataset_dir/subdir/search_query, having the posts ids as names. """ # Load data emb_name = 'glove' text_dir = 'text_model' emb_dir = 'embedding_weights' filename = 'glove.6B.50d.txt' if emb_name == 'word2vec': vocabulary, embedding = _load_embedding_weights_word2vec( text_dir, emb_dir, filename) else: vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) df = preprocess_one_df(vocabulary, embedding, search_query, _POST_SIZE) if end == -1: end = df.shape[0] links = df['photo'] # Create subdir if it doesn't exist if not tf.gfile.Exists(os.path.join(dataset_dir, subdir)): tf.gfile.MakeDirs(os.path.join(dataset_dir, subdir)) # Create search_query folder if it doesn't exist photos_dir = os.path.join(dataset_dir, subdir, search_query) if not tf.gfile.Exists(photos_dir): tf.gfile.MakeDirs(photos_dir) for i in range(start, end): # Check for NaNs if links[i] == links[i]: # Open url and convert to JPEG image try: f = urlopen(links[i]) except Exception: continue image_file = io.BytesIO(f.read()) im = Image.open(image_file) # The filename is the index of the image in the dataframe filename = str(i) + '.jpg' im.convert('RGB').save(os.path.join(photos_dir, filename), 'JPEG')
def convert_images_with_text(dataset_dir, num_valid, photos_subdir='photos', tfrecords_subdir='tfrecords'): """Downloads the photos and convert them to TFRecords. Parameters: dataset_dir: The data directory. photos_subdir: The subdirectory where the photos are stored. tfrecords_subdir: The subdirectory to store the TFRecords files. """ # Create the tfrecords_subdir if it doesn't exist if not tf.gfile.Exists(os.path.join(dataset_dir, tfrecords_subdir)): tf.gfile.MakeDirs(os.path.join(dataset_dir, tfrecords_subdir)) if _dataset_exists(dataset_dir, photos_subdir): print('Dataset files already exist. Exiting without re-creating them.') return photo_filenames, class_names = _get_filenames_and_classes( dataset_dir, photos_subdir, full_list='shuffled_test.csv') class_names_to_ids = dict(zip(class_names, range(len(class_names)))) # Divide into train and test: random.seed(_RANDOM_SEED) random.shuffle(photo_filenames) training_filenames = photo_filenames[num_valid:] validation_filenames = photo_filenames[:num_valid] # Load dataframes df_dict = dict() emb_name = 'glove' text_dir = 'text_model' emb_dir = 'embedding_weights' filename = 'glove.6B.50d.txt' if emb_name == 'word2vec': vocabulary, embedding = _load_embedding_weights_word2vec( text_dir, emb_dir, filename) else: vocabulary, embedding = _load_embedding_weights_glove( text_dir, emb_dir, filename) for emotion in class_names: df_dict[emotion] = preprocess_one_df( vocabulary, embedding, emotion, _POST_SIZE) # First, convert the training and validation sets. _convert_dataset_with_text('train', training_filenames, class_names_to_ids, dataset_dir, df_dict, tfrecords_subdir) _convert_dataset_with_text('validation', validation_filenames, class_names_to_ids, dataset_dir, df_dict, tfrecords_subdir) # Write the train/validation split size train_valid_split = dict( zip(['train', 'validation'], [len(photo_filenames) - num_valid, num_valid])) train_valid_filename = os.path.join( dataset_dir, photos_subdir, _TRAIN_VALID_FILENAME) with tf.gfile.Open(train_valid_filename, 'w') as f: for split_name in train_valid_split: size = train_valid_split[split_name] f.write('%s:%d\n' % (split_name, size)) # Finally, write the labels file: labels_to_class_names = dict(zip(range(len(class_names)), class_names)) dataset_utils.write_label_file( labels_to_class_names, dataset_dir, photos_subdir) #_clean_up_temporary_files(dataset_dir) print('\nFinished converting the dataset!')
'happy', 'sad', 'scared', 'angry', 'surprised', 'disgusted', 'annoyed', 'bored', 'love', 'calm', 'amazed', 'optimistic', 'pensive', 'ashamed', 'excited' ] #removed interested df_dict = dict() text_dir = 'text_model' emb_dir = 'embedding_weights' filename = 'glove.6B.50d.txt' vocabulary, embedding = _load_embedding_weights_glove(text_dir, emb_dir, filename) _POST_SIZE = 50 for emotion in emotions: df = preprocess_one_df(vocabulary, embedding, emotion, _POST_SIZE) df_dict[emotion] = df columns = [ u'id', u'post_url', u'type', u'timestamp', u'date', u'tags', u'liked', u'note_count', u'photo', u'text', u'search_query', u'text_list', u'text_len' ] df_all = pd.DataFrame(columns=[]) for emotion in emotions: df_all = pd.concat([df_all, df_dict[emotion]]).reset_index(drop=True) # Get top 1000 most occurring words vocabulary_count = dict( zip(vocabulary, np.zeros(len(vocabulary), dtype=np.int32)))