def get_embedding_complex(text, word_weights): words = text.split(' ') topics = np.zeros(num_topics) if embedding == 'LDA': for w in words: w_topics = text2topics.LDA(w, model, num_topics) topics = topics + w_topics topics = topics / len(words) elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(text, word_weights, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(text, model, num_topics, tfidf_model, tfidf_dictionary) elif embedding == 'doc2vec': topics = text2topics.doc2vec(text, model, num_topics) elif embedding == 'glove': topics = text2topics.glove(text, word_weights, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(text, model, num_topics) return topics
def get_results_complex(database, text, word_weights, num_results, results_path): words = text.split(' ') topics = np.zeros(num_topics) if embedding == 'LDA': for w in words: w_topics = text2topics.LDA(w, model, num_topics) topics = topics + w_topics topics = topics / len(words) elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(text, word_weights, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(text, model, num_topics, tfidf_model, tfidf_dictionary) topics = topics / len(words) elif embedding == 'doc2vec': topics = text2topics.doc2vec(text, model, num_topics) elif embedding == 'glove': topics = text2topics.glove(text, word_weights, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(text, model, num_topics) # Create empty dict for distances distances = {} print "Max text query: " + str(max(topics)) print "Max total query: " + str(max(topics)) # Compute distances for id in database: distances[id] = np.dot(database[id], topics) # Sort dictionary distances = sorted(distances.items(), key=operator.itemgetter(1), reverse=True) # Get elements with min distances for idx, id in enumerate(distances): # Copy image results if test_dataset == 'webvision': copyfile('../../../datasets/WebVision/test_images_256/' + id[0] , results_path + id[0].replace('/', '_')) else: copyfile('../../../datasets/SocialMedia/img_resized_1M/cities_instagram/' + id[0] + '.jpg', results_path + id[0].replace('/', '_') + '.jpg') if idx == num_results - 1: break
used_words += 1 topics = topics / used_words elif embedding == 'word2vec_mean': word_weights = 0 topics = text2topics.word2vec_mean(text_query, word_weights, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(text_query, model, num_topics, tfidf_model, tfidf_dictionary) elif embedding == 'doc2vec': topics = text2topics.doc2vec(text_query, model, num_topics) elif embedding == 'glove': topics = text2topics.glove(text_query, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(text_query, model, num_topics) else: print("Select a correct embedding") raise SystemExit(0) distances = {} for id in database: distances[id] = np.linalg.norm(database[id]-topics) correct = 0 precisions = []
def get_results_complex(database, text, word_weights, im_id, num_results): words = text.split(' ') topics = np.zeros(num_topics) if embedding == 'LDA': for w in words: w_topics = text2topics.LDA(w, model, num_topics) topics = topics + w_topics topics = topics / len(words) elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(text, word_weights, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(text, model, num_topics, tfidf_model, tfidf_dictionary) elif embedding == 'doc2vec': topics = text2topics.doc2vec(text, model, num_topics) elif embedding == 'glove': topics = text2topics.glove(text, word_weights, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(text, model, num_topics) if FC: topics = topics - min(topics) if max(topics) > 0: topics = topics / max(topics) topics = get_NN_txt_embedding.get_NN_txt_embedding(text_NN, topics) topics = topics - min(topics) topics = topics / max(topics) # Create empty dict for distances distances = {} # Compute distances for id in database: distances[id] = np.dot(database[id], topics) # Sort dictionary distances = sorted(distances.items(), key=operator.itemgetter(1), reverse=True) # Get elements with min distances results = [] for idx, id in enumerate(distances): copyfile('../../../datasets/COCO/val2014/' + id[0], '../../../datasets/COCO/rr/' + id[0].replace('/', '_')) results.append(int(id[0][13:-4])) if idx == num_results - 1: break if im_id in results: return True else: return False
for cur_q in q: cur_w = '1 1' if len(cur_q.split(' ')) == 1: if embedding == 'LDA': topics = text2topics.LDA(cur_q, model, num_topics) elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics) elif embedding == 'doc2vec': topics = text2topics.doc2vec(cur_q, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(cur_q, model, num_topics, tfidf_model, tfidf_dictionary) elif embedding == 'glove': topics = text2topics.glove(cur_q, cur_w, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(cur_q, model, num_topics) else: topics = get_embedding_complex(cur_q, cur_w) # Normalize by max to compare with net topics = topics - min(topics) topics = topics / max(topics) f.write('word_images/' + cur_q.replace(' ', '_')) for t in topics: f.write(',' + str(t)) f.write('\n')
q.append('snow ski') q.append('rain umbrella') q.append('icecream beach') q.append('chocolate cake') q.append('pizza wine') q.append('woman bag') q.append('man boat') q.append('kid dog') file = open(out_file_path,'w') for e,cur_q in enumerate(q): if len(cur_q.split(' ')) == 1: topics = text2topics.glove(cur_q, '1', model, num_topics) if len(cur_q.split(' ')) == 2: topics = text2topics.glove(cur_q, '0.5 0.5', model, num_topics) file.write(cur_q) for t in topics: file.write(',' + str(t))
word_weights = 0 topics = text2topics.word2vec_mean(text_query, word_weights, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(text_query, model, num_topics, tfidf_model, tfidf_dictionary) # topics = topics + w_topics # topics = topics / len(words) elif embedding == 'doc2vec': topics = text2topics.doc2vec(text_query, model, num_topics) elif embedding == 'glove': word_weights = 0 topics = text2topics.glove(text_query, word_weights, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(text_query, model, num_topics) # Create empty dict for ditances distances = {} topics = topics - min(topics) topics = topics / max(topics) # Compute distances) for id in database: distances[id] = np.dot(database[id], topics) # Get elements with min distances
print(cur_q) cur_w = w[e] if test_dataset == 'webvision': results_path = "../../../datasets/WebVision/rr/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/' else: results_path = "../../../datasets/SocialMedia/retrieval_results/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/' if not os.path.exists(results_path): print("Creating dir: " + results_path) os.makedirs(results_path) if len(cur_q.split(' ')) == 1: if embedding == 'LDA': topics = text2topics.LDA(cur_q, model, num_topics) elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics) elif embedding == 'doc2vec': topics = text2topics.doc2vec(cur_q, model, num_topics) elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(cur_q, model, num_topics, tfidf_model, tfidf_dictionary) elif embedding == 'glove': topics = text2topics.glove(cur_q, cur_w, model, num_topics) elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(cur_q, model, num_topics) get_results(database, topics, num_results,results_path) else: get_results_complex(database, cur_q, cur_w, num_results, results_path)