コード例 #1
0
def get_embedding_complex(text, word_weights):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model,
                                           num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics,
                                            tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)

    return topics
コード例 #2
0
def get_results_complex(database, text, word_weights, num_results, results_path):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model, num_topics)


    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics, tfidf_model, tfidf_dictionary)
        topics = topics / len(words)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)


    # Create empty dict for distances
    distances = {}
    print "Max text query: " + str(max(topics))
    print "Max total query: " + str(max(topics))

    # Compute distances
    for id in database:
        distances[id] = np.dot(database[id], topics)

    # Sort dictionary
    distances = sorted(distances.items(), key=operator.itemgetter(1), reverse=True)

    # Get elements with min distances
    for idx, id in enumerate(distances):
        # Copy image results
        if test_dataset == 'webvision':
            copyfile('../../../datasets/WebVision/test_images_256/' + id[0] , results_path + id[0].replace('/', '_'))
        else:
            copyfile('../../../datasets/SocialMedia/img_resized_1M/cities_instagram/' + id[0] + '.jpg', results_path + id[0].replace('/', '_') + '.jpg')
        if idx == num_results - 1: break
コード例 #3
0
    for l in topics:
        for cat in topics:
            if cat == 'plant_life':
                cat = 'plant'
            text_query = text_query + ' ' + cat

    # text_query = 'car'
    words = text_query.split(' ')
    topics = np.zeros(num_topics)


    if embedding == 'LDA':
        used_words = 0
        for w in words:
            if w == '' or w == []: continue
            w_topics = text2topics.LDA(w, model, num_topics)
            if sum(w_topics) > 0:
                topics = topics + w_topics
                used_words += 1
        topics = topics / used_words

    elif embedding == 'word2vec_mean':
        word_weights = 0
        topics = text2topics.word2vec_mean(text_query, word_weights, model, num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text_query, model, num_topics, tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text_query, model, num_topics)
コード例 #4
0
def get_results_complex(database, text, word_weights, im_id, num_results):

    words = text.split(' ')
    topics = np.zeros(num_topics)

    if embedding == 'LDA':
        for w in words:
            w_topics = text2topics.LDA(w, model, num_topics)
            topics = topics + w_topics
        topics = topics / len(words)

    elif embedding == 'word2vec_mean':
        topics = text2topics.word2vec_mean(text, word_weights, model,
                                           num_topics)

    elif embedding == 'word2vec_tfidf':
        topics = text2topics.word2vec_tfidf(text, model, num_topics,
                                            tfidf_model, tfidf_dictionary)

    elif embedding == 'doc2vec':
        topics = text2topics.doc2vec(text, model, num_topics)

    elif embedding == 'glove':
        topics = text2topics.glove(text, word_weights, model, num_topics)

    elif embedding == 'glove_tfidf':
        topics = text2topics.glove_tfidf(text, model, num_topics)

    if FC:
        topics = topics - min(topics)
        if max(topics) > 0:
            topics = topics / max(topics)
            topics = get_NN_txt_embedding.get_NN_txt_embedding(text_NN, topics)

    topics = topics - min(topics)
    topics = topics / max(topics)

    # Create empty dict for distances
    distances = {}

    # Compute distances
    for id in database:
        distances[id] = np.dot(database[id], topics)

    # Sort dictionary
    distances = sorted(distances.items(),
                       key=operator.itemgetter(1),
                       reverse=True)

    # Get elements with min distances
    results = []
    for idx, id in enumerate(distances):
        copyfile('../../../datasets/COCO/val2014/' + id[0],
                 '../../../datasets/COCO/rr/' + id[0].replace('/', '_'))
        results.append(int(id[0][13:-4]))
        if idx == num_results - 1: break

    if im_id in results:
        return True
    else:
        return False
コード例 #5
0
#
# q.append('icecream beach')
# q.append('chocolate cake')
# q.append('pizza wine')
#
# q.append('woman bag')
# q.append('man boat')
# q.append('kid dog')

for cur_q in q:
    cur_w = '1 1'

    if len(cur_q.split(' ')) == 1:

        if embedding == 'LDA':
            topics = text2topics.LDA(cur_q, model, num_topics)
        elif embedding == 'word2vec_mean':
            topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics)
        elif embedding == 'doc2vec':
            topics = text2topics.doc2vec(cur_q, model, num_topics)
        elif embedding == 'word2vec_tfidf':
            topics = text2topics.word2vec_tfidf(cur_q, model, num_topics,
                                                tfidf_model, tfidf_dictionary)
        elif embedding == 'glove':
            topics = text2topics.glove(cur_q, cur_w, model, num_topics)
        elif embedding == 'glove_tfidf':
            topics = text2topics.glove_tfidf(cur_q, model, num_topics)

    else:
        topics = get_embedding_complex(cur_q, cur_w)
コード例 #6
0


for e,cur_q in enumerate(q):
    print(cur_q)
    cur_w = w[e]
    if test_dataset == 'webvision': results_path = "../../../datasets/WebVision/rr/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/'
    else: results_path = "../../../datasets/SocialMedia/retrieval_results/" + data + "/" + cur_q.replace(' ', '_') + '__' + cur_w.replace(' ', '_') + '/'
    if not os.path.exists(results_path):
        print("Creating dir: " + results_path)
        os.makedirs(results_path)


    if len(cur_q.split(' ')) == 1:

        if embedding == 'LDA': topics = text2topics.LDA(cur_q,  model, num_topics)
        elif embedding == 'word2vec_mean': topics = text2topics.word2vec_mean(cur_q, cur_w, model, num_topics)
        elif embedding == 'doc2vec': topics = text2topics.doc2vec(cur_q, model, num_topics)
        elif embedding == 'word2vec_tfidf': topics = text2topics.word2vec_tfidf(cur_q, model, num_topics, tfidf_model, tfidf_dictionary)
        elif embedding == 'glove': topics = text2topics.glove(cur_q, cur_w, model, num_topics)
        elif embedding == 'glove_tfidf': topics = text2topics.glove_tfidf(cur_q, model, num_topics)


        get_results(database, topics, num_results,results_path)

    else:
        get_results_complex(database, cur_q, cur_w, num_results, results_path)