Esempio n. 1
0
def close_words_of_entities():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_joint_oml_100.bin'
    entity_dict_file_name = 'e:/dc/20ng_bydate/entity_names.txt'
    entity_vecs_file_name = 'e:/dc/20ng_bydate/vecs/entity_vecs_joint_oml_100.bin'

    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)
    words = ioutils.load_words_dict_to_list(word_dict_file_name)
    entity_vecs = ioutils.load_vec_list_file(entity_vecs_file_name)
    entities = ioutils.load_entity_dict(entity_dict_file_name)
    print len(entity_vecs)
    print len(entities)

    def show_close_words(entity_idx):
        print 'entity: ', entities[entity_idx]
        entity_vec = entity_vecs[entity_idx]
        dist_list = list()
        for word_idx in xrange(len(word_vecs)):
            # dist = np.dot(entity_vec, word_vecs[word_idx])
            dist = scipy.spatial.distance.cosine(entity_vec,
                                                 word_vecs[word_idx])
            dist_list.append((dist, word_idx))
        # dist_list.sort(key=lambda tup: tup[0])
        closest_words = heapq.nsmallest(100, dist_list, key=lambda tup: tup[0])
        for dist, idx in closest_words:
            print dist, words[idx], idx

    show_close_words(25304)
Esempio n. 2
0
def close_words_of_entities():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_joint_oml_100.bin'
    entity_dict_file_name = 'e:/dc/20ng_bydate/entity_names.txt'
    entity_vecs_file_name = 'e:/dc/20ng_bydate/vecs/entity_vecs_joint_oml_100.bin'

    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)
    words = ioutils.load_words_dict_to_list(word_dict_file_name)
    entity_vecs = ioutils.load_vec_list_file(entity_vecs_file_name)
    entities = ioutils.load_entity_dict(entity_dict_file_name)
    print len(entity_vecs)
    print len(entities)

    def show_close_words(entity_idx):
        print 'entity: ', entities[entity_idx]
        entity_vec = entity_vecs[entity_idx]
        dist_list = list()
        for word_idx in xrange(len(word_vecs)):
            # dist = np.dot(entity_vec, word_vecs[word_idx])
            dist = scipy.spatial.distance.cosine(entity_vec, word_vecs[word_idx])
            dist_list.append((dist, word_idx))
        # dist_list.sort(key=lambda tup: tup[0])
        closest_words = heapq.nsmallest(100, dist_list, key=lambda tup: tup[0])
        for dist, idx in closest_words:
            print dist, words[idx], idx

    show_close_words(25304)
Esempio n. 3
0
def test():
    words = ioutils.load_words_dict_to_list('e:/dc/20ng_bydate/words_dict.txt')
    fin = open('e:/dc/20ng_bydate/word_cnts.bin', 'rb')
    num_words = np.fromfile(fin, np.int32, 1)
    print num_words, 'words'
    cnts = np.fromfile(fin, np.int32, num_words)
    for idx, cnt in enumerate(cnts[:100]):
        print words[idx], cnt
    fin.close()
Esempio n. 4
0
def test():
    words = ioutils.load_words_dict_to_list('e:/dc/20ng_bydate/words_dict.txt')
    fin = open('e:/dc/20ng_bydate/word_cnts.bin', 'rb')
    num_words = np.fromfile(fin, np.int32, 1)
    print num_words, 'words'
    cnts = np.fromfile(fin, np.int32, num_words)
    for idx, cnt in enumerate(cnts[:100]):
        print words[idx], cnt
    fin.close()
Esempio n. 5
0
def close_words():
    words_dict_file = 'e:/dc/el/words_dict_proper.txt'
    word_vecs_file = 'e:/dc/el/vecs/word_vecs.bin'

    words = ioutils.load_words_dict_to_list(words_dict_file, False)
    print len(words)
    idx = 774318
    print words[idx]

    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    print len(word_vecs)
    close_list = close_vecs(word_vecs, word_vecs[idx])
    for idx, dist in close_list:
        print words[idx], dist
Esempio n. 6
0
def close_words():
    words_dict_file = 'e:/dc/el/words_dict_proper.txt'
    word_vecs_file = 'e:/dc/el/vecs/word_vecs.bin'

    words = ioutils.load_words_dict_to_list(words_dict_file, False)
    print len(words)
    idx = 774318
    print words[idx]

    word_vecs = ioutils.load_vec_list_file(word_vecs_file)
    print len(word_vecs)
    close_list = close_vecs(word_vecs, word_vecs[idx])
    for idx, dist in close_list:
        print words[idx], dist
Esempio n. 7
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])
Esempio n. 8
0
def close_words_of_docs():
    word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt'
    words = ioutils.load_words_dict_to_list(word_dict_file_name)

    bow_docs_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin'
    word_indices_list, word_cnts_list, num_words = ioutils.load_bow_file(
        bow_docs_file_name)
    print num_words, 'words'

    doc_vec_file_name = 'e:/dc/20ng_bydate/vecs/doc_vec_cpp_100.bin'
    word_vec_file_name = 'e:/dc/20ng_bydate/vecs/word_vecs_cpp_100.bin'
    doc_vecs = ioutils.load_vec_list_file(doc_vec_file_name)
    word_vecs = ioutils.load_vec_list_file(word_vec_file_name)

    def show_close_words(doc_vec, word_indices):
        dist_list = list()
        for word_idx in word_indices:
            dist_list.append((np.dot(doc_vec, word_vecs[word_idx]), word_idx))
        dist_list.sort(key=lambda tup: tup[0])
        # closest_words = heapq.nlargest(10, dist_list, key=lambda tup: tup[0])
        for dist, idx in dist_list:
            print dist, words[idx]

    show_close_words(doc_vecs[0], word_indices_list[0])