def load_or_build_embedding(ds, vocab):
    # One-hot embedding
    # embd = eye(len(vocab))
    # return embd

    # Read Word Vectors
    # word_vector_file = 'data/glove.6B/glove.6B.300d.txt'
    # word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt'
    #_, embd, word_vector_map = loadWord2Vec(word_vector_file)
    # word_embeddings_dim = len(embd[0])
    try:
        word_vector_file = 'data/corpus/' + ds + '_word_vectors.txt'
        word_vec_vocab, embd, word_vec_id_map = loadWord2Vec(word_vector_file)
        word_embeddings_dim = len(embd[0])

        # word embedding matrix
        wm = np.matrix(embd)
        return word_vec_vocab, wm, word_vec_id_map
    except:
        print('Building embedding...')
        definitions = []
        for word in vocab:
            word = word.strip()
            synsets = wn.synsets(clean_str(word))
            word_defs = []
            for synset in synsets:
                syn_def = synset.definition()
                word_defs.append(syn_def)
            word_des = ' '.join(word_defs)
            if word_des == '':
                word_des = '<PAD>'
            definitions.append(word_des)

        tfidf_vec = TfidfVectorizer(max_features=1000)
        tfidf_matrix = tfidf_vec.fit_transform(definitions)
        tfidf_matrix_array = tfidf_matrix.toarray()

        word_vectors = []

        for i in range(len(vocab)):
            word = vocab[i]
            vector = tfidf_matrix_array[i]
            str_vector = []
            for j in range(len(vector)):
                str_vector.append(str(vector[j]))
            temp = ' '.join(str_vector)
            word_vector = word + ' ' + temp
            word_vectors.append(word_vector)

        string = '\n'.join(word_vectors)
        f = open('data/corpus/' + ds + '_word_vectors.txt', 'w')
        f.write(string)
        f.close()

        return load_or_build_embedding(ds, vocab)
Example #2
0
import sys, logging
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from utils import loadWord2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)
logger = logging.getLogger('food2vec')

filename = 'food2vec.model.txt'

food2vec = loadWord2Vec(filename)

vectors = food2vec.syn0
clustersNo = 10

logger.info("Preparing clusters...")

kmeans = KMeans(n_clusters=clustersNo)
idx = kmeans.fit_predict(vectors)

logger.info("Clusters are ready!")

wordMap = dict(zip(food2vec.index2word, idx))

for cluster in xrange(0, clustersNo):
    print "\nCluster %d" % cluster
    words = []
    for i in xrange(0, len(wordMap.values())):
        if (wordMap.values()[i] == cluster):
            words.append(wordMap.keys()[i])
from math import log
import sys

if len(sys.argv) != 2:
    sys.exit("Use: python build_graph.py <dataset>")

datasets = ['i2b2', 'mimic']
# build corpus
dataset = sys.argv[1]

# Read Word Vectors
model_dir = 'PATH/TO/WORD2VEC/MODEL'
word_vector_file = model_dir + 'glove.6B.100d.txt'

# word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt'
_, embd, word_vector_map = loadWord2Vec(word_vector_file)
word_embeddings_dim = len(embd[0])

# word_embeddings_dim = 30
# word_vector_map = {}

# shulffing
doc_name_list = []
doc_train_list = []
doc_test_list = []

f = open('data/' + dataset + '.txt', 'r')
lines = f.readlines()
for line in lines:
    doc_name_list.append(line.strip())
    temp = line.split("\t")
Example #4
0
def build_graph():
    dataset = 'own_wms_all'
    word_embeddings_dim = 300
    word_vector_map = {}

    # shulffing
    doc_name_list = []
    doc_train_list = []
    doc_test_list = []

    f = open('data/' + dataset + '.txt', 'r')
    lines = f.readlines()
    for line in lines:
        doc_name_list.append(line.strip())
        temp = line.split("\t")
        if temp[1].find('test') != -1:
            doc_test_list.append(line.strip())
        elif temp[1].find('train') != -1:
            doc_train_list.append(line.strip())
    f.close()
    # print(doc_train_list)
    # print(doc_test_list)

    doc_content_list = []
    f = open('data/corpus/' + dataset + '.clean.txt', 'r')
    lines = f.readlines()
    for line in lines:
        doc_content_list.append(line.strip())
    f.close()
    # print(doc_content_list)

    train_ids = []
    for train_name in doc_train_list:
        train_id = doc_name_list.index(train_name)
        train_ids.append(train_id)
    print(train_ids)
    random.shuffle(train_ids)

    # partial labeled data
    # train_ids = train_ids[:int(0.2 * len(train_ids))]

    train_ids_str = '\n'.join(str(index) for index in train_ids)
    f = open('data/' + dataset + '.train.index', 'w')
    f.write(train_ids_str)
    f.close()

    test_ids = []
    for test_name in doc_test_list:
        test_id = doc_name_list.index(test_name)
        test_ids.append(test_id)
    print(test_ids)
    random.shuffle(test_ids)

    test_ids_str = '\n'.join(str(index) for index in test_ids)
    f = open('data/' + dataset + '.test.index', 'w')
    f.write(test_ids_str)
    f.close()

    ids = train_ids + test_ids
    #print(ids)
    print(len(ids))

    shuffle_doc_name_list = []
    shuffle_doc_words_list = []
    for id in ids:
        shuffle_doc_name_list.append(doc_name_list[int(id)])
        shuffle_doc_words_list.append(doc_content_list[int(id)])
    shuffle_doc_name_str = '\n'.join(shuffle_doc_name_list)
    shuffle_doc_words_str = '\n'.join(shuffle_doc_words_list)

    f = open('data/' + dataset + '_shuffle.txt', 'w')
    f.write(shuffle_doc_name_str)
    f.close()

    f = open('data/corpus/' + dataset + '_shuffle.txt', 'w')
    f.write(shuffle_doc_words_str)
    f.close()

    # build vocab
    word_freq = {}
    word_set = set()
    for doc_words in shuffle_doc_words_list:
        words = doc_words.split()
        for word in words:
            word_set.add(word)
            if word in word_freq:
                word_freq[word] += 1
            else:
                word_freq[word] = 1

    vocab = list(word_set)
    vocab_size = len(vocab)

    word_doc_list = {}

    for i in range(len(shuffle_doc_words_list)):
        doc_words = shuffle_doc_words_list[i]
        words = doc_words.split()
        appeared = set()
        for word in words:
            if word in appeared:
                continue
            if word in word_doc_list:
                doc_list = word_doc_list[word]
                doc_list.append(i)
                word_doc_list[word] = doc_list
            else:
                word_doc_list[word] = [i]
            appeared.add(word)

    word_doc_freq = {}
    for word, doc_list in word_doc_list.items():
        word_doc_freq[word] = len(doc_list)

    word_id_map = {}
    for i in range(vocab_size):
        word_id_map[vocab[i]] = i

    vocab_str = '\n'.join(vocab)

    f = open('data/corpus/' + dataset + '_vocab.txt', 'w')
    f.write(vocab_str)
    f.close()

    definitions = []

    for word in vocab:
        word = word.strip()
        synsets = wn.synsets(clean_str(word))
        word_defs = []
        for synset in synsets:
            syn_def = synset.definition()
            word_defs.append(syn_def)
        word_des = ' '.join(word_defs)
        if word_des == '':
            word_des = '<PAD>'
        definitions.append(word_des)

    string = '\n'.join(definitions)

    f = open('data/corpus/' + dataset + '_vocab_def.txt', 'w')
    f.write(string)
    f.close()

    tfidf_vec = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf_vec.fit_transform(definitions)
    tfidf_matrix_array = tfidf_matrix.toarray()
    #print(tfidf_matrix_array[0], len(tfidf_matrix_array[0]))

    word_vectors = []

    for i in range(len(vocab)):
        word = vocab[i]
        vector = tfidf_matrix_array[i]
        str_vector = []
        for j in range(len(vector)):
            str_vector.append(str(vector[j]))
        temp = ' '.join(str_vector)
        word_vector = word + ' ' + temp
        word_vectors.append(word_vector)

    string = '\n'.join(word_vectors)

    f = open('data/corpus/' + dataset + '_word_vectors.txt', 'w')
    f.write(string)
    f.close()

    word_vector_file = 'data/corpus/' + dataset + '_word_vectors.txt'
    _, embd, word_vector_map = loadWord2Vec(word_vector_file)
    word_embeddings_dim = len(embd[0])

    # label list
    label_set = set()  # 不重复的序列
    for doc_meta in shuffle_doc_name_list:
        temp = doc_meta.split('\t')
        label_set.add(temp[2])
    label_list = list(label_set)

    label_list_str = '\n'.join(label_list)
    f = open('data/corpus/' + dataset + '_labels.txt', 'w')
    f.write(label_list_str)
    f.close()

    # x: feature vectors of training docs, no initial features
    # slect 90% training set
    train_size = len(train_ids)
    val_size = int(0.1 * train_size)
    real_train_size = train_size - val_size  # - int(0.5 * train_size)
    # different training rates

    real_train_doc_names = shuffle_doc_name_list[:real_train_size]
    real_train_doc_names_str = '\n'.join(real_train_doc_names)

    f = open('data/' + dataset + '.real_train.name', 'w')
    f.write(real_train_doc_names_str)
    f.close()

    row_x = []
    col_x = []
    data_x = []
    for i in range(real_train_size):
        doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
        doc_words = shuffle_doc_words_list[i]
        words = doc_words.split()
        doc_len = len(words)
        for word in words:
            if word in word_vector_map:
                word_vector = word_vector_map[word]
                # print(doc_vec)
                # print(np.array(word_vector))
                doc_vec = doc_vec + np.array(word_vector)

        for j in range(word_embeddings_dim):
            row_x.append(i)
            col_x.append(j)
            # np.random.uniform(-0.25, 0.25)
            data_x.append(doc_vec[j] / doc_len)  # doc_vec[j]/ doc_len

    # x = sp.csr_matrix((real_train_size, word_embeddings_dim), dtype=np.float32)
    x = sp.csr_matrix((data_x, (row_x, col_x)),
                      shape=(real_train_size, word_embeddings_dim))

    y = []
    for i in range(real_train_size):
        doc_meta = shuffle_doc_name_list[i]
        temp = doc_meta.split('\t')
        temp2 = temp[2].split(',')
        one_hot = []
        for i in range(len(temp2)):
            if temp2[i] == '0':
                one_hot.append(0)
            elif temp2[i] == '1':
                one_hot.append(1)
        y.append(one_hot)
    y = np.array(y)
    print(y)

    # tx: feature vectors of test docs, no initial features
    test_size = len(test_ids)

    row_tx = []
    col_tx = []
    data_tx = []
    for i in range(test_size):
        doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
        doc_words = shuffle_doc_words_list[i + train_size]
        words = doc_words.split()
        doc_len = len(words)
        for word in words:
            if word in word_vector_map:
                word_vector = word_vector_map[word]
                doc_vec = doc_vec + np.array(word_vector)

        for j in range(word_embeddings_dim):
            row_tx.append(i)
            col_tx.append(j)
            # np.random.uniform(-0.25, 0.25)
            data_tx.append(doc_vec[j] / doc_len)  # doc_vec[j] / doc_len

    # tx = sp.csr_matrix((test_size, word_embeddings_dim), dtype=np.float32)
    tx = sp.csr_matrix((data_tx, (row_tx, col_tx)),
                       shape=(test_size, word_embeddings_dim))

    ty = []
    for i in range(test_size):
        doc_meta = shuffle_doc_name_list[i + train_size]
        temp = doc_meta.split('\t')
        temp2 = temp[2].split(',')
        one_hot = []
        for i in range(len(temp2)):
            if temp2[i] == '0':
                one_hot.append(0)
            elif temp2[i] == '1':
                one_hot.append(1)
        #for i in range(len(temp)):
        #   if i>1:
        #      if temp[i]=='0':
        #         one_hot.append(0)
        #    elif temp[i]=='1':
        #       one_hot.append(1)
        ty.append(one_hot)
    ty = np.array(ty)
    print(ty)

    # allx: the the feature vectors of both labeled and unlabeled training instances
    # (a superset of x)
    # unlabeled training instances -> words

    word_vectors = np.random.uniform(
        -0.01, 0.01,
        (vocab_size, word_embeddings_dim
         ))  # vocab_size = len(vocab) word_embeddings_dim = len(embd[0])

    for i in range(len(vocab)):
        word = vocab[i]
        if word in word_vector_map:
            vector = word_vector_map[word]
            word_vectors[i] = vector

    row_allx = []
    col_allx = []
    data_allx = []

    for i in range(train_size):
        doc_vec = np.array([0.0 for k in range(word_embeddings_dim)])
        doc_words = shuffle_doc_words_list[i]
        words = doc_words.split()
        doc_len = len(words)
        for word in words:
            if word in word_vector_map:
                word_vector = word_vector_map[word]
                doc_vec = doc_vec + np.array(word_vector)

        for j in range(word_embeddings_dim):
            row_allx.append(int(i))
            col_allx.append(j)
            # np.random.uniform(-0.25, 0.25)
            data_allx.append(doc_vec[j] / doc_len)  # doc_vec[j]/doc_len
    for i in range(vocab_size):
        for j in range(word_embeddings_dim):
            row_allx.append(int(i + train_size))
            col_allx.append(j)
            data_allx.append(word_vectors.item((i, j)))

    row_allx = np.array(row_allx)
    col_allx = np.array(col_allx)
    data_allx = np.array(data_allx)

    allx = sp.csr_matrix((data_allx, (row_allx, col_allx)),
                         shape=(train_size + vocab_size, word_embeddings_dim))

    ally = []
    for i in range(train_size):
        doc_meta = shuffle_doc_name_list[i]
        temp = doc_meta.split('\t')
        temp2 = temp[2].split(',')
        one_hot = []
        for i in range(len(temp2)):
            if temp2[i] == '0':
                one_hot.append(0)
            elif temp2[i] == '1':
                one_hot.append(1)
        ally.append(one_hot)

    for i in range(vocab_size):
        one_hot = [0 for l in range(len(classes))]
        ally.append(one_hot)

    ally = np.array(ally)
    print(ally)

    print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape)
    '''
    Doc word heterogeneous graph
    '''

    # word co-occurence with context windows
    window_size = 20
    windows = []

    for doc_words in shuffle_doc_words_list:
        words = doc_words.split()
        length = len(words)
        if length <= window_size:
            windows.append(words)
        else:
            # print(length, length - window_size + 1)
            for j in range(length - window_size + 1):
                window = words[j:j + window_size]
                windows.append(window)
                # print(window)

    word_window_freq = {}
    for window in windows:
        appeared = set()
        for i in range(len(window)):
            if window[i] in appeared:
                continue
            if window[i] in word_window_freq:
                word_window_freq[window[i]] += 1
            else:
                word_window_freq[window[i]] = 1
            appeared.add(window[i])

    word_pair_count = {}
    for window in windows:
        for i in range(1, len(window)):
            for j in range(0, i):
                word_i = window[i]
                word_i_id = word_id_map[word_i]
                word_j = window[j]
                word_j_id = word_id_map[word_j]
                if word_i_id == word_j_id:
                    continue
                word_pair_str = str(word_i_id) + ',' + str(word_j_id)
                if word_pair_str in word_pair_count:
                    word_pair_count[word_pair_str] += 1
                else:
                    word_pair_count[word_pair_str] = 1
                # two orders
                word_pair_str = str(word_j_id) + ',' + str(word_i_id)
                if word_pair_str in word_pair_count:
                    word_pair_count[word_pair_str] += 1
                else:
                    word_pair_count[word_pair_str] = 1

    row = []
    col = []
    weight = []

    # pmi as weights

    num_window = len(windows)

    for key in word_pair_count:
        temp = key.split(',')
        i = int(temp[0])
        j = int(temp[1])
        count = word_pair_count[key]
        word_freq_i = word_window_freq[vocab[i]]
        word_freq_j = word_window_freq[vocab[j]]
        pmi = log(
            (1.0 * count / num_window) / (1.0 * word_freq_i * word_freq_j /
                                          (num_window * num_window)))
        if pmi <= 0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(pmi)

    # word vector cosine similarity as weights
    '''
   for i in range(vocab_size):
       for j in range(vocab_size):
           if vocab[i] in word_vector_map and vocab[j] in word_vector_map:
               vector_i = np.array(word_vector_map[vocab[i]])
               vector_j = np.array(word_vector_map[vocab[j]])
               similarity = 1.0 - cosine(vector_i, vector_j)
               if similarity > 0.9:
                   print(vocab[i], vocab[j], similarity)
                   row.append(train_size + i)
                   col.append(train_size + j)
                   weight.append(similarity)
    '''
    # doc word frequency
    doc_word_freq = {}

    for doc_id in range(len(shuffle_doc_words_list)):
        doc_words = shuffle_doc_words_list[doc_id]
        words = doc_words.split()
        for word in words:
            word_id = word_id_map[word]
            doc_word_str = str(doc_id) + ',' + str(word_id)
            if doc_word_str in doc_word_freq:
                doc_word_freq[doc_word_str] += 1
            else:
                doc_word_freq[doc_word_str] = 1

    for i in range(len(shuffle_doc_words_list)):
        doc_words = shuffle_doc_words_list[i]
        words = doc_words.split()
        doc_word_set = set()
        for word in words:
            if word in doc_word_set:
                continue
            j = word_id_map[word]
            key = str(i) + ',' + str(j)
            freq = doc_word_freq[key]
            if i < train_size:
                row.append(i)
            else:
                row.append(i + vocab_size)
            col.append(train_size + j)
            idf = log(1.0 * len(shuffle_doc_words_list) /
                      word_doc_freq[vocab[j]])
            weight.append(freq * idf * spi[lemma(word)])  #
            doc_word_set.add(word)

    node_size = train_size + vocab_size + test_size
    adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))

    # dump objects
    f = open("data/ind.{}.x".format(dataset), 'wb')
    pkl.dump(x, f)
    f.close()

    f = open("data/ind.{}.y".format(dataset), 'wb')
    pkl.dump(y, f)
    f.close()

    f = open("data/ind.{}.tx".format(dataset), 'wb')
    pkl.dump(tx, f)
    f.close()

    f = open("data/ind.{}.ty".format(dataset), 'wb')
    pkl.dump(ty, f)
    f.close()

    f = open("data/ind.{}.allx".format(dataset), 'wb')
    pkl.dump(allx, f)
    f.close()

    f = open("data/ind.{}.ally".format(dataset), 'wb')
    pkl.dump(ally, f)
    f.close()

    f = open("data/ind.{}.adj".format(dataset), 'wb')
    pkl.dump(adj, f)
    f.close()
Example #5
0
        jsonstr = ''.join(f.readlines())
        ent_dic = json.loads(jsonstr)
else:
    cand_dic, ent_dic = GenerateCand('kb.json')

# 生成训练、验证、测试的文本数据
if not os.path.exists('../data/generated/train_data.txt'):
    GeneratePairwaiseSample('train.json', cand_dic, ent_dic, is_train=True)
if not os.path.exists('../data/generated/dev_data.txt'):
    GeneratePairwaiseSample('dev.json', cand_dic, ent_dic, is_train=False)
if not os.path.exists('../data/generated/test_data.txt'):
    GeneratePairwaiseSample('test.json', cand_dic, ent_dic, is_train=False)

# matrix 向量数组;vocab 包含 vocab["w2i"]: word2idx、vocab["i2w"]:idx2word;向量维度,字词数
if not os.path.exists('../data/pretrain_data/matrix.npy'):
    matrix, vocab, vec_dim, vocab_size = utils.loadWord2Vec(
        "../data/pretrain_data/word2vec.iter5")
else:
    matrix = np.load('../data/pretrain_data/matrix.npy')
    with open('../data/pretrain_data/vocab.json', 'r', encoding='utf8') as f:
        jsonstr = ''.join(f.readlines())
        vocab = json.loads(jsonstr)

# 类型2标签字典
type2label = utils.type2label

# 数据编码
data_encoder = DataEncoder(vocab["w2i"], type2label)
if not os.path.exists('../data/generated/train.csv'):
    data_encoder.data_encode("../data/generated/train_data.txt", is_train=True)
if not os.path.exists('../data/generated/dev.csv'):
    data_encoder.data_encode("../data/generated/dev_data.txt", is_train=False)
Example #6
0
from warnings import warn
import string
import numpy as np
from itertools import cycle
from itertools import repeat

# Try to load the word2vec model and the multilabelbinarizer
w2vfile = './models/w2v'
mlbfile = './models/mlb.pickle'
w2v = False

# Loading pickle files is faster, so check that one first
if os.path.exists(w2vfile + '.pickle'):
    w2v = loadPickle(w2vfile + '.pickle')
elif os.path.exists(w2vfile + '.bin'):
    w2v = loadWord2Vec(w2vfile + '.bin')
else:
    warn(
        "{} not found, will not be able to sub or create word matrices".format(
            w2vfile))

if w2v:
    word_d = w2v.layer1_size

prepare_mode = '-p' in sys.argv or '--prepare' in sys.argv or '-m' in sys.argv or '--make' in sys.argv

if os.path.exists(mlbfile) and not prepare_mode:
    mlb = loadPickle(mlbfile)
    valid_hashtags = set(mlb.classes_)
else:
    valid_hashtags = set()
Example #7
0
import sys, logging
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from utils import loadWord2Vec

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('food2vec')

filename = 'food2vec.model.txt'

food2vec = loadWord2Vec(filename)

labels = set(food2vec.index2word)
vectors = food2vec.syn0

logger.info("Preparing tsne transformation...")

tsne = TSNE(perplexity=15, n_components=2, init='pca', n_iter=4000, early_exaggeration=8.0)
vectors2d = tsne.fit_transform(vectors)

logger.info('Trying to plot food2vec results...')

plt.figure(figsize=(15, 15))
for i, label in enumerate(labels):
    x, y = vectors2d[i,:]
    plt.scatter(x, y)
    plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.savefig('tsne.png')