Ejemplo n.º 1
0
 def __init__(self,
              corpus,
              ptype,
              test=False,
              modelname="crfre_classifier"):
     super(CrfSuiteRE, self).__init__()
     self.data = []
     self.labels = []
     self.scores = []
     self.predicted = []
     self.entities = []
     self.pairtype = ptype
     self.modelname = ptype + "_" + modelname
     self.gold_relations = set()
     self.tair_pairs = load_tair_relations()
     self.vecmodel = word2vec.load("corpora/Thaliana/documents-processed" +
                                   '.bin')
     with codecs.open("seedev_relation.txt", 'r', 'utf-8') as relfile:
         for r in relfile:
             self.gold_relations.add(r.strip())
     self.clusters = word2vec.load_clusters(
         "corpora/Thaliana/documents-processed-clusters.txt")
     #with codecs.open("corpora/Thaliana/documents-clusters.txt", "r", "utf-8") as clusterfile:
     #    for l in clusterfile:
     #        values = l.strip().split(" ")
     #        self.clusters[values[0]] = values[1]
     self.generate_data(corpus, self.modelname, ptype, test)
Ejemplo n.º 2
0
def create_voabulary_labelO():
    model = word2vec.load('zhihu-word2vec-multilabel.bin-100', kind='bin') #zhihu-word2vec.bin-100
    count=0
    vocabulary_word2index_label={}
    vocabulary_index2word_label={}
    label_unique={}
    for i,vocab in enumerate(model.vocab):
        if '__label__' in vocab:  #'__label__-2051131023989903826
            label=vocab[vocab.index('__label__')+len('__label__'):]
            if label_unique.get(label,None) is None: #不曾出现过的话,保持到字典中
                vocabulary_word2index_label[label]=count
                vocabulary_index2word_label[count]=label #ADD
                count=count+1
                label_unique[label]=label
    return vocabulary_word2index_label,vocabulary_index2word_label
def assign_pretrained_word_embedding(sess, vocabulary_index2word, vocab_size,
                                     textCNN, word2vec_model_path):
    print("using pre-trained word emebedding.started.word2vec_model_path:",
          word2vec_model_path)
    word2vec_model = word2vec.load(word2vec_model_path, kind='bin')
    word2vec_dict = {}
    for word, vector in zip(word2vec_model.vocab, word2vec_model.vectors):
        word2vec_dict[word] = vector
    word_embedding_2dlist = [
        []
    ] * vocab_size  # create an empty word_embedding list.
    word_embedding_2dlist[0] = np.zeros(
        FLAGS.embed_size)  # assign empty for first word:'PAD'
    bound = np.sqrt(6.0) / np.sqrt(vocab_size)  # bound for random variables.
    count_exist = 0
    count_not_exist = 0
    for i in range(1, vocab_size):  # loop each word
        word = vocabulary_index2word[i]  # get a word
        embedding = None
        try:
            embedding = word2vec_dict[
                word]  # try to get vector:it is an array.
        except Exception:
            embedding = None
        if embedding is not None:  # the 'word' exist a embedding
            word_embedding_2dlist[i] = embedding
            count_exist = count_exist + 1  # assign array to this word.
        else:  # no embedding for this word
            word_embedding_2dlist[i] = np.random.uniform(
                -bound, bound, FLAGS.embed_size)
            count_not_exist = count_not_exist + 1  # init a random value for the word.
    word_embedding_final = np.array(
        word_embedding_2dlist)  # covert to 2d array.
    word_embedding = tf.constant(word_embedding_final,
                                 dtype=tf.float32)  # convert to tensor
    t_assign_embedding = tf.assign(
        textCNN.Embedding, word_embedding
    )  # assign this value to our embedding variables of our model.
    sess.run(t_assign_embedding)
    print("word. exists embedding:", count_exist,
          " ;word not exist embedding:", count_not_exist)
    print("using pre-trained word emebedding.ended...")
Ejemplo n.º 4
0
def match_relations(reltype,
                    docfile_root="corpora/Thaliana/documents-processed"):

    model = word2vec.load(docfile_root + '.bin')
    gold_relations = []
    with open("seedev_relation.txt") as f:
        gold_relations = f.readlines()
    unmatched1, unmatched2 = 0, 0
    for r in gold_relations:
        values = r.split("\t")
        if values[1] == reltype:
            entity1 = values[0].split("#")[1]
            entity2 = values[2].split("#")[1]
            #print entity1,
            if entity1 in model:
                indexes, metrics = model.cosine(entity1, n=1)
                #print model.generate_response(indexes, metrics).tolist()
            else:
                entity1 = entity1.split(" ")[0]
                if entity1 in model:
                    indexes, metrics = model.cosine(entity1, n=1)
                    #print model.generate_response(indexes, metrics).tolist()
                else:
                    unmatched1 += 1
                    #print
            #print entity2,
            if entity2 in model:
                indexes, metrics = model.cosine(entity2, n=5)
                #print model.generate_response(indexes, metrics).tolist()
            else:
                entity2 = entity2.split(" ")[0]
                if entity2 in model:
                    indexes, metrics = model.cosine(entity2, n=5)
                    #print model.generate_response(indexes, metrics).tolist()
                else:
                    unmatched2 += 1
                    #print
            #print "========================================"
    print(unmatched1, unmatched2)
Ejemplo n.º 5
0
miniBatch_size = 64

f = pd.read_csv("testEchantillonnage1.csv").values[:, 1:]
print("\n*** La taille du dataSet est de : ", len(f[:, 0]), " lignes ***")
token = []

for i in range(0, len(f[:, 3])):
    st = str(f[i, 3]).lower().decode('utf8')
    token.append([nltk.word_tokenize(st)])
print("*** finished to tokenize ***")

print("*** Start Word2Vec with ", len(token), " elements *** ")

if LOAD_VOCAB:
    print("** Starting Word2Vec loading.")
    w2v = word2vec.load('weights/FR.vocab')
    print("** Word2Vec loading ended.")
else:
    print("** Starting Word2Vec saving.")
    w2v = word2vec.save(token, 'weights/FR.vocab')
    print("** Word2Vec saving ended.")

w2v = word2vec.save(token, 'FR.vocab')
print("\n*** Word2Vec processed ***\n")

keys = w2v.wv.vocab.keys()
vocabIndex = {}
for i in range(len(keys)):
    w = keys[i]
    vocabIndex[w] = w2v.wv.vocab[w].index
    if i % 500 == 0: