from nltk import pos_tag
from gensim.similarities import WmdSimilarity
import gensim
import datetime
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import json

file = open(os.path.join(os.pardir, "outnew", "cate", "bert_category.json"),
            'r')
cates = json.load(file)

bc = BertClient()

doc_vecs = np.load('../outnew/bert_cate_v4/algorithm_vec.npy')

while True:
    query = input('your question: ')
    query_vec = bc.encode([query])[0]
    print(query_vec)
    print(bc.encode([query]))
    # compute normalized dot product as score
    score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs,
                                                                  axis=1)
    topk_idx = np.argsort(score)[::-1][:10]
    for idx in topk_idx:
        print('> %s\t%s' % (score[idx], cates['algorithm'][idx]))
class PICOSpanRobot:
    def __init__(self):
        """
        This bot tags sequences of words from abstracts as describing
        P,I, or O elements.
        """
        logging.debug("Loading PICO LSTM-CRF")
        config = Config()
        # build model
        self.model = NERModel(config)
        self.model.build()

        self.model.restore_session(
            os.path.join(robotreviewer.DATA_ROOT, "pico_spans/model.weights/"))
        logging.debug("PICO classifiers loaded")
        self.bert = BertClient(ip=BERT_IP,
                               port=BERT_PORT,
                               port_out=BERT_PORT_OUT)

    def api_annotate(self, articles, get_berts=True, get_meshes=True):

        if not all(((('parsed_ab' in article) and ('parsed_ti' in article)) or
                    (article.get('skip_annotation')) for article in articles)):
            raise Exception(
                'PICO span model requires a title and abstract to be able to complete annotation'
            )
        annotations = []
        for article in articles:
            if article.get('skip_annotation'):
                annotations.append([])
            else:
                annotations.append(
                    self.annotate(
                        {
                            "title": article['parsed_ti'],
                            "abstract": article['parsed_ab']
                        },
                        get_berts=get_berts,
                        get_meshes=True))
        return annotations

    def pdf_annotate(self, data):
        if data.get("abstract") is not None and data.get("title") is not None:
            ti = tokenizer.nlp(data["title"])
            ab = tokenizer.nlp(data["abstract"])
        elif data.get("parsed_text") is not None:
            # then just use the start of the document
            TI_LEN = 30
            AB_LEN = 500
            # best guesses based on sample of RCT abstracts + aiming for 95% centile
            ti = tokenizer.nlp(data['parsed_text'][:TI_LEN].string)
            ab = tokenizer.nlp(data['parsed_text'][:AB_LEN].string)
        else:
            # else can't proceed
            return data

        data.ml["pico_span"] = self.annotate({"title": ti, "abstract": ab})

        return data

    def annotate(self, article, get_berts=True, get_meshes=True):
        """
        Annotate abstract of clinical trial report
        """

        label_dict = {
            "1_p": "population",
            "1_i": "interventions",
            "1_o": "outcomes"
        }

        out = {"population": [], "interventions": [], "outcomes": []}

        for sent in chain(article['title'].sents, article['abstract'].sents):
            words = [w.text for w in sent]
            preds = self.model.predict(words)

            last_label = "N"
            start_idx = 0

            for i, p in enumerate(preds):

                if p != last_label and last_label != "N":
                    out[label_dict[last_label]].append(
                        sent[start_idx:i].text.strip())
                    start_idx = i

                if p != last_label and last_label == "N":
                    start_idx = i

                last_label = p

            if last_label != "N":
                out[label_dict[last_label]].append(
                    sent[start_idx:].text.strip())

        for e in out:
            out[e] = cleanup(out[e])

        if get_berts:
            for k in ['population', 'interventions', 'outcomes']:
                bert_out_key = "{}_berts".format(k)

                bert_q = []
                for r in out[k]:
                    if r.strip() and len(r) > 5:
                        bert_q.append(r.strip())

                if len(bert_q) == 0:
                    out[bert_out_key] = []
                else:
                    out[bert_out_key] = [
                        r.tolist() for r in self.bert.encode(bert_q)
                    ]

        if get_meshes:
            abbrev_dict = schwartz_hearst.extract_abbreviation_definition_pairs(
                doc_text=article['abstract'].text)
            for k in ['population', 'interventions', 'outcomes']:
                out[f"{k}_mesh"] = minimap.get_unique_terms(
                    out[k], abbrevs=abbrev_dict)

        return out

    @staticmethod
    def get_marginalia(data):
        """
        Get marginalia formatted for Spa from structured data
        """
        marginalia = [{
            "type": "PICO text from abstracts",
            "title": "PICO characteristics",
            "annotations": [],
            "description": data["ml"]["pico_span"]
        }]
        return marginalia
Example #3
0
class DataLoader(object):
    """
    Load data from json files, preprocess and prepare batches.
    """
    def __init__(self,
                 filename,
                 batch_size,
                 opt,
                 vocab,
                 life,
                 evaluation=False):
        self.batch_size = batch_size
        self.opt = opt
        self.vocab = vocab
        self.eval = evaluation
        self.bc = BertClient()
        self.life = life

        with open(filename) as infile:
            data = json.load(infile)
        if opt['bert']:
            data = self.preprocess_bert(data, vocab, opt)
        else:
            data = self.preprocess(data, vocab, opt)
        # shuffle for training
        if not evaluation:
            indices = list(range(len(data)))
            random.shuffle(indices)
            data = [data[i] for i in indices]
        if self.life:
            id2label = dict([(v, k)
                             for k, v in constant.LIFE_LABEL_TO_ID.items()])
        if not self.life:
            id2label = dict([(v, k) for k, v in constant.LABEL_TO_ID.items()])
        self.labels = [id2label[d[-1]] for d in data]
        self.num_examples = len(data)

        # chunk into batches
        data = [
            data[i:i + batch_size] for i in range(0, len(data), batch_size)
        ]
        self.data = data
        print("{} batches created for {}".format(len(data), filename))

    def preprocess(self, data, vocab, opt):
        """ Preprocess the data and convert to ids. """
        processed = []
        for d in data:
            tokens = d['token']
            if opt['lower']:
                tokens = [t.lower() for t in tokens]
            # anonymize tokens
            ss, se = d['subj_start'], d['subj_end']
            os, oe = d['obj_start'], d['obj_end']
            tokens[ss:se + 1] = ['SUBJ-' + d['subj_type']] * (se - ss + 1)
            tokens[os:oe + 1] = ['OBJ-' + d['obj_type']] * (oe - os + 1)
            tokens = map_to_ids(tokens, vocab.word2id)
            pos = map_to_ids(d['stanford_pos'], constant.POS_TO_ID)
            ner = map_to_ids(d['stanford_ner'], constant.NER_TO_ID)
            deprel = map_to_ids(d['stanford_deprel'], constant.DEPREL_TO_ID)
            l = len(tokens)
            subj_positions = get_positions(d['subj_start'], d['subj_end'], l)
            obj_positions = get_positions(d['obj_start'], d['obj_end'], l)
            if not self.life:
                relation = constant.LABEL_TO_ID[d['relation']]
            if self.life:
                relation = constant.LIFE_LABEL_TO_ID[d['relation']]
            processed += [(tokens, pos, ner, deprel, subj_positions,
                           obj_positions, relation)]
        return processed

    def preprocess_bert(self, data, vocab, opt):
        """ Preprocess the data and convert to ids. """
        processed = []
        for d in data:
            tokens = d['token']
            for i, token in enumerate(tokens):
                if type(token) != str:
                    tokens[i] = str(token)
            # anonymize tokens [skip, doesn't prevent overfitting?]
            # ss, se = d['subj_start'], d['subj_end']
            # os, oe = d['obj_start'], d['obj_end']
            # TODO check indexing
            # tokens[ss:se] = ['SUBJ-'+d['subj_type']] * (se-ss)
            # tokens[os:oe] = ['OBJ-'+d['obj_type']] * (oe-os)

            # tokens = map_to_ids(tokens, vocab.word2id)

            # pos = map_to_ids(d['stanford_pos'], constant.POS_TO_ID)
            # ner = map_to_ids(d['stanford_ner'], constant.NER_TO_ID)
            # deprel = map_to_ids(d['stanford_deprel'], constant.DEPREL_TO_ID)

            l = len(tokens)
            subj_positions = get_positions(d['subj_start'], d['subj_end'], l)
            obj_positions = get_positions(d['obj_start'], d['obj_end'], l)
            if self.life:
                relation = constant.LIFE_LABEL_TO_ID[d['relation']]
            if not self.life:
                relation = constant.LABEL_TO_ID[d['relation']]
            processed += [(tokens, None, None, None, subj_positions,
                           obj_positions, relation)]
        return processed

    def gold(self):
        """ Return gold labels as a list. """
        return self.labels

    def __len__(self):
        #return 50
        return len(self.data)

    def __getitem__(self, key):
        """ Get a batch with index. """
        if not isinstance(key, int):
            raise TypeError
        if key < 0 or key >= len(self.data):
            raise IndexError
        batch = self.data[key]
        batch_size = len(batch)
        batch = list(zip(*batch))
        assert len(batch) == 7

        # sort all fields by lens for easy RNN operations
        lens = [len(x) for x in batch[0]]
        batch, orig_idx = sort_all(batch, lens)

        # word dropout
        if not self.eval:
            words = [
                word_dropout(sent, self.opt['word_dropout'])
                for sent in batch[0]
            ]
        else:
            words = batch[0]

        if self.opt['bert']:

            w = self.bc.encode(padded(words), is_tokenized=True)
            # for i, word in enumerate(words):
            #     if word[:4] == "SUBJ":

            words = torch.FloatTensor(w)
            pos, ner, deprel = None, None, None
        else:
            # convert to tensors
            words = get_long_tensor(words, batch_size)
            pos = get_long_tensor(batch[1], batch_size)
            ner = get_long_tensor(batch[2], batch_size)
            deprel = get_long_tensor(batch[3], batch_size)
        masks = torch.eq(words, 0)
        subj_positions = get_long_tensor(batch[4], batch_size)
        obj_positions = get_long_tensor(batch[5], batch_size)

        rels = torch.LongTensor(batch[6])

        return (words, masks, pos, ner, deprel, subj_positions, obj_positions,
                rels, orig_idx)

    def __iter__(self):
        for i in range(self.__len__()):
            yield self.__getitem__(i)
Example #4
0
import json
import os
from tqdm import tqdm
from copy import deepcopy
from bert_serving.client import BertClient

bc = BertClient(output_fmt='list')

# input: cord-19 dataset in directory ./data/
# output: each paper as its own json, trimmed and ready to upload to elasticsearch in ./trimmed_papers/


def get_bert_encoding(text: str) -> list:
    return bc.encode([text])[0]


def write_json(output_file: str, data: dict, counter: int) -> int:
    with open(output_file, 'w') as f:
        json.dump(data, f)
    return counter + 1


def handle_file(data: dict, p_num_offset: int):

    keep = {}

    keys = data.keys()

    p_num = p_num_offset

    if 'abstract' in keys:
Example #5
0
from bert_serving.client import BertClient
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
#import gensim

client = BertClient(check_length=False)

# Load Google's pre-trained Word2Vec model.
# model =  gensim.models.KeyedVectors.load_word2vec_format('crisisNLP_word_vector.bin', binary=True)
#model = gensim.models.KeyedVectors.load_word2vec_format('/home/nilesh/SciBert/scibert_scivocab_uncased.tar.gz', binary=True)

#text_data=[]
# filepath = 'srilanka_preprocessed.txt'
# filepath = 'distinct_imp_senetences.txt'
# filepath = 'test1.txt'
filepath1 = 'file1.csv'
filepath2 = 'output3.csv'

f5 = open('finalSent.txt', 'w')


def maxn(list1, N):
    final_list = []

    for i in range(0, N):
        max1 = (0, 0)

        for j in range(len(list1)):
            if list1[j][0] > max1[0]:
                max1 = list1[j]
Example #6
0
child,parent = load_file(path,A,B)

test_a = np.array(child)
test_b = np.array(parent)
test_all = np.append(test_a,test_b)
test_all = np.unique(test_all)
np.save(name,test_all)

split_data_a = []

for i in test_all:
    split_data_a.append(i.split(' '))

from bert_serving.client import BertClient

bc = BertClient()

data_a_vectors = []
num = len(split_data_a)
i = 0
for j in split_data_a:
    if i%1000 == 0:
        print(i/num)

    try:
        data_a_vectors.append(bc.encode(j))
    except ValueError:
        j.remove('')
        data_a_vectors.append(bc.encode(j))
    i += 1
#!/usr/bin/env python3
""" Generating sentence embeddings using pre-trained BERT models. 
This script uses https://github.com/hanxiao/bert-as-service for a 
simpler sentence vectorisation
"""

# Initialise server from terminal (can specify model)
#bert-serving-start -model_dir data/bert_models/uncased_L-12_H-768_A-12 -num_worker=1 -max_seq_len=128

import numpy as np
import pandas as pd

from bert_serving.client import BertClient

lan = 'es'

# Load sentences
wpd = pd.read_csv("data/un-timed-sentences/en-" + lan + ".processed",
                  sep='\t').drop_duplicates()
sentences = list(wpd['Segment'])

bc = BertClient()
df = pd.DataFrame(bc.encode(sentences))

df.to_csv("bert-embeddings-timed-sentences-" + lan + ".csv",
          header=None,
          index=False)
Example #8
0
import numpy as np
from bert_serving.client import BertClient
from termcolor import colored

prefix_q = '##### **Q:** '
topk = 5

with open('../README.md') as fp:
    questions = [
        v.replace(prefix_q, '').strip() for v in fp
        if v.strip() and v.startswith(prefix_q)
    ]
    print('%d questions loaded, avg. len of %d' %
          (len(questions), np.mean([len(d.split()) for d in questions])))

with BertClient(port=5555, port_out=5556) as bc:
    doc_vecs = bc.encode(questions)

    while True:
        query = input(colored('your question: ', 'green'))
        query_vec = bc.encode([query])[0]
        # compute normalized dot product as score
        score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs,
                                                                      axis=1)
        topk_idx = np.argsort(score)[::-1][:topk]
        print('top %d questions similar to "%s"' %
              (topk, colored(query, 'green')))
        for idx in topk_idx:
            print('> %s\t%s' % (colored('%.1f' % score[idx], 'cyan'),
                                colored(questions[idx], 'yellow')))
Example #9
0
    gamma = 0.15
    rel = np.sum(rank_list[top_k_index], axis=0) / K
    irrel = np.sum(rank_list[bottom_k_index], axis=0) / K
    query = (alpha * query + beta * rel - gamma * irrel) / (alpha + beta -
                                                            gamma)
    return query


if __name__ == '__main__':
    #bert-serving-start -model_dir chinese_L-12_H-768_A-12/
    embedding_dict = load_obj('embedding/embedding_bert')
    K = 10
    prf_iteration = 10

    query = input()
    bc = BertClient()
    vector = bc.encode([query])

    rank_list = []
    for key in embedding_dict:
        rank_list.append([embedding_dict[key]])
    rank_list = np.array(rank_list).squeeze()

    top_k_index, bottom_k_index = topK(vector, rank_list, K)

    top_result, bottom_result = topK_result(top_k_index, bottom_k_index, K)

    for i in range(prf_iteration):
        vector = PRF(vector, rank_list, top_k_index, bottom_k_index, K)

        top_k_index, bottom_k_index = topK(vector, rank_list, K)
Example #10
0
import numpy as np
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Download BERT embeddings into a dictionary
from bert_serving.client import BertClient

client = BertClient()

bert_embeddings = {}
glove_embeddings = {}

with open("./top_50000.txt", 'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        tag = nltk.pos_tag([word])[0][1]
        if len(word) > 1 and word.isalpha() and tag in ['NN', 'NNP']:
            # add to bert_embeddings
            vector_bert = client.encode([word])
            bert_embeddings[word] = vector_bert[0]

            # add to glove_embeddings
            vector_glove = np.asarray(values[1:], "float32")
            glove_embeddings[word] = vector_glove

# Save dictionaries
np.save('bert.npy', bert_embeddings)
np.save('glove.npy', glove_embeddings)
Example #11
0
class MultiSenDetect(object):
    def __init__(self):
        # cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
        # self.embedding_size = 60#300
        # self.embedding_path = os.path.join(cur,'Word60.model')# 'word_vec_300.bin')
        # self.embdding_dict = self.load_embedding(self.embedding_path)
        # 将实体相似度设置为sim_limit,将大于这个数值的两个实体认为是同一个实体
        # self.sim_limit = 0.8
        self.bc=BertClient(ip='192.168.1.101',ignore_all_checks=True)
        self.word_dict=dict()
        self.kg_dict=dict()
        #以redis代替文件
        self.redis = None
        try:
            #这是百科页面缓存
            pool = redis.ConnectionPool(host='192.168.1.101', port=6379, db=1, decode_responses=True)
            #这是向量缓存
            pool1 = redis.ConnectionPool(host='192.168.1.101', port=6379, db=2)
            #这是关键词缓存
            pool2 = redis.ConnectionPool(host='192.168.1.101', port=6379, db=3)
            self.redis = redis.Redis(connection_pool=pool)
            self.redis_1=redis.StrictRedis(connection_pool=pool1)
            self.redis_2 = redis.StrictRedis(connection_pool=pool2)
            logging.info('baidu cache in redis is connected ,count %d' % (self.redis.dbsize()))
            logging.info('word vector in redis is connected ,count %d' % (self.redis_1.dbsize()))
            logging.info('keyword in redis is connected ,count %d' % (self.redis_2.dbsize()))
            load_file = open('./mod/place_dict.bin', 'rb')
            self.place_dict = pickle.load(load_file)
            logging.info('place_dict count %d' % (len(self.aho_policical_person)))
        except:
            #如果没有redis,用文件代替
            try:
                load_file = open('./mod/baidu_cache.bin', 'rb')
                self.baidu_cache = pickle.load(load_file)
                logging.info('baidu cache count %d' % (len(self.baidu_cache)))
                load_file = open('./mod/word_dict.bin', 'rb')
                self.word_dict = pickle.load(load_file)
                logging.info('word vector dict count %d' % (len(self.word_dict)))
            except:
                self.baidu_cache = dict()
                self.word_dict = dict()



    '''请求主页面'''
    def get_html(self, url):
        if self.redis:
            if self.redis.exists(url):
                return self.redis.get(url)
            else:
                html=request.urlopen(url,timeout=600).read().decode('utf-8').replace('&nbsp;', '')
                self.redis.set(url,html)
                return html
        if self.baidu_cache:
            if self.baidu_cache.get(url):
                return self.baidu_cache.get(url)
            else:
                # request.
                content=request.urlopen(url,timeout=600).read().decode('utf-8').replace('&nbsp;', '')
                self.baidu_cache[url]=content
                return content
        # return request.urlopen(url).read().decode('utf-8').replace('&nbsp;', '')

    '''收集词的多个义项'''
    '''这里特指人物名了'''
    def collect_mutilsens(self, word):
        # if self.baidu_person.get(word):
        #     html=self.baidu_person.get(word)
        # else:
        url = "http://baike.baidu.com/item/%s?force=1" % parse.quote(word)
        html = self.get_html(url)
        # self.baidu_person[word]=html

        selector = etree.HTML(html)
        #这个判断有时候不准确
        sens = [''.join(i.split(':')[1:]) for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/text()')]
        sens_link = ['http://baike.baidu.com' + i for i in selector.xpath('//li[@class="list-dot list-dot-paddingleft"]/div/a/@href')]
        sens_dict = {sens[i]:sens_link[i] for i in range(len(sens)) if sens[i].strip()!=''}
        #有可能没有多概念,只有一个概念
        if len(sens_dict)==0:
            sens_dict={word:url}
        return sens_dict

    '''概念抽取'''
    def extract_concept(self, desc):
        #实际最后只取了一个,可以做成n与n相对,nr与nr相对
        desc_seg = [[i.word, i.flag] for i in pseg.cut(desc)]
        concepts_candi = [i[0] for i in desc_seg if i[1][0] in ['n','b','v','d']]
        return concepts_candi[-1]

    def extract_baidu(self, selector):
        info_data = {}
        if selector.xpath('//h2/text()'):
            info_data['current_semantic'] = selector.xpath('//h2/text()')[0].replace('    ', '').replace('(','').replace(')','')
        else:
            info_data['current_semantic'] = ''
        if info_data['current_semantic'] == '目录':
            info_data['current_semantic'] = ''

        info_data['tags'] = [item.replace('\n', '') for item in selector.xpath('//span[@class="taglist"]/text()')]
        if selector.xpath("//div[starts-with(@class,'basic-info')]"):
            for li_result in selector.xpath("//div[starts-with(@class,'basic-info')]")[0].xpath('./dl'):
                attributes = [attribute.xpath('string(.)').replace('\n', '') for attribute in li_result.xpath('./dt')]
                values = [value.xpath('string(.)').replace('\n', '') for value in li_result.xpath('./dd')]
                for item in zip(attributes, values):
                    info_data[item[0].replace('    ', '')] = item[1].replace('    ', '')
        # 补充元数据
        try:
            info_data['desc'] = selector.xpath('//meta[@name="description"]/@content')[0]
        except:
            info_data['desc'] =[]
        paras=[]
        para_text=''
        pattern = re.compile('“(.*?)”')
        if selector.xpath("//div[starts-with(@class,'para')]"):
            for para in selector.xpath("//div[starts-with(@class,'para')]"):
                # paras.append(para.text)
                if para.text:
                    para_text=para_text+para.text
            paras=pattern.findall(para_text)
            info_data['keywords']=self.extract_keywords(info_data['desc']+para_text)+paras# anse.extract_tags(para_text, topK=20, withWeight=False)
        else:
            info_data['keywords'] =self.extract_keywords(info_data['desc'])#[]
        #计算后面的地点、职位、最高词频等值

        # info_data['keywords']=selector.xpath('//meta[@name="keywords"]/@content')
        return info_data

    '''多义词主函数'''
    def collect_concepts(self, wd):
        #这个收集,如果我们自己建立了图谱,应该先向图谱询问,没有的情况下再向互联网查询。
        sens_dict = self.collect_mutilsens(wd)
        if not sens_dict:
            return {}
        concept_dict = {}
        concepts_dict = {}
        for sen, link in sens_dict.items():
            #     concept_dict[sen]=[link]
            #     concept = self.extract_concept(sen)
            #     if concept not in concept_dict:
            #         concept_dict[concept] = [link]
            #     else:
            #         concept_dict[concept].append(link)
            # cluster_concept_dict = self.concept_cluster(concept_dict)
            #
            # for concept, links in cluster_concept_dict.items():
            #     link = links[0]
            concept = sen
            if self.redis_2:
                if self.redis_2.exists(link):
                    concept_data=pickle.loads(self.redis_2.get(link))
                else:

                    selector = etree.HTML(self.get_html(link))
                    concept_data=self.extract_baidu(selector)
                    self.redis_2.set(link,pickle.dumps(concept_data))
                    # desc, keywords = self.extract_desc(link,wd)
                desc =concept_data['desc']
                concept_data['link']=link
                # keywords=' '.join(concept_data['keywords'])
                # context = ' '.join(desc + [' '] + concept_data['keywords'])
                context = concept_data['keywords']
                concepts_dict[concept] = context
                self.kg_dict[concept] = concept_data

        # pprint.pprint(concepts_dict)
        return concepts_dict
    def getConcept(self,concept):
        return self.kg_dict.get(concept)
    '''词义项的聚类'''
    # def concept_cluster(self, sens_dict):
    #     sens_list = []
    #     cluster_sens_dict = {}
    #     for sen1 in sens_dict:
    #         sen1_list = [sen1]
    #         for sen2 in sens_dict:
    #             if sen1 == sen2:
    #                 continue
    #             sim_score = self.similarity_cosine(self.get_wordvector(sen1), self.get_wordvector(sen2))
    #             if sim_score >= self.sim_limit:
    #                 sen1_list.append(sen2)
    #         sens_list.append(sen1_list)
    #     sens_clusters = self.entity_clusters(sens_list)
    #     for sens in sens_clusters:
    #         symbol_sen = list(sens)[0]
    #         cluster_sens_dict[symbol_sen] = sens_dict[symbol_sen]
    #
    #     return cluster_sens_dict

    '''对具有联通边的实体进行聚类'''
    # def entity_clusters(self, s):
    #     clusters = []
    #     for i in range(len(s)):
    #         cluster = s[i]
    #         for j in range(len(s)):
    #             if set(s[i]).intersection(set(s[j])) and set(s[i]).intersection(set(cluster)) and set(
    #                     s[j]).intersection(set(cluster)):
    #                 cluster += s[i]
    #                 cluster += s[j]
    #         if set(cluster) not in clusters:
    #             clusters.append(set(cluster))
    #
    #     return clusters

    '''获取概念描述信息,作为该个义项的意义描述'''
    # def extract_desc(self, link):
    #     html = self.get_html(link)
    #     selector = etree.HTML(html)
    #     #这个selector.xpath完了少东西,变成...了
    #     keywords = selector.xpath('//meta[@name="keywords"]/@content')
    #     desc = selector.xpath('//meta[@name="description"]/@content')
    #     print(desc)
    #     return desc, keywords

    '''对概念的描述信息进行关键词提取,作为整个概念的一个结构化表示'''
    def extract_keywords(self, sent):
        # keywords = [i for i in anse.extract_tags(sent, topK=20, withWeight=False, allowPOS=('n', 'v', 'ns', 'nh', 'nr', 'm', 'q', 'b', 'i', 'j')) if i !=wd]
        keywords = [i for i in anse.extract_tags(sent, topK=20, withWeight=False, allowPOS=('n', 'v', 'ns', 'nh', 'nr', 'q', 'b', 'i', 'j'))]
        return keywords

    '''加载词向量'''
    # def load_embedding(self, embedding_path):
        # embedding_dict = {}
        # count = 0
        # for line in open(embedding_path):
        #     line = line.strip().split(' ')
        #     if len(line) < 300:
        #         continue
        #     wd = line[0]
        #     vector = np.array([float(i) for i in line[1:]])
        #     embedding_dict[wd] = vector
        #     count += 1
        #     if count%10000 == 0:
        #         print(count, 'loaded')
        # print('loaded %s word embedding, finished'%count)
        # w2v=word2vec.Word2Vec.load(embedding_path)

        # return w2v#embedding_dict
        # return None
    '''基于wordvector,通过lookup table的方式找到句子的wordvector的表示'''
    '''都改用了bert的向量来表示'''
    def rep_sentencevector(self, sentence):

        return self.bc.encode([sentence])[0]
        # word_list = self.extract_keywords(sentence)
        # #用关键词叠加再取平均
        # embedding = np.zeros(self.embedding_size)
        # sent_len = 0
        # for index, wd in enumerate(word_list):
        #     if wd in self.embdding_dict:
        #         embedding += self.embdding_dict.wv.get_vector(wd) #self.embdding_dict.get(wd)
        #         sent_len += 1
        #     else:
        #         continue
        # return embedding/sent_len
    '''用BERT来取得关键词组的向量组'''
    def get_wordsvectors(self,words):
        if len(words)==0:return []
        key=''.join(words)
        #试试用mget来处理,取的时候优化,一起发送
        if self.redis_1:
            word_v_dumps = self.redis_1.mget(words)
            encodes=[]
            needencodes_word=[]
            needencodes_index=[]
            for i,dump in enumerate(word_v_dumps):
                if dump is None:
                    needencodes_index.append(i)
                    needencodes_word.append(words[i])
                else:
                    encodes.insert(i,pickle.loads(dump))
            if len(needencodes_word)>0:
                vecs=self.bc.encode(needencodes_word)
                for vec in zip(needencodes_index,vecs,needencodes_word):
                    self.redis_1.set(vec[2], pickle.dumps(vec[1]))
                    encodes.insert(vec[0],vec[1])
            return encodes
            # if self.redis_1.exists(key):
            #     return pickle.loads(self.redis_1.get(key))
            # else:
            #     vecs=self.bc.encode(words)
            #     self.redis_1.set(key, pickle.dumps(vecs))
            #     return vecs
        return self.bc.encode(words)

    '''获取单个词的词向量'''
    '''改用BERT的字向量来代词向量'''
    def get_wordvector(self, word):
        if self.redis_1:
            if self.redis_1.exists(word):
                return pickle.loads(self.redis_1.get(word))
            else:
                vec = self.bc.encode([word])[0]

                self.redis_1.set(word, pickle.dumps(vec))
                return vec
        if self.word_dict:
            if self.word_dict.get(word) is None:
                vec=self.bc.encode([word])[0]
                self.word_dict[word]=vec
            else:
                vec=self.word_dict.get(word)
            return vec#self.bc.encode([word])[0]
        # try:
        #     v=self.embdding_dict.wv.get_vector(word)
        # except:
        #     v=np.array([0]*self.embedding_size)
        # return v
        # return np.array(self.embdding_dict.get(word, [0]*self.embedding_size))
    '''两两向量之间相似度(余弦)'''
    def similarity_cosine(self, vector1, vector2):
        cos1 = np.sum(vector1 * vector2)
        cos21 = linalg.norm(vector1)
        cos22 = linalg.norm(vector2)
        # cos21 = np.sqrt(sum(vector1 ** 2))
        # cos22 = np.sqrt(sum(vector2 ** 2))
        similarity = cos1 / float(cos21 * cos22)
        if str(similarity) == 'nan':
            return 0.0
        else:
            return similarity
    '''计算两组向量之间的相关度(余弦),代替两两向量之间相似度的多重计算,速度快'''
    def similarity_cosine_matrix(self, vectors1, vectors2):
        cos1 = np.tensordot(vectors1, vectors2, axes=(1, 1))
        cos2 = np.tensordot(vectors2, vectors1, axes=(1, 1))
        # 2 - 范数:║x║2 =(│x1│2 +│x2│2 +…+│xn│2)1 / 2
        cos21 = linalg.norm(vectors1, axis=1)
        # cos21 = np.sqrt(sum(vector1**2))
        cos22 = linalg.norm(vectors2, axis=1)
        # cos22 = np.sqrt(sum(vector2**2))
        score_wds1= np.divide(cos1, (np.outer(cos21, cos22)))
        score_wds2 = np.divide(cos2, (np.outer(cos22, cos21)))
        #加权平均
        similarity1 = np.average(np.max(score_wds1, axis=1), axis=0)
        similarity2 = np.average(np.max(score_wds2, axis=1), axis=0)
        similarity=(similarity1+similarity2)/2

        if str(similarity) == 'nan':
            return 0.0
        else:
            return similarity
    #减少目标句的反复,直接用向量字典,sent1是知识数据,sent2是目标数据,vecs是目标数据的关键词向量
    def distance_words_vecs(self, sent1_keywords, sent2, vecs, word='', concept='',weightkeys=[],att=[],geo=[]):
        # TODO:这里也可以进行分析,如果有定中的词,把定中拿出来与concept进行比较
        # 简化地:如果句子中包含concept可以直接得到结论
        concept_keys = []
        if concept != '':
            concept_data=self.kg_dict.get(concept)
            #地理加权,简化版,对国际人物效果高,国内人物未处理
            if concept_data.get('国籍') and len(geo)>0:
                concept_keys.append(concept_data.get('国籍'))
            #标签加权
            # if concept_data.get('tags'):

            pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)'
            concept_list = re.split(pattern, concept)
            for concept_sub in concept_list:
                if concept_sub.strip() == '': continue
                # score=self.similarity_cosine(self.get_wordvector(concept_sub), self.get_wordvector(word2))
                sm = difflib.SequenceMatcher(None, concept_sub, sent2)
                maxlen = sm.find_longest_match(0, len(concept_sub), 0, len(sent2)).size
                # if maxlen/ len(concept_sub) ==1:
                #     #完全一致
                #     return 1
                if maxlen / len(concept_sub) > 0.75:
                    concept_keys.append(concept_sub)#maxlen / len(concept_sub))
                    concept_keys.append(concept_sub)
                    concept_keys.append(concept_sub)

                if len(att)>0:
                    for att_ in att:
                        sm_ = difflib.SequenceMatcher(None, concept_sub, att_)
                        blocks= sm_.get_matching_blocks()
                        maxlen_=sum([b.size for b in blocks])
                                # maxlen_   (0, len(concept_sub), 0, len(att_)).size
                        if maxlen_/len(concept_sub)==1:
                            return 1
                        if maxlen / len(concept_sub) > 0.75 or maxlen / len(att_)==1:
                            concept_keys.append(att_)
                            concept_keys.append(att_)
                            concept_keys.append(att_)

                    concept_keys.append(concept_sub)
        # modi
        from scipy.spatial.distance import cosine
        # sent1 = sent1.replace('...', '').replace(word, '')
        # if sent1.strip() == '': return 0
        # wds1 = self.extract_keywords(sent1 + ' ' + concept)
        #这里能否实现:如果输入句有title,这里要加权title,如果有地名,要加权地名
        wds1 = sent1_keywords
        wds1 = wds1+concept_keys+concept_keys
        # wds2 = self.extract_keywords(sent2)
        # pprint.pprint(wds1)
        # pprint.pprint(wds2)
        score_wds1 = []
        score_wds2 = []
        sim_score = 0
        # t = time.time()
        sent1_vectors=self.get_wordsvectors(wds1)
        #补充的权重
        weightkeys_vector=[]
        for key in weightkeys:
            if key in wds1:
                i=wds1.index(key)
                weightkeys_vector.append(sent1_vectors[i])
        if len(weightkeys_vector)>0:
            sent1_vectors = np.append(sent1_vectors ,weightkeys_vector,axis=0)
            vecs =  np.append(vecs , weightkeys_vector,axis=0)
        #如果百科句的关键词过少,会导致比例提高,如果能一致呢?
        # logging.info('%s vecs length:%d,sentv length:%d' % (concept,len(vecs),len(sent1_vectors)))
        try:
            #替换算法
            sim_score=self.similarity_cosine_matrix(sent1_vectors,vecs)

            # for word1 in wds1:  # 这个模式下,在找两组词中最接近的两个词,所有的最接近值再取平均,反向再取,得到一个和第二句每个最大相似平均值。这个算法,不太好。
            #     score = max(
            #         [self.similarity_cosine(self.get_wordvector(word1),vec) for vec in vecs.values()])
            #     score_wds1.append(score)
            # for word2 in vecs:
            #     score = max(
            #         [self.similarity_cosine(vecs.get(word2), self.get_wordvector(word1)) for word1 in wds1])
            #     score_wds2.append(score)
            #这里用的是sum/len,其实还是要平滑一下,如果有max的则应该加权
            #对于keyword太少的情况,可能效果会比较差
            # 相当于定中高于向量关系,特殊情况会出现都相等的情况,再处理
            # score_wds1=score_wds1+[s for s in score_wds1 if s>=1]
            # score_wds2 = score_wds2 + [s for s in score_wds2 if s >= 1]
            # sim_score = max(sum(score_wds1) / len(wds1), sum(score_wds2) / len(vecs))
        except:
            sim_score = 0

        # logging.info(concept + 'distance_words cost:' + str(time.time() - t))
        # t = time.time()

        # if len(scores)>0:
        #     sim_score=len(scores)

        return sim_score

    # def _get_maybe_error_index(self, scores, ratio=0.6745, threshold=1.4):
    #     """
    #     取疑似错字的位置,通过平均绝对离差(MAD)
    #     :param scores: np.array
    #     :param threshold: 阈值越小,得到疑似错别字越多
    #     :return:
    #     """
    #     scores = np.array(scores)
    #     if len(scores.shape) == 1:
    #         scores = scores[:, None]
    #     median = np.median(scores, axis=0)  # get median of all scores
    #     margin_median = np.sqrt(np.sum((scores - median) ** 2, axis=-1))  # deviation from the median
    #     # 平均绝对离差值
    #     med_abs_deviation = np.median(margin_median)
    #     if med_abs_deviation == 0:
    #         return []
    #     y_score = ratio * margin_median / med_abs_deviation
    #     # 打平
    #     scores = scores.flatten()
    #     maybe_error_indices = np.where((y_score > threshold) & (scores < median))
    #     # 取全部疑似错误字的index
    #     return list(maybe_error_indices[0])

    '''基于词语相似度计算句子相似度'''
    def distance_words(self, sent1, sent2, word='', concept=''):
        #TODO:这里也可以进行分析,如果有定中的词,把定中拿出来与concept进行比较
        #简化地:如果句子中包含concept可以直接得到结论
        concept_list=[]
        if concept!='':

            pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)'
            concept_list = re.split(pattern, concept)
            for concept_sub in concept_list:
                if concept_sub.strip()=='':continue
                # score=self.similarity_cosine(self.get_wordvector(concept_sub), self.get_wordvector(word2))
                sm = difflib.SequenceMatcher(None, concept, sent2)
                maxlen = sm.find_longest_match(0, len(concept), 0, len(sent2)).size
                if maxlen/len(concept_sub)>0.75:
                    return maxlen/len(concept_sub)
        # modi
        sent1 = sent1.replace('...', '').replace(word, '')
        if sent1.strip()=='': return 0
        wds1 = self.extract_keywords(sent1+' '+concept)
        wds2 = self.extract_keywords(sent2)
        # pprint.pprint(wds1)
        # pprint.pprint(wds2)
        score_wds1 = []
        score_wds2 = []
        sim_score = 0
        try:
            for word1 in wds1:#这个模式下,在找两组词中最接近的两个词,所有的最接近值再取平均,反向再取,得到一个和第二句每个最大相似平均值。这个算法,不太好。
                score = max([self.similarity_cosine(self.get_wordvector(word1), self.get_wordvector(word2)) for word2 in wds2])
                score_wds1.append(score)
            for word2 in wds2:
                score = max([self.similarity_cosine(self.get_wordvector(word2), self.get_wordvector(word1)) for word1 in wds1])
                score_wds2.append(score)

            sim_score = max(sum(score_wds1)/len(wds1), sum(score_wds2)/len(wds2))
        except:
            sim_score=0
        return sim_score

    '根据用户输入的句子,进行概念上的一种对齐'
    def detect_main(self, sent, word,att=[],geo=[]):
        pprint.pprint(word)
        pprint.pprint(att)
        pprint.pprint(geo)
        if att==[''] :
            att=[]
        sent = sent.replace(word, '')
        concept_dict = self.collect_concepts(word)
        # sent_vector = self.rep_sentencevector(sent)#这个句向量的得法太武断了,并不利于聚类
        concept_scores_sent = {}
        concept_scores_wds = {}

        keys = self.extract_keywords(sent)

        # keysdict=dict()
        # for key in keys:
        #     vec=self.get_wordvector(key)
        #     keysdict[key]=vec

        # for att_ in att:
        #     if att_!='':
        #         vec = self.get_wordvector(att_)
        #         keysdict[att_] = vec
        # for geo_ in geo:
        #     if geo_!='':
        #         vec = self.get_wordvector(geo_)
        #         keysdict[geo_] = vec
        # w=[]
        # if len(att) > 0:
        #     w=w+att
        # if len(geo) >0:
        #    w=w+list(geo)
        # if len(w)>0:
        #     keys=w
        keys = keys + att + list(geo)+ att + list(geo)
        pprint.pprint(keys)
        while '' in keys:
            keys.remove('')
        keys_vectors = self.get_wordsvectors(keys)

        for concept, keywords in concept_dict.items():
            if len(concept_dict) == 1:
                concept_scores_sent[concept]=1
                concept_scores_wds[concept]=1
                break
            #句向量的模式下,效果并不那么好,因为单凭一句话来聚类,还是比较偏颇的
            # try:
            #     concept_vector = self.rep_sentencevector(self.kg_dict[concept]['desc'][0])#把概念的描述句向量化,再与输入句的向量比对
            #     similarity_sent = self.similarity_cosine(sent_vector, concept_vector)
            #     concept_scores_sent[concept] = similarity_sent
            # except:
            concept_scores_sent[concept]=0
            #词向量模式下,效果还可以
            # similarity_wds = self.distance_words(desc, sent, word, concept)
            #把相同项加权做到这里了,可能反而会降速
            # keywords=keywords+list(set(keywords).intersection(set(keys)))
            #ATT分析
            geo_=[]
            for att_ in att:
                desc_seg = [[i.word, i.flag] for i in pseg.cut(att_)]
                concepts_candi = [i[0] for i in desc_seg if i[1][0] in ['ns']]
                if len(concepts_candi)>0:
                    geo_=geo_+concepts_candi
            if len(geo_)>0:
                geo=geo_
            similarity_wds = self.distance_words_vecs(keywords, sent, keys_vectors, word, concept,list(set(keywords).intersection(set(keys))),att,geo)

            concept_scores_wds[concept] = similarity_wds

        concept_scores_sent = sorted(concept_scores_sent.items(), key=lambda asd:asd[1],reverse=True)
        concept_scores_wds = sorted(concept_scores_wds.items(), key=lambda asd:asd[1],reverse=True)
        pprint.pprint(concept_scores_wds)

        # 只取了前三个,如果单从给结果的角度看,直接max给一个结果就可以
        return concept_scores_sent[:3], concept_scores_wds[:3]
    def save_cache(self):
        '''
                都处理完成后,要对现场进行保存,以提高未来的响应速度
                现在放到了变量中,未来可以由redis来保存
                '''
        # 词向量(BERT生成)
        if self.redis:
            pass
        else:
            fou = open('./mod/word_dict.bin', 'wb')
            pickle.dump(self.word_dict, fou)
            fou.close()
            # 百度人物URL内容
            fou = open('./mod/baidu_cache.bin', 'wb')
            pickle.dump(self.baidu_cache, fou)
            fou.close()
Example #12
0
Y = train_df['is_duplicate']
X_train, X_validation, Y_train, Y_validation = train_test_split(X,
                                                                Y,
                                                                test_size=0.2)
# print(X_train.head())

X = X_train[['question1', 'question2']]
X_v = X_validation[['question1', 'question2']]

print("loaded training and validation data")
Y_train = Y_train

Y_validation = Y_validation

#train_question1
bc = BertClient()
BERT_train_question1 = []
k = 0

for x in X['question1']:

    f = bc.encode([x])

    f = tf.convert_to_tensor(f)
    # print("after",f)

    BERT_train_question1.append(f[0])
    k = k + 1
    print("BERT_train_question1: point ", k)

BERT_train_question1 = tf.stack(BERT_train_question1)
Example #13
0
from bert_serving.client import BertClient
from load_data import test_df, train_df, train_labels, train_texts, test_labels, test_texts
import numpy as np
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.layers import Input, BatchNormalization, Dense
import pandas as pd
import os
import tensorflow as tf


# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
""" create vector """
bc = BertClient()
x_test = np.array(bc.encode(test_texts))
x_train = np.array(bc.encode(train_texts))

np.savetxt("x_test_vec_fine_5.txt", x_test, delimiter=',')
print(x_test)

np.savetxt("x_train_vec_fine_5.txt", x_train, delimiter=',')
print(x_train)

y_train = np.array([vec for vec in train_df['label']])
np.savetxt("y_train.txt", y_train, delimiter=',')
print("y_train complete")


""" balance labels """
# count = np.zeros(3)
 print(year+month+day)
 if str(year+month+day) in y:
     print('we have '+year+month+day)
     continue
 try:
     locate = './../../webScrapying/CNA/DATA/'+year+'/'+year+'_'+month+'/'+year+month+day+'.csv'
     datalist = getParagragh(locate)
 except:
     continue
 
 chunhua = int(single_date.strftime("%Y")) - 1911
 context = pythex( datalist , str(chunhua)+month+day ) 
 #Regular expression的部分
 #bert轉譯
 filelist = []
 bc = BertClient()
 i = 0
 
 for article in context:
     if i == 0 :
         i += 1
         continue
     try:
         context_bert = bc.encode(article[2])
     except:
         context_bert = []
     topic_bert = bc.encode([article[0]])
     if context_bert == []:
         bert_news = topic_bert
     else:
         bert_news = np.append(topic_bert,context_bert,axis=0)
Example #15
0
 def __init__(self):
     self.bc = BertClient()
Example #16
0
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
print("完成!")
print(X_train)

import copy

small_word_index = copy.deepcopy(word_index) # 防止原来的字典被改变
x = list(t.word_counts.items())
s = sorted(x, key=lambda p:p[1], reverse=True)
print("移除word_index字典中的低频词...")
for item in s[10000:]:
    small_word_index.pop(item[0]) # 对字典pop
print("完成!")

from bert_serving.client import BertClient
bc = BertClient()
# 定义随机矩阵

embedding_matrix = np.random.uniform(size=(vocab_size+1,768))
print("构建embedding_matrix...")
for word, index in small_word_index.items():
    try:
        word_vector = bc.encode([word])
        embedding_matrix[index] = word_vector
        # print("Word: [", index, "]")
    except:
        print("Word: [",word,"] not in wvmodel! Use random embedding instead.")
print("完成!")
print("Embedding matrix shape:\n",embedding_matrix.shape)

from keras.models import Sequential, Model
Example #17
0
import pandas as pd
import numpy as np
import os
import json
from bert_serving.client import BertClient

save_dir = "R:\custom"

bert = BertClient(check_length=False)
embed_size = len(bert.encode(["test"])[0])

bert_str = "_BERT_encoded_cased_" + str(embed_size)

train_set_path = r"R:\custom\custom_training_set.csv"

test_sets_dir = r"R:\custom\custom_test_set"
dataset_paths = [
    os.path.join(test_sets_dir, fname) for fname in os.listdir(test_sets_dir)
]


def embed_dataset(path):
    with open(path, mode="r", encoding="utf-8") as f:
        df = pd.read_csv(f)
        X = bert.encode(df["input"].tolist()).tolist()
        y = df["label"].tolist()

    return [{"input": i, "label": l} for i, l in zip(X, y)]


# Embed training set
def run(args):
    """""" """""" """""" """""" """""" """
    
    Set up
    
    """ """""" """""" """""" """""" """"""

    # topic = "Georgetown_University"
    topic = args.topic
    actor_lr = args.actor_LR
    critic_lr = args.critic_LR
    episodes = args.eps

    topic = "Georgetown_University"
    actor_lr = 0.0001
    critic_lr = 0.001
    episodes = 10

    buffer = classes.ReplayBuffer()
    wiki_wiki = wikipediaapi.Wikipedia('en')
    rs = relation_standardizer.Relation_Standardizer("Bob")

    # TODO: Future add code to let a person choose another topic

    G_augment, cleaned_tuples, topic_graph, encoded_tuples = load_GU_Data()
    page_list = GU_pages()
    buffer.relations = encoded_tuples
    individual_pages, Topic_Dict = get_pages(page_list, wiki_wiki)

    n_features = 768  # Default used in BERT
    n_output = 25  # Num possilbe relations.  Currently set to 25
    actor_H1 = 768  # Num of hidden units in first layer of actor
    actor_H2 = 768  # Num of hidden units in second layer of actor
    critic_H1 = 768  # Num of hidden units in first layer of critic
    critic_H2 = 768  # Num of hidden units in second layer of critic

    # TensorFlow Setup and Initialization
    tf.reset_default_graph()

    actor = classes.Actor(n_features, n_output, actor_lr, actor_H1, actor_H2)
    critic = classes.Critic(n_features, critic_lr, critic_H1, critic_H2)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    # BERT Setup
    print(
        "In a terminal window Python Environment can access, run the following:\n"
        +
        "bert-serving-start -model_dir ~/Final_Proj/BERT-Data/ -num_worker=2\n\nPress Enter when done."
    )
    x = input()

    from bert_serving.client import BertClient
    bc = BertClient()

    # Core-NLP Setup
    print(
        "In a terminal window run the following:\ncd ~/Final_Proj/Stan-Core-NLP; java -mx6g -cp \"*\" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000\n\nPress Enter when done."
    )
    x = input()

    nlp = StanfordCoreNLP('http://localhost:9000')

    current_node = topic
    """""" """""" """""" """""" """""" """
    
    Running Episodes
    
    """ """""" """""" """""" """""" """"""

    for episode in range(episodes):

        relations = []
        probs = []
        chosen = []
        rewards = []
        states = []
        pred_rewards = []
        td_error = []

        # Run the Training Routine for the Critic
        training_sample = buffer.sample(20)

        # sample = training_sample[0]

        # Use the Actor to determine the predicted relation for a state
        for sample in training_sample:
            relations.append(rs.relation_to_int(sample[0]))
            states.append(sample[1])
            probs.append(
                sess.run(actor.layer3,
                         feed_dict={actor.observation: sample[1]}))

        # Formatting the probabilities to make them easier to use
        for prob in probs:
            prob_list = prob[0].tolist()
            chosen.append(prob_list.index(max(prob_list)))

        # Determine reward from the environment
        for actual, pred in zip(relations, chosen):
            if actual == pred:
                rewards.append(1.0)
            else:
                rewards.append(0.0)

        # Training the Critic
        loss, _ = sess.run(
            [critic.loss, critic.train],
            feed_dict={
                critic.observation: np.reshape(states, (-1, 768)),
                critic.reward: np.reshape(rewards, (-1, 1))
            })
        print("Training loss for critic is: " + str(loss))

        ######
        # Exploration code
        ######

        # Run the links available for the current node through the critic to get lowest mean
        # The std is also included if we want to include a LCB version later.
        node_predictions = determine_node_knowledge(current_node, G_augment,
                                                    sess, critic)

        # Filter out nan entries & sort
        filtered = [x for x in node_predictions if not math.isnan(x[1])]
        filtered.sort(key=lambda x: x[1])

        # Determine the next node to go to
        for node in filtered:
            if node[0] not in individual_pages:
                current_node = node[0]
                individual_pages.append(current_node)

                break

        # Explore page
        clean_tuples, encodes = explore_new_page(current_node, bc, nlp,
                                                 wiki_wiki, G_augment)

        # Add encoded tuples to the replay buffer
        buffer.relations += encodes

        relations = []
        states = []
        chosen = []
        probs = []
        questions = []

        # Gather info for training the Actor
        for encode in encodes:
            relations.append(rs.relation_to_int(encode[0]))
            states.append(encode[1])
            probs.append(
                sess.run(actor.layer3,
                         feed_dict={actor.observation: encode[1]}))

        # Predict the rewards for the new relations from the page.
        # Runs it through the critic and then flattens it.
        pred_rewards = sess.run(
            critic.layer3,
            feed_dict={critic.observation: np.reshape(states, (-1, 768))})
        pred_rewards = [item for sublist in pred_rewards for item in sublist]
        '''
        # MOVE CRITIC PRED HERE and DETERMINE
        pred_rewards = sess.run(critic.layer3, feed_dict = {critic.observation: np.reshape(states,(-1, 768))})
        pred_rewards = [item for sublist in pred_rewards for item in sublist]
        '''
        '''
        # Determine reward from the environment       
        for actual, pred in zip(relations, chosen):
            if actual == pred:
                rewards.append(1.0)
            else:
                rewards.append(0.0)
        '''

        # td_error = [(p - r) for p, r in zip(pred_rewards, rewards)]

        s = states[1]
        r = relations[1]
        p = pred_rewards[1]

        # Train the Actor on the downloaded items.
        for s, r, p, clean in zip(states, relations, pred_rewards,
                                  clean_tuples):

            # print(str(s) + " " + str(r) + " " + str(p))

            actor_prob = sess.run(actor.layer3,
                                  feed_dict={actor.observation: s})
            # print (actor_prob)
            actor_prob = actor_prob[0].tolist()
            chosen = actor_prob.index(max(actor_prob))

            # I need to come back and make sure this is correct
            reward = -p
            if chosen == r:
                reward = 1 + reward
            else:
                questions.append("Actual:    " + clean[0] + " | " + clean[1] +
                                 " | " + clean[2] + "\nPredicted: " +
                                 clean[0] + " | " +
                                 str(rs.int_to_relation(chosen)) + " | " +
                                 clean[2])

            loss, log_prob, _ = sess.run(
                [actor.loss, actor.log_probability, actor.layer3],
                feed_dict={
                    actor.observation: s,
                    actor.td_error: reward,
                    actor.relation: r
                })
Example #19
0
def create_query_vector(query):
    return BertClient().encode([query])[0]
Example #20
0
class Baseline:
    def __init__(self, flags):
        self.lr = flags.lr
        self.sen_len = flags.sen_len
        self.pre_embed = flags.pre_embed
        self.pos_limit = flags.pos_limit
        self.pos_dim = flags.pos_dim
        self.window = flags.window
        self.word_dim = flags.word_dim
        self.hidden_dim = flags.hidden_dim
        self.postagger_dim = flags.postagger_dim
        self.batch_size = flags.batch_size
        self.data_path = flags.data_path
        self.model_path = flags.model_path
        self.encoder = flags.encoder
        self.mode = flags.mode
        self.epochs = flags.epochs
        self.dropout = flags.dropout
        self.bag_threshold = flags.bag_threshold
        self.word_frequency = flags.word_frequency
        self.postaggerMap = {}
        if flags.level == 'sent':
            self.bag = False
        elif flags.level == 'bag':
            self.bag = True
        else:
            self.bag = True

        if flags.embed_bert:
            self.embed_bert = True
            self.word_dim = 768
            self.bert = BertClient(ip='localhost',
                                   check_version=False,
                                   check_length=False)
        else:
            self.embed_bert = False

        self.relation2id = self.load_relation()
        self.num_classes = len(self.relation2id)

        self.pos_num = 2 * self.pos_limit + 3
        if self.pre_embed:
            self.wordMap, word_embed = self.load_wordVec()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)

        elif self.embed_bert and self.mode == 'train':
            self.wordMap, word_embed = self.bert_wordMap()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)
        elif self.embed_bert and self.mode == 'test':
            self.wordMap, word_embed = self.load_bert_word2vec()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)
        else:
            self.wordMap = self.load_wordMap()
            self.word_embedding = tf.compat.v1.get_variable(
                shape=[len(self.wordMap), self.word_dim],
                name='word_embedding',
                trainable=True)

        self.pos_e1_embedding = tf.compat.v1.get_variable(
            name='pos_e1_embedding', shape=[self.pos_num, self.pos_dim])
        self.pos_e2_embedding = tf.compat.v1.get_variable(
            name='pos_e2_embedding', shape=[self.pos_num, self.pos_dim])
        self.pos_postagger_embedding = tf.compat.v1.get_variable(
            name='pos_postagger_embedding', shape=[29, self.postagger_dim])

        if self.encoder == 'pcnn':
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim * 3, self.num_classes])
        elif self.encoder == 'birnn':

            self.e1_w = tf.compat.v1.get_variable(
                name='e1_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.e2_w = tf.compat.v1.get_variable(
                name='e2_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.text_w = tf.compat.v1.get_variable(
                name='text_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim * 2, self.num_classes])
            self.att_weight = tf.compat.v1.get_variable(
                name='att_weight',
                shape=[self.batch_size, 1, self.hidden_dim * 2])
        else:
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim, self.num_classes])
        self.relation_embedding_b = tf.compat.v1.get_variable(
            name='relation_embedding_b', shape=[self.num_classes])
        if self.encoder == 'cnn':
            self.sentence_reps = self.cnn()
        elif self.encoder == 'pcnn':
            self.sentence_reps = self.pcnn()
        elif self.encoder == 'rnn':
            self.sentence_reps = self.rnn()
        elif self.encoder == 'birnn':
            self.sentence_reps = self.bi_rnn()
        else:
            self.sentence_reps = self.Transformer()

        if self.bag:
            self.bag_level()
        else:
            self.sentence_level()
        self._classifier_train_op = tf.compat.v1.train.AdamOptimizer(
            self.lr).minimize(self.classifier_loss)

    def pos_index(self, x):
        if x < -self.pos_limit:
            return 0
        if x >= -self.pos_limit and x <= self.pos_limit:
            return x + self.pos_limit + 1
        if x > self.pos_limit:
            return 2 * self.pos_limit + 2

    def load_wordVec(self):
        wordMap = {}
        wordMap['PAD'] = len(wordMap)
        wordMap['UNK'] = len(wordMap)
        word_embed = []
        for line in open(os.path.join(self.data_path, 'word2vec.txt')):
            content = line.strip().split()
            if len(content) != self.word_dim + 1:
                continue
            wordMap[content[0]] = len(wordMap)
            word_embed.append(np.asarray(content[1:], dtype=np.float32))

        word_embed = np.stack(word_embed)
        embed_mean, embed_std = word_embed.mean(), word_embed.std()

        pad_embed = np.random.normal(embed_mean, embed_std, (2, self.word_dim))
        word_embed = np.concatenate((pad_embed, word_embed), axis=0)
        word_embed = word_embed.astype(np.float32)
        return wordMap, word_embed

    def load_bert_word2vec(self):
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(time_str, str('加载bert sentence2vec'))
        print(tempstr)
        ori_word_vec = json.load(
            open(os.path.join(self.data_path, 'bert_word2vec.json'), "r"))
        word_embed = np.zeros((len(ori_word_vec), self.word_dim),
                              dtype=np.float32)
        wordMap = {}
        for cur_id, word in enumerate(ori_word_vec):
            w = word['word']
            wordMap[w] = cur_id
            word_embed[cur_id, :] = word['vec']
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(time_str, str('加载bert sentence2vec完成'))
        print(tempstr)
        return wordMap, word_embed

    def load_wordMap(self):
        wordMap = {}
        wordMap['PAD'] = len(wordMap)
        wordMap['UNK'] = len(wordMap)
        all_content = []
        for line in open(os.path.join(self.data_path, 'sent_train.txt')):
            all_content += line.strip().split('\t')[3].split()
        for item in Counter(all_content).most_common():
            if item[1] > self.word_frequency:
                wordMap[item[0]] = len(wordMap)
            else:
                break
        return wordMap

    def bert_wordMap(self):
        wordMap = {}
        all_content = []
        all_content.append('PAD')
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(time_str, str('加载语料库'))
        print(tempstr)
        for line in open(os.path.join(self.data_path, 'sent_train.txt')):
            all_content += line.strip().split('\t')[3].split()
        all_content = list(set(all_content))
        for line in open(os.path.join(self.data_path, 'sent_test.txt')):
            all_content += line.strip().split('\t')[3].split()
        all_content = list(set(all_content))
        for line in open(os.path.join(self.data_path, 'sent_dev.txt')):
            all_content += line.strip().split('\t')[3].split()
        all_content = list(set(all_content))
        wordMap = dict(zip(all_content, range(len(all_content))))
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(
            time_str,
            str('语料库加载完成,提取词向量中,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
                ))
        print(tempstr)
        word_embed = self.bert.encode(all_content)
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(time_str, str('提取词向量完成'))
        print(tempstr)

        #保存好提取的bert模型的word2vec
        #形式是:[{"word": "我的", "vec": [1, 2, 3]}, {"word": "中国", "vec": [4, 5, 6]}, {"word": "使得", "vec": [2, 4, 5]}]
        print('保存bert word2vec 到json文件中')
        word2vec_list = []
        for word, vec in zip(all_content, word_embed):
            word2vec_dict = {}
            word2vec_dict['word'] = word
            word2vec_dict['vec'] = vec
            word2vec_list.append(word2vec_dict)
        filew = open(os.path.join(self.data_path, 'bert_word2vec.json'),
                     'w',
                     encoding='utf-8')
        json.dump(word2vec_list, filew, cls=NpEncoder, ensure_ascii=False)
        filew.close()
        time_str = datetime.datetime.now().isoformat()
        tempstr = "{}:{}".format(time_str, str('保存完成'))
        print(tempstr)

        word_embed = np.array(bert.encode(all_content), np.float32)
        return wordMap, word_embed

    def load_relation(self):
        relation2id = {}
        for line in open(os.path.join(self.data_path, 'relation2id.txt')):
            relation, id_ = line.strip().split()
            relation2id[relation] = int(id_)
        return relation2id

    def load_sent(self, filename):
        sentence_dict = {}
        with open(os.path.join(self.data_path, filename), 'r') as fr:
            for line in fr:
                id_, en1, en2, sentence = line.strip().split('\t')
                sentence = sentence.split()
                en1_pos = 0
                en2_pos = 0
                for i in range(len(sentence)):
                    if sentence[i] == en1:
                        en1_pos = i
                        sentence[i] = 'Mask'
                    if sentence[i] == en2:
                        en2_pos = i
                        sentence[i] = 'Mask'
                words = []
                postagger_id = []
                pos1 = []
                pos2 = []
                segment = []
                mask = []
                pos_min = min(en1_pos, en2_pos)
                pos_max = max(en1_pos, en2_pos)
                length = min(self.sen_len, len(sentence))
                postags = postagger.postag(sentence)
                for i in range(length):
                    if self.embed_bert:
                        words.append(self.wordMap.get(sentence[i]))
                    else:
                        words.append(
                            self.wordMap.get(sentence[i], self.wordMap['UNK']))
                    if postags[i] in self.postaggerMap:
                        postagger_id.append(self.postaggerMap[postags[i]])
                    else:
                        self.postaggerMap[postags[i]] = len(self.postaggerMap)
                        postagger_id.append(self.postaggerMap[postags[i]])
                    if i == en1_pos:
                        segment.append(1)
                    elif i == en2_pos:
                        segment.append(-1)
                    else:
                        segment.append(0)
                    pos1.append(self.pos_index(i - en1_pos))
                    pos2.append(self.pos_index(i - en2_pos))
                    if i <= pos_min:
                        mask.append(1)
                    elif i <= pos_max:
                        mask.append(2)
                    else:
                        mask.append(3)

                if length < self.sen_len:
                    for i in range(length, self.sen_len):
                        words.append(self.wordMap['PAD'])
                        pos1.append(self.pos_index(i - en1_pos))
                        pos2.append(self.pos_index(i - en2_pos))
                        mask.append(0)
                        postagger_id.append(28)
                        segment.append(0)
                sentence_dict[id_] = np.reshape(
                    np.asarray(
                        [words, pos1, pos2, mask, postagger_id, segment],
                        dtype=np.int32), (1, 6, self.sen_len))
        return sentence_dict

    def data_batcher(self,
                     sentence_dict,
                     filename,
                     padding=False,
                     shuffle=True):
        if self.bag:
            all_bags = []
            all_sents = []
            all_labels = []
            with open(os.path.join(self.data_path, filename), 'r') as fr:
                for line in fr:
                    rel = [0] * self.num_classes
                    try:
                        bag_id, _, _, sents, types = line.strip().split('\t')
                        type_list = types.split()
                        for tp in type_list:
                            if len(
                                    type_list
                            ) > 1 and tp == '0':  # if a bag has multiple relations, we only consider non-NA relations
                                continue
                            rel[int(tp)] = 1
                    except:
                        bag_id, _, _, sents = line.strip().split('\t')

                    sent_list = []
                    for sent in sents.split():
                        sent_list.append(sentence_dict[sent])

                    all_bags.append(bag_id)
                    all_sents.append(np.concatenate(sent_list, axis=0))
                    all_labels.append(np.asarray(rel, dtype=np.float32))

            self.data_size = len(all_bags)
            self.datas = all_bags
            data_order = list(range(self.data_size))

            if shuffle:
                np.random.shuffle(data_order)
            if padding:
                if self.data_size % self.batch_size != 0:
                    data_order += [data_order[-1]] * (
                        self.batch_size - self.data_size % self.batch_size)
            for i in range(len(data_order) // self.batch_size):
                total_sens = 0
                out_sents = []
                out_sent_nums = []
                out_labels = []
                for k in data_order[i * self.batch_size:(i + 1) *
                                    self.batch_size]:
                    out_sents.append(all_sents[k])
                    out_sent_nums.append(total_sens)
                    total_sens += all_sents[k].shape[0]
                    out_labels.append(all_labels[k])

                out_sents = np.concatenate(out_sents, axis=0)
                out_sent_nums.append(total_sens)
                out_sent_nums = np.asarray(out_sent_nums, dtype=np.int32)
                out_labels = np.stack(out_labels)

                yield out_sents, out_labels, out_sent_nums
        else:
            all_sent_ids = []
            all_sents = []
            all_labels = []
            with open(os.path.join(self.data_path, filename), 'r') as fr:
                for line in fr:
                    rel = [0] * self.num_classes
                    try:
                        sent_id, types = line.strip().split('\t')
                        type_list = types.split()
                        for tp in type_list:
                            if len(
                                    type_list
                            ) > 1 and tp == '0':  # if a sentence has multiple relations, we only consider non-NA relations
                                continue
                            rel[int(tp)] = 1
                    except:
                        sent_id = line.strip()

                    all_sent_ids.append(sent_id)
                    all_sents.append(sentence_dict[sent_id])

                    all_labels.append(
                        np.reshape(np.asarray(rel, dtype=np.float32),
                                   (-1, self.num_classes)))

            self.data_size = len(all_sent_ids)
            self.datas = all_sent_ids

            all_sents = np.concatenate(all_sents, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)

            data_order = list(range(self.data_size))
            if shuffle:
                np.random.shuffle(data_order)
            if padding:
                if self.data_size % self.batch_size != 0:
                    data_order += [data_order[-1]] * (
                        self.batch_size - self.data_size % self.batch_size)

            for i in range(len(data_order) // self.batch_size):
                idx = data_order[i * self.batch_size:(i + 1) * self.batch_size]
                yield all_sents[idx], all_labels[idx], None

    def embedding(self):
        self.keep_prob = tf.compat.v1.placeholder(dtype=tf.float32,
                                                  name='keep_prob')
        self.input_word = tf.compat.v1.placeholder(dtype=tf.int32,
                                                   shape=[None, self.sen_len],
                                                   name='input_word')
        self.input_pos_e1 = tf.compat.v1.placeholder(
            dtype=tf.int32, shape=[None, self.sen_len], name='input_pos_e1')
        self.input_pos_e2 = tf.compat.v1.placeholder(
            dtype=tf.int32, shape=[None, self.sen_len], name='input_pos_e2')
        self.input_postagger = tf.compat.v1.placeholder(
            dtype=tf.int32, shape=[None, self.sen_len], name='input_postagger')
        self.input_segment = tf.compat.v1.placeholder(
            dtype=tf.int32, shape=[None, self.sen_len], name='input_segment')
        if self.encoder == 'pcnn':
            self.mask = tf.compat.v1.placeholder(dtype=tf.int32,
                                                 shape=[None, self.sen_len],
                                                 name='mask')
        self.input_label = tf.compat.v1.placeholder(
            dtype=tf.float32,
            shape=[None, self.num_classes],
            name='input_label')
        inputs_forward = tf.concat(axis=2, values=[tf.nn.embedding_lookup(self.word_embedding, self.input_word), \
                                                   tf.nn.embedding_lookup(self.pos_postagger_embedding, self.input_postagger), \
                                                   tf.nn.embedding_lookup(self.pos_e1_embedding, self.input_pos_e1), \
                                                   tf.nn.embedding_lookup(self.pos_e2_embedding, self.input_pos_e2)])
        #[batch_size,max_len,word_dim+postagger_dim+2*pos_dim]:[batch_size,60,410]
        inputs_forward = tf.add(inputs_forward,
                                tf.to_float(
                                    tf.expand_dims(self.input_segment, -1)),
                                name='inputs_forward')
        return inputs_forward

    def cnn(self):
        #[batch_size,self.sen_len,self.word_dim+2*self.pos_dim+postagger_dim]:bag时 为[377,60,410]
        inputs_forward = self.embedding()
        #[batch_size,self.sen_len,self.word_dim+2*self.pos_dim+postagger_dim,1]
        inputs_forward = tf.expand_dims(inputs_forward, -1)
        with tf.compat.v1.name_scope('conv-maxpool'):
            w = tf.compat.v1.get_variable(
                name='w',
                shape=[
                    self.window,
                    self.word_dim + 2 * self.pos_dim + self.postagger_dim, 1,
                    self.hidden_dim
                ])
            b = tf.compat.v1.get_variable(name='b', shape=[self.hidden_dim])
            conv = tf.nn.conv2d(inputs_forward,
                                w,
                                strides=[1, 1, 1, 1],
                                padding='VALID',
                                name='conv')
            #[batch_size,self.sen_len-self.window+1,1,self.hidden_dim]
            h = tf.nn.bias_add(conv, b)
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, self.sen_len - self.window + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name='pool')
        sen_reps = tf.tanh(tf.reshape(pooled, [-1, self.hidden_dim]))
        sen_reps = tf.nn.dropout(sen_reps, self.keep_prob)
        return sen_reps

    def pcnn(self):
        mask_embedding = tf.constant(
            [[0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]], dtype=np.float32)
        #[batch_size,self.sen_len,self.word_dim+2*self.pos_dim]:bag时 为[377,60,310]
        inputs_forward = self.embedding()
        #[batch_size,self.sen_len,self.hidden_dim]
        conv = tf.layers.conv1d(
            inputs=inputs_forward,
            filters=self.hidden_dim,
            kernel_size=3,
            strides=1,
            padding='same',
            kernel_initializer=tf.contrib.layers.xavier_initializer())

        #self.mask:[batch_size,self.sen_len] mask:[batch_size,self.sen_len,3]
        mask = tf.nn.embedding_lookup(mask_embedding, self.mask)
        #tf.expand_dims(mask * 100, 2):[batch_size,self.sen_len,1,3] tf.expand_dims(conv, 3):[batch_size,self.sen_len,self.hidden_dim,1]
        #sen_reps:[batch_size,self.hidden_dim,3]
        sen_reps = tf.reduce_max(
            tf.expand_dims(mask * 100, 2) + tf.expand_dims(conv, 3),
            axis=1) - 100
        sen_reps = tf.tanh(tf.reshape(sen_reps, [-1, self.hidden_dim * 3]))
        sen_reps = tf.nn.dropout(sen_reps, self.keep_prob)
        return sen_reps

    def rnn_cell(self, cell_name='lstm'):
        if isinstance(cell_name, list) or isinstance(cell_name, tuple):
            if len(cell_name) == 1:
                return self.rnn_cell(cell_name[0])
            cells = [self.rnn_cell(c) for c in cell_name]
            return tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
        if cell_name.lower() == 'lstm':
            return tf.contrib.rnn.BasicLSTMCell(self.hidden_dim,
                                                state_is_tuple=True)
        elif cell_name.lower() == 'gru':
            return tf.contrib.rnn.GRUCell(self.hidden_dim)
        raise NotImplementedError

    def rnn(self, cell_name='lstm'):
        inputs_forward = self.embedding()
        inputs_forward = tf.nn.dropout(inputs_forward, self.keep_prob)
        cell = self.rnn_cell(cell_name)
        _, states = tf.nn.dynamic_rnn(cell,
                                      inputs_forward,
                                      sequence_length=[self.sen_len] *
                                      self.batch_size,
                                      dtype=tf.float32,
                                      scope='dynamic-rnn')
        if isinstance(states, tuple):
            states = states[0]
        return states

    def bi_rnn(self, cell_name='lstm'):
        #[batch_size,max_len,word_dim+postagger_dim+2*pos_dim]:[batch_size,60,410]
        inputs_forward = self.embedding()
        inputs_forward = tf.nn.dropout(inputs_forward, keep_prob=self.dropout)

        #第一层birnn
        fw_cell_1 = self.rnn_cell('gru')
        bw_cell_1 = self.rnn_cell('gru')
        #_, states = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell,inputs_forward, dtype=tf.float32, scope='dynamic-bi-rnn')
        #fw_states, bw_states = states
        #if isinstance(fw_states, tuple):
        #fw_states = fw_states[0]
        #bw_states = bw_states[0]
        outputs_1, _ = tf.nn.bidirectional_dynamic_rnn(
            fw_cell_1,
            bw_cell_1,
            inputs_forward,
            dtype=tf.float32,
            scope='dynamic-bi-rnn_1')
        output_fw_1, output_bw_1 = outputs_1
        #sen_reps:[batch_size,max_len,2*hidden]
        sen_reps_1 = tf.tanh(tf.concat([output_fw_1, output_bw_1], axis=2))
        sen_reps_1 = tf.nn.dropout(sen_reps_1, self.keep_prob)

        #第二层birnn
        fw_cell_2 = self.rnn_cell('gru')
        bw_cell_2 = self.rnn_cell('gru')
        #_, states = tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell,inputs_forward, dtype=tf.float32, scope='dynamic-bi-rnn')
        #fw_states, bw_states = states
        #if isinstance(fw_states, tuple):
        #fw_states = fw_states[0]
        #bw_states = bw_states[0]
        outputs_2, _ = tf.nn.bidirectional_dynamic_rnn(
            fw_cell_2,
            bw_cell_2,
            sen_reps_1,
            dtype=tf.float32,
            scope='dynamic-bi-rnn_2')
        output_fw_2, output_bw_2 = outputs_2
        #sen_reps:[batch_size,max_len,2*hidden]
        sen_reps_2 = tf.tanh(tf.concat([output_fw_2, output_bw_2], axis=2))
        sen_reps_2 = tf.nn.dropout(sen_reps_2, self.keep_prob)

        sen_reps = tf.add(sen_reps_1, sen_reps_2)
        return sen_reps

    def MultiHeadAttention(self,
                           inputs_forward,
                           hidden_size,
                           multihead_num=5,
                           activation=None,
                           name='MultiHeadAttention_1',
                           score_mask=None,
                           output_mask=None):
        #MultiHeadAttention的输入是[batch,max_len,feature_size] 输出是[batch,max_len,hidden_size]
        with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
            # 计算Q、K、V
            V = tf.layers.dense(inputs_forward,
                                units=hidden_size,
                                activation=activation,
                                use_bias=False)
            K = tf.layers.dense(inputs_forward,
                                units=hidden_size,
                                activation=activation,
                                use_bias=False)
            Q = tf.layers.dense(inputs_forward,
                                units=hidden_size,
                                activation=activation,
                                use_bias=False)

            # 将Q、K、V分离为multi-heads的形式
            V = tf.concat(tf.split(V, multihead_num, axis=-1), axis=0)
            K = tf.concat(tf.split(K, multihead_num, axis=-1), axis=0)
            Q = tf.concat(tf.split(Q, multihead_num, axis=-1), axis=0)

            # 计算Q、K的点积,并进行scale
            score = tf.matmul(Q, tf.transpose(K, [0, 2, 1])) / tf.sqrt(
                hidden_size / multihead_num)

            # mask
            if score_mask is not None:
                score *= score_mask
                score += ((score_mask - 1) * 1e+9)

            # softmax
            softmax = tf.nn.softmax(score, dim=2)

            # dropout
            softmax = tf.nn.dropout(softmax, keep_prob=self.keep_prob)

            # attention
            attention = tf.matmul(softmax, V)

            # 将multi-head的输出进行拼接
            concat = tf.concat(tf.split(attention, multihead_num, axis=0),
                               axis=-1)

            # Linear
            Multihead = tf.layers.dense(concat,
                                        units=hidden_size,
                                        activation=activation,
                                        use_bias=False)

            # output mask
            if output_mask is not None:
                Multihead *= output_mask

            # 残差连接前的dropout
            Multihead = tf.nn.dropout(Multihead, keep_prob=self.keep_prob)

            # 残差连接
            Multihead += inputs_forward

            # Layer Norm
            Multihead = tf.contrib.layers.layer_norm(Multihead,
                                                     begin_norm_axis=2)
            return Multihead

    def Transformer(self):
        num = 3

        feature_size = self.word_dim + self.pos_dim * 2 + self.postagger_dim
        hidden_size = feature_size
        inputs_forward = self.embedding()
        for i in range(num):
            inputs_forward = self.MultiHeadAttention(
                inputs_forward,
                feature_size,
                name='MultiHeadAttention_' + str(i + 1))
            inputs_forward = tf.layers.dense(inputs_forward,
                                             units=hidden_size,
                                             activation=tf.nn.relu,
                                             use_bias=False)
            inputs_forward = tf.layers.dense(inputs_forward,
                                             units=hidden_size,
                                             activation=None,
                                             use_bias=False)
            inputs_forward = tf.nn.dropout(inputs_forward, self.keep_prob)
            inputs_forward = tf.contrib.layers.layer_norm(inputs_forward,
                                                          begin_norm_axis=2)

        inputs_forward = tf.contrib.layers.layer_norm(inputs_forward,
                                                      begin_norm_axis=2)
        inputs_forward = tf.layers.dense(inputs_forward,
                                         units=self.num_classes,
                                         activation=None,
                                         use_bias=False)
        # inputs_forward:[batch,max_len,feature_size]
        return inputs_forward

    #https://github.com/maozezhong/focal_loss_multi_class/blob/master/focal_loss.py
    def focal_loss_fixed_1(self,
                           target_tensor,
                           prediction_tensor,
                           classes_num=[
                               248850, 8135, 218, 183, 5513, 245, 69, 291, 40,
                               9, 6870, 1383, 2627, 830, 46, 19, 1673, 637,
                               532, 805, 77, 77, 22, 158, 30, 13, 119, 67, 24,
                               165, 1610, 1301, 1266, 2900, 547
                           ],
                           gamma=2.,
                           alpha=.25,
                           e=0.1):
        '''
        prediction_tensor is the output tensor with shape [None, 100], where 100 is the number of classes
        target_tensor is the label tensor, same shape as predcition_tensor
        '''

        #1# get focal loss with no balanced weight which presented in paper function (4)
        zeros = array_ops.zeros_like(prediction_tensor,
                                     dtype=prediction_tensor.dtype)
        one_minus_p = array_ops.where(tf.greater(target_tensor, zeros),
                                      target_tensor - prediction_tensor, zeros)
        FT = -1 * (one_minus_p**gamma) * tf.log(
            tf.clip_by_value(prediction_tensor, 1e-8, 1.0))

        #2# get balanced weight alpha
        classes_weight = array_ops.zeros_like(prediction_tensor,
                                              dtype=prediction_tensor.dtype)

        total_num = float(sum(classes_num))
        classes_w_t1 = [total_num / ff for ff in classes_num]
        sum_ = sum(classes_w_t1)
        classes_w_t2 = [ff / sum_ for ff in classes_w_t1]  #scale
        classes_w_tensor = tf.convert_to_tensor(classes_w_t2,
                                                dtype=prediction_tensor.dtype)
        classes_weight += classes_w_tensor

        alpha = array_ops.where(tf.greater(target_tensor, zeros),
                                classes_weight, zeros)

        #3# get balanced focal loss
        balanced_fl = alpha * FT
        balanced_fl = tf.reduce_mean(balanced_fl)

        #4# add other op to prevent overfit
        # reference : https://spaces.ac.cn/archives/4493
        nb_classes = len(classes_num)
        fianal_loss = (1 - e) * balanced_fl + e * K.categorical_crossentropy(
            K.ones_like(prediction_tensor) / nb_classes, prediction_tensor)
        fianal_loss = tf.reduce_sum(fianal_loss)
        return fianal_loss

    #https://github.com/fudannlp16/focal-loss/blob/master/focal_loss.py
    def focal_loss_fixed_2(self, target_tensor, prediction_tensor, gamma=2):
        alpha = tf.constant(value=[1] + [1.5 for _ in range(34)],
                            dtype=tf.float32)
        prediction_tensor = tf.clip_by_value(prediction_tensor, 1.0e-8, 1.0)
        fianal_loss = -target_tensor * (
            (1 - prediction_tensor)**gamma) * tf.log(prediction_tensor) * alpha
        fianal_loss = tf.reduce_sum(fianal_loss, axis=1)
        fianal_loss = tf.reduce_mean(fianal_loss)
        return fianal_loss

    def bag_level(self):
        self.classifier_loss = 0.0
        self.probability = []

        if self.encoder == 'pcnn':
            hidden_dim_cur = self.hidden_dim * 3
        elif self.encoder == 'birnn':
            hidden_dim_cur = self.hidden_dim * 2
        else:
            hidden_dim_cur = self.hidden_dim

        self.bag_sens = tf.compat.v1.placeholder(dtype=tf.int32,
                                                 shape=[self.batch_size + 1],
                                                 name='bag_sens')
        self.att_A = tf.compat.v1.get_variable(name='att_A',
                                               shape=[hidden_dim_cur])
        self.rel = tf.reshape(tf.transpose(self.relation_embedding),
                              [self.num_classes, hidden_dim_cur])
        for i in range(self.batch_size):
            sen_reps = tf.reshape(
                self.sentence_reps[self.bag_sens[i]:self.bag_sens[i + 1]],
                [-1, hidden_dim_cur])

            att_sen = tf.reshape(tf.multiply(sen_reps, self.att_A),
                                 [-1, hidden_dim_cur])
            score = tf.matmul(self.rel, tf.transpose(att_sen))
            alpha = tf.nn.softmax(score, 1)
            bag_rep = tf.matmul(alpha, sen_reps)
            out = tf.matmul(
                bag_rep, self.relation_embedding) + self.relation_embedding_b

            prob = tf.reshape(
                tf.reduce_sum(
                    tf.nn.softmax(out, 1) *
                    tf.reshape(self.input_label[i], [-1, 1]), 0),
                [self.num_classes])

            self.probability.append(
                tf.reshape(
                    tf.reduce_sum(
                        tf.nn.softmax(out, 1) *
                        tf.linalg.tensor_diag([1.0] * (self.num_classes)), 1),
                    [-1, self.num_classes]))
            self.classifier_loss += tf.reduce_sum(
                -tf.log(tf.clip_by_value(prob, 1.0e-10, 1.0)) *
                tf.reshape(self.input_label[i], [-1]))

        self.probability = tf.concat(axis=0, values=self.probability)
        self.classifier_loss = self.classifier_loss / tf.cast(
            self.batch_size, tf.float32)

    def attention_1(self, M):
        #######这里采用的注意力打分机制是使用原文本#####################
        #https://zhuanlan.zhihu.com/p/53682800
        #经过转之后:[batch_size,2*hidden,max_len]
        M = tf.transpose(M, [0, 2, 1])
        #a:[batch_size,1,max_len]
        a = tf.nn.softmax(tf.matmul(self.att_weight, M), 2)
        #a:[batch_size,max_len,1]
        a = tf.transpose(a, [0, 2, 1])
        #返回:[batch_size,2*hidden,1]
        return tf.matmul(M, a)

    def attention_2(self, M):
        #######这里采用的注意力打分机制是使用的是出去实体的其它部分原文本#####################
        #https://zhuanlan.zhihu.com/p/53682800
        #M:[batch_size,max_len,2*hidden],
        e = tf.to_float(tf.expand_dims(self.input_segment, -1))
        #M: [batch_size,max_len,2*hidden] e1:[batch_size,max_len,1],相乘后:[batch_size,max_len,2*hidden]
        e1 = tf.where(tf.equal(e, 1), e, e - e)
        e1 = tf.multiply(M, e1)
        e2 = tf.where(tf.equal(e, -1), 0 - e, e - e)
        e2 = tf.multiply(M, e2)

        #e1,e2,text:[batch_size,max_len,1]
        e1 = tf.tanh(tf.nn.conv1d(e1, self.e1_w, 1, 'SAME'))
        e2 = tf.tanh(tf.nn.conv1d(e2, self.e2_w, 1, 'SAME'))
        text = tf.tanh(tf.nn.conv1d(M, self.text_w, 1, 'SAME'))
        score = tf.nn.softmax(tf.subtract(text, tf.add(e1, e2)), 1)
        M = tf.transpose(M, [0, 2, 1])
        return tf.matmul(M, score)

    def sentence_level(self):
        if self.encoder == 'birnn':
            #att_out = tf.tanh(self.attention_1(self.sentence_reps))
            #out = tf.matmul(tf.reshape(att_out,[self.batch_size,-1]), self.relation_embedding) + self.relation_embedding_b
            hidden_size = self.hidden_dim * 2
            out = self.MultiHeadAttention(self.sentence_reps,
                                          hidden_size,
                                          multihead_num=5,
                                          activation=None,
                                          name='MultiHeadAttention_1',
                                          score_mask=None,
                                          output_mask=None)
            out = out[:, 0]
            out = tf.matmul(
                out, self.relation_embedding) + self.relation_embedding_b

        elif self.encoder == 'transformer':
            # self.sentence_reps:[batch,max_len,feature_size]  out:[batch,feature_size]
            out = self.sentence_reps[:, 0]
        else:
            out = tf.matmul(
                self.sentence_reps,
                self.relation_embedding) + self.relation_embedding_b
        self.probability = tf.nn.softmax(out, 1)

        # self.classifier_loss = self.focal_loss_fixed_2(self.input_label,self.probability)

        self.classifier_loss = tf.reduce_mean(
            tf.reduce_sum(
                -tf.log(tf.clip_by_value(self.probability, 1.0e-10, 1.0)) *
                self.input_label, 1))

    def run_train(self, sess, batch):

        sent_batch, label_batch, sen_num_batch = batch

        feed_dict = {}
        feed_dict[self.keep_prob] = 1 - self.dropout
        feed_dict[self.input_word] = sent_batch[:, 0, :]
        feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
        feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
        feed_dict[self.input_postagger] = sent_batch[:, 4, :]
        feed_dict[self.input_segment] = sent_batch[:, 5, :]
        if self.encoder == 'pcnn':
            feed_dict[self.mask] = sent_batch[:, 3, :]
        feed_dict[self.input_label] = label_batch
        if self.bag:
            feed_dict[self.bag_sens] = sen_num_batch
        _, classifier_loss = sess.run(
            [self._classifier_train_op, self.classifier_loss], feed_dict)
        return classifier_loss

    def run_dev(self, sess, dev_batchers):
        all_labels = []
        all_probs = []
        for batch in dev_batchers:
            sent_batch, label_batch, sen_num_batch = batch
            all_labels.append(label_batch)

            feed_dict = {}
            feed_dict[self.keep_prob] = 1.0
            feed_dict[self.input_word] = sent_batch[:, 0, :]
            feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
            feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
            feed_dict[self.input_postagger] = sent_batch[:, 4, :]
            feed_dict[self.input_segment] = sent_batch[:, 5, :]
            if self.encoder == 'pcnn':
                feed_dict[self.mask] = sent_batch[:, 3, :]
            if self.bag:
                feed_dict[self.bag_sens] = sen_num_batch
            prob = sess.run([self.probability], feed_dict)
            all_probs.append(np.reshape(prob, (-1, self.num_classes)))

        all_labels = np.concatenate(all_labels, axis=0)[:self.data_size]
        all_probs = np.concatenate(all_probs, axis=0)[:self.data_size]
        if self.bag:
            all_preds = all_probs
            all_preds[all_probs > self.bag_threshold] = 1
            all_preds[all_probs <= self.bag_threshold] = 0
        else:
            all_preds = np.eye(self.num_classes)[np.reshape(
                np.argmax(all_probs, 1), (-1))]

        return all_preds, all_labels

    def run_test(self, sess, test_batchers):
        all_probs = []
        for batch in test_batchers:
            sent_batch, _, sen_num_batch = batch

            feed_dict = {}
            feed_dict[self.keep_prob] = 1.0
            feed_dict[self.input_word] = sent_batch[:, 0, :]
            feed_dict[self.input_pos_e1] = sent_batch[:, 1, :]
            feed_dict[self.input_pos_e2] = sent_batch[:, 2, :]
            feed_dict[self.input_postagger] = sent_batch[:, 4, :]
            feed_dict[self.input_segment] = sent_batch[:, 5, :]
            if self.encoder == 'pcnn':
                feed_dict[self.mask] = sent_batch[:, 3, :]
            if self.bag:
                feed_dict[self.bag_sens] = sen_num_batch
            prob = sess.run([self.probability], feed_dict)
            all_probs.append(np.reshape(prob, (-1, self.num_classes)))

        all_probs = np.concatenate(all_probs, axis=0)[:self.data_size]
        if self.bag:
            all_preds = all_probs
            all_preds[all_probs > self.bag_threshold] = 1
            all_preds[all_probs <= self.bag_threshold] = 0
        else:
            all_preds = np.eye(self.num_classes)[np.reshape(
                np.argmax(all_probs, 1), (-1))]

        if self.bag:
            with open('result_bag.txt', 'w') as fw:
                for i in range(self.data_size):
                    rel_one_hot = [int(num) for num in all_preds[i].tolist()]
                    rel_list = []
                    for j in range(0, self.num_classes):
                        if rel_one_hot[j] == 1:
                            rel_list.append(str(j))
                    if len(
                            rel_list
                    ) == 0:  # if a bag has no relation, it will be consider as having a relation NA
                        rel_list.append('0')
                    fw.write(self.datas[i] + '\t' + ' '.join(rel_list) + '\n')
        else:
            with open('result_sent.txt', 'w') as fw:
                for i in range(self.data_size):
                    rel_one_hot = [int(num) for num in all_preds[i].tolist()]
                    rel_list = []
                    for j in range(0, self.num_classes):
                        if rel_one_hot[j] == 1:
                            rel_list.append(str(j))
                    fw.write(self.datas[i] + '\t' + ' '.join(rel_list) + '\n')

    def run_model(self, sess, saver):
        if self.mode == 'train':
            global_step = 0
            sent_train = self.load_sent('sent_train.txt')
            sent_dev = self.load_sent('sent_dev.txt')

            max_f1 = 0.0

            if not os.path.isdir(self.model_path):
                os.mkdir(self.model_path)

            for epoch in range(self.epochs):
                if self.bag:
                    train_batchers = self.data_batcher(
                        sent_train,
                        'bag_relation_train.txt',
                        padding=False,
                        shuffle=True)
                else:
                    train_batchers = self.data_batcher(
                        sent_train,
                        'sent_relation_train.txt',
                        padding=False,
                        shuffle=True)
                for batch in train_batchers:

                    losses = self.run_train(sess, batch)
                    global_step += 1
                    if global_step % 50 == 0:
                        time_str = datetime.datetime.now().isoformat()
                        tempstr = "{}: step {}, classifier_loss {:g}".format(
                            time_str, global_step, losses)
                        print(tempstr)
                    if global_step % 100 == 0:
                        if self.bag:
                            dev_batchers = self.data_batcher(
                                sent_dev,
                                'bag_relation_dev.txt',
                                padding=True,
                                shuffle=False)
                        else:
                            dev_batchers = self.data_batcher(
                                sent_dev,
                                'sent_relation_dev.txt',
                                padding=True,
                                shuffle=False)
                        all_preds, all_labels = self.run_dev(
                            sess, dev_batchers)

                        # when calculate f1 score, we don't consider whether NA results are predicted or not
                        # the number of non-NA answers in test is counted as n_std
                        # the number of non-NA answers in predicted answers is counted as n_sys
                        # intersection of two answers is counted as n_r
                        n_r = int(np.sum(all_preds[:, 1:] * all_labels[:, 1:]))
                        n_std = int(np.sum(all_labels[:, 1:]))
                        n_sys = int(np.sum(all_preds[:, 1:]))
                        try:
                            precision = n_r / n_sys
                            recall = n_r / n_std
                            f1 = 2 * precision * recall / (precision + recall)
                        except ZeroDivisionError:
                            f1 = 0.0

                        if f1 > max_f1:
                            max_f1 = f1
                            print('f1: %f' % f1)
                            print('saving model')
                            path = saver.save(sess,
                                              os.path.join(
                                                  self.model_path,
                                                  'ipre_bag_%d' % (self.bag)),
                                              global_step=0)
                            tempstr = 'have saved model to ' + path
                            print(tempstr)

        else:
            path = os.path.join(self.model_path,
                                'ipre_bag_%d' % self.bag) + '-0'
            tempstr = 'load model: ' + path
            print(tempstr)
            try:
                saver.restore(sess, path)
            except:
                raise ValueError('Unvalid model name')

            sent_test = self.load_sent('sent_test.txt')
            if self.bag:
                test_batchers = self.data_batcher(sent_test,
                                                  'bag_relation_test.txt',
                                                  padding=True,
                                                  shuffle=False)
            else:
                test_batchers = self.data_batcher(sent_test,
                                                  'sent_relation_test.txt',
                                                  padding=True,
                                                  shuffle=False)

            self.run_test(sess, test_batchers)
 def cosine_similarity(self, c1, c2):
     #bc = BertClient(check_length=False)
     bc = BertClient()
     vectors = bc.encode([c1.text, c2.text])
     cosine = 1.0 - scipy.spatial.distance.cosine(vectors[0], vectors[1])
     return cosine
Example #22
0
    def __init__(self, flags):
        self.lr = flags.lr
        self.sen_len = flags.sen_len
        self.pre_embed = flags.pre_embed
        self.pos_limit = flags.pos_limit
        self.pos_dim = flags.pos_dim
        self.window = flags.window
        self.word_dim = flags.word_dim
        self.hidden_dim = flags.hidden_dim
        self.postagger_dim = flags.postagger_dim
        self.batch_size = flags.batch_size
        self.data_path = flags.data_path
        self.model_path = flags.model_path
        self.encoder = flags.encoder
        self.mode = flags.mode
        self.epochs = flags.epochs
        self.dropout = flags.dropout
        self.bag_threshold = flags.bag_threshold
        self.word_frequency = flags.word_frequency
        self.postaggerMap = {}
        if flags.level == 'sent':
            self.bag = False
        elif flags.level == 'bag':
            self.bag = True
        else:
            self.bag = True

        if flags.embed_bert:
            self.embed_bert = True
            self.word_dim = 768
            self.bert = BertClient(ip='localhost',
                                   check_version=False,
                                   check_length=False)
        else:
            self.embed_bert = False

        self.relation2id = self.load_relation()
        self.num_classes = len(self.relation2id)

        self.pos_num = 2 * self.pos_limit + 3
        if self.pre_embed:
            self.wordMap, word_embed = self.load_wordVec()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)

        elif self.embed_bert and self.mode == 'train':
            self.wordMap, word_embed = self.bert_wordMap()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)
        elif self.embed_bert and self.mode == 'test':
            self.wordMap, word_embed = self.load_bert_word2vec()
            self.word_embedding = tf.compat.v1.get_variable(
                initializer=word_embed, name='word_embedding', trainable=False)
        else:
            self.wordMap = self.load_wordMap()
            self.word_embedding = tf.compat.v1.get_variable(
                shape=[len(self.wordMap), self.word_dim],
                name='word_embedding',
                trainable=True)

        self.pos_e1_embedding = tf.compat.v1.get_variable(
            name='pos_e1_embedding', shape=[self.pos_num, self.pos_dim])
        self.pos_e2_embedding = tf.compat.v1.get_variable(
            name='pos_e2_embedding', shape=[self.pos_num, self.pos_dim])
        self.pos_postagger_embedding = tf.compat.v1.get_variable(
            name='pos_postagger_embedding', shape=[29, self.postagger_dim])

        if self.encoder == 'pcnn':
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim * 3, self.num_classes])
        elif self.encoder == 'birnn':

            self.e1_w = tf.compat.v1.get_variable(
                name='e1_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.e2_w = tf.compat.v1.get_variable(
                name='e2_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.text_w = tf.compat.v1.get_variable(
                name='text_conv', shape=[self.window, self.hidden_dim * 2, 1])
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim * 2, self.num_classes])
            self.att_weight = tf.compat.v1.get_variable(
                name='att_weight',
                shape=[self.batch_size, 1, self.hidden_dim * 2])
        else:
            self.relation_embedding = tf.compat.v1.get_variable(
                name='relation_embedding',
                shape=[self.hidden_dim, self.num_classes])
        self.relation_embedding_b = tf.compat.v1.get_variable(
            name='relation_embedding_b', shape=[self.num_classes])
        if self.encoder == 'cnn':
            self.sentence_reps = self.cnn()
        elif self.encoder == 'pcnn':
            self.sentence_reps = self.pcnn()
        elif self.encoder == 'rnn':
            self.sentence_reps = self.rnn()
        elif self.encoder == 'birnn':
            self.sentence_reps = self.bi_rnn()
        else:
            self.sentence_reps = self.Transformer()

        if self.bag:
            self.bag_level()
        else:
            self.sentence_level()
        self._classifier_train_op = tf.compat.v1.train.AdamOptimizer(
            self.lr).minimize(self.classifier_loss)
Example #23
0
from bert_serving.client import BertClient
from read_questions_csv import questions, answers
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

bc = BertClient(ip='10.51.101.101', check_length=False)

q_embs = bc.encode(questions)
topk = 5


def answer_query(query):
    emb = bc.encode([query, "asfg"])[0].reshape((1, -1))
    similarities = [cosine_similarity(emb, x.reshape((1, -1))) for x in q_embs]
    most_similar = np.argmax(similarities)
    results = []
    score = np.sum(emb * q_embs, axis=1) / np.linalg.norm(q_embs, axis=1)
    topk_idx = np.argsort(score)[::-1][:topk]
    for idx in topk_idx:
        # print('> %s\t%s' % (score[idx], questions[idx]))
        results.append([questions[idx], answers[idx]])

    return results
Example #24
0
def bert_lu_annotation_embeddings(fn, vocab_path):
	vocab = load_vocab(vocab_path)
	basic_tokenizer = BasicTokenizer(do_lower_case=False)
	wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab)
	regex = re.compile(r'([ :]{4,}| \.\.\. )')

	sentences = []

	for frame in fn.frames:
		for lu in frame.lus:
			if len(lu.anno_sents) == 0:
				tokens = wordpiece_tokenizer.tokenize(lu.clean_name)
				
				if '[UNK]' not in tokens:
					sentences.append({
						"tokens": tokens,
						"lu_pos": len(tokens) - 1,
						"lu_id": lu.id,
					})

			for anno in lu.anno_sents:
				sentence = anno["sentence"]
				start = max(anno["lu_pos"][0]-1, 0)
				end = min(anno["lu_pos"][1]+1, len(sentence))

				a = basic_tokenizer.tokenize(re.sub(regex, ' ', sentence[:start]))
				w = basic_tokenizer.tokenize(sentence[start:end])
				b = basic_tokenizer.tokenize(re.sub(regex, ' ', sentence[end:]))

				tokenized = [wordpiece_tokenizer.tokenize(t) for t in a+w+b]
				shift = list(itertools.accumulate([len(t) for t in tokenized]))

				tokens = [s for subtokens in tokenized for s in subtokens]
				pos = shift[len(a)+len(w)-1]-1

				if tokens[pos] != '[UNK]':
					sentences.append({
						"tokens": tokens,
						"lu_pos": pos,
						"lu_id": lu.id
					})

	# sentences = sentences[:100]

	vecs = None
	bc = BertClient()

	for chunk in chunks(sentences, 1024):
		tokens = [s["tokens"] for s in chunk]
		res = bc.encode(tokens, is_tokenized=True)
		sel = res[np.arange(len(res)), [s["lu_pos"] + 1 for s in chunk]]

		if vecs is not None:
			vecs = np.concatenate((vecs, sel), axis=0)
		else: 
			vecs = sel

	vecs = vecs.reshape((len(sentences), 768))

	embs = defaultdict(list)
	for lu, vec in zip(sentences, vecs):
		embs[lu["lu_id"]].append(vec)

	return embs
    'pooling_layer': [-2],
    'gpu_memory_fraction': 0.5
}
args = namedtuple('args_namedtuple', ','.join(common.keys()))
for k, v in common.items():
    setattr(args, k, v)

for pool_layer in range(1, 13):
    setattr(args, 'pooling_layer', [-pool_layer])
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(15)
    print('encoding...')
    bc = BertClient(port=common['port'],
                    port_out=common['port_out'],
                    show_server_config=True)
    subset_vec_all_layers.append(bc.encode(subset_text))
    bc.close()
    server.close()
    print('done at layer -%d' % pool_layer)


def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'):
    plt.close()
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [21, 7]
    for idx, ebd in enumerate(embed):
        ax = plt.subplot(2, 6, idx + 1)
        vis_x = ebd[:, 0]
        vis_y = ebd[:, 1]
Example #26
0
def load_data(data_path):

    return pd.read_csv(data_path)


train = load_data("../question-pairs-dataset/test_set.csv")

# train, test = train_test_split(data, test_size=0.3, random_state=42)

claims = train["question1"].tolist()
sents = train["question2"].tolist()

sampled_claims = claims[:20000]
sampled_sents = sents[:20000]

bc = BertClient(check_length=False, check_version=False)

# print (type(claims[0:10]))
sents_pair = [[str(claim) + ' ||| ' + str(sent)]
              for claim, sent in zip(sampled_claims, sampled_sents)]
# print (sents_pair[30])
print(len(sents_pair))
vec = np.empty((len(sents_pair), 768))
print(vec.shape)

count = 0
for sent in sents_pair:

    if count == 0:
        vec = bc.encode(sent)
    else:
Example #27
0
BASEDIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(BASEDIR)
from text_cnn_con_wy_1 import TextCNN
import data_helper2
import utils
from configure import FLAGS

from sklearn.metrics import f1_score
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore",
                        category=sklearn.exceptions.UndefinedMetricWarning)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from bert_serving.client import BertClient

bc = BertClient()


def train():
    with tf.device('/cpu:0'):
        train_text, train_y, train_text_pos, train_e1, train_e2, train_pos1, train_pos2, train_sentence_len = data_helper2.load_data_and_labels(
            FLAGS.train_path)
    with tf.device('/cpu:0'):
        test_text, test_y, test_text_pos, test_e1, test_e2, test_pos1, test_pos2, test_sentence_len = data_helper2.load_data_and_labels(
            FLAGS.test_path)

    # Build vocabulary
    # Example: x_text[3] = "A misty <e1>ridge</e1> uprises from the <e2>surge</e2>."
    # ['a misty ridge uprises from the surge <UNK> <UNK> ... <UNK>']
    # =>
    # [27 39 40 41 42  1 43  0  0 ... 0]
from SemanticModel import SemanticModel
from stimulus_utils import load_grids_for_stories
from stimulus_utils import load_generic_trfiles
from dsutils import make_word_ds, make_phoneme_ds
from dsutils import make_semantic_model
from npp import zscore
from util import make_delayed
from helper import getTimestampDict, listToString, numUniqueWords
from transformers import BertTokenizer, BertModel
import pandas as pd
import torch
import tables
from bert_serving.client import BertClient
logging.basicConfig(level=logging.DEBUG)

bc = BertClient()

print("Pre-loading model")

eng1000 = SemanticModel.load("data/english1000sm.hf5")

print("Post-loading model")
# These are lists of the stories
# Rstories are the names of the training (or Regression) stories, which we will use to fit our models
Rstories = ['alternateithicatom', 'avatar', 'howtodraw', 'legacy', 
            'life', 'myfirstdaywiththeyankees', 'naked', 
            'odetostepfather', 'souls', 'undertheinfluence']

# Pstories are the test (or Prediction) stories (well, story), which we will use to test our models
Pstories = ['wheretheressmoke']
Example #29
0
"""

This script creates documents in the required format for
indexing.

"""

import json
from pandas import read_csv
from argparse import ArgumentParser
from bert_serving.client import BertClient
bc = BertClient(output_fmt='list', check_length=False)


def create_document(doc, emb, index_name):
    return {
        '_op_type': 'index',
        '_index': index_name,
        'title': doc['title'],
        'abstract': doc['abstract'],
        'abstract_vector': emb
    }


def load_dataset(path):
    docs = []
    df = read_csv(path)
    for row in df.iterrows():
        series = row[1]
        doc = {
            'title': series.title,
Example #30
0
 def __init__(self):
     self.server_ip = "localhost"
     self.bert_client = BertClient(ip=self.server_ip)