Example #1
0
def create_query_vector(query):
    return BertClient().encode([query])[0]
Example #2
0
'''
pip下载bert-serving的server和client之后,启动server:

bert-serving-start -model_dir chinese_L-12_H-768_A-12/(预训练模型) -num_worker=2

'''
from bert_serving.client import BertClient
from tqdm import tqdm
import pickle

bc = BertClient(ip='localhost')

data = []
file = 'baidu_95.csv'  # 语料
with open(file, 'r', encoding='utf8') as f:
    for line in tqdm(f.readlines()):
        words = line.split(' ')
        data.append(bc.encode(words))

save_path = 'bert_serving_vector.pkl'
with open(save_path, 'wb') as f:
    pickle.dump(data, f)

print('successful get vocab at {}'.format(save_path))
Example #3
0
def embed(lst):
    from bert_serving.client import BertClient
    bc = BertClient()
    vec = bc.encode(lst)
    print('vec.shape: ', vec.shape)
    return vec
Example #4
0
    "-port_out",
    "5556",
    "-max_seq_len",
    "NONE",
    "-pooling_strategy",
    "NONE",
    "-mask_cls_sep",
    "-cpu",
]
SHUT_ARGS = ["-ip", "localhost", "-port", "5555", "-timeout", "5000"]


class BertWordEmbedding:
    def __init__(self):
        self.start_args = get_args_parser().parse_args(START_ARGS)
        self.shut_args = get_shutdown_parser().parse_args(SHUT_ARGS)

    def vectorize(self, client, tokens):
        vecs = np.squeeze(client.encode(tokens, is_tokenized=True))[1:-1]
        return vecs


if __name__ == "__main__":
    bert = BertWordEmbedding()
    tokens = [["hello", "world", "!"]]
    with BertServer(bert.start_args):
        with BertClient() as client:
            vecs = bert.vectorize(client, tokens)
    print(vecs)
    print(vecs.shape)
Example #5
0
    return 1 / (1 + math.exp(-100 * (cosine - 0.95)))


# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-n", "--news", required=True, help="text of the news")
args = vars(ap.parse_args())

#query = "Covid19 is deadly and spreads through 5G" ## change or input query here
query = args["news"]

json_path = "who_scrap.json"  ## data corpus
indices = []
scores = []

with BertClient(port=5555, port_out=5556, check_version=False) as bc:
    Pairs, dataframe = testing(query, json_path)
    print("Start testing")
    for i, p in enumerate(Pairs):
        try:
            score = scoring(p)
            #print("Similarity of Pair {}: ".format(i+1),score )
            if score > 0.7:
                indices.append(i)
                scores.append(score)
        except:
            print("no text found for entry {}".format(i + 1))
    result_df = dataframe.iloc[indices]

weight = 0
sentiments = 0
def create_kui_data_for(path_patch_kui, path_defects4f_c, path_supply_data,
                        path_FSE_defects4j, model):
    print('model: {}'.format(model))

    with open('../data/experiment3/kui_data_for_' + model + '.pickle',
              'wb') as f:
        if model == 'doc':
            m = Doc2Vec.load('../data/model/doc_frag.model')
        elif model == 'bert':
            # max_seq_len=360
            m = BertClient(check_length=False)
        else:
            print('error')
        # buggy_array = np.array([])
        # patched_array = np.array([])
        # label_array = np.array([])

        sets = set()

        cnt = 0
        label_array, buggy_array, patched_array = list(), list(), list()

        # xiong's 139 data
        path_patch_supply = path_supply_data
        path_jsons = os.path.join(path_patch_supply, 'INFO')
        json_files = os.listdir(path_jsons)
        for j in json_files:
            with open(os.path.join(path_jsons, j), 'r') as f1:
                info_dict = json.load(f1)
            if j.split('.')[0] not in data_139:
                continue
            if info_dict['project'] == 'Mockito':
                continue
            if info_dict['correctness'] == 'Correct':
                label = 1
            elif info_dict['correctness'] == 'Incorrect':
                label = 0
            else:
                continue
            path_patch = os.path.join(path_patch_supply, j.split('.')[0])

            bug_vec, patched_vec = get_sample_supply2(model, path_patch, m,
                                                      sets)
            # filter duplication
            if type(bug_vec) is not np.ndarray:
                continue
            if cnt == 0:
                buggy_array = bug_vec.reshape((1, -1))
                patched_array = patched_vec.reshape((1, -1))
                label_array = [label]
            else:
                buggy_array = np.concatenate(
                    (buggy_array, bug_vec.reshape((1, -1))), axis=0)
                patched_array = np.concatenate(
                    (patched_array, patched_vec.reshape((1, -1))), axis=0)
                label_array.append(label)
            cnt += 1
            print('cnt: {}'.format(cnt))

        # kui's dataset
        for root, dirs, files in os.walk(path_patch_kui):
            if files == ['.DS_Store']:
                continue
            # files = sorted(files,key=lambda x:int(x.split('-')[1].split('.')[0]))
            if files == []:
                continue
            if root.split('/')[-1].startswith('Mockito'):
                continue
            label, bug_vec, patched_vec = get_sample(model, files, root, m,
                                                     sets)
            # filter duplication
            if type(bug_vec) is not np.ndarray:
                continue
            if cnt == 0:
                buggy_array = bug_vec.reshape((1, -1))
                patched_array = patched_vec.reshape((1, -1))
                label_array = [label]
            else:
                buggy_array = np.concatenate(
                    (buggy_array, bug_vec.reshape((1, -1))), axis=0)
                patched_array = np.concatenate(
                    (patched_array, patched_vec.reshape((1, -1))), axis=0)
                label_array.append(label)
            cnt += 1
            print('cnt: {}'.format(cnt))

        # label=1 developer's correct patches
        for bug in bug_folder:
            bug_path = os.path.join(path_defects4f_c, bug)
            correct_patches = os.path.join(bug_path, 'patches')
            for patch in os.listdir(correct_patches):
                if not patch.endswith('src.patch'):
                    continue
                path_patch = os.path.join(correct_patches, patch)
                try:
                    label, bug_vec, patched_vec = get_sample_supply(
                        model, path_patch, m, sets)
                    # filter duplication
                    if type(bug_vec) is not np.ndarray:
                        continue
                    if cnt == 0:
                        buggy_array = bug_vec.reshape((1, -1))
                        patched_array = patched_vec.reshape((1, -1))
                        label_array = [label]
                    else:
                        buggy_array = np.concatenate(
                            (buggy_array, bug_vec.reshape((1, -1))), axis=0)
                        patched_array = np.concatenate(
                            (patched_array, patched_vec.reshape((1, -1))),
                            axis=0)
                        label_array.append(label)
                except Exception as e:
                    print(e)
                    continue
                cnt += 1
                print('cnt: {}'.format(cnt))

        # big dataset
        # # FSE correct
        # cor = path_FSE_defects4j+'Correct'
        # patchName = os.listdir(cor)
        # for pn in patchName:
        #     pf = os.path.join(cor,pn)
        #     try:
        #         label, bug_vec, patched_vec = get_sample_supply(model, pf, m, sets)
        #         if type(bug_vec) is not np.ndarray:
        #             continue
        #         if cnt == 0:
        #             buggy_array = bug_vec.reshape((1, -1))
        #             patched_array = patched_vec.reshape((1, -1))
        #             label_array = [label]
        #         else:
        #             buggy_array = np.concatenate((buggy_array, bug_vec.reshape((1, -1))), axis=0)
        #             patched_array = np.concatenate((patched_array, patched_vec.reshape((1, -1))), axis=0)
        #             label_array.append(label)
        #     except Exception as e:
        #         print(e)
        #         continue
        #     cnt += 1
        #     print('cnt: {}'.format(cnt))
        #
        # # FSE incorrect
        # cor = path_FSE_defects4j + 'Incorrect'
        # patchName = os.listdir(cor)
        # for pn in patchName:
        #     pf = os.path.join(cor, pn)
        #     try:
        #         label, bug_vec, patched_vec = get_sample_supply_fseincorrect(model, pf, m, sets)
        #         if type(bug_vec) is not np.ndarray:
        #             continue
        #         if cnt == 0:
        #             buggy_array = bug_vec.reshape((1, -1))
        #             patched_array = patched_vec.reshape((1, -1))
        #             label_array = [label]
        #         else:
        #             buggy_array = np.concatenate((buggy_array, bug_vec.reshape((1, -1))), axis=0)
        #             patched_array = np.concatenate((patched_array, patched_vec.reshape((1, -1))), axis=0)
        #             label_array.append(label)
        #     except Exception as e:
        #         print(e)
        #         continue
        #     cnt += 1
        #     print('cnt: {}'.format(cnt))

        label_array = np.array(label_array)
        data = label_array, buggy_array, patched_array
        pickle.dump(data, f)
Example #7
0
# coding: utf-8

# In[1]:


import matplotlib.pyplot as plt
from bert_serving.client import BertClient
bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True)


# In[27]:


vec = bc.encode(
    ['First do it',  # [CLS] First do it [SEP] [word embedding for padding symbol]
     'then do it right', 
     'then do it better',
     'In the middle of nowhere, you will find that you are nobody, nobody in the middle of nowhere.'],
    show_tokens=True)

print(vec[0].shape, vec[1])
for idx_sentence in range(len(vec[1])):
    print('\n',vec[1][idx_sentence])
    for idx_token in range(len(vec[1][idx_sentence])):
        print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5])

vec = vec[0]

plt.subplot(2, 1, 1)
plt.plot(vec[0][0:5].T)
Example #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 28 12:45:30 2018

@author: lihuixian
"""

#bert-serving-start -model_dir /model/chinese_L-12_H-768_A-12/ -num_worker=4 -port 5555 -port_out 5556

from bert_serving.client import BertClient

bc = BertClient(ip='192.168.13.19', port=5555, port_out=5556)
import os
import numpy
import glob
readPath = '/Users/lihuixian/Documents/2018analysis/bert3/first10.2'
savePath = '/Users/lihuixian/Documents/2018analysis/bert3/vector_first10.2'

files = glob.glob('%s/*.txt' % readPath)
for path in files:
    filename = os.path.basename(path)[:-4]
    print(filename)
    f = open(path)
    lines = f.readlines()
    avector = bc.encode(lines)
    print(avector)
    numpy.savetxt(r'%s/%s.csv' % (savePath, filename), avector)
    print('写入成功')
Example #9
0
 def embed_sentences(self):
     bc = BertClient()
     embedding = bc.encode(self.sentences)
     return embedding
Example #10
0
 def _get_BaaS():
     assert spu.is_port_in_use(
         cfg.bert_port
     ), f'Bert As Service port not in use ({cfg.bert_port}).'
     return BertClient(ignore_all_checks=True)
Example #11
0
app = app = Flask(__name__)

prefix_q = 'Q: '
prefix_a = 'A: '
topk = 5

with open('./QA_TravelAgancy.txt') as fp:
    questions = [v.replace(prefix_q, '').strip() for v in fp if v.strip() and v.startswith(prefix_q)]
    print('%d questions loaded, avg. len of %d' % (len(questions), np.mean([len(d.split()) for d in questions])))

with open('./QA_TravelAgancy.txt') as fp:
    answers = [v.replace(prefix_a, '').strip() for v in fp if v.strip() and v.startswith(prefix_a)]
    print('%d answers loaded, avg. len of %d' % (len(answers), np.mean([len(d.split()) for d in answers])))


bc = BertClient(ip='195.246.57.106' ,port=5555, port_out=5556, check_length=False)

doc_vecs = bc.encode(questions)


@app.route('/')
@as_json
def hello_world():
    return {'message':'Hello World'}

@app.route('/gsug', methods=['POST'])
@as_json
def sendAnswers():
    data = request.form
    query = data['query']
    query_vec = bc.encode([query])[0]
    'pooling_layer': [-2],
    'gpu_memory_fraction': 0.5
}
args = namedtuple('args_namedtuple', ','.join(common.keys()))
for k, v in common.items():
    setattr(args, k, v)

for pool_layer in range(1, 13):
    setattr(args, 'pooling_layer', [-pool_layer])
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(15)
    print('encoding...')
    bc = BertClient(port=common['port'],
                    port_out=common['port_out'],
                    show_server_config=True)
    subset_vec_all_layers.append(bc.encode(subset_text))
    bc.close()
    server.close()
    print('done at layer -%d' % pool_layer)


def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'):
    plt.close()
    fig = plt.figure()
    plt.rcParams['figure.figsize'] = [21, 7]
    for idx, ebd in enumerate(embed):
        ax = plt.subplot(2, 6, idx + 1)
        vis_x = ebd[:, 0]
        vis_y = ebd[:, 1]
Example #13
0
import json
import os
from tqdm import tqdm
from copy import deepcopy
from bert_serving.client import BertClient

bc = BertClient(output_fmt='list')

# input: cord-19 dataset in directory ./data/
# output: each paper as its own json, trimmed and ready to upload to elasticsearch in ./trimmed_papers/


def get_bert_encoding(text: str) -> list:
    return bc.encode([text])[0]


def write_json(output_file: str, data: dict, counter: int) -> int:
    with open(output_file, 'w') as f:
        json.dump(data, f)
    return counter + 1


def handle_file(data: dict, p_num_offset: int):

    keep = {}

    keys = data.keys()

    p_num = p_num_offset

    if 'abstract' in keys:
 def cosine_similarity(self, c1, c2):
     #bc = BertClient(check_length=False)
     bc = BertClient()
     vectors = bc.encode([c1.text, c2.text])
     cosine = 1.0 - scipy.spatial.distance.cosine(vectors[0], vectors[1])
     return cosine
Example #15
0
 def __init__(self, ip='localhost', port=5555, port_out=5556):
     print('Initializing ranker...')
     self.bc = BertClient(ip, port, port_out)
     print('Ranker established.')
Example #16
0
"""
Example script to create elasticsearch documents.
"""
import argparse
import json

from pandas import read_csv
from bert_serving.client import BertClient
bc = BertClient(output_fmt='list', check_length=False)


def create_document(doc, emb, index_name):
    return {
        '_op_type': 'index',
        '_index': index_name,
        'title': doc['title'],
        'purpose': doc['purpose'],
        'documents_submission_date_start':
        doc['documents_submission_date_start'],
        'documents_submission_date_end': doc['documents_submission_date_end'],
        'documents_submission_time_end': doc['documents_submission_time_end'],
        'is_urgent': doc['is_urgent'],
        'fund_name': doc['fund_name'],
        'country': doc['country'],
        'allowed_participant_countries': doc['allowed_participant_countries'],
        'id': doc['id'],
        'field_of_knoweledge': doc['field_of_knoweledge'],
        # 'specific_objectives': doc['specific_objectives'], #!
        # 'expected_impact': doc['expected_impact'], #!
        'topic_description': doc['topic_description'],  #!
        'allowed_participants': doc['allowed_participants'],
Example #17
0
def main():
    args = docopt(
        '''Compute BERT embeddings for target words and calculate change metrics.

    Usage:
        diasense.py <language> <corpus1> <corpus2> <targets> <result_path> <sent_limit>

        <language> = english, german, swedish or latin
        <corpus1> = path to corpus1 (txt-file)
        <corpus2> = path to corpus2 (txt-file)
        <targets> = path to target words (txt-file)
        <result_path> = path to directory for results
        <sent_limit> = number of sentences considered in calculation of BERT embeddings in each corpus
        
    ''')

    language = args['<language>']
    corpus1 = args['<corpus1>']
    corpus2 = args['<corpus2>']
    targets = args['<targets>']
    result_path = args['<result_path>']
    sent_limit = int(args['<sent_limit>'])

    with open(targets, 'r') as target_in:
        target_list = [line.rstrip() for line in target_in]

    metrics_path = result_path + '/metrics'

    if not os.path.exists(metrics_path):
        os.mkdir(metrics_path)

    metrics = metrics_path + '/metrics.txt'
    delta_later_out = metrics_path + '/delta_later.txt'
    delta_compare_out = metrics_path + '/delta_compare.txt'

    metrics = open(metrics, 'w')
    delta_later_out = open(delta_later_out, 'w')
    delta_compare_out = open(delta_compare_out, 'w')

    metrics.write(
        'TARGET\tEARLIER\tEARLIER_STD\tLATER\tLATER_STD\tCOMPARE\tCOMPARE_MIXED\tDELTA_LATER\tDELTA_COMPARE\n'
    )

    with open(corpus1, 'r') as corpus:
        corpus1 = [line.rstrip() for line in corpus]

    with open(corpus2, 'r') as corpus:
        corpus2 = [line.rstrip() for line in corpus]

    #get BERT sentence encoder; bert-as-service should be started beforehand (separately in your terminal)
    #recommendation: bert-serving-start -pooling_strategy NONE -show_tokens_to_client -model_dir multi_cased_L-12_H-768_A-12 -max_seq_len=128
    bc = BertClient(check_length=False)

    for target in target_list:

        #get sentences in which target occurs
        sentences_c1 = get_sentences(corpus1, target)
        sentences_c2 = get_sentences(corpus2, target)
        if len(sentences_c1) > sent_limit:
            sentences_c1 = sentences_c1[0:sent_limit]
        if len(sentences_c2) > sent_limit:
            sentences_c2 = sentences_c2[0:sent_limit]

        #corpus 1

        #get sentence embeddings (embeddings and tokenization) for sentences which contain target word in corpus1
        embed_target_c1, tokens_target_c1 = bc.encode(sentences_c1,
                                                      show_tokens=True)
        #get word embeddings for target words
        target_embeddings_c1 = word_embeddings(embed_target_c1,
                                               tokens_target_c1, target)

        #earlier
        earlier_dist = []

        #get all distances between target word embeddings in corpus1
        for i, embed in enumerate(target_embeddings_c1):
            j = 1
            while i + j in range(len(target_embeddings_c1)):
                dist = cosine(target_embeddings_c1[i],
                              target_embeddings_c1[i + j])
                earlier_dist.append(dist)
                j += 1

        #mean of all distances in corpus1  (=earlier)
        earlier = np.mean(np.array(earlier_dist))
        #standard deviation in earlier
        earlier_std = np.std(np.array(earlier_dist), axis=0)

        #corpus2

        #get sentence embeddings (embeddings and tokenization) for sentences which contain target word in corpus2
        embed_target_c2, tokens_target_c2 = bc.encode(sentences_c2,
                                                      show_tokens=True)
        #get word embeddings for target words
        target_embeddings_c2 = word_embeddings(embed_target_c2,
                                               tokens_target_c2, target)

        #later
        later_dist = []

        #get all distances between target word embeddings in corpus2
        for i, embed in enumerate(target_embeddings_c2):
            j = 1
            while i + j in range(len(target_embeddings_c2)):
                dist = cosine(target_embeddings_c2[i],
                              target_embeddings_c2[i + j])
                later_dist.append(dist)
                j += 1

        #mean of all distances in corpus2  (=later)
        later = np.mean(np.array(later_dist))
        #standard deviation in later
        later_std = np.std(np.array(later_dist), axis=0)

        #delta_later
        delta_later = later - earlier

        #compare
        compare_dist = []

        #get all distances between pairs of target word embeddings, where one embedding is from corpus1 and the other from corpus2
        for embed in target_embeddings_c1:
            for embed2 in target_embeddings_c2:
                dist = cosine(embed, embed2)
                compare_dist.append(dist)

        #mean of distances between pairs (=compare)
        compare = np.mean(np.array(compare_dist))

        #compare_mixed
        all_embeddings = np.concatenate(
            (target_embeddings_c1, target_embeddings_c2), axis=0)

        mixed_dist = []

        #get all distances between all target word embeddings in corpus1 and corpus2
        for i, embed in enumerate(all_embeddings):
            j = 1
            while i + j in range(len(all_embeddings)):
                dist = cosine(all_embeddings[i], all_embeddings[i + j])
                mixed_dist.append(dist)
                j += 1

        #mean of distances between all target word embeddings
        compare_mixed = np.mean(np.array(mixed_dist))

        #delta_compare (here redefined as compare - compare_mixed)
        delta_compare = abs(compare - compare_mixed)

        metrics.write(target + '\t' + str(earlier) + '\t' + str(earlier_std) +
                      '\t' + str(later) + '\t' + str(later_std) + '\t' +
                      str(compare) + '\t' + str(compare_mixed) + '\t' +
                      str(delta_later) + '\t' + str(delta_compare) + '\n')
        delta_later_out.write(target + '\t' + str(delta_later) + '\n')
        delta_compare_out.write(target + '\t' + str(delta_compare) + '\n')
Example #18
0
from elasticsearch import Elasticsearch
from bert_serving.client import BertClient
from elasticsearch.exceptions import ConnectionError, NotFoundError

# total number of responses
SEARCH_SIZE = 1

# establishing connections
bc = BertClient(ip='localhost', output_fmt='list', check_length=False)
client = Elasticsearch('localhost:9200')

# this query is used as the search term, feel free to change
query = 'machine learning'
query_vector = bc.encode([query])[0]

script_query = {
    "script_score": {
        "query": {
            "match_all": {}
        },
        "script": {
            "source":
            "cosineSimilarity(params.query_vector, doc['abstract_vector']) + 1.0",
            "params": {
                "query_vector": query_vector
            }
        }
    }
}

try:
Example #19
0
    args = sys.argv

    if args[1] == 'w2v':
        print('loading w2v model...')
        model = api.load('word2vec-google-news-300')
        model_abbr = 'w2v'

        embed_html = word2vec_avg

        main()

    elif args[1] == 'bert':

        if args[2] == 'server':

            bc = BertClient(ip='iccluster037.iccluster.epfl.ch',
                            check_length=False)

            print('connection with server established')

            def bert_avg_server(body):
                return bert_avg(body, server=True, bc=bc)

            embed_html = bert_avg_server

        else:

            print('loading bert model...')
            tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            bert_model = BertModel.from_pretrained('bert-base-cased',
                                                   output_hidden_states=True)
            bert_model.eval()
Example #20
0
    no_improve = 0

    valid_slot = 0
    test_slot = 0
    valid_intent = 0
    test_intent = 0
    valid_err = 0
    test_err = 0
    best_epoch_num = 0
    eval_loss = 0.0

    result_records = []

    if arg.use_bert:
        from bert_serving.client import BertClient
        bc = BertClient(ip=arg.bert_ip)

    while True:
        if data_processor == None:

            # For unk purpose
            if arg.use_unk == True:
                unker = UNKer(os.path.join(full_train_path, arg.input_file),
                              os.path.join(
                                  full_train_path,
                                  arg.input_file + ".unk." + arg.unk_priority),
                              os.path.join(full_train_path, arg.slot_file),
                              ratio=arg.unk_ratio,
                              threshold=arg.unk_threshold,
                              priority=arg.unk_priority)
                data_processor = DataProcessor(
Example #21
0
'''从pip下载bert-serving-server模型
下载语句:pip install bert-serving-server
pip install bert-serving-client
然后将下载的中文编码chinese_L-12_H-768_A-12放到bert-serving模型中启动
在cmd中输入 : bert-serving-start -model_dir (放中文编码的路径) -num_worker=1
启动完成就可以进行调用,将文字转换成词向量'''

from bert_serving.client import BertClient
import numpy as np
import pandas as pd
import time
import tensorflow as tf


bc=BertClient(port=5555,port_out=5556)
def ner_test():
    with BertClient(show_server_config=False, check_version=False, check_length=False) as bc:
        start_t = time.perf_counter()
        str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。'
        str1 = list(str1)
        rst = bc.encode([str1], is_tokenized=True)
        print('rst:', rst)
        print(len(rst[0]))
        print(time.perf_counter() - start_t)
# file=pd.read_csv("dd.txt")
# data=file.to_string()
# for line in data:
#     result.append(line.strip("\n"))
# class_test()
ner_test()
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from bert_serving.client import BertClient

bc = BertClient(check_length=True)

torch.manual_seed(1)

raw_data = [("How are you? I am well".lower(), [0, 0, 0, 0, 0, 1]),
            ("Who are you? I am me".lower(), [0, 0, 0, 0, 0, 1]),
            ("What are you? I am me".lower(), [0, 0, 0, 1, 1, 0])]

training_data = []

#### CURRENTLY TAKES FIRST WORD VECTOR FOR TESTING———NEEDS UPDATE #####
for dataPoint in raw_data:
    sentence = dataPoint[0]
    sentenceVec = [num for num in bc.encode([sentence])[0][1]]
    training_data.append((torch.tensor(sentenceVec), dataPoint[1]))

tag_to_ix = {0: 0, 1: 1}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 30

Example #23
0
from elasticsearch import Elasticsearch
from bert_serving.client import BertClient
import itertools

bc = BertClient(ip='10.51.101.101', check_length=False)
es = Elasticsearch(['https://cypher.es.eu-central-1.aws.cloud.es.io:9243'],
                   http_auth=('elastic', 'UVrF6kyW58KrBzxoffp2YRKH'))


def remove_duplicates_from_list(combined):
    ret = []
    checked = []
    for c in combined:
        if c[0] not in checked:
            ret.append(c)
            checked.append(c[0])
    return ret


def findRelevantHits(in_query):
    in_query_vector = bc.encode([in_query])[0].tolist()
    queries = {
        'bert': {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source":
                    "cosineSimilarity(params.in_query_vector, doc['vector']) + 1.0",
                    "params": {
    def build_nn_graph(self, instance):
        word_vec = self.word_embed(instance.word_seq.unsqueeze(0))
        # generate bert word embedding, not finetune
        if self.bert_emb > 0:
            from bert_serving.client import BertClient
            bc = BertClient(port=8880)
            tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

            tokens = []
            orig_to_tok_index = []  # 0 - >0, 1-> len(all word_piece)
            for i, word in enumerate(instance.input):
                orig_to_tok_index.append(len(tokens))
                word_tokens = tokenizer.tokenize(word)
                for sub_token in word_tokens:
                    tokens.append(sub_token)
            vec = bc.encode([tokens], show_tokens=True, is_tokenized=True)
            vec = vec[0][:, 1:, :][:, orig_to_tok_index, :]
            bert_vec = torch.tensor(vec).to(word_vec.device)

        word_rep = [word_vec]
        if self.char_emb_size > 0:
            char_seq_tensor = instance.char_seq_tensor.unsqueeze(0)
            char_seq_len = instance.char_seq_len.unsqueeze(0)
            char_features = self.char_bilstm.get_last_hiddens(
                char_seq_tensor, char_seq_len)
            word_rep.append(char_features)
        word_rep = torch.cat(word_rep, 2)

        #concate bert word embedding
        if self.bert_emb > 0:
            word_rep = torch.cat((word_rep, bert_vec), 2)

        word_rep = self.word_drop(word_rep)
        lstm_out, (hn, cn) = self.rnn(word_rep, None)
        lstm_out = self.lstm_drop(lstm_out)
        lstm_out = lstm_out.squeeze(0)
        linear_output = self.linear(lstm_out).squeeze(0)
        #score of each node
        instance_len = instance.size()
        lstm_hidden_size = self.lstm_hidden_size

        seg_embs = {}
        for i in range(instance_len):
            for j in range(i, instance_len):
                if i == 0 and j + 1 == instance_len:
                    segment_emb = torch.cat([
                        lstm_out[j][:lstm_hidden_size],
                        lstm_out[i][lstm_hidden_size:]
                    ], 0)
                elif i == 0 and j + 1 < instance_len:
                    segment_emb = torch.cat([
                        lstm_out[j][:lstm_hidden_size],
                        lstm_out[i][lstm_hidden_size:] -
                        lstm_out[j + 1][lstm_hidden_size:]
                    ], 0)
                elif i > 0 and j + 1 == instance_len:
                    segment_emb = torch.cat([
                        lstm_out[j][:lstm_hidden_size] -
                        lstm_out[i - 1][:lstm_hidden_size],
                        lstm_out[i][lstm_hidden_size:]
                    ], 0)
                else:
                    segment_emb = torch.cat([
                        lstm_out[j][:lstm_hidden_size] -
                        lstm_out[i - 1][:lstm_hidden_size],
                        lstm_out[i][lstm_hidden_size:] -
                        lstm_out[j + 1][lstm_hidden_size:]
                    ], 0)
                seg_embs[i, j] = segment_emb

        span_score = {}
        polar_score = {}

        for i in range(instance_len):
            for j in range(i, instance_len):
                span_score[i, j] = self.linear_span(seg_embs[i, j])

        offset = [i for i in range(self.pos_embed_range_max)]
        offset = torch.LongTensor(offset)
        offset_score = self.pos_embed(offset)
        offset_score = self.pos_embed_linear(offset_score)

        zero_col = torch.zeros(1, self.label_size).to(NetworkConfig.DEVICE)
        return torch.cat(
            [linear_output, zero_col],
            0), span_score, polar_score, offset_score, lstm_out, seg_embs
Example #25
0
from bert_serving.client import BertClient
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import time
import progressbar

bc = BertClient()  # ip address of the GPU machine

dir_name = "/Users/jaswanttummala/downloads/questions.csv"
question1 = np.array(pd.read_csv(dir_name, usecols=["question1"]))
question2 = np.array(pd.read_csv(dir_name, usecols=["question2"]))
question1 = question1.tolist()
question2 = question2.tolist()
temp1 = []
temp2 = []
for i in range(len(question1)):
    temp1.append(str(question1[i][0]))
    temp2.append(str(question2[i][0]))

temp1.append("")
temp2.append("")


def divide_chunks(l, n):

    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]
Example #26
0
import sys

from bert_serving.client import BertClient


def send_without_block(bc, data, repeat=10):
    # encoding without blocking:
    print('sending all data without blocking...')
    for _ in range(repeat):
        bc.encode(data, blocking=False)
    print('all sent!')


if __name__ == '__main__':
    bc = BertClient(port=int(sys.argv[1]), port_out=int(sys.argv[2]))
    num_repeat = 20

    with open('../README.md') as fp:
        data = [v for v in fp if v.strip()]

    send_without_block(bc, data, num_repeat)

    num_expect_vecs = len(data) * num_repeat

    # then fetch all
    print('now waiting until all results are available...')
    vecs = bc.fetch_all(concat=True)
    print('received %s, expected: %d' % (vecs.shape, num_expect_vecs))

    # now send it again
Example #27
0
import sys
try:
    import numpy as np
    from sklearn.cluster import KMeans
    from bert_serving.client import BertClient
    from sklearn.metrics import pairwise_distances_argmin_min
    from flask import Flask, jsonify, request
    from flask_cors import CORS
    from nltk import sent_tokenize
except ImportError:
    sys.exit('Error importing modules')

app = Flask(__name__)
CORS(app)

bc = BertClient(check_length=False)


@app.route('/summary', methods=['POST'])
def summary():
    req = request.get_json(force=True)
    text = req['text']
    sent_list = sent_tokenize(text)
    sent_list = [sent for sent in sent_list if len(sent) > 20]
    encoded = bc.encode(sent_list).tolist()

    n_clusters = int(np.ceil(len(encoded)**0.5))

    kmeans = KMeans(n_clusters=n_clusters)
    kmeans = kmeans.fit(encoded)
Example #28
0
def validate(model, dataloader):
    """
    Compute the loss and accuracy of a model on some validation dataset.

    Args:
        model: A torch module for which the loss and accuracy must be
            computed.
        dataloader: A DataLoader object to iterate over the validation data.
        criterion: A loss criterion to use for computing the loss.
        epoch: The number of the epoch for which validation is performed.
        device: The device on which the model is located.

    Returns:
        epoch_time: The total time to compute the loss and accuracy on the
            entire validation set.
        epoch_loss: The loss computed on the entire validation set.
        epoch_accuracy: The accuracy computed on the entire validation set.
    """
    criterion = nn.CrossEntropyLoss(reduction='none')
    criterion_all = nn.CrossEntropyLoss()
    l2dist = PairwiseDistance(2)

    # Switch to evaluate mode.
    running_loss_entailment, running_loss_neutral, running_loss_contradiction = 0.0, 0.0, 0.0
    adv_loss_entailment, adv_loss_neutral, adv_loss_contradiction = None, None, None

    model.train()
    device = model.device

    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    total_num = 0

    bc = BertClient(check_length=False)
    batch = dataloader
    # Deactivate autograd for evaluation.
    for batch_index in range(len(dataloader['labels'])):
        # Move input and output data to the GPU if one is used.
        premises = torch.tensor(bc.encode(
            batch["premises"][batch_index])).to(device)
        hypotheses = torch.tensor(bc.encode(
            batch["hypotheses"][batch_index])).to(device)
        labels = torch.tensor(batch["labels"][batch_index]).to(device)

        logits, probs, _ = model(premises, hypotheses)
        pred = torch.argmax(logits, dim=1)

        loss = criterion(logits, labels)

        running_loss += loss.sum().item()
        # running_accuracy += correct_predictions(probs, labels)
        total_num += len(labels)

        np_labels = labels.cpu().numpy()
        np_loss = loss.detach().cpu().numpy()
        np_pred = pred.detach().cpu().numpy()

        # 'entailment': 0, 'neutral': 1, 'contradiction': 2
        running_loss_entailment += np_loss[(
            np_labels == 0)].sum()  # &(np_pred==1)
        running_loss_neutral += np_loss[(
            np_labels == 1)].sum()  # &(np_pred==0)
        running_loss_contradiction += np_loss[(
            np_labels == 2)].sum()  # &(np_pred==1)

        # adv
        premises_adv, hypotheses_adv = fgsm(
            premises, hypotheses, pred, model,
            criterion_all)  # eps=0.05, if_infnity=True
        logits_adv, probs_adv, _ = model(premises_adv, hypotheses_adv)

        running_accuracy += correct_predictions(probs, labels)

        adv_loss = ShannonEntropy(logits_adv, probs)
        # adv_loss = criterion(logits_adv, pred)
        np_adv_loss = adv_loss.detach().cpu().numpy()
        # np_probs_adv = torch.max(probs_adv, dim=1)[0].detach().cpu().numpy()

        if batch_index == 0:
            adv_loss_entailment = np_adv_loss[np_labels == 0]
            adv_loss_neutral = np_adv_loss[np_labels == 1]
            adv_loss_contradiction = np_adv_loss[np_labels == 2]
        else:
            adv_loss_entailment = np.concatenate(
                (adv_loss_entailment, np_adv_loss[(np_labels == 0)]), axis=0)
            adv_loss_neutral = np.concatenate(
                (adv_loss_neutral, np_adv_loss[(np_labels == 1)]), axis=0)
            adv_loss_contradiction = np.concatenate(
                (adv_loss_contradiction, np_adv_loss[(np_labels == 2)]),
                axis=0)

        # if batch_index == 10:
        #     break

    epoch_time = time.time() - epoch_start
    epoch_accuracy = running_accuracy / total_num
    print(running_loss_entailment, running_loss_neutral,
          running_loss_contradiction)

    # losses = np.concatenate((adv_loss_pos, adv_loss_neg), axis=0)
    # labels = np.concatenate((np.ones_like(adv_loss_pos), np.zeros_like(adv_loss_neg)), axis=0)
    # auc_score = roc_auc(labels, losses)
    adv_loss_entailment = adv_loss_entailment[adv_loss_entailment < 1.5]
    adv_loss_neutral = adv_loss_neutral[adv_loss_neutral < 1.5]
    adv_loss_contradiction = adv_loss_contradiction[
        adv_loss_contradiction < 1.5]
    creterion_func(adv_loss_entailment, adv_loss_neutral,
                   adv_loss_contradiction)
    # print('[ROC_AUC] score: %.2f%%' % (100. * auc_score))

    return epoch_time, epoch_accuracy
from bert_serving.client import BertClient
import datetime

bc = BertClient()
m = 2

#def trigger_func(m):
file_dir = './why_merged_' + str(m) + '_set.tsv'
trigger = []
with open(file_dir, 'r') as f:
    line = f.readline()
    while line:
        trigger.append(line[:-1])
        line = f.readline()


print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))  

node_feat_vec_H0 = bc.encode(trigger)

print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

node_feat_vec_H0.tofile('./node_feat_vec_H0_cutoff_' + str(m) + '.txt')

print(node_feat_vec_H0.shape)
Example #30
0
 def __init__(self):
     self.bc = BertClient()