class ActionGetFAQAnswer(Action): def __init__(self): super(ActionGetFAQAnswer, self).__init__() self.faq_data = json.load( open("./data/nlu/faq.json", "rt", encoding="utf-8")) self.sentence_embedding_choose(sentence_transformer_select, pretrained_model) self.standard_questions_encoder = np.load( "./data/standard_questions.npy") self.standard_questions_encoder_len = np.load( "./data/standard_questions_len.npy") print(self.standard_questions_encoder.shape) def sentence_embedding_choose( self, sentence_transformer_select=True, pretrained_model='bert-base-nli-mean-tokens'): self.sentence_transformer_select = sentence_transformer_select if sentence_transformer_select: self.bc = SentenceTransformer(pretrained_model) else: self.bc = BertClient(check_version=False) def get_most_similar_standard_question_id(self, query_question): if self.sentence_transformer_select: query_vector = torch.tensor(self.bc.encode([query_question ])[0]).numpy() else: query_vector = self.bc.encode([query_question])[0] print("Question received at action engineer") score = np.sum((self.standard_questions_encoder * query_vector), axis=1) / (self.standard_questions_encoder_len * (np.sum(query_vector * query_vector)**0.5)) top_id = np.argsort(score)[::-1][0] return top_id, score[top_id] def name(self) -> Text: return "action_get_answer" def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]: query = tracker.latest_message['text'] print(query) most_similar_id, score = self.get_most_similar_standard_question_id( query) print("The question is matched with id:{} with score: {}".format( most_similar_id, score)) if float( score ) > score_threshold: # This confidence scores can be adjusted based on your need!! response = self.faq_data[most_similar_id]['a'] dispatcher.utter_message(response) dispatcher.utter_message("Problem solved?") else: response = "Sorry, this question is beyond my ability..." dispatcher.utter_message(response) dispatcher.utter_message( "Sorry, I can't answer your question. You can dial the manual service..." ) return []
def main(captions_file: str, output: str, embedding_size: int = 768, train: bool = True): df = pd.read_json(captions_file) bc = BertClient() if train: captions = df.caption.values bert_sentence_embeddings = np.zeros((len(captions), embedding_size)) for i in tqdm(range(len(captions))): caption = captions[i] bert_sentence_embeddings[i] = bc.encode([caption]) else: bert_sentence_embeddings = {} for i in tqdm(range(len(df))): sub_df = df.iloc[i] key = sub_df['num'] caption = sub_df.caption value = bc.encode([caption]) if key not in bert_sentence_embeddings.keys(): bert_sentence_embeddings[key] = [value] else: bert_sentence_embeddings[key].append(value) with open(output, 'wb') as f: pickle.dump(bert_sentence_embeddings, f)
class PhraseEmbedding: def __init__(self): self.bc = BertClient() print('phrase embedding...') def get_embedding(self, phrase): phrase_list = [] phrase_list.append(phrase) encoded_phrase = self.bc.encode(phrase_list) return encoded_phrase def compare_phrases(self, phrase1, phrase2): phrase1_list = [] phrase1_list.append(phrase1) phrase2_list = [] phrase2_list.append(phrase2) phrase1_encode = self.bc.encode([phrase1]) phrase2_encode = self.bc.encode([phrase2]) cos = nn.CosineSimilarity(dim=1, eps=1e-6) output = cos(torch.tensor(phrase1_encode), torch.tensor(phrase2_encode)) print('comparison score : ', output) return output
def getData(): datasize = data_num * 2 X = [[] for i in range(datasize)] Y = [0 for i in range(datasize)] data = get_atecQuestAns() bc = BertClient() for index in range(0, datasize, 2): tmp = data[int(index / 2)] # print(tmp[0], tmp[1], tmp[2]) v1 = bc.encode([tmp[0]]) v2 = bc.encode([tmp[1]]) v3 = bc.encode([tmp[2]]) qq1_vec = np.append(v1, v2) qq2_vec = np.append(v1, v3) print(qq2_vec) X[index] = qq1_vec.tolist() X[index + 1] = qq2_vec.tolist() Y[index] = 1 Y[index + 1] = 0 if index % 100 == 0: print(index, 'is finish') X_train = np.array(X) Y_train = np.array(Y) np.save(path + '/data/Y_qa_all_data.npy', Y_train) np.save(path + '/data/X_qa_all_data.npy', X_train) print(X_train.shape) print(Y_train.shape) print('save x train')
def get_bert(des): des['len'] = des['intro'].str.len() des.set_index('company', inplace=True) short = des[des['intro'].str.len() <= 512] long = des[(des['intro'].str.len() > 512) & (des['intro'].str.len() < 1024)] # max length of bert is 512 long_first_part = long['intro'].str[:512] long_second_part = long['intro'].str[512:] long_second_part = long_second_part[long_second_part.str.len() > 100] short_intro = short['intro'].values.tolist() long_first_part_intro = long_first_part.values.tolist() long_second_part_intro = long_second_part.values.tolist() bc = BertClient() short_embadding = bc.encode(short_intro) long_first_part_embadding = bc.encode(long_first_part_intro) long_second_part_embadding = bc.encode(long_second_part_intro) short_embadding = pd.DataFrame(short_embadding, index=short_intro.index) long_first_part_embadding = pd.DataFrame(long_first_part_embadding, index=long_first_part.index) long_second_part_embadding = pd.DataFrame(long_second_part_embadding, index=long_second_part.index) temp = long_first_part_embadding.reindex(long_second_part_embadding.index) temp = (temp + long_second_part_embadding) / 2 long_first_part_embadding.loc[temp.index, :] = temp return pd.concat([short_embadding, long_first_part_embadding])
def generate_features(QR_QA_path, read_pattern): data_csv = csv.reader(open(QR_QA_path, "r")) # bc = BertClient(ip='222.25.172.41') bc = BertClient(check_length=False) print('fatch features for ', QR_QA_path) # add some outs for low speed. QR_words = [] questions = [] reviews = [] labels = [] for item in data_csv: if len(item) >= 3: question = item[1].strip() review = item[2].strip() if read_pattern == "QR": if item[4].strip() != '': label = int(item[4].strip()) else: label = 0 else: if item[3].strip() != '': label = int(item[3].strip()) else: label = 0 QR_words.append((question, review, label)) questions.append(question) reviews.append(review) labels.append(label) questions = bc.encode(questions) reviews = bc.encode(reviews) y = np.asarray(labels) return questions, reviews, y
def analyzeResponses(): if request.is_json: data_dict = request.get_json() text = data_dict["text"] # ip address of the GPU machine bc = BertClient(ip='localhost') with open('./server/data/answer-corpus.csv') as readFile: line_count = 0 answers = [i.strip() for i in readFile.readlines()] # encode corpus as array of strings doc_vecs = bc.encode(answers) # if tokenized: is_tokenized=True while True: # query = input('Find matching answer: ') query_vec = bc.encode([text])[0] # convert to torch input tensor_query_vec = torch.from_numpy(query_vec) tensor_doc_vecs = torch.from_numpy(doc_vecs) # compute normalized dot product as score tensor_input = tensor_query_vec * tensor_doc_vecs score = torch.sum(tensor_input, 1) / \ torch.norm(tensor_doc_vecs, dim=1) argsort = torch.argsort(score) topk_idx = torch.topk(argsort, 1) scores = [] for idx in topk_idx: print('> %s\t%s' % (score[idx], answers[idx])) scores.append(answers[idx]) print(f'Scores: {scores}') return jsonify(scores[-1]), 201
def chatbot_sentence_vec_by_bert_bertasserver(): """bert encode is used bert as server""" from conf.path_config import chicken_and_gossip_path from bert_serving.client import BertClient from utils.text_tools import txtRead import numpy as np topk = 5 matrix_ques_save_path = "doc_vecs_chicken_and_gossip" questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') ques = [ques.split('\t')[0] for ques in questions][0:100] bc = BertClient(ip = 'localhost') doc_vecs = bc.encode(ques) np.savetxt(matrix_ques_save_path, doc_vecs) # matrix_ques = np.loadtxt(matrix_ques_save_path) while True: query = input('你问: ') query_vec = bc.encode([query])[0] query_bert_vec = np.array(query_bert_vec) # compute normalized dot product as score score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1) topk_idx = np.argsort(score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答: %s\t%s' % (score[idx], questions[idx]))
def encode(X_correct): bc = BertClient() if (type(X_correct) == type([])): X_enc = bc.encode(X_correct) else: X_enc = bc.encode(list(X_correct)) return X_enc
def papreData(): file_dir = 'G:/tf-start/Implementation-of-Question-Answering-System/data/atec_nlp1.csv' # 导入18668条数据 bc = BertClient() setDataNum = 2000 with open(file_dir, 'r', encoding='utf-8') as csvfile: read = csv.reader(csvfile) X = [[[] for i in range(2)] for j in range(setDataNum + 1)] index = 0 for i in read: # print(i[0], i[1], i[2]) tmp0 = bc.encode([i[0]]) tmp1 = bc.encode([i[1]]) tmp2 = bc.encode([i[2]]) # print(tmp0, tmp1, tmp2) qq1_vec = np.append(tmp0, tmp1) qq2_vec = np.append(tmp0, tmp2) # print(qq1_vec == qq2_vec) X[index][0] = qq1_vec.tolist() X[index][1] = qq2_vec.tolist() index += 1 if index % 100 == 0: print(index) if index > setDataNum: break X1 = np.array(X) np.save("x1_2000.npy", X1) print('数据导入10000条及预处理完成------------------')
def parse_symptoms(user_text): ''' Parse symptoms and run it through bert ''' bc = BertClient(check_length=False) with open(SYMPTOMS_FILE) as f: symptoms = json.load(f) user_text = user_text.translate(str.maketrans('', '', string.punctuation)) new_user_text = "" for token in nlp(user_text): if token.lemma_ != '-PRON-': new_user_text += token.lemma_ + " " word_tokens = word_tokenize(new_user_text.strip()) filtered_sentence = " ".join( [w for w in word_tokens if not w in stop_words]) symptoms = list(symptoms.keys()) symptom_sentences = list() for symptom in symptoms: word_tokens = word_tokenize(symptom.strip()) sentence = " ".join([w for w in word_tokens if not w in stop_words]) symptom_sentences.append(sentence) encodings = bc.encode(symptoms) user_text_new = bc.encode([filtered_sentence.strip()]) length = len(encodings) return symptoms, user_text_new, encodings, length
def get_word_embeddings(data): bc = BertClient() # bc.encode(['First do it', 'then do it right', 'then do it better']) embeddings = [] sentiment_embeddings = [] bar = ChargingBar('Calculating tweet embeddings\t\t\t', max=len(data)) for instance in data: # should encode the join of the tokens array instead # kinda a hacky fix to an empty tokens array if len(instance['tokens']) == 0: embedding = bc.encode([instance['tweet']]) else: embedding = bc.encode([' '.join(instance['tokens'])]) embeddings.append(embedding) sentiment_embeddings.append({ "embedding": embedding[0], "sentiment": instance['sentiment'] }) bar.next() bar.finish() # print(embeddings) # print(len(embeddings), len(embeddings[0]),len(embeddings[0][0])) return embeddings, sentiment_embeddings
def create_1000_case_test(): li = [] bc = BertClient(ip='222.19.197.230', port=5555, port_out=5556, check_version=False) test_text = pre_deal.get_test_textVector() zero_vector = np.zeros((500, 768)) for i in range(0, len(test_text)): x = tokenize.word_tokenize(test_text[i]) if (len(x) >502): index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501]) if (index != -1): list = [] sentence_1 = test_text[i][0:index] sentence_2 = test_text[i][index:] list.append(sentence_1) list.append(sentence_2) vector = bc.encode(list) ve = np.concatenate((vector[0], vector[1]), axis=0) li.append(ve.tolist()) else: list = [] list.append(test_text[i]) vector = bc.encode(list) ve = np.concatenate((vector[0], zero_vector), axis=0) li.append(ve.tolist()) else: list = [] list.append(test_text[i]) vector = bc.encode(list) ve = np.concatenate((vector[0], zero_vector), axis=0) li.append(ve.tolist()) li_vector = np.array(li) np.save("test_case_1000.npy", li_vector)
def main(sentences_file, queries_file, output_file): start = time.time() bc = BertClient(check_length=False) logger.info("Loading sentences and queries...") with open(sentences_file,"r") as f: corpus = list(set([line.strip() for line in f.readlines()])) with open(queries_file,"r") as f: queries = [line.strip() for line in f.readlines()] logger.info("Encoding sentences...") doc_vecs = bc.encode(corpus) n = 10 top_k = 5 logger.info("Computing top {} similar sentences to each of {} queries...".format(top_k,len(queries))) data = [] for query in queries: query_vec = bc.encode([query])[0] top_k_list = get_query_top_k(query, query_vec, corpus, doc_vecs, max_n = n, top_k = top_k) data.extend(top_k_list) df = pd.DataFrame(data) df.to_csv(output_file,index=False,sep="\t") end = time.time() e = int(end - start) logger.info('Time elapsed is: {:02d}:{:02d}:{:02d}'.format(e // 3600, (e % 3600 // 60), e % 60))
def validate(model, dataloader, criterion): """ Compute the loss and accuracy of a model on some validation dataset. Args: model: A torch module for which the loss and accuracy must be computed. dataloader: A DataLoader object to iterate over the validation data. criterion: A loss criterion to use for computing the loss. epoch: The number of the epoch for which validation is performed. device: The device on which the model is located. Returns: epoch_time: The total time to compute the loss and accuracy on the entire validation set. epoch_loss: The loss computed on the entire validation set. epoch_accuracy: The accuracy computed on the entire validation set. """ # Switch to evaluate mode. model.eval() device = model.device epoch_start = time.time() running_loss = 0.0 running_accuracy = 0.0 total_num = 0 sub_len = 0 bc = BertClient(check_length=False) batch = dataloader # Deactivate autograd for evaluation. with torch.no_grad(): for batch_index in range(len(dataloader['labels'])): # Move input and output data to the GPU if one is used. # try: premises = torch.tensor(bc.encode( batch["premises"][batch_index])).to(device) hypotheses = torch.tensor( bc.encode(batch["hypotheses"][batch_index])).to(device) labels = torch.tensor(batch["labels"][batch_index]).to(device) logits, probs, adv_logits = model(premises, hypotheses) # print(logits.size()) loss = criterion(logits, labels) running_loss += loss.item() running_accuracy += correct_predictions(probs, labels) total_num += len(labels) # except: # sub_len += 1 # print('encoding error!') epoch_time = time.time() - epoch_start epoch_loss = running_loss / (len(dataloader['labels']) - sub_len) epoch_accuracy = running_accuracy / total_num return epoch_time, epoch_loss, epoch_accuracy
class TextSummarizer(object): def __init__(self, payload): self.payload = payload self.categories = ['i need a doctor', 'has the following symptoms', 'needs to show something'] print("Attempting to connect to Bert instance.") self.bc = BertClient(check_length=False) #ip="52.249.61.86" print("Connected to Bert instance.") print("Server status:", self.bc.status) def get_paragraphs(self): text = self.payload #self.payload["data"] self.paragraphs = text.split("\n\n") self.sentences = [] for paragraph in self.paragraphs: for sent in sent_tokenize(paragraph): self.sentences.append(sent) cleaned_sentences = [] for sentence in self.sentences: if len(sentence.split()) > 2: cleaned_sentences.append(sentence) self.sentences = cleaned_sentences def lowercase_text(self): # lower case all words in sentences for idx,sentence in enumerate(self.sentences): lower_cased_tokens = [word.lower() for word in sentence.split()] lower_cased_sentence = " ".join([word for word in lower_cased_tokens]) if lower_cased_sentence: self.sentences[idx] = lower_cased_sentence.strip() def get_embeddings(self): self.question_embedding = self.bc.encode(self.sentences) self.category_embeddings = self.bc.encode(self.categories) self.length = len(self.categories) def build_similarity_matrix(self): self.similarity_matrix = np.zeros([self.length]) for i in range(self.length): self.similarity_matrix[i] = cosine_similarity([self.question_embedding[0]], [self.category_embeddings[i]]) def get_important_category(self): reversed_sorted = np.argsort(self.similarity_matrix)[::-1] top_similarity = reversed_sorted[0] return self.categories[top_similarity] def get_summary(self): ''' Returns a list of the most important sentences in the legal document. ''' self.get_paragraphs() print("Cleaning text input") self.lowercase_text() print("Generating embeddings") self.get_embeddings() print("Building similarity matrix") self.build_similarity_matrix() print(self.similarity_matrix) summary = self.get_important_category() return summary
def find_nearest(self, search_query, return_size=DEFAULT_RETURN_SIZE, taggings=DEFAULT_TAGGING_TXT, output_file=DEFAULT_OUTPUT_FILE): # Get an input query client = BertClient() search_query = str(search_query) v1 = client.encode([search_query]) # Read in all cluster taggings f = open(taggings, "r") d = {} for line in f: try: line = line.rstrip("\n") line = line.rstrip(" ") line_split = line.split(", ") index = line_split[0].split(" ")[0] start = len(index) + 1 index = int(index) line_split[0] = line_split[0][start:] for tag in line_split: encoding = client.encode([tag]) d[tag] = [index, encoding] except: pass f.close() # Calculate cosine similarity between input query and all taggings similarity = {} # tagging: score topn_score = [] # top n scores for key, value in d.items(): score = cosine_similarity(d[key][1], v1)[0][0] similarity[key] = score if len(topn_score) < return_size or topn_score is None: topn_score.append(score) else: if self.if_larger(topn_score, score): topn_score[0] = score topn_score.sort() result_clusters = {} for key, value in similarity.items(): if similarity[key] in topn_score: if d[key][0] not in result_clusters.keys(): result_clusters[d[key][0]] = 0 if result_clusters[d[key][0]] < similarity[key]: result_clusters[d[key][0]] = topn_score[topn_score.index( similarity[key])] output = open(output_file, "w") output.write("cluster number, probability\n") for i in result_clusters.keys(): output.write("{},{}\n".format(i, result_clusters[i])) output.close() return result_clusters
def average_word_embeddings_with_without_emojis(data, emojisInData): bc = BertClient() sentiment_embeddings = [] sentiment_embeddings_with_emojis = [] bar = ChargingBar('Calculating word average embeddings with emojis\t\t\t', max=len(data)) for instance in data: if len(instance['tokens']) == 0: embedding = bc.encode([instance['tweet']]) else: word_embeddings = [] for word in instance['tokens']: wordList = [word] word_embeddings.append(bc.encode(wordList)) # for each feature in the embedding, calucalute the avg of said feature for each word # all word embeddings should be the same size because thats how embeddings work # TODO add a check, probably word_embedding_sum = [0] * len( word_embeddings[0] ) # TODO add a check to make sure this first elem exists for word_embedding in word_embeddings: for i in range(len(word_embedding)): word_embedding_sum[i] += word_embedding[i] embedding = [ feature / len(word_embeddings) for feature in word_embedding_sum ] # compute the average of each feature of the word embedding embedding = np.array(embedding) embedding = embedding.reshape( 768 ) # The hidden bert layer has 768 neurons (hence 768 features) sentiment_embeddings.append({ "embedding": embedding, "sentiment": instance['sentiment'] }) # gets the freq of each emoji in a given tweet, returned in a list. This list has the same number of emojis and order of them for each tweet emojiFreqList = metrics.get_emojis_of_tweet(instance['tweet'], emojisInData) combinedEmbedding = np.concatenate((embedding, emojiFreqList)) combinedEmbedding = combinedEmbedding.reshape(1, -1) sentiment_embeddings_with_emojis.append({ "embedding": combinedEmbedding[0], "sentiment": instance['sentiment'] }) bar.next() bar.finish() # print("word average embedding: " + str(sentiment_embeddings_with_emojis[0])) return sentiment_embeddings, sentiment_embeddings_with_emojis
class RawBERTEncoder(Encoder): def __init__(self): self.client = BertClient(check_length=False) def encode(self, data): return self.client.encode([data]).tolist()[0] def encode_multiple(self, data): return self.client.encode(data).tolist()
def run(self): time_all = [] bc = BertClient(port=PORT, port_out=PORT_OUT, show_server_config=False) for _ in range(self.num_repeat): start_t = time.perf_counter() bc.encode(self.batch) time_all.append(time.perf_counter() - start_t) print(time_all) self.avg_time = mean(time_all)
def call_BERT_server_async(X, start_index, end_index, vocab_we, setting, BERT_SERVER_IP): def handle_word_embeddings(encoded): # remove [CLS] and [SEP] part encoded = np.delete(encoded, 0, axis=0) encoded = np.delete(encoded, len(sent_tokenized), axis=0) # return to pre-padding zeros (BERT does post-padding) final = encoded[:len(sent_tokenized)] if len(X[i]) - final.shape[0] > 0: last = np.zeros((len(X[i]) - final.shape[0], final.shape[1])) final = np.vstack((last, final)) return final X_new = [] try: bc = BertClient(ip=BERT_SERVER_IP) # ip address of the GPU machine for i in tqdm(range(start_index, end_index)): # return back to token strings if setting[0] == "BERT" or setting[0] == "BERT_SENT": # sentences sent_tokenized = [vocab_we[s] for s in X[i] if s != 0] # encode with BERT embeddings encoded = bc.encode([sent_tokenized], is_tokenized=True)[0] if setting[0] == "BERT": final = handle_word_embeddings( encoded) # padding & delete CLS/SEP elif setting[0] == "BERT_SENT": final = encoded else: # knowledge final = [] for concept_list in X[i]: if max(concept_list) == 0: final.append(np.zeros( (4, setting[2]))) #todo get the 4 from params else: encoded = bc.encode([ vocab_we[s][:-3] for s in concept_list if s != 0 ])[0] if len(encoded) == setting[2]: encoded = np.expand_dims(encoded, 0) missing_concepts = 4 - len(encoded) encoded = np.vstack((np.zeros( (missing_concepts, setting[2])), encoded)) final.append(encoded) # add processed embedding sequence for sentence X_new.append(final) return np.array(X_new) except Exception as e: with print_lock: print(str(e)) return np.array(X_new)
def get_bert_vector(vocab: list) -> np.ndarray: from bert_serving.client import BertClient bc = BertClient() vectors = np.zeros((len(vocab), 768)) vectors[2:1066] = bc.encode(vocab[2:1066]) vectors[1067:] = bc.encode(vocab[1067:]) scio.savemat('{}/bert.mat'.format(DATA_DIR), {'vectors': vectors}) return vectors
def add_bert_embeddings_to_df(df): from bert_serving.client import BertClient bc = BertClient() df["Question1_embedding"] = df["Question1"].apply( lambda row: bc.encode([row])) df["Question2_embedding"] = df["Question2"].apply( lambda row: bc.encode([row])) return df
def deal_words_cos(): model = BertClient() word_en = model.encode(['学生']) word_my = model.encode(['student']) dotmultiply = la.norm(word_en - word_my) word_en = math.sqrt(sum(map(lambda tmp_en: tmp_en * tmp_en, word_en.T))) word_my = math.sqrt(sum(map(lambda tmp_my: tmp_my * tmp_my, word_my.T))) #ret_cos = tmp_cos / (word_en * word_en) print(la.norm(word_en - word_my))
def get_encoding(): #dataset_path = "/home/kkuma12s/thesis/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/fever_full_binary_dev_sent_ret.jsonl" dataset_path = "/home/kkuma12s/thesis/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/fever_full_binary_dev_bert.jsonl" claims = [] sents = [] labels = [] with jsonlines.open(dataset_path, mode='r') as f: tmp_dict = {} for example in f: claims.append(example["claim"]) sents.append(example["sentence"]) labels.append(example["label"]) tmp_dict = {'claim': claims, 'sentence': sents, 'label': labels} train_data = pd.DataFrame(data=tmp_dict) print(train_data.shape) # len(train_df["sentence"]) bc = BertClient() claims = train_data["claim"].tolist() sents = train_data["sentence"].tolist() print("claims length ", len(claims)) sents_pair = [[claim + ' ||| ' + sent] for claim, sent in zip(claims, sents)] print("sent pair length ", len(sents_pair)) vec = np.empty((len(sents_pair), 768)) count = 0 for sent in sents_pair: if count == 0: # pass vec = bc.encode(sent) else: # pass vec = np.vstack((vec, bc.encode(sent))) if count % 300 == 0: print("count ", count) count += 1 print("saving vector into zip") file_name = "/scratch/kkuma12s/new_embeddings/fever_full_dev_claim_cls_bert" save_dataset_and_compress(vec, file_name)
class Encoding(object): def __init__(self): self.server_ip = "127.0.0.1" self.bert_client = BertClient(ip=self.server_ip) def encode(self, query): tensor = self.bert_client.encode([query]) return tensor def query_similarity(self, query_list): tensors = self.bert_client.encode(query_list) return cosine_similarity(tensors)[0][1]
class Encoding(object): def __init__(self): self.server_ip = "localhost" self.bert_client = BertClient(ip=self.server_ip) def encode(self, query): tensor = self.bert_client.encode([query]) return tensor def query_similarity(self, query_list): tensors = self.bert_client.encode(query_list) # dist = np.linalg.norm(tensors[0]-tensors[1]) # prea = stats.pearsonr(tensors[0],tensors[1])[0] return cosine_similarity(tensors)[0][1]
def main(vocab_file: str, output: str, server_hostname: str): client = BertClient(ip=server_hostname) vocabulary = torch.load(vocab_file) vocab_size = len(vocabulary) fake_embedding = client.encode(["test"]).reshape(-1) embed_size = fake_embedding.shape[0] print("Encoding words into embeddings with size: ", embed_size) embeddings = np.empty((vocab_size, embed_size)) for i in tqdm(range(len(embeddings)), ascii=True): embeddings[i] = client.encode([vocabulary.idx2word[i]]) np.save(output, embeddings)
def Bert_embedding(sentences: [str]): bc = BertClient() tickets_vec = bc.encode(sentences) print(tickets_vec.shape) with open('models/BERT/Bert_representation.pickle', 'wb') as handle: pickle.dump(tickets_vec, handle) print("Embeddings Generated At models/BERT/Bert_representation.pickle")
def analyzer(): bc = BertClient(ip='bertserving', output_fmt='list') client = Elasticsearch('elasticsearch:9200') query = request.args.get('q') query_vector = bc.encode([query])[0] script_query = { "script_score": { "query": { "match_all": {} }, "script": { "source": "cosineSimilarity(params.query_vector, 'topic_description_vector') + 1.0", "params": { "query_vector": query_vector } } } } response = client.search(index='grants', body={ "size": SEARCH_SIZE, "query": script_query, "_source": { "includes": ["title", "topic_description"] } }) print(query) pprint(response) return jsonify(response)