Example #1
0
class ActionGetFAQAnswer(Action):
    def __init__(self):
        super(ActionGetFAQAnswer, self).__init__()
        self.faq_data = json.load(
            open("./data/nlu/faq.json", "rt", encoding="utf-8"))
        self.sentence_embedding_choose(sentence_transformer_select,
                                       pretrained_model)
        self.standard_questions_encoder = np.load(
            "./data/standard_questions.npy")
        self.standard_questions_encoder_len = np.load(
            "./data/standard_questions_len.npy")
        print(self.standard_questions_encoder.shape)

    def sentence_embedding_choose(
            self,
            sentence_transformer_select=True,
            pretrained_model='bert-base-nli-mean-tokens'):
        self.sentence_transformer_select = sentence_transformer_select
        if sentence_transformer_select:
            self.bc = SentenceTransformer(pretrained_model)
        else:
            self.bc = BertClient(check_version=False)

    def get_most_similar_standard_question_id(self, query_question):
        if self.sentence_transformer_select:
            query_vector = torch.tensor(self.bc.encode([query_question
                                                        ])[0]).numpy()
        else:
            query_vector = self.bc.encode([query_question])[0]
        print("Question received at action engineer")
        score = np.sum((self.standard_questions_encoder * query_vector),
                       axis=1) / (self.standard_questions_encoder_len *
                                  (np.sum(query_vector * query_vector)**0.5))
        top_id = np.argsort(score)[::-1][0]
        return top_id, score[top_id]

    def name(self) -> Text:
        return "action_get_answer"

    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
        query = tracker.latest_message['text']
        print(query)
        most_similar_id, score = self.get_most_similar_standard_question_id(
            query)
        print("The question is matched with id:{} with score: {}".format(
            most_similar_id, score))
        if float(
                score
        ) > score_threshold:  # This confidence scores can be adjusted based on your need!!
            response = self.faq_data[most_similar_id]['a']
            dispatcher.utter_message(response)
            dispatcher.utter_message("Problem solved?")
        else:
            response = "Sorry, this question is beyond my ability..."
            dispatcher.utter_message(response)
            dispatcher.utter_message(
                "Sorry, I can't answer your question. You can dial the manual service..."
            )
        return []
Example #2
0
def main(captions_file: str,
         output: str,
         embedding_size: int = 768,
         train: bool = True):
    df = pd.read_json(captions_file)
    bc = BertClient()

    if train:
        captions = df.caption.values
        bert_sentence_embeddings = np.zeros((len(captions), embedding_size))
        for i in tqdm(range(len(captions))):
            caption = captions[i]
            bert_sentence_embeddings[i] = bc.encode([caption])

    else:
        bert_sentence_embeddings = {}

        for i in tqdm(range(len(df))):
            sub_df = df.iloc[i]
            key = sub_df['num']
            caption = sub_df.caption
            value = bc.encode([caption])

            if key not in bert_sentence_embeddings.keys():
                bert_sentence_embeddings[key] = [value]
            else:
                bert_sentence_embeddings[key].append(value)

    with open(output, 'wb') as f:
        pickle.dump(bert_sentence_embeddings, f)
class PhraseEmbedding:

    def __init__(self):
        self.bc = BertClient()
        print('phrase embedding...')


    def get_embedding(self, phrase):
        phrase_list = []
        phrase_list.append(phrase)
        encoded_phrase = self.bc.encode(phrase_list)
        return encoded_phrase


    def compare_phrases(self, phrase1, phrase2):
        phrase1_list = []
        phrase1_list.append(phrase1)

        phrase2_list = []
        phrase2_list.append(phrase2)

        phrase1_encode = self.bc.encode([phrase1])
        phrase2_encode = self.bc.encode([phrase2])

        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        output = cos(torch.tensor(phrase1_encode), torch.tensor(phrase2_encode))
        print('comparison score : ', output)
        return output
def getData():
    datasize = data_num * 2
    X = [[] for i in range(datasize)]
    Y = [0 for i in range(datasize)]
    data = get_atecQuestAns()
    bc = BertClient()
    for index in range(0, datasize, 2):
        tmp = data[int(index / 2)]
        # print(tmp[0], tmp[1], tmp[2])
        v1 = bc.encode([tmp[0]])
        v2 = bc.encode([tmp[1]])
        v3 = bc.encode([tmp[2]])
        qq1_vec = np.append(v1, v2)
        qq2_vec = np.append(v1, v3)
        print(qq2_vec)
        X[index] = qq1_vec.tolist()
        X[index + 1] = qq2_vec.tolist()
        Y[index] = 1
        Y[index + 1] = 0
        if index % 100 == 0:
            print(index, 'is finish')
    X_train = np.array(X)
    Y_train = np.array(Y)
    np.save(path + '/data/Y_qa_all_data.npy', Y_train)
    np.save(path + '/data/X_qa_all_data.npy', X_train)
    print(X_train.shape)
    print(Y_train.shape)
    print('save x train')
def get_bert(des):
    des['len'] = des['intro'].str.len()
    des.set_index('company', inplace=True)

    short = des[des['intro'].str.len() <= 512]
    long = des[(des['intro'].str.len() > 512)
               & (des['intro'].str.len() < 1024)]
    # max length of bert is 512

    long_first_part = long['intro'].str[:512]
    long_second_part = long['intro'].str[512:]
    long_second_part = long_second_part[long_second_part.str.len() > 100]

    short_intro = short['intro'].values.tolist()
    long_first_part_intro = long_first_part.values.tolist()
    long_second_part_intro = long_second_part.values.tolist()

    bc = BertClient()
    short_embadding = bc.encode(short_intro)
    long_first_part_embadding = bc.encode(long_first_part_intro)
    long_second_part_embadding = bc.encode(long_second_part_intro)

    short_embadding = pd.DataFrame(short_embadding, index=short_intro.index)
    long_first_part_embadding = pd.DataFrame(long_first_part_embadding,
                                             index=long_first_part.index)
    long_second_part_embadding = pd.DataFrame(long_second_part_embadding,
                                              index=long_second_part.index)

    temp = long_first_part_embadding.reindex(long_second_part_embadding.index)
    temp = (temp + long_second_part_embadding) / 2

    long_first_part_embadding.loc[temp.index, :] = temp

    return pd.concat([short_embadding, long_first_part_embadding])
Example #6
0
def generate_features(QR_QA_path, read_pattern):
    data_csv = csv.reader(open(QR_QA_path, "r"))
    # bc = BertClient(ip='222.25.172.41')
    bc = BertClient(check_length=False)
    print('fatch features for ', QR_QA_path)  # add some outs for low speed.
    QR_words = []
    questions = []
    reviews = []
    labels = []
    for item in data_csv:
        if len(item) >= 3:
            question = item[1].strip()
            review = item[2].strip()
            if read_pattern == "QR":
                if item[4].strip() != '':
                    label = int(item[4].strip())
                else:
                    label = 0
            else:
                if item[3].strip() != '':
                    label = int(item[3].strip())
                else:
                    label = 0
            QR_words.append((question, review, label))
            questions.append(question)
            reviews.append(review)
            labels.append(label)
    questions = bc.encode(questions)
    reviews = bc.encode(reviews)
    y = np.asarray(labels)
    return questions, reviews, y
Example #7
0
def analyzeResponses():
    if request.is_json:
      data_dict = request.get_json()
      text = data_dict["text"]
      # ip address of the GPU machine
      bc = BertClient(ip='localhost')
      with open('./server/data/answer-corpus.csv') as readFile:
          line_count = 0
          answers = [i.strip() for i in readFile.readlines()]
      # encode corpus as array of strings
      doc_vecs = bc.encode(answers)  # if tokenized: is_tokenized=True

      while True:
          # query = input('Find matching answer: ')
          query_vec = bc.encode([text])[0]
          # convert to torch input
          tensor_query_vec = torch.from_numpy(query_vec)
          tensor_doc_vecs = torch.from_numpy(doc_vecs)
          # compute normalized dot product as score
          tensor_input = tensor_query_vec * tensor_doc_vecs
          score = torch.sum(tensor_input, 1) / \
              torch.norm(tensor_doc_vecs, dim=1)
          argsort = torch.argsort(score)
          topk_idx = torch.topk(argsort, 1)
          scores = []
          for idx in topk_idx:
              print('> %s\t%s' % (score[idx], answers[idx]))
              scores.append(answers[idx])
          print(f'Scores: {scores}')
          return jsonify(scores[-1]), 201
def chatbot_sentence_vec_by_bert_bertasserver():
    """bert encode is used bert as server"""
    from conf.path_config import chicken_and_gossip_path
    from bert_serving.client import BertClient
    from utils.text_tools import txtRead
    import numpy as np

    topk = 5
    matrix_ques_save_path = "doc_vecs_chicken_and_gossip"
    questions = txtRead(chicken_and_gossip_path, encodeType='utf-8')
    ques = [ques.split('\t')[0] for ques in questions][0:100]

    bc = BertClient(ip = 'localhost')
    doc_vecs = bc.encode(ques)
    np.savetxt(matrix_ques_save_path, doc_vecs)
    # matrix_ques = np.loadtxt(matrix_ques_save_path)

    while True:
        query = input('你问: ')
        query_vec = bc.encode([query])[0]
        query_bert_vec = np.array(query_bert_vec)
        # compute normalized dot product as score
        score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
        topk_idx = np.argsort(score)[::-1][:topk]
        for idx in topk_idx:
            print('小姜机器人回答: %s\t%s' % (score[idx], questions[idx]))
Example #9
0
def encode(X_correct):
    bc = BertClient()
    if (type(X_correct) == type([])):
        X_enc = bc.encode(X_correct)
    else:
        X_enc = bc.encode(list(X_correct))
    return X_enc
Example #10
0
def papreData():
    file_dir = 'G:/tf-start/Implementation-of-Question-Answering-System/data/atec_nlp1.csv'
    # 导入18668条数据
    bc = BertClient()
    setDataNum = 2000
    with open(file_dir, 'r', encoding='utf-8') as csvfile:
        read = csv.reader(csvfile)
        X = [[[] for i in range(2)] for j in range(setDataNum + 1)]
        index = 0
        for i in read:
            # print(i[0], i[1], i[2])
            tmp0 = bc.encode([i[0]])
            tmp1 = bc.encode([i[1]])
            tmp2 = bc.encode([i[2]])
            # print(tmp0, tmp1, tmp2)
            qq1_vec = np.append(tmp0, tmp1)
            qq2_vec = np.append(tmp0, tmp2)
            # print(qq1_vec == qq2_vec)
            X[index][0] = qq1_vec.tolist()
            X[index][1] = qq2_vec.tolist()
            index += 1
            if index % 100 == 0:
                print(index)
            if index > setDataNum:
                break
        X1 = np.array(X)
        np.save("x1_2000.npy", X1)
    print('数据导入10000条及预处理完成------------------')
Example #11
0
def parse_symptoms(user_text):
    ''' Parse symptoms and run it through bert '''
    bc = BertClient(check_length=False)

    with open(SYMPTOMS_FILE) as f:
        symptoms = json.load(f)

    user_text = user_text.translate(str.maketrans('', '', string.punctuation))
    new_user_text = ""
    for token in nlp(user_text):
        if token.lemma_ != '-PRON-':
            new_user_text += token.lemma_ + " "
    word_tokens = word_tokenize(new_user_text.strip())
    filtered_sentence = " ".join(
        [w for w in word_tokens if not w in stop_words])

    symptoms = list(symptoms.keys())
    symptom_sentences = list()
    for symptom in symptoms:
        word_tokens = word_tokenize(symptom.strip())
        sentence = " ".join([w for w in word_tokens if not w in stop_words])
        symptom_sentences.append(sentence)

    encodings = bc.encode(symptoms)
    user_text_new = bc.encode([filtered_sentence.strip()])
    length = len(encodings)

    return symptoms, user_text_new, encodings, length
def get_word_embeddings(data):
    bc = BertClient()
    # bc.encode(['First do it', 'then do it right', 'then do it better'])

    embeddings = []
    sentiment_embeddings = []
    bar = ChargingBar('Calculating tweet embeddings\t\t\t', max=len(data))
    for instance in data:
        # should encode the join of the tokens array instead
        # kinda a hacky fix to an empty tokens array
        if len(instance['tokens']) == 0:
            embedding = bc.encode([instance['tweet']])
        else:
            embedding = bc.encode([' '.join(instance['tokens'])])
        embeddings.append(embedding)
        sentiment_embeddings.append({
            "embedding": embedding[0],
            "sentiment": instance['sentiment']
        })
        bar.next()

    bar.finish()
    # print(embeddings)
    # print(len(embeddings), len(embeddings[0]),len(embeddings[0][0]))
    return embeddings, sentiment_embeddings
Example #13
0
def create_1000_case_test():
    li = []
    bc = BertClient(ip='222.19.197.230', port=5555, port_out=5556, check_version=False)
    test_text = pre_deal.get_test_textVector()
    zero_vector = np.zeros((500, 768))
    for i in range(0, len(test_text)):
    x = tokenize.word_tokenize(test_text[i])
    if (len(x) >502):
        index = KMP.KMP_algorithm(test_text[i], x[500] + " " + x[501])
        if (index != -1):
            list = []
            sentence_1 = test_text[i][0:index]
            sentence_2 = test_text[i][index:]
            list.append(sentence_1)
            list.append(sentence_2)
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], vector[1]), axis=0)
            li.append(ve.tolist())
        else:
            list = []
            list.append(test_text[i])
            vector = bc.encode(list)
            ve = np.concatenate((vector[0], zero_vector), axis=0)
            li.append(ve.tolist())
    else:
        list = []
        list.append(test_text[i])
        vector = bc.encode(list)
        ve = np.concatenate((vector[0], zero_vector), axis=0)
        li.append(ve.tolist())
    li_vector = np.array(li)
    np.save("test_case_1000.npy", li_vector)
Example #14
0
def main(sentences_file, queries_file, output_file):
  start = time.time()

  bc = BertClient(check_length=False)


  logger.info("Loading sentences and queries...")
  with open(sentences_file,"r") as f:
    corpus = list(set([line.strip() for line in f.readlines()]))

  with open(queries_file,"r") as f:
    queries = [line.strip() for line in f.readlines()]


  logger.info("Encoding sentences...")
  doc_vecs = bc.encode(corpus)

  n = 10
  top_k = 5

  logger.info("Computing top {} similar sentences to each of {} queries...".format(top_k,len(queries)))


  data = []
  for query in queries:
      query_vec = bc.encode([query])[0]
      top_k_list = get_query_top_k(query, query_vec, corpus, doc_vecs, max_n = n, top_k = top_k)
      data.extend(top_k_list)
  df = pd.DataFrame(data)
  df.to_csv(output_file,index=False,sep="\t")
  end = time.time()

  e = int(end - start)
  logger.info('Time elapsed is: {:02d}:{:02d}:{:02d}'.format(e // 3600, (e % 3600 // 60), e % 60))
Example #15
0
def validate(model, dataloader, criterion):
    """
    Compute the loss and accuracy of a model on some validation dataset.

    Args:
        model: A torch module for which the loss and accuracy must be
            computed.
        dataloader: A DataLoader object to iterate over the validation data.
        criterion: A loss criterion to use for computing the loss.
        epoch: The number of the epoch for which validation is performed.
        device: The device on which the model is located.

    Returns:
        epoch_time: The total time to compute the loss and accuracy on the
            entire validation set.
        epoch_loss: The loss computed on the entire validation set.
        epoch_accuracy: The accuracy computed on the entire validation set.
    """

    # Switch to evaluate mode.
    model.eval()
    device = model.device

    epoch_start = time.time()
    running_loss = 0.0
    running_accuracy = 0.0
    total_num = 0
    sub_len = 0

    bc = BertClient(check_length=False)
    batch = dataloader
    # Deactivate autograd for evaluation.
    with torch.no_grad():
        for batch_index in range(len(dataloader['labels'])):
            # Move input and output data to the GPU if one is used.
            # try:
            premises = torch.tensor(bc.encode(
                batch["premises"][batch_index])).to(device)
            hypotheses = torch.tensor(
                bc.encode(batch["hypotheses"][batch_index])).to(device)
            labels = torch.tensor(batch["labels"][batch_index]).to(device)

            logits, probs, adv_logits = model(premises, hypotheses)

            # print(logits.size())
            loss = criterion(logits, labels)

            running_loss += loss.item()
            running_accuracy += correct_predictions(probs, labels)
            total_num += len(labels)
            # except:
            #     sub_len += 1
            #     print('encoding error!')

    epoch_time = time.time() - epoch_start
    epoch_loss = running_loss / (len(dataloader['labels']) - sub_len)
    epoch_accuracy = running_accuracy / total_num

    return epoch_time, epoch_loss, epoch_accuracy
Example #16
0
class TextSummarizer(object):
    def __init__(self, payload):
        self.payload = payload
        self.categories = ['i need a doctor', 'has the following symptoms', 'needs to show something']
        print("Attempting to connect to Bert instance.")
        self.bc = BertClient(check_length=False) #ip="52.249.61.86"
        print("Connected to Bert instance.")
        print("Server status:", self.bc.status)

    def get_paragraphs(self):
        text = self.payload #self.payload["data"]
        self.paragraphs = text.split("\n\n")
        self.sentences = []
        for paragraph in self.paragraphs:
            for sent in sent_tokenize(paragraph):
                self.sentences.append(sent)
        cleaned_sentences = []
        for sentence in self.sentences:
            if len(sentence.split()) > 2:
                cleaned_sentences.append(sentence)
        self.sentences = cleaned_sentences

    def lowercase_text(self):
        # lower case all words in sentences 
        for idx,sentence in enumerate(self.sentences):
            lower_cased_tokens = [word.lower() for word in sentence.split()]
            lower_cased_sentence = " ".join([word for word in lower_cased_tokens])
            if lower_cased_sentence:
                self.sentences[idx] = lower_cased_sentence.strip()
    
    def get_embeddings(self):
        self.question_embedding = self.bc.encode(self.sentences)
        self.category_embeddings = self.bc.encode(self.categories)
        self.length = len(self.categories)

    def build_similarity_matrix(self):
        self.similarity_matrix = np.zeros([self.length])
        for i in range(self.length):
            self.similarity_matrix[i] = cosine_similarity([self.question_embedding[0]], 
                                                                [self.category_embeddings[i]])

    def get_important_category(self):
        reversed_sorted = np.argsort(self.similarity_matrix)[::-1]
        top_similarity = reversed_sorted[0]
        return self.categories[top_similarity]

    def get_summary(self):
        ''' Returns a list of the most important sentences in the legal document. '''
        self.get_paragraphs()
        print("Cleaning text input")
        self.lowercase_text()
        print("Generating embeddings")
        self.get_embeddings()
        print("Building similarity matrix")
        self.build_similarity_matrix()
        print(self.similarity_matrix)
        summary = self.get_important_category()
        return summary
Example #17
0
    def find_nearest(self,
                     search_query,
                     return_size=DEFAULT_RETURN_SIZE,
                     taggings=DEFAULT_TAGGING_TXT,
                     output_file=DEFAULT_OUTPUT_FILE):
        # Get an input query
        client = BertClient()
        search_query = str(search_query)
        v1 = client.encode([search_query])

        # Read in all cluster taggings
        f = open(taggings, "r")

        d = {}
        for line in f:
            try:
                line = line.rstrip("\n")
                line = line.rstrip(" ")
                line_split = line.split(", ")
                index = line_split[0].split(" ")[0]
                start = len(index) + 1
                index = int(index)
                line_split[0] = line_split[0][start:]
                for tag in line_split:
                    encoding = client.encode([tag])
                    d[tag] = [index, encoding]
            except:
                pass
        f.close()

        # Calculate cosine similarity between input query and all taggings
        similarity = {}  # tagging: score
        topn_score = []  # top n scores
        for key, value in d.items():
            score = cosine_similarity(d[key][1], v1)[0][0]
            similarity[key] = score
            if len(topn_score) < return_size or topn_score is None:
                topn_score.append(score)
            else:
                if self.if_larger(topn_score, score):
                    topn_score[0] = score
            topn_score.sort()

        result_clusters = {}
        for key, value in similarity.items():
            if similarity[key] in topn_score:
                if d[key][0] not in result_clusters.keys():
                    result_clusters[d[key][0]] = 0
                if result_clusters[d[key][0]] < similarity[key]:
                    result_clusters[d[key][0]] = topn_score[topn_score.index(
                        similarity[key])]

        output = open(output_file, "w")
        output.write("cluster number, probability\n")
        for i in result_clusters.keys():
            output.write("{},{}\n".format(i, result_clusters[i]))
        output.close()
        return result_clusters
def average_word_embeddings_with_without_emojis(data, emojisInData):
    bc = BertClient()

    sentiment_embeddings = []
    sentiment_embeddings_with_emojis = []

    bar = ChargingBar('Calculating word average embeddings with emojis\t\t\t',
                      max=len(data))
    for instance in data:
        if len(instance['tokens']) == 0:
            embedding = bc.encode([instance['tweet']])
        else:
            word_embeddings = []
            for word in instance['tokens']:
                wordList = [word]
                word_embeddings.append(bc.encode(wordList))

            # for each feature in the embedding, calucalute the avg of said feature for each word
            # all word embeddings should be the same size because thats how embeddings work
            # TODO add a check, probably
            word_embedding_sum = [0] * len(
                word_embeddings[0]
            )  # TODO add a check to make sure this first elem exists
            for word_embedding in word_embeddings:
                for i in range(len(word_embedding)):
                    word_embedding_sum[i] += word_embedding[i]
            embedding = [
                feature / len(word_embeddings)
                for feature in word_embedding_sum
            ]  # compute the average of each feature of the word embedding
            embedding = np.array(embedding)
            embedding = embedding.reshape(
                768
            )  # The hidden bert layer has 768 neurons (hence 768 features)

        sentiment_embeddings.append({
            "embedding": embedding,
            "sentiment": instance['sentiment']
        })

        # gets the freq of each emoji in a given tweet, returned in a list. This list has the same number of emojis and order of them for each tweet
        emojiFreqList = metrics.get_emojis_of_tweet(instance['tweet'],
                                                    emojisInData)
        combinedEmbedding = np.concatenate((embedding, emojiFreqList))
        combinedEmbedding = combinedEmbedding.reshape(1, -1)

        sentiment_embeddings_with_emojis.append({
            "embedding":
            combinedEmbedding[0],
            "sentiment":
            instance['sentiment']
        })
        bar.next()

    bar.finish()
    # print("word average embedding: " + str(sentiment_embeddings_with_emojis[0]))
    return sentiment_embeddings, sentiment_embeddings_with_emojis
Example #19
0
class RawBERTEncoder(Encoder):
    def __init__(self):
        self.client = BertClient(check_length=False)

    def encode(self, data):
        return self.client.encode([data]).tolist()[0]

    def encode_multiple(self, data):
        return self.client.encode(data).tolist()
Example #20
0
 def run(self):
     time_all = []
     bc = BertClient(port=PORT, port_out=PORT_OUT, show_server_config=False)
     for _ in range(self.num_repeat):
         start_t = time.perf_counter()
         bc.encode(self.batch)
         time_all.append(time.perf_counter() - start_t)
     print(time_all)
     self.avg_time = mean(time_all)
Example #21
0
def call_BERT_server_async(X, start_index, end_index, vocab_we, setting,
                           BERT_SERVER_IP):
    def handle_word_embeddings(encoded):
        # remove [CLS] and [SEP] part
        encoded = np.delete(encoded, 0, axis=0)
        encoded = np.delete(encoded, len(sent_tokenized), axis=0)

        # return to pre-padding zeros (BERT does post-padding)
        final = encoded[:len(sent_tokenized)]
        if len(X[i]) - final.shape[0] > 0:
            last = np.zeros((len(X[i]) - final.shape[0], final.shape[1]))
            final = np.vstack((last, final))
        return final

    X_new = []
    try:
        bc = BertClient(ip=BERT_SERVER_IP)  # ip address of the GPU machine
        for i in tqdm(range(start_index, end_index)):
            # return back to token strings
            if setting[0] == "BERT" or setting[0] == "BERT_SENT":  # sentences
                sent_tokenized = [vocab_we[s] for s in X[i] if s != 0]

                # encode with BERT embeddings
                encoded = bc.encode([sent_tokenized], is_tokenized=True)[0]

                if setting[0] == "BERT":
                    final = handle_word_embeddings(
                        encoded)  # padding & delete CLS/SEP
                elif setting[0] == "BERT_SENT":
                    final = encoded
            else:  # knowledge
                final = []
                for concept_list in X[i]:
                    if max(concept_list) == 0:
                        final.append(np.zeros(
                            (4, setting[2])))  #todo get the 4 from params
                    else:
                        encoded = bc.encode([
                            vocab_we[s][:-3] for s in concept_list if s != 0
                        ])[0]

                        if len(encoded) == setting[2]:
                            encoded = np.expand_dims(encoded, 0)
                        missing_concepts = 4 - len(encoded)
                        encoded = np.vstack((np.zeros(
                            (missing_concepts, setting[2])), encoded))

                        final.append(encoded)

            # add processed embedding sequence for sentence
            X_new.append(final)
        return np.array(X_new)
    except Exception as e:
        with print_lock:
            print(str(e))
        return np.array(X_new)
Example #22
0
def get_bert_vector(vocab: list) -> np.ndarray:

    from bert_serving.client import BertClient
    bc = BertClient()

    vectors = np.zeros((len(vocab), 768))
    vectors[2:1066] = bc.encode(vocab[2:1066])
    vectors[1067:] = bc.encode(vocab[1067:])
    scio.savemat('{}/bert.mat'.format(DATA_DIR), {'vectors': vectors})
    return vectors
Example #23
0
def add_bert_embeddings_to_df(df):
    from bert_serving.client import BertClient
    bc = BertClient()

    df["Question1_embedding"] = df["Question1"].apply(
        lambda row: bc.encode([row]))
    df["Question2_embedding"] = df["Question2"].apply(
        lambda row: bc.encode([row]))

    return df
Example #24
0
def deal_words_cos():
    model = BertClient()
    word_en = model.encode(['学生'])
    word_my = model.encode(['student'])
    dotmultiply = la.norm(word_en - word_my)
    word_en = math.sqrt(sum(map(lambda tmp_en: tmp_en * tmp_en, word_en.T)))
    word_my = math.sqrt(sum(map(lambda tmp_my: tmp_my * tmp_my, word_my.T)))
    #ret_cos = tmp_cos / (word_en * word_en)

    print(la.norm(word_en - word_my))
Example #25
0
def get_encoding():

    #dataset_path = "/home/kkuma12s/thesis/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/fever_full_binary_dev_sent_ret.jsonl"
    dataset_path = "/home/kkuma12s/thesis/Proof_Extraction/data/fever-full/complete_pipeline/sent_ret/fever_full_binary_dev_bert.jsonl"

    claims = []
    sents = []
    labels = []

    with jsonlines.open(dataset_path, mode='r') as f:
        tmp_dict = {}
        for example in f:
            claims.append(example["claim"])
            sents.append(example["sentence"])
            labels.append(example["label"])

        tmp_dict = {'claim': claims, 'sentence': sents, 'label': labels}
        train_data = pd.DataFrame(data=tmp_dict)

    print(train_data.shape)

    # len(train_df["sentence"])
    bc = BertClient()

    claims = train_data["claim"].tolist()
    sents = train_data["sentence"].tolist()

    print("claims length ", len(claims))

    sents_pair = [[claim + ' ||| ' + sent]
                  for claim, sent in zip(claims, sents)]

    print("sent pair length ", len(sents_pair))

    vec = np.empty((len(sents_pair), 768))

    count = 0
    for sent in sents_pair:

        if count == 0:
            # pass
            vec = bc.encode(sent)
        else:
            # pass
            vec = np.vstack((vec, bc.encode(sent)))

        if count % 300 == 0:
            print("count ", count)
        count += 1

    print("saving vector into zip")

    file_name = "/scratch/kkuma12s/new_embeddings/fever_full_dev_claim_cls_bert"

    save_dataset_and_compress(vec, file_name)
Example #26
0
class Encoding(object):
    def __init__(self):
        self.server_ip = "127.0.0.1"
        self.bert_client = BertClient(ip=self.server_ip)

    def encode(self, query):
        tensor = self.bert_client.encode([query])
        return tensor

    def query_similarity(self, query_list):
        tensors = self.bert_client.encode(query_list)
        return cosine_similarity(tensors)[0][1]
Example #27
0
class Encoding(object):
    def __init__(self):
        self.server_ip = "localhost"
        self.bert_client = BertClient(ip=self.server_ip)

    def encode(self, query):
        tensor = self.bert_client.encode([query])
        return tensor

    def query_similarity(self, query_list):
        tensors = self.bert_client.encode(query_list)
        # dist = np.linalg.norm(tensors[0]-tensors[1])
        # prea = stats.pearsonr(tensors[0],tensors[1])[0]
        return cosine_similarity(tensors)[0][1]
Example #28
0
def main(vocab_file: str, output: str, server_hostname: str):
    client = BertClient(ip=server_hostname)
    vocabulary = torch.load(vocab_file)
    vocab_size = len(vocabulary)

    fake_embedding = client.encode(["test"]).reshape(-1)
    embed_size = fake_embedding.shape[0]

    print("Encoding words into embeddings with size: ", embed_size)

    embeddings = np.empty((vocab_size, embed_size))
    for i in tqdm(range(len(embeddings)), ascii=True):
        embeddings[i] = client.encode([vocabulary.idx2word[i]])
    np.save(output, embeddings)
def Bert_embedding(sentences: [str]):
    bc = BertClient()
    tickets_vec = bc.encode(sentences)
    print(tickets_vec.shape)
    with open('models/BERT/Bert_representation.pickle', 'wb') as handle:
        pickle.dump(tickets_vec, handle)
    print("Embeddings Generated At models/BERT/Bert_representation.pickle")
Example #30
0
def analyzer():
    bc = BertClient(ip='bertserving', output_fmt='list')
    client = Elasticsearch('elasticsearch:9200')

    query = request.args.get('q')
    query_vector = bc.encode([query])[0]

    script_query = {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source":
                "cosineSimilarity(params.query_vector, 'topic_description_vector') + 1.0",
                "params": {
                    "query_vector": query_vector
                }
            }
        }
    }

    response = client.search(index='grants',
                             body={
                                 "size": SEARCH_SIZE,
                                 "query": script_query,
                                 "_source": {
                                     "includes":
                                     ["title", "topic_description"]
                                 }
                             })
    print(query)
    pprint(response)
    return jsonify(response)