def extract_fact_set_mapped(factsets): sentences = [] original_sentences = dict() for idx, data in factsets.items(): fun_facts = data.get("fun_facts") if fun_facts: for fact in fun_facts: cleaned_fact = clean(fact) sentences.append(cleaned_fact) original_sentences[cleaned_fact] = fact short_wiki = data.get("shortened_wiki_lead_section") if short_wiki: cleaned_swiki = clean(short_wiki) sentences.append(cleaned_swiki) original_sentences[cleaned_swiki] = short_wiki summarized_wiki = data.get("summarized_wiki_lead_section") if summarized_wiki: cleaned_sum_wiki = clean(summarized_wiki) sentences.append(cleaned_sum_wiki) original_sentences[cleaned_sum_wiki] = summarized_wiki return original_sentences
def prepare_reading_set_for_conversation(conv_id, reading_set): conv_reading_set = reading_set[conv_id] fact_mapping_1 = extract_fact_set_mapped(conv_reading_set["agent_1"]) fact_mapping_2 = extract_fact_set_mapped(conv_reading_set["agent_2"]) fact_set_1 = set(fact_mapping_1.keys()) fact_set_2 = set(fact_mapping_2.keys()) article_data = conv_reading_set["article"] article_indices = ['AS1', 'AS2', 'AS3', 'AS4'] common_knowledge_mapping = dict() if "AS1" in article_data: for idx in article_indices: sentence = article_data[idx] if len(word_tokenize(sentence)) < 5: continue cleaned_sentence = clean(sentence) common_knowledge_mapping[cleaned_sentence] = sentence common_knowledge_set = set(common_knowledge_mapping.keys()) fact_set_1.update(common_knowledge_set) fact_set_2.update(common_knowledge_set) fact_mapping_1.update(common_knowledge_mapping) fact_mapping_2.update(common_knowledge_mapping) agent_knowledge = { "agent_1": list(fact_set_1), "agent_2": list(fact_set_2) } agent_mapping = {"agent_1": fact_mapping_1, "agent_2": fact_mapping_2} return agent_knowledge, agent_mapping
def knowledge_selection_strategy(text, available_knowledge): fact_sims = get_cosine_similarity_embs_all(clean(text), available_knowledge, model, knowledge_policy="bert") fact_sims.sort(key=lambda x: x[1], reverse=True) return fact_sims
def prepare_sentence_knowledge_data(agent_mapping, conv_id, dialog_act, tokenizer, turn, sentence, ranker, da_index): knowledge_sentence = ranker.get_top_fact(clean(sentence), conv_id, threshold=True) original_knowledge_sentence = agent_mapping[turn["agent"]].get( knowledge_sentence, "") return tokenizer.encode(sentence), [ turn[dialog_act][da_index] ], tokenizer.encode(original_knowledge_sentence)
def knowledge_selection_strategy(text, available_knowledge): text_tfidf = vectorizer.transform([clean(text)]) knowledge_tfidf = vectorizer.transform(available_knowledge) similarity = np.squeeze( np.asarray( text_tfidf.dot(knowledge_tfidf.transpose()).todense())) top_n_indices = similarity.argsort()[-3:][::-1].tolist() top_similarities = [similarity[i] for i in top_n_indices] top_n_knowledges = [available_knowledge[i] for i in top_n_indices] return list(zip(top_n_knowledges, top_similarities))
def get_tfidf_conv_knowledge(conv_id, test_freq_reading_set): conv_reading_set = test_freq_reading_set[conv_id] fact_set_1 = set(extract_fact_set(conv_reading_set["agent_1"])) fact_set_2 = set(extract_fact_set(conv_reading_set["agent_2"])) article_data = conv_reading_set["article"] article_indices = ['AS1', 'AS2', 'AS3', 'AS4'] common_knowledge_set = set() if "AS1" in article_data: for idx in article_indices: sentence = article_data[idx] if len(word_tokenize(sentence)) < 5: continue common_knowledge_set.add(clean(sentence)) fact_set_1.update(common_knowledge_set) fact_set_2.update(common_knowledge_set) agent_knowledge = { "agent_1": list(fact_set_1), "agent_2": list(fact_set_2) } return agent_knowledge
def prepare_turn_data(agent_mapping, available_knowledge, conv_id, dialog_act, knowledge_policy, response, tokenizer, turn, vec, sentiment=None, ranker=None): knowledge_sentence = "" for segment in turn["segments"]: sentence = segment["text"] if knowledge_policy == "none": # Always return an empty sentence break if knowledge_policy == "tf_idf": # With regards to knowledge selection, this is a highly approximate heuristic. # Both Gopalakrishnan et al. 2019 and Hedayatnia et al. 2020 # acknowledge they don't have anything better for this issue text_tfidf = vec.transform([clean(sentence)]) """ In this section, we find the knowledge sentence that is closest to the ground truth response expected from the model. This is so that the model learns to appropriately condition on the knowledge """ knowledge_tfidf = vec.transform(available_knowledge) similarities = linear_kernel(knowledge_tfidf, text_tfidf).flatten() closest_knowledge_index = similarities.argsort()[-1] if similarities[closest_knowledge_index] > 0.3: knowledge_sentence = available_knowledge[ closest_knowledge_index] break else: knowledge_sentence = ranker.get_top_fact(clean(sentence), conv_id, threshold=True) if knowledge_sentence != "": break else: if knowledge_policy == "tf_idf": text_tfidf = vec.transform([clean(response)]) knowledge_tfidf = vec.transform(available_knowledge) similarities = linear_kernel(knowledge_tfidf, text_tfidf).flatten() closest_knowledge_index = similarities.argsort()[-1] knowledge_sentence = available_knowledge[closest_knowledge_index] \ if similarities[closest_knowledge_index] > 0.3 else "" original_knowledge_sentence = agent_mapping[turn["agent"]].get( knowledge_sentence, "") if sentiment: current_turn_data = (tokenizer.encode(response), turn["sentiment_vader"], tokenizer.encode(original_knowledge_sentence)) else: current_turn_data = (tokenizer.encode(response), turn[dialog_act], tokenizer.encode(original_knowledge_sentence)) return current_turn_data