def test_partial_load_invalid_end_index(self): speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker( id="alice", meta={'speaker_binary_data': speaker_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1) self.assertEqual(len(list(corpus2.iter_utterances())), 0)
def test_partial_load_start_idx_specified_only(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=1) self.assertEqual(len(list(corpus2.iter_utterances())), 2) self.assertEqual(corpus1.get_utterance("1"), corpus2.get_utterance("1")) self.assertEqual(corpus1.get_utterance("2"), corpus2.get_utterance("2"))
def transform(self, corpus: Corpus) -> Corpus: for utt in corpus.iter_utterances(): if self.utt_selector(utt): utt.add_meta(self.perplexity_feat_name, self.model.str_perplexity(self.utt_text_func(utt))) else: utt.add_meta(self.perplexity_feat_name, None) return corpus
def get_corpus_leaf_ids(c: Corpus) -> set: leaves = set() not_leaves = set() for utt in c.iter_utterances(): if utt.id not in not_leaves: leaves.add(utt.id) if utt.reply_to in leaves: leaves.remove(utt.reply_to) not_leaves.add(utt.reply_to) return leaves
# -*- coding: utf-8 -*- """ Created on Mon Jul 27 23:20:11 2020 @author: kach """ from convokit import Corpus, download corpus = Corpus(filename=download("conversations-gone-awry-corpus")) corpus.print_summary_stats() reviews = open("data/reviews.txt", "w", encoding="utf-8") label = open("data/labels.txt", "w") #i=0 for utt in corpus.iter_utterances(): #i+=1 txt = str(utt.text).replace('\n', ' ') reviews.write(txt + '\n') if utt.meta['comment_has_personal_attack']: l = '1' else: l = '0' label.write(l + '\n') #if i>10: # break reviews.close() label.close()
# This example extracts politeness strategies from the Conversations Gone Awry dataset, # one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html). # For code reproducing the full results of the paper, see the example notebook in the # `conversations-gone-awry` example subdirectory. import pandas as pd from convokit import PolitenessStrategies, Corpus, download print("Loading awry corpus...") corpus = Corpus(filename=download('conversations-gone-awry-corpus')) # extract the politeness strategies. # Note: politeness strategies are a hand-engineered feature set, so no fitting is needed. ps = PolitenessStrategies(verbose=100) print("Extracting politeness strategies...") corpus = ps.transform(corpus) values = [] idx = [] for utterance in corpus.iter_utterances(): values.append(utterance.meta["politeness_strategies"]) idx.append(utterance.id) pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv") print("Done, results written to awry_strategy_df_v2.csv")
def get_examples(self, filename, ns_name, anserini_folder, sent_bert_model, loss, output_dir, input_pair=True, eval_data=False, denoise_negatives=False, num_ns_for_denoising=100, generative_model = 'facebook/blenderbot-3B', remove_cand_subsets=True, last_utterance_only=False, use_external_corpus=False): """ filename specified which data split to use (train.csv, dev.csv, test.csv). """ filepath = os.path.join(self.dataset_folder, filename) self.data = pd.read_csv(filepath, sep="\t") if denoise_negatives: num_ns = num_ns_for_denoising else: num_ns = 10 candidates = list(self.data["response"].values) if use_external_corpus: external_datasets = [ 'movie-corpus', 'wiki-corpus', 'subreddit-Ubuntu', 'subreddit-microsoft', 'subreddit-apple', 'subreddit-Database', 'subreddit-DIY', 'subreddit-electronics', 'subreddit-ENGLISH', 'subreddit-gis', 'subreddit-Physics', 'subreddit-scifi', 'subreddit-statistics', 'subreddit-travel', 'subreddit-worldbuilding' ] for ds_name in external_datasets: corpus = Corpus(download(ds_name)) corpus.print_summary_stats() for utt in corpus.iter_utterances(): if utt.text != "": candidates.append(utt.text) if ns_name == "random" or eval_data: self.negative_sampler = negative_sampling.RandomNegativeSampler(candidates, num_ns) elif ns_name == "bm25": index_folder = "/anserini_train_-1/" if use_external_corpus: index_folder = index_folder.replace("train", "train_expanded_") self.negative_sampler = negative_sampling.BM25NegativeSamplerPyserini(candidates, num_ns, self.dataset_folder+index_folder, -1, anserini_folder) elif ns_name == "sentence_transformer": self.negative_sampler = negative_sampling.SentenceBERTNegativeSampler(candidates, num_ns, self.dataset_folder+"/train_sentenceBERTembeds", -1, sent_bert_model, large_index=use_external_corpus) elif ns_name == "generative": self.negative_sampler = negative_sampling.GenerativeNegativeSamplerForDialogue(num_ns, generative_model) if loss == 'MarginMSELoss': self.negative_sampler.score_relevant_docs = True if loss == "ContrastiveLoss" and not eval_data: input_pair = False if loss == "OnlineContrastiveLoss" and not eval_data: input_pair = False examples = [] scores_df = [] # Code used to annotate some samples # samples_to_annotate = [] # self.data = self.data.sample(200, random_state=42) # self.negative_sampler.score_relevant_docs = True count_ns_part_of_context = 0 for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))): context = row[0] if last_utterance_only: if 'msdialog' in self.dataset_folder: context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip() else: context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip() relevant_response = row[1] if not input_pair: examples.append(InputExample(guid=filename+str(idx)+"_pos", texts=[context, relevant_response], label=1.0)) if ns_name == "bm25" and not eval_data: ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response], max_query_len = 512, normalize_scores = False, rel_doc_id = str(idx)) else: ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response]) rel_score = rel_scores[0] if denoise_negatives: zipped = zip(ns_candidates[-10:], ns_scores[-10:]) else: zipped = zip(ns_candidates, ns_scores) for ns, score_ns in zipped: if remove_cand_subsets and ns.replace("<<<AGENT>>>: ", "") in context: count_ns_part_of_context+=1 else: if input_pair: examples.append(InputExample(texts=[context, relevant_response, ns], label=float(rel_score-score_ns))) scores_df.append(rel_score-score_ns) # samples_to_annotate.append([self.dataset_folder.split("/")[-1], ns_name, context, relevant_response, ns, rel_score, score_ns]) else: examples.append(InputExample(guid=filename+str(idx)+"_neg", texts=[context, ns], label=0.0)) logging.info("{} {} count of ns which are part of the context: {} out of {}.".format(self.dataset_folder.split("/")[-1], ns_name, count_ns_part_of_context, len(examples))) # print(pd.DataFrame(scores_df).describe()) # pd.DataFrame(samples_to_annotate, columns=['task', 'ns', 'context', 'rel_response', 'negative_sample', 'rel_score', 'score_negative']).\ # to_csv(output_dir+"neg_samples_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]), index=False) if loss == 'MarginMSELoss': pd.DataFrame(scores_df).to_csv(output_dir+"MarginScores_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1])) return examples