def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], [str(type('bar'))]) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], [str(type(1))]) self.assertEqual(corpus1.meta_index.speakers_index['surname'], [str(type(1.0))]) # test that deleting an attribute from an individual utterance fails to remove it del corpus1.get_utterance("2").meta['hey'] corpus1.get_utterance("2").meta['hey'] # test that delete_metadata works corpus1.delete_metadata('utterance', 'foo') self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], str(type('bar'))) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], str(type(1))) self.assertEqual(corpus1.meta_index.speakers_index['surname'], str(type(1.0))) # test that deleting a key from an utterance removes it from the index del corpus1.get_utterance("2").meta['hey'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['hey']) # test that deleting a key from an utterance removes it from the index and from all other objects of same type del corpus1.get_utterance("1").meta['foo'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def test_corpus_dump(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 corpus1.dump('test_index_meta_corpus', base_path="./") corpus2 = Corpus(filename="test_index_meta_corpus") self.assertEqual(corpus1.meta_index.utterances_index, corpus2.meta_index.utterances_index) self.assertEqual(corpus1.meta_index.speakers_index, corpus2.meta_index.speakers_index) self.assertEqual(corpus1.meta_index.conversations_index, corpus2.meta_index.conversations_index) self.assertEqual(corpus1.meta_index.overall_index, corpus2.meta_index.overall_index)
def test_basic_functions(self): """ Test basic meta functions """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) first_utt = corpus1.get_utterance("0") first_utt.meta['hey'] = 9 # correct class type stored self.assertEqual(corpus1.meta_index.utterances_index['hey'], repr(type(9))) # keyErrors result in None output self.assertRaises(KeyError, lambda: first_utt.meta['nonexistent key']) # test that setting a custom get still works self.assertEqual(first_utt.meta.get('nonexistent_key', {}), {})
def burr_sir_corpus(): hamilton = Speaker(id='hamilton') burr = Speaker(id='burr') utterances = [ Utterance(id='0', text=BURR_SIR_TEXT_1, speaker=hamilton), Utterance(id='1', text=BURR_SIR_TEXT_2, speaker=burr) ] return Corpus(utterances=utterances)
def politeness_test_zh_corpus(): speakers = [Speaker(id='alice'), Speaker(id='bob')] texts = [GRATITUDE_ZH, DEFERENCE_ZH, GREETING_ZH, APOLOGY_ZH, PLEASE_ZH, PLEASE_START_ZH, BTW_ZH, DIRECT_QN_ZH , HEDGES_ZH, FACTUALITY_ZH] utterances = [Utterance(id='0', text=texts[0], speaker=speakers[1], reply_to=None)] for i, text in enumerate(texts[1:]): utterances.append(Utterance(id=str(i+1), text=text, speaker=speakers[i%2], reply_to=str(i))) return Corpus(utterances=utterances)
def politeness_test_corpus(): speakers = [Speaker(id='alice'), Speaker(id='bob')] texts = [GRATITUDE, DEFERENCE, GREETING, APOLOGY, PLEASE, PLEASE_START, BTW, DIRECT_QN, DIRECT_START, SUBJUNCTIVE, INDICATIVE, HEDGES, FACTUALITY] utterances = [Utterance(id='0', text=texts[0], speaker=speakers[1], reply_to=None)] for i, text in enumerate(texts[1:]): utterances.append(Utterance(id=str(i+1), text=text, speaker=speakers[i%2], reply_to=str(i))) return Corpus(utterances=utterances)
def test_broken_convos(self): """ Test basic meta functions """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to=None, speaker=Speaker(id="alice2"), timestamp=0), ]) corpus2 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to="9", speaker=Speaker(id="alice2"), timestamp=0), ]) # test broken convo where there are multiple roots convo = corpus1.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True))) # test broken convo where utterance replies to something not in Conversation convo = corpus2.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True)))
def transform_utterance(self, utterance, spacy_nlp=None, markers=False): """ Extract politeness strategies for raw string inputs. :param utterance: the utterance to be annotated with politeness strategies. :spacy_nlp: if provided, will use this SpaCy object to do parsing; otherwise will initialize an object via `load('en')`. :return: the utterance with politeness annotations. """ if isinstance(utterance, str): utterance = Utterance(text=utterance, speaker=Speaker(id='speaker')) if spacy_nlp is None: spacy_nlp = spacy.load('en_core_web_sm', disable=['ner']) utterance.meta['parsed'] = process_text(utterance.text, spacy_nlp=spacy_nlp) for i, sent in enumerate(utterance.meta["parsed"]): for p in sent["toks"]: p["tok"] = p['tok'].lower() utterance.meta[ self.strategy_attribute_name], marks = self.__extractor_lookup[ self.strategy_collection](utterance) if markers: utterance.meta[self.marker_attribute_name] = marks return utterance
def transform_utterance(self, utt, override_input_filter=False): """ Computes per-utterance attributes of an individual utterance or string. For utterances which do not contain all of the `input_field` attributes as specified in the constructor, or for utterances which return `False` on `input_filter`, this call will not annotate the utterance. For strings, will convert the string to an utterance and return the utterance, annotating it if `input_field` is not set to `None` at initialization. :param utt: utterance or a string :param override_input_filter: ignore `input_filter` and compute attribute for all utterances :return: the utterance """ if isinstance(utt, str): utt = Utterance(text=utt, speaker=Speaker(id="speaker")) if self.input_field is None: text_entry = utt.text else: if not override_input_filter: if not self.input_filter(utt, self.aux_input): return utt if isinstance(self.input_field, str): text_entry = utt.retrieve_meta(self.input_field) elif isinstance(self.input_field, list): text_entry = {field: utt.retrieve_meta(field) for field in self.input_field} if sum(x is None for x in text_entry.values()) > 0: return utt if text_entry is None: return utt if len(self.aux_input) == 0: result = self.proc_fn(text_entry) else: result = self.proc_fn(text_entry, self.aux_input) if self.multi_outputs: for res, out in zip(result, self.output_field): utt.add_meta(out, res) else: utt.add_meta(self.output_field, result) return utt
def transform_utterance(self, utt: Utterance, spacy_nlp: Callable[[str], Doc] = None, markers: bool = False): """ Extract politeness strategies for raw string inputs (or individual utterances) :param utt: the utterance to be annotated with politeness strategies. :spacy_nlp: if provided, will use this SpaCy object to do parsing; otherwise will initialize an object via `load('en')`. :return: the utterance with politeness annotations. """ if isinstance(utt, str): utt = Utterance(text=utt, speaker=Speaker(id='speaker')) if self.parse_attribute_name not in utt.meta: if spacy_nlp is None: raise ValueError('spacy object required') parses = process_text(utt.text, spacy_nlp=spacy_nlp) utt.add_meta(self.parse_attribute_name, parses) parsed = utt.retrieve_meta(self.parse_attribute_name) for i, sent in enumerate(parsed): for p in sent["toks"]: p["tok"] = p['tok'].lower() parses = [x["toks"] for x in parsed] utt.meta[self.strategy_attribute_name], marks = self._extractor_lookup[self.strategy_collection](parses) if markers: utt.meta[self.marker_attribute_name] = marks return utt
def test_overlap_diff_data(self): """ Merge with overlap in utterance id and utterance has diff data but same metadata Warning should be printed. Original utterance data should be preserved. """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance( id="2", text="this is a test2", speaker=Speaker(id="candace")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3) self.assertEqual(merged.get_utterance("2").text, "this is a test") self.assertEqual( merged.get_utterance("2").speaker, Speaker(id="charlie"))
def test_multiple_types(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance('2').meta['hey'] = None self.assertEqual(corpus1.meta_index.utterances_index.get('hey', None), None) corpus1.get_utterance('0').meta['hey'] = 5 self.assertEqual(corpus1.meta_index.utterances_index['hey'], [str(type(5))]) corpus1.get_utterance('1').meta['hey'] = 'five' self.assertEqual(corpus1.meta_index.utterances_index['hey'], [str(type(5)), str(type('five'))])
def test_with_overlap(self): """ Basic merge: with overlap in utterance id (but utterance has same data & metadata) """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3)
def test_add_utterance(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance(id="2", text="this is a test", speaker=Speaker(id="charlie"), meta={ 'hey': 'jude', 'hello': 'world' }), ]) utts = [ Utterance(id="1", text="i like pie", speaker=Speaker(id="delta")), Utterance(id="2", text="this is a test", speaker=Speaker(id="charlie"), meta={ 'hello': 'food', 'what': 'a mood' }), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ] added = corpus1.add_utterances(utts) self.assertEqual(len(list(added.iter_utterances())), 4) self.assertEqual(len(added.get_utterance("2").meta), 3) self.assertEqual(added.get_utterance("2").meta['hello'], 'food')
def test_corpus_metadata(self): """ Merge with overlap in corpus metadata Expect second corpus metadata to override if keys are the same """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id="3", text="i like pie", speaker=Speaker(id="delta")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) corpus1.add_meta('politeness', 0.95) corpus1.add_meta('toxicity', 0.8) corpus2.add_meta('toxicity', 0.9) corpus2.add_meta('paggro', 1.0) merged = corpus1.merge(corpus2) self.assertEqual(len(merged.meta), 3) self.assertEqual(merged.meta['toxicity'], 0.9)
def test_corpus_merge_add(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' # test that adding separately initialized utterances with new metadata updates Index new_utt = Utterance(id="4", text="hello world", speaker=Speaker(id="alice", meta={'donkey': 'kong'}), meta={'new': 'meta'}) new_corpus = corpus1.add_utterances([new_utt]) self.assertTrue('new' in new_corpus.meta_index.utterances_index) self.assertTrue('donkey' in new_corpus.meta_index.speakers_index)
def transform_utterance(self, utt, **params): """ Computes attributes of an individual string or utterance using all of the transformers in the pipeline. :param utt: the utterance to compute attributes for. :return: the utterance, with new attributes. """ params_steps = self._parse_param_steps(params) if isinstance(utt, str): utt = Utterance(text=utt, speaker=Speaker(id="speaker")) for name, transform in self.steps: if name in params_steps: utt = transform.transform_utterance(utt, **params_steps[name]) else: utt = transform.transform_utterance(utt) return utt
def test_overlap_convo_metadata(self): """ Merge with overlap in conversation with metadata differences. Expect second corpus convo metadata to override if keys are the same """ corpus1 = Corpus(utterances=[ Utterance(id="0", conversation_id='convo1', text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", conversation_id='convo1', text="my name is bob", speaker=Speaker(id="bob")), Utterance(id="2", conversation_id='convo1', text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id="2", conversation_id='convo1', text="this is a test", speaker=Speaker(id="charlie")), Utterance(id="4", conversation_id='convo1', text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", conversation_id='convo1', text="goodbye", speaker=Speaker(id="foxtrot")), ]) corpus1.get_conversation('convo1').add_meta('hey', 'jude') corpus1.get_conversation('convo1').add_meta('hello', 'world') corpus2.get_conversation('convo1').add_meta('hey', 'jude') corpus2.get_conversation('convo1').add_meta('hello', 'food') corpus2.get_conversation('convo1').add_meta('what', 'a mood') merged = corpus1.merge(corpus2) self.assertEqual(len(merged.get_conversation('convo1').meta), 3) self.assertEqual( merged.get_conversation('convo1').meta['hello'], 'food')
def test_overlap_diff_metadata(self): """ Merge with overlap in utterance id and utterance has same data but diff metadata Second corpus utterance metadata should override if the keys are the same. """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance(id="2", text="this is a test", speaker=Speaker(id="charlie"), meta={ 'hey': 'jude', 'the': 'beatles' }), ]) corpus2 = Corpus(utterances=[ Utterance(id="2", text="this is a test", speaker=Speaker(id="charlie"), meta={ 'hey': 'jude', 'the': 'ringo', 'let it': 'be' }), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 5) self.assertEqual(len(list(merged.iter_speakers())), 5) self.assertEqual(len(merged.get_utterance("2").meta), 3) self.assertEqual(merged.get_utterance("2").meta['the'], 'ringo')
def test_no_overlap(self): """ Basic merge: no overlap in utterance id """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id="3", text="i like pie", speaker=Speaker(id="delta")), Utterance( id="4", text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", text="goodbye", speaker=Speaker(id="foxtrot")), ]) merged = corpus1.merge(corpus2) self.assertEqual(len(list(merged.iter_utterances())), 6) self.assertEqual(len(list(merged.iter_speakers())), 6) self.assertEqual(len(list(corpus1.iter_utterances())), 3) self.assertEqual(len(list(corpus2.iter_utterances())), 3)
def setUp(self) -> None: """ Basic Conversation tree (left to right within subtree => earliest to latest) 0 1 2 3 4 5 6 7 8 9 10 11 """ self.corpus = Corpus(utterances=[ Utterance(id="0", reply_to=None, root="0", speaker=Speaker(id="alice"), timestamp=0), Utterance(id="2", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=2), Utterance(id="1", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=1), Utterance(id="3", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=3), Utterance(id="4", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="5", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="6", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=6), Utterance(id="7", reply_to="2", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="8", reply_to="2", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="9", reply_to="3", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="10", reply_to="4", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="11", reply_to="9", root="0", speaker=Speaker(id="alice"), timestamp=10), Utterance(id="other", reply_to=None, root="other", speaker=Speaker(id="alice"), timestamp=99) ]) self.corpus.get_conversation("0").meta['hey'] = 'jude' self.corpus.meta['foo'] = 'bar'