def get_scores(self, corpus: Corpus, selector: Optional[Callable[[], bool]] = None): """ Calculates average occurance per utterance. Used in summarize() :param corpus: the corpus used to compute averages :param selector: lambda function which takes in meta data and returns a boolean. """ utts = [corpus.get_utterance(x) for x in corpus.get_utterance_ids()] if self.MRKR_NAME not in utts[0].meta: corpus = self.transform(corpus, markers=True) if selector != None: utts = [x for x in utts if selector(x.meta)] if len(utts) == 0: raise Exception("No query matches") counts = { k[21:len(k) - 2]: 0 for k in utts[0].meta[self.MRKR_NAME].keys() } for utt in utts: for k, v in utt.meta[self.MRKR_NAME].items(): counts[k[21:len(k) - 2]] += len(v) scores = {k: v / len(utts) for k, v in counts.items()} return scores
def transform(self, corpus: Corpus) -> Corpus: """Computes the average number of questions asked in a conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding questions per utterance") questions = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text nquestions = len(re.findall(r'\?+', text)) questions.append( nquestions) #gives number of questions in each utterance if self.verbose: print("Finding questions per conversation") allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgquestion = np.mean( np.asarray(questions)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[ self.ATTR_NAME] = avgquestion #adds average questions per conversation to conversation metadata return corpus
def transform(self, corpus: Corpus) -> Corpus: """Computes the count of pause and hesitancy words for each utterance, then aggregates them for each conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding counts of pause and hesitancy words...") pause_words = [ 'um', 'umm', 'ummm', 'uh', 'uhh', 'uhhh', 'hm', 'hmm', 'hmmm', 'er', 'err', 'uh huh', 'huh', 'mhm', 'mhmm', 'erm', '...', 'ah', 'ahh', 'ahem', 'eh', 'ehh', 'ehhh', 'meh' ] hesitant_words = [ 'maybe', 'not', 'sure', 'unsure', 'probably', 'well', 'okay', 'like', 'actually', 'basically', 'seriously', 'totally', 'literally', 'know', 'mean', 'guess', 'suppose', 'but', 'something', 'so', 'wow', 'just', 'really', 'later', 'wait', 'future', 'almost', 'slightly', 'perhaps', 'somehow', 'sort', 'kind', 'little', 'somewhat', 'hey', 'alas', 'see', 'sounds', 'ok', 'roughly', 'why', 'how', 'yep', 'yup', 'may', 'possibly', 'might', 'could', 'doubt', 'skeptical', 'don\'t', 'won\'t', 'nah' ] pause = [] hesitancy = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text textcleaned = "".join( c for c in text if c not in ('!', '.', ':', '?', '\'', ',', '\"', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '~', '`', '_', '+', '=', '>', '<', '[', ']', '{', '}')) textlist = textcleaned.split() npause = len([i for i in textlist if i in pause_words]) nhesitant = len([i for i in textlist if i in hesitant_words]) pause.append( npause) #gives number of pause words in each utterance hesitancy.append( nhesitant) #gives number of hesitant words in each utterance corpus.get_utterance(utter_id).meta[self.NAME1] = npause corpus.get_utterance(utter_id).meta[self.NAME2] = nhesitant allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgpause = np.mean(np.asarray(pause)[np.asarray(convo_utters)]) avghesitancy = np.mean( np.asarray(hesitancy)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[self.NAME3] = avgpause corpus.get_conversation(convo_id)._meta[self.NAME4] = avghesitancy return corpus
def transform(self, corpus: Corpus) -> Corpus: """Adds the ARI score to the metadata table of each utterance in the corpus. :return: corpus, modified with ARI and Flesch-Kincaid grade level scores assigned to each utterance """ utt_ids = corpus.get_utterance_ids() for utt_id in utt_ids: #add scores to each utterances metadata corpus.get_utterance(utt_id).meta['ARI'] = ARI( corpus.get_utterance(utt_id).text) corpus.get_utterance( utt_id).meta['Flesch-Kincaid'] = Flesch_Kincaid( corpus.get_utterance(utt_id).text) return corpus
def transform(self, corpus: Corpus) -> Corpus: """Runs the SpaCy parser on each utterance in the corpus, and adds the parses to the utterance metadata table. :return: corpus, modified with parses assigned to each utterance """ utt_ids = corpus.get_utterance_ids() # if the user specifies multithreading, we will enable parallelized parsing # using spacy.pipe. Otherwise we will operate sequentially. if self.n_threads == 1: spacy_iter = (self.spacy_nlp(corpus.get_utterance(utt_id).text) for utt_id in utt_ids) else: spacy_iter = self.spacy_nlp.pipe((corpus.get_utterance(utt_id).text for utt_id in utt_ids), n_threads=self.n_threads) # add the spacy parses to the utterance metadata for utt_id, parsed in zip(utt_ids, spacy_iter): corpus.get_utterance(utt_id).meta[self.ATTR_NAME] = _remove_tensor(parsed) return corpus