コード例 #1
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Computes the average number of questions asked in a conversation
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        if self.verbose: print("Finding questions per utterance")

        questions = []
        allutterids = corpus.get_utterance_ids()
        for i in list(range(0, len(allutterids))):
            utter_id = allutterids[i]
            text = corpus.get_utterance(utter_id).text
            nquestions = len(re.findall(r'\?+', text))
            questions.append(
                nquestions)  #gives number of questions in each utterance

        if self.verbose: print("Finding questions per conversation")
        allconvoids = corpus.get_conversation_ids()
        for i in list(range(0, len(allconvoids))):
            convo_id = allconvoids[i]
            convo_utters = corpus.get_conversation(convo_id)._utterance_ids
            avgquestion = np.mean(
                np.asarray(questions)[np.asarray(convo_utters)])
            corpus.get_conversation(convo_id)._meta[
                self.ATTR_NAME] = avgquestion
            #adds average questions per conversation to conversation metadata

        return corpus
コード例 #2
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Computes the count of pause and hesitancy words for each utterance, then aggregates them for each conversation
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        if self.verbose:
            print("Finding counts of pause and hesitancy words...")

        pause_words = [
            'um', 'umm', 'ummm', 'uh', 'uhh', 'uhhh', 'hm', 'hmm', 'hmmm',
            'er', 'err', 'uh huh', 'huh', 'mhm', 'mhmm', 'erm', '...', 'ah',
            'ahh', 'ahem', 'eh', 'ehh', 'ehhh', 'meh'
        ]
        hesitant_words = [
            'maybe', 'not', 'sure', 'unsure', 'probably', 'well', 'okay',
            'like', 'actually', 'basically', 'seriously', 'totally',
            'literally', 'know', 'mean', 'guess', 'suppose', 'but',
            'something', 'so', 'wow', 'just', 'really', 'later', 'wait',
            'future', 'almost', 'slightly', 'perhaps', 'somehow', 'sort',
            'kind', 'little', 'somewhat', 'hey', 'alas', 'see', 'sounds', 'ok',
            'roughly', 'why', 'how', 'yep', 'yup', 'may', 'possibly', 'might',
            'could', 'doubt', 'skeptical', 'don\'t', 'won\'t', 'nah'
        ]

        pause = []
        hesitancy = []
        allutterids = corpus.get_utterance_ids()
        for i in list(range(0, len(allutterids))):
            utter_id = allutterids[i]
            text = corpus.get_utterance(utter_id).text
            textcleaned = "".join(
                c for c in text
                if c not in ('!', '.', ':', '?', '\'', ',', '\"', '@', '#',
                             '$', '%', '^', '&', '*', '(', ')', '-', '~', '`',
                             '_', '+', '=', '>', '<', '[', ']', '{', '}'))
            textlist = textcleaned.split()
            npause = len([i for i in textlist if i in pause_words])
            nhesitant = len([i for i in textlist if i in hesitant_words])
            pause.append(
                npause)  #gives number of pause words in each utterance
            hesitancy.append(
                nhesitant)  #gives number of hesitant words in each utterance
            corpus.get_utterance(utter_id).meta[self.NAME1] = npause
            corpus.get_utterance(utter_id).meta[self.NAME2] = nhesitant

        allconvoids = corpus.get_conversation_ids()
        for i in list(range(0, len(allconvoids))):
            convo_id = allconvoids[i]
            convo_utters = corpus.get_conversation(convo_id)._utterance_ids
            avgpause = np.mean(np.asarray(pause)[np.asarray(convo_utters)])
            avghesitancy = np.mean(
                np.asarray(hesitancy)[np.asarray(convo_utters)])
            corpus.get_conversation(convo_id)._meta[self.NAME3] = avgpause
            corpus.get_conversation(convo_id)._meta[self.NAME4] = avghesitancy

        return corpus