コード例 #1
0
    def get_scores(self,
                   corpus: Corpus,
                   selector: Optional[Callable[[], bool]] = None):
        """
        Calculates average occurance per utterance. Used in summarize()
        
        :param corpus: the corpus used to compute averages
        :param selector: lambda function which takes in meta data and returns a boolean.
        """

        utts = [corpus.get_utterance(x) for x in corpus.get_utterance_ids()]

        if self.MRKR_NAME not in utts[0].meta:
            corpus = self.transform(corpus, markers=True)

        if selector != None:
            utts = [x for x in utts if selector(x.meta)]
            if len(utts) == 0:
                raise Exception("No query matches")

        counts = {
            k[21:len(k) - 2]: 0
            for k in utts[0].meta[self.MRKR_NAME].keys()
        }

        for utt in utts:
            for k, v in utt.meta[self.MRKR_NAME].items():
                counts[k[21:len(k) - 2]] += len(v)
        scores = {k: v / len(utts) for k, v in counts.items()}
        return scores
コード例 #2
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Computes the average number of questions asked in a conversation
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        if self.verbose: print("Finding questions per utterance")

        questions = []
        allutterids = corpus.get_utterance_ids()
        for i in list(range(0, len(allutterids))):
            utter_id = allutterids[i]
            text = corpus.get_utterance(utter_id).text
            nquestions = len(re.findall(r'\?+', text))
            questions.append(
                nquestions)  #gives number of questions in each utterance

        if self.verbose: print("Finding questions per conversation")
        allconvoids = corpus.get_conversation_ids()
        for i in list(range(0, len(allconvoids))):
            convo_id = allconvoids[i]
            convo_utters = corpus.get_conversation(convo_id)._utterance_ids
            avgquestion = np.mean(
                np.asarray(questions)[np.asarray(convo_utters)])
            corpus.get_conversation(convo_id)._meta[
                self.ATTR_NAME] = avgquestion
            #adds average questions per conversation to conversation metadata

        return corpus
コード例 #3
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Computes the count of pause and hesitancy words for each utterance, then aggregates them for each conversation
        
        :param corpus: the corpus to compute features for.
        :type corpus: Corpus
        """

        if self.verbose:
            print("Finding counts of pause and hesitancy words...")

        pause_words = [
            'um', 'umm', 'ummm', 'uh', 'uhh', 'uhhh', 'hm', 'hmm', 'hmmm',
            'er', 'err', 'uh huh', 'huh', 'mhm', 'mhmm', 'erm', '...', 'ah',
            'ahh', 'ahem', 'eh', 'ehh', 'ehhh', 'meh'
        ]
        hesitant_words = [
            'maybe', 'not', 'sure', 'unsure', 'probably', 'well', 'okay',
            'like', 'actually', 'basically', 'seriously', 'totally',
            'literally', 'know', 'mean', 'guess', 'suppose', 'but',
            'something', 'so', 'wow', 'just', 'really', 'later', 'wait',
            'future', 'almost', 'slightly', 'perhaps', 'somehow', 'sort',
            'kind', 'little', 'somewhat', 'hey', 'alas', 'see', 'sounds', 'ok',
            'roughly', 'why', 'how', 'yep', 'yup', 'may', 'possibly', 'might',
            'could', 'doubt', 'skeptical', 'don\'t', 'won\'t', 'nah'
        ]

        pause = []
        hesitancy = []
        allutterids = corpus.get_utterance_ids()
        for i in list(range(0, len(allutterids))):
            utter_id = allutterids[i]
            text = corpus.get_utterance(utter_id).text
            textcleaned = "".join(
                c for c in text
                if c not in ('!', '.', ':', '?', '\'', ',', '\"', '@', '#',
                             '$', '%', '^', '&', '*', '(', ')', '-', '~', '`',
                             '_', '+', '=', '>', '<', '[', ']', '{', '}'))
            textlist = textcleaned.split()
            npause = len([i for i in textlist if i in pause_words])
            nhesitant = len([i for i in textlist if i in hesitant_words])
            pause.append(
                npause)  #gives number of pause words in each utterance
            hesitancy.append(
                nhesitant)  #gives number of hesitant words in each utterance
            corpus.get_utterance(utter_id).meta[self.NAME1] = npause
            corpus.get_utterance(utter_id).meta[self.NAME2] = nhesitant

        allconvoids = corpus.get_conversation_ids()
        for i in list(range(0, len(allconvoids))):
            convo_id = allconvoids[i]
            convo_utters = corpus.get_conversation(convo_id)._utterance_ids
            avgpause = np.mean(np.asarray(pause)[np.asarray(convo_utters)])
            avghesitancy = np.mean(
                np.asarray(hesitancy)[np.asarray(convo_utters)])
            corpus.get_conversation(convo_id)._meta[self.NAME3] = avgpause
            corpus.get_conversation(convo_id)._meta[self.NAME4] = avghesitancy

        return corpus
コード例 #4
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Adds the ARI score to the metadata table of each utterance in the corpus.

        :return: corpus, modified with ARI and Flesch-Kincaid grade level scores assigned to each utterance
        """
        utt_ids = corpus.get_utterance_ids()
        for utt_id in utt_ids:  #add scores to each utterances metadata
            corpus.get_utterance(utt_id).meta['ARI'] = ARI(
                corpus.get_utterance(utt_id).text)
            corpus.get_utterance(
                utt_id).meta['Flesch-Kincaid'] = Flesch_Kincaid(
                    corpus.get_utterance(utt_id).text)
        return corpus
コード例 #5
0
ファイル: parser.py プロジェクト: calebchiam/cs6742-fork
    def transform(self, corpus: Corpus) -> Corpus:
        """Runs the SpaCy parser on each utterance in the corpus, and adds the 
        parses to the utterance metadata table.

        :return: corpus, modified with parses assigned to each utterance
        """
        utt_ids = corpus.get_utterance_ids()
        # if the user specifies multithreading, we will enable parallelized parsing
        # using spacy.pipe. Otherwise we will operate sequentially.
        if self.n_threads == 1:
            spacy_iter = (self.spacy_nlp(corpus.get_utterance(utt_id).text) for utt_id in utt_ids)
        else:
            spacy_iter = self.spacy_nlp.pipe((corpus.get_utterance(utt_id).text for utt_id in utt_ids), n_threads=self.n_threads)
        # add the spacy parses to the utterance metadata
        for utt_id, parsed in zip(utt_ids, spacy_iter):
            corpus.get_utterance(utt_id).meta[self.ATTR_NAME] = _remove_tensor(parsed)
        return corpus