コード例 #1
0
    def _get_context_reply_label_dict(self,
                                      corpus: Corpus,
                                      convo_selector,
                                      utt_excluder,
                                      include_label=True):
        """
        Returns a dict mapping reply id to (context, reply, label).

        If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt
        """
        dialogs = []
        if self.convo_structure == "branched":
            for convo in corpus.iter_conversations(convo_selector):
                try:
                    for path in convo.get_root_to_leaf_paths():
                        path = [utt for utt in path if not utt_excluder(utt)]
                        if len(path) == 1: continue
                        dialogs.append(path)
                except ValueError as e:
                    if not self.skip_broken_convos:
                        raise e

        elif self.convo_structure == "linear":
            for convo in corpus.iter_conversations(convo_selector):
                utts = convo.get_chronological_utterance_list(
                    selector=lambda x: not utt_excluder(x))
                if len(utts) == 1: continue
                dialogs.append(utts)

        id_to_context_reply_label = dict()

        if self.forecast_mode == 'future':
            for dialog in dialogs:
                id_to_context_reply_label[dialog[-1].id] = (dialog, dialog[-1],
                                                            None)

        for dialog in dialogs:
            if self.use_last_only:
                reply = self.text_func(dialog[-1])
                context = [self.text_func(utt) for utt in dialog[:-1]]
                label = self.label_func(dialog[-1]) if include_label else None
                id_to_context_reply_label[dialog[-1].id] = (context, reply,
                                                            label)
            else:
                for idx in range(1, len(dialog)):
                    reply = self.text_func(dialog[idx])
                    label = self.label_func(
                        dialog[idx]) if include_label else None
                    reply_id = dialog[idx].id
                    context = [self.text_func(utt) for utt in dialog[:idx]]
                    id_to_context_reply_label[reply_id] = (
                        context, reply,
                        label) if include_label else (context, reply, None)

        return id_to_context_reply_label
コード例 #2
0
 def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True):
     """
     Returns a DataFrame of utterances and their forecasts (and forecast probabilities)
     :param corpus: target Corpus
     :param use_selector: whether to use Forecaster's convo and utterance selector functions
     :param exclude_na: whether to drop NaN results
     :return: a pandas DataFrame
     """
     utt_forecast_prob = []
     if use_selector:
         for convo in corpus.iter_conversations(self.convo_selector_func):
             for utt in convo.iter_utterances(self.utt_selector_func):
                 utt_forecast_prob.append(
                     (utt.id, utt.meta[self.forecast_feat_name],
                      utt.meta[self.forecast_prob_feat_name]))
     else:
         for utt in corpus.iter_utterances():
             utt_forecast_prob.append(
                 (utt.id, utt.meta[self.forecast_feat_name],
                  utt.meta[self.forecast_prob_feat_name]))
     forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \
         .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False)
     if exclude_na:
         forecast_df = forecast_df.dropna()
     return forecast_df
コード例 #3
0
    def retrieve_feats(self, corpus: Corpus) -> Dict[str, Dict]:
        """
        Retrieve all hypergraph features for a given corpus (viewed as a set
        of conversation threads).

        See init() for further documentation.

        :return: A dictionary from a thread root id to its stats dictionary,
            which is a dictionary from feature names to feature values. For degree-related
            features specifically.
        """

        threads_stats = dict()

        for convo in corpus.iter_conversations():
            ordered_utts = convo.get_chronological_utterance_list()
            if len(ordered_utts) < self.min_thread_len: continue
            utts = ordered_utts[:self.prefix_len]
            stats = {}
            G = Hypergraph.init_from_utterances(utterances=utts)
            G_mid = Hypergraph.init_from_utterances(
                utterances=utts[1:])  # exclude root
            for k, v in HyperConvo._degree_feats(graph=G).items():
                stats[k] = v
            for k, v in HyperConvo._motif_feats(graph=G).items():
                stats[k] = v
            for k, v in HyperConvo._degree_feats(
                    graph=G_mid, name_ext="mid-thread ").items():
                stats[k] = v
            for k, v in HyperConvo._motif_feats(
                    graph=G_mid, name_ext=" over mid-thread").items():
                stats[k] = v
            threads_stats[convo.id] = stats
        return threads_stats
コード例 #4
0
    def transform(
        self,
        corpus: Corpus,
        selector: Optional[Callable[[Conversation], bool]] = lambda convo: True
    ) -> Corpus:
        """
        Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set

        :param corpus: Corpus object to retrieve feature information from
        :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects
            conversations to be annotated with hypergraph features. By default, all conversations will be annotated.
        :return: corpus with conversations having a new meta field with the specified feature name  containing the stats generated by retrieve_feats().
        """

        convo_id_to_feats = self.retrieve_feats(corpus, selector)
        df = pd.DataFrame(convo_id_to_feats).T
        corpus.set_vector_matrix(name=self.vector_name,
                                 ids=list(df.index),
                                 columns=list(df.columns),
                                 matrix=csr_matrix(
                                     df.values.astype('float64')))

        for convo in corpus.iter_conversations(selector):
            convo.add_vector(self.vector_name)
        return corpus
コード例 #5
0
    def summarize(self,
                  corpus: Corpus,
                  selector: Callable[[Conversation],
                                     bool] = lambda convo: True,
                  ignore_utterances: Callable[[Utterance],
                                              bool] = lambda utt: False,
                  exclude_na=True):
        """
        Returns a DataFrame of utterances and their forecasts (and forecast probabilities)

        :param corpus: target Corpus
        :param exclude_na: whether to drop NaN results
        :param selector: a (lambda) function that takes a Conversation and returns a bool: True if the Conversation is to be included in the summary step. By default, includes all Conversations.
        :param ignore_utterances: a (lambda) function that takes an Utterance and returns a bool: True if the Utterance should be excluded from the Conversation in the summary step. By default, all Utterances are included.
        :return: a pandas DataFrame
        """
        utt_forecast_prob = []
        for convo in corpus.iter_conversations(selector):
            for utt in convo.iter_utterances(
                    lambda x: not ignore_utterances(x)):
                utt_forecast_prob.append(
                    (utt.id, utt.meta[self.forecast_feat_name],
                     utt.meta[self.forecast_prob_feat_name]))
        forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \
            .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False)
        if exclude_na:
            forecast_df = forecast_df.dropna()
        return forecast_df
コード例 #6
0
    def transform(self, corpus: Corpus):
        for convo in corpus.iter_conversations():
            reciprocal = 0
            onesided = 0
            user_to_targets = dict()
            for user in convo.iter_users():
                user_to_targets[user.name] = {
                    corpus.get_utterance(utt.reply_to).user.name
                    for utt in user.iter_utterances()
                    if utt.reply_to is not None
                }

            for user1, user2 in combinations(convo.iter_users(), 2):
                user1_to_user2 = user2.name in user_to_targets[user1.name]
                user2_to_user1 = user1.name in user_to_targets[user2.name]

                if user1_to_user2 and user2_to_user1:
                    reciprocal += 1
                elif user1_to_user2 or user2_to_user1:
                    onesided += 1

            if reciprocal + onesided == 0:
                reciprocity_pct = 0
            else:
                reciprocity_pct = reciprocal / (reciprocal + onesided)

            convo.add_meta('reciprocity', reciprocity_pct)
        return corpus
	def transform(self, corpus: Corpus):

		stop_words = set(stopwords.words('english'))

		# overlap of vocabulary is a conversation-level metric
		for convo in corpus.iter_conversations():
	
			users = convo.get_usernames()
			vocabs = {u:defaultdict(int) for u in users}
	
			for utt in convo.iter_utterances():
		
				# Tokenize via NLTK tokenizer
				tokens = self._tokenize_utt(utt.text)
		
				# Filter out stop words
				tokens = [token for token in tokens if not token in stop_words]
				
				for token in tokens:
					vocabs[utt.user.name][token] += 1

				overlapVocab, ratio = self._compute_overlap(vocabs)	
				utt.add_meta('vocabulary_overlap', {'vocab': overlapVocab, 'ratio': ratio})
	
			# Compute frequency of overlaps and total number of tokens
			overlapVocab, ratio = self._compute_overlap(vocabs)
		
			convo.add_meta('vocabulary_overlap', {'vocab': overlapVocab, 'ratio': ratio})

	
		return corpus
コード例 #8
0
    def fit_transform(self, corpus: Corpus) -> Corpus:
        """
        :param corpus: the Corpus to use

        :return: Modifies and returns corpus with new meta key: "threadEmbedder",
             value: Dict, containing "X": an array with rows corresponding
             to embedded threads, "roots": an array whose ith entry is the
             thread root id of the ith row of X. If return_components is True,
             then the Dict contains a third key "components": the SVD components array
        """
        convos = corpus.iter_conversations()
        sample_convo_meta = next(iter(convos))
        if "hyperconvo" not in sample_convo_meta:
            raise RuntimeError(
                "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first"
            )

        thread_stats = dict()

        for convo in convos:
            thread_stats.update(convo.meta["hyperconvo"])

        X = []
        roots = []
        for root, feats in thread_stats.items():
            roots.append(root)
            row = np.array([
                v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0
                for v in sorted(feats.items())
            ])
            X.append(row)
        X = np.array(X)

        if self.norm_method.lower() == "standard":
            X = StandardScaler().fit_transform(X)
        elif self.norm_method.lower() == "none":
            pass
        else:
            raise Exception("Invalid embed_feats normalization method")

        if self.method.lower() == "svd":
            f = TruncatedSVD
        elif self.method.lower() == "tsne":
            f = TSNE
        else:
            raise Exception("Invalid embed_feats embedding method")

        emb = f(n_components=self.n_components)
        X_mid = emb.fit_transform(X) / emb.singular_values_

        retval = {"X": X_mid, "roots": roots}
        if self.return_components: retval["components"] = emb.components_

        corpus.add_meta("threadEmbedder", retval)
        return corpus
コード例 #9
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Modify the provided corpus. This is an abstract method that must be
        implemented by any Transformer subclass

        :param corpus: the Corpus to transform

        :return: modified version of the input Corpus. Note that unlike the
            scikit-learn equivalent, ``transform()`` operates inplace on the Corpus
            (though for convenience and compatibility with scikit-learn, it also
            returns the modified Corpus).
        """

        #counter = 0
        for convo in corpus.iter_conversations():

            temp_chain = []

            for utt in convo.iter_utterances():

                if utt.meta['post_depth'] == 2:
                    temp_chain.append(utt.id)

            if len(temp_chain) > 0:

                convo.add_meta('chain', random.choice(temp_chain))

                uttid = random.choice(temp_chain)
                chosen_chain = []
                chosen_chain.append(uttid)

                utt = convo.get_utterance(uttid)

                chosen_chain_tox = []
                chosen_chain_tox.append(utt.meta['toxicity'])

                while (utt.meta['post_depth'] > 0):
                    if utt.reply_to in corpus.utterances:
                        utt = convo.get_utterance(utt.reply_to)
                        chosen_chain.append(utt.id)
                        chosen_chain_tox.append(utt.meta['toxicity'])
                    else:
                        break

                #counter+=1
                chosen_chain.reverse()
                chosen_chain_tox.reverse()
                convo.add_meta('chain', chosen_chain)
                convo.add_meta('chain_tox', chosen_chain_tox)
            else:
                convo.add_meta('chain', None)

        return corpus
コード例 #10
0
    def _get_context_reply_label_dict(self,
                                      corpus: Corpus,
                                      include_label=True):
        dialogs = []
        if self.convo_structure == "branched":
            for convo in corpus.iter_conversations(self.convo_selector_func):
                try:
                    for path in convo.get_root_to_leaf_paths():
                        path = [
                            utt for utt in path if self.utt_selector_func(utt)
                        ]
                        if len(path) == 1: continue
                        dialogs.append(path)
                except ValueError as e:
                    if not self.skip_broken_convos:
                        raise e

        elif self.convo_structure == "linear":
            for convo in corpus.iter_conversations(self.convo_selector_func):
                utts = convo.get_chronological_utterance_list(
                    selector=self.utt_selector_func)
                if len(utts) == 1: continue
                dialogs.append(utts)

        id_to_context_reply_label = dict()

        for dialog in dialogs:
            for idx in range(1, len(dialog)):
                reply = self.text_func(dialog[idx])
                label = dialog[idx].meta[
                    self.label_feat] if include_label else None
                reply_id = dialog[idx].id
                context = [self.text_func(utt) for utt in dialog[:idx]]
                id_to_context_reply_label[reply_id] = (
                    context, reply, label) if include_label else (context,
                                                                  reply, None)

        return id_to_context_reply_label
コード例 #11
0
    def transform(self, corpus: Corpus) -> Corpus:
        """Modify the provided corpus. This is an abstract method that must be
        implemented by any Transformer subclass

        :param corpus: the Corpus to transform

        :return: modified version of the input Corpus. Note that unlike the
            scikit-learn equivalent, ``transform()`` operates inplace on the Corpus
            (though for convenience and compatibility with scikit-learn, it also
            returns the modified Corpus).
        """

        if self.toxicity_json_path:
            with open(self.toxicity_json_path, 'r') as f:
                toxicity_scores_dict = json.load(f)

        scores_to_save = {}
        for convo in tqdm(list(corpus.iter_conversations())):

            convo_scores = 0
            count = 0
            for utt in convo.iter_utterances():
                '''
                    rerunning this takes over a day for our 110k+ comments since it uses an api with limited query rate,
                    we'll load them from tox_dictionary.py that was pre-fetched,
                    for others using our transformer, please run self._get_toxicity over the utterances on their corpus.
                '''
                if self.api_key:
                    utt_score = self._get_toxicity(utt.text)
                    scores_to_save[utt.id] = utt_score
                else:
                    utt_score = toxicity_scores_dict[utt.id]

                convo_scores += utt_score
                count += 1

                #print (utt_score)
                utt.add_meta('toxicity', utt_score)

            convo.add_meta('averagetoxicity', convo_scores / count)

        if self.toxicity_path_to_save:
            with open(self.toxicity_path_to_save, 'w') as f:
                json.dump(scores_to_save, f)

        return corpus
コード例 #12
0
    def transform(self, corpus: Corpus):

        for convo in corpus.iter_conversations():

            scores = []

            for utt in convo.iter_utterances():

                tags = [pair[1] for pair in utt.meta['tag']]
                for tag in tags:
                    try:
                        scores.append(self.rubric[tag])
                    except KeyError:
                        continue

            convo.add_meta('damsl_score', np.mean(scores))

        return corpus
コード例 #13
0
    def transform(
        self,
        corpus: Corpus,
        selector: Optional[Callable[[Conversation], bool]] = lambda convo: True
    ) -> Corpus:
        """
        Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set

        :param corpus: Corpus object to retrieve feature information from
        :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects
            conversations to be annotated with hypergraph features. By default, all conversations will be annotated.
        :return: corpus with conversations having a new meta field with the specified feature name  containing the stats generated by retrieve_feats().
        """

        convo_id_to_feats = self.retrieve_feats(corpus)

        for convo in corpus.iter_conversations():
            convo.add_meta(self.feat_name,
                           convo_id_to_feats.get(convo.id, None))
        return corpus
コード例 #14
0
    def transform(self, corpus: Corpus):
        # This measure is calculated on a conversation-level and statement-pair
        # level
        for c in corpus.iter_conversations():

            # Update the beginning of each statement with the statement length
            user_tokens = {}
            user_order = []
            prev_user = ''  # User of the last utterance
            cur_user = ''  # User of the current utterance
            statement_root_id = ''
            statement_roots = []
            for u in c._utterance_ids:

                # Update current user
                prev_user = cur_user
                cur_user = corpus.utterances[u].user.name

                # A user has started new statement, add metadata field
                if cur_user != prev_user:
                    statement_root_id = u
                    statement_roots.append(u)
                    corpus.utterances[u].meta['statement_len'] = 0

                # Extract tokens from this utterance
                utt_tokens = self._tokenize_utt(corpus.utterances[u].text)

                if cur_user not in user_tokens:  # New user
                    user_tokens[cur_user] = 0
                    user_order.append(cur_user)
                user_tokens[cur_user] += len(utt_tokens)
                corpus.utterances[statement_root_id].meta[
                    'statement_len'] += len(utt_tokens)

            # Update utterance-level metadata with balance ratio
            for i in range(len(statement_roots) - 1):
                utt_id_cur = statement_roots[i]
                utt_id_next = statement_roots[i + 1]

                cur_len = corpus.utterances[utt_id_cur].meta['statement_len']
                next_len = corpus.utterances[utt_id_next].meta['statement_len']

                if cur_len + next_len == 0:
                    sment_balance = 1
                else:
                    sment_balance = 2 * min(cur_len,
                                            next_len) / (cur_len + next_len)
                corpus.utterances[utt_id_cur].meta[
                    'statement_balance'] = sment_balance

            # Update conversation-level metadata with balance ratio
            convo_balance = np.zeros((len(user_order), len(user_order)))
            for i, A in enumerate(user_order):
                for j, B in enumerate(user_order):
                    convo_balance[i, j] = 2 * min(
                        user_tokens[A],
                        user_tokens[B]) / (user_tokens[A] + user_tokens[B])
            c._meta['conversation_balance'] = convo_balance

            # Add the usernames to the conversation metadata
            c._usernames = user_order

        return (corpus)
コード例 #15
0
	def transform(self, corpus: Corpus):

		sid = SentimentIntensityAnalyzer()

		for convo in corpus.iter_conversations():
	
			# Compute polarity scores of each statement (set of utterances)
			users = convo.get_usernames()
			curr_user = users[0] # Current user
			curr_statement = [] # Current statement
			curr_utts = [] # Current list of utterances

			# Compute polarity score for the first 10% of the conversation
			for utt in convo.iter_utterances():
		
				# Tokenize via NLTK tokenizer
				tokens = self._tokenize_utt(utt.text)
						
				if utt.user.name == curr_user: # Utterance belongs to current user
					curr_utts.append(utt)
					curr_statement += tokens
				else: # Utterance belongs to another user

					# Compute polarity scores for current statement
					scores = sid.polarity_scores(' '.join(curr_statement))
					for u in curr_utts:
						u.add_meta('polarity', scores)

					# Move on to next user
					curr_user = utt.user.name
					curr_statement = tokens
					curr_utts = [utt]

			# flush out remaining scores
			scores = sid.polarity_scores(' '.join(curr_statement))
			for u in curr_utts:
				u.add_meta('polarity', scores)


			total_length = 0
			for utt in convo.iter_utterances():
				total_length += len(self._tokenize_utt(utt.text))


			initial_sentiment = {u:defaultdict(float) for u in users}
			curr_user = users[0] # Current user
			curr_statement = [] # Current statement
			curr_length = 0 # How many tokens are covered so far

			# Compute polarity score for the first 10% of the conversation
			for utt in convo.iter_utterances():
		
				# Tokenize via NLTK tokenizer
				tokens = self._tokenize_utt(utt.text)
						
				if utt.user.name == curr_user: # Utterance belongs to current user
					curr_statement += tokens
				else: # Utterance belongs to another user

					# Compute polarity scores for current statement
					scores = sid.polarity_scores(' '.join(curr_statement))
					for k, v in scores.items():
						initial_sentiment[curr_user][k] += v

					# Move on to next user
					curr_user = utt.user.name
					curr_statement = []

					# stop loop if we covered 10% of total conversation
					if curr_length > total_length/10:
						break
	
				curr_length += len(tokens)

				
			# Take the average if multiple statements are counted
			for user in initial_sentiment.keys():
				num_statements = initial_sentiment[user]['neg'] + \
					initial_sentiment[user]['neu'] + initial_sentiment[user]['pos']
				if num_statements != 0:
					for k in initial_sentiment[user].keys():
						initial_sentiment[user][k] /= num_statements

			convo.add_meta('initial_sentiment', initial_sentiment)

	
		return corpus
コード例 #16
0
ファイル: forecaster.py プロジェクト: Nuri22/csDetector
    def _get_context_reply_label_dict(self,
                                      corpus: Corpus,
                                      convo_selector,
                                      utt_excluder,
                                      include_label=True):
        """
        Returns a dict mapping reply id to (context, reply, label).

        If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt
        """
        dialogs = []
        if self.convo_structure == "branched":
            for convo in corpus.iter_conversations(convo_selector):
                try:
                    for path in convo.get_root_to_leaf_paths():
                        path = [utt for utt in path if not utt_excluder(utt)]
                        if len(path) == 1: continue
                        dialogs.append(path)
                except ValueError as e:
                    if not self.skip_broken_convos:
                        raise e

        elif self.convo_structure == "linear":
            for convo in corpus.iter_conversations(convo_selector):
                utts = convo.get_chronological_utterance_list(
                    selector=lambda x: not utt_excluder(x))
                if len(utts) == 1: continue
                dialogs.append(utts)

        id_to_context_reply_label = dict()

        # this flag determines whether the dictionary entry for each utterance ID should include that
        # utterance in the context (True corresponds to "future" behavior). This needs to be always
        # False when include_label = True, since include_label assumes that the label comes from the
        # utterance after the last utterance in the context. This override logic won't affect
        # forecast_mode however, since that argument only applies to transform() while include_label
        # is only True when called from fit()
        include_current = (self.forecast_mode
                           == 'future') and (not include_label)

        for dialog in dialogs:
            if self.use_last_only:
                reply = self.text_func(dialog[-1])
                context = [
                    self.text_func(utt)
                    for utt in (dialog if include_current else dialog[:-1])
                ]
                label = self.label_func(dialog[-1]) if include_label else None
                id_to_context_reply_label[dialog[-1].id] = (context, reply,
                                                            label)
            else:
                for idx in range(0 if include_current else 1, len(dialog)):
                    reply = self.text_func(dialog[idx])
                    label = self.label_func(
                        dialog[idx]) if include_label else None
                    reply_id = dialog[idx].id
                    context = [
                        self.text_func(utt) for utt in (dialog[:(
                            idx + 1)] if include_current else dialog[:idx])
                    ]
                    id_to_context_reply_label[reply_id] = (
                        context, reply,
                        label) if include_label else (context, reply, None)

        return id_to_context_reply_label