def _get_context_reply_label_dict(self, corpus: Corpus, convo_selector, utt_excluder, include_label=True): """ Returns a dict mapping reply id to (context, reply, label). If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt """ dialogs = [] if self.convo_structure == "branched": for convo in corpus.iter_conversations(convo_selector): try: for path in convo.get_root_to_leaf_paths(): path = [utt for utt in path if not utt_excluder(utt)] if len(path) == 1: continue dialogs.append(path) except ValueError as e: if not self.skip_broken_convos: raise e elif self.convo_structure == "linear": for convo in corpus.iter_conversations(convo_selector): utts = convo.get_chronological_utterance_list( selector=lambda x: not utt_excluder(x)) if len(utts) == 1: continue dialogs.append(utts) id_to_context_reply_label = dict() if self.forecast_mode == 'future': for dialog in dialogs: id_to_context_reply_label[dialog[-1].id] = (dialog, dialog[-1], None) for dialog in dialogs: if self.use_last_only: reply = self.text_func(dialog[-1]) context = [self.text_func(utt) for utt in dialog[:-1]] label = self.label_func(dialog[-1]) if include_label else None id_to_context_reply_label[dialog[-1].id] = (context, reply, label) else: for idx in range(1, len(dialog)): reply = self.text_func(dialog[idx]) label = self.label_func( dialog[idx]) if include_label else None reply_id = dialog[idx].id context = [self.text_func(utt) for utt in dialog[:idx]] id_to_context_reply_label[reply_id] = ( context, reply, label) if include_label else (context, reply, None) return id_to_context_reply_label
def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True): """ Returns a DataFrame of utterances and their forecasts (and forecast probabilities) :param corpus: target Corpus :param use_selector: whether to use Forecaster's convo and utterance selector functions :param exclude_na: whether to drop NaN results :return: a pandas DataFrame """ utt_forecast_prob = [] if use_selector: for convo in corpus.iter_conversations(self.convo_selector_func): for utt in convo.iter_utterances(self.utt_selector_func): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) else: for utt in corpus.iter_utterances(): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \ .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False) if exclude_na: forecast_df = forecast_df.dropna() return forecast_df
def retrieve_feats(self, corpus: Corpus) -> Dict[str, Dict]: """ Retrieve all hypergraph features for a given corpus (viewed as a set of conversation threads). See init() for further documentation. :return: A dictionary from a thread root id to its stats dictionary, which is a dictionary from feature names to feature values. For degree-related features specifically. """ threads_stats = dict() for convo in corpus.iter_conversations(): ordered_utts = convo.get_chronological_utterance_list() if len(ordered_utts) < self.min_thread_len: continue utts = ordered_utts[:self.prefix_len] stats = {} G = Hypergraph.init_from_utterances(utterances=utts) G_mid = Hypergraph.init_from_utterances( utterances=utts[1:]) # exclude root for k, v in HyperConvo._degree_feats(graph=G).items(): stats[k] = v for k, v in HyperConvo._motif_feats(graph=G).items(): stats[k] = v for k, v in HyperConvo._degree_feats( graph=G_mid, name_ext="mid-thread ").items(): stats[k] = v for k, v in HyperConvo._motif_feats( graph=G_mid, name_ext=" over mid-thread").items(): stats[k] = v threads_stats[convo.id] = stats return threads_stats
def transform( self, corpus: Corpus, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True ) -> Corpus: """ Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set :param corpus: Corpus object to retrieve feature information from :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects conversations to be annotated with hypergraph features. By default, all conversations will be annotated. :return: corpus with conversations having a new meta field with the specified feature name containing the stats generated by retrieve_feats(). """ convo_id_to_feats = self.retrieve_feats(corpus, selector) df = pd.DataFrame(convo_id_to_feats).T corpus.set_vector_matrix(name=self.vector_name, ids=list(df.index), columns=list(df.columns), matrix=csr_matrix( df.values.astype('float64'))) for convo in corpus.iter_conversations(selector): convo.add_vector(self.vector_name) return corpus
def summarize(self, corpus: Corpus, selector: Callable[[Conversation], bool] = lambda convo: True, ignore_utterances: Callable[[Utterance], bool] = lambda utt: False, exclude_na=True): """ Returns a DataFrame of utterances and their forecasts (and forecast probabilities) :param corpus: target Corpus :param exclude_na: whether to drop NaN results :param selector: a (lambda) function that takes a Conversation and returns a bool: True if the Conversation is to be included in the summary step. By default, includes all Conversations. :param ignore_utterances: a (lambda) function that takes an Utterance and returns a bool: True if the Utterance should be excluded from the Conversation in the summary step. By default, all Utterances are included. :return: a pandas DataFrame """ utt_forecast_prob = [] for convo in corpus.iter_conversations(selector): for utt in convo.iter_utterances( lambda x: not ignore_utterances(x)): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \ .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False) if exclude_na: forecast_df = forecast_df.dropna() return forecast_df
def transform(self, corpus: Corpus): for convo in corpus.iter_conversations(): reciprocal = 0 onesided = 0 user_to_targets = dict() for user in convo.iter_users(): user_to_targets[user.name] = { corpus.get_utterance(utt.reply_to).user.name for utt in user.iter_utterances() if utt.reply_to is not None } for user1, user2 in combinations(convo.iter_users(), 2): user1_to_user2 = user2.name in user_to_targets[user1.name] user2_to_user1 = user1.name in user_to_targets[user2.name] if user1_to_user2 and user2_to_user1: reciprocal += 1 elif user1_to_user2 or user2_to_user1: onesided += 1 if reciprocal + onesided == 0: reciprocity_pct = 0 else: reciprocity_pct = reciprocal / (reciprocal + onesided) convo.add_meta('reciprocity', reciprocity_pct) return corpus
def transform(self, corpus: Corpus): stop_words = set(stopwords.words('english')) # overlap of vocabulary is a conversation-level metric for convo in corpus.iter_conversations(): users = convo.get_usernames() vocabs = {u:defaultdict(int) for u in users} for utt in convo.iter_utterances(): # Tokenize via NLTK tokenizer tokens = self._tokenize_utt(utt.text) # Filter out stop words tokens = [token for token in tokens if not token in stop_words] for token in tokens: vocabs[utt.user.name][token] += 1 overlapVocab, ratio = self._compute_overlap(vocabs) utt.add_meta('vocabulary_overlap', {'vocab': overlapVocab, 'ratio': ratio}) # Compute frequency of overlaps and total number of tokens overlapVocab, ratio = self._compute_overlap(vocabs) convo.add_meta('vocabulary_overlap', {'vocab': overlapVocab, 'ratio': ratio}) return corpus
def fit_transform(self, corpus: Corpus) -> Corpus: """ :param corpus: the Corpus to use :return: Modifies and returns corpus with new meta key: "threadEmbedder", value: Dict, containing "X": an array with rows corresponding to embedded threads, "roots": an array whose ith entry is the thread root id of the ith row of X. If return_components is True, then the Dict contains a third key "components": the SVD components array """ convos = corpus.iter_conversations() sample_convo_meta = next(iter(convos)) if "hyperconvo" not in sample_convo_meta: raise RuntimeError( "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first" ) thread_stats = dict() for convo in convos: thread_stats.update(convo.meta["hyperconvo"]) X = [] roots = [] for root, feats in thread_stats.items(): roots.append(root) row = np.array([ v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0 for v in sorted(feats.items()) ]) X.append(row) X = np.array(X) if self.norm_method.lower() == "standard": X = StandardScaler().fit_transform(X) elif self.norm_method.lower() == "none": pass else: raise Exception("Invalid embed_feats normalization method") if self.method.lower() == "svd": f = TruncatedSVD elif self.method.lower() == "tsne": f = TSNE else: raise Exception("Invalid embed_feats embedding method") emb = f(n_components=self.n_components) X_mid = emb.fit_transform(X) / emb.singular_values_ retval = {"X": X_mid, "roots": roots} if self.return_components: retval["components"] = emb.components_ corpus.add_meta("threadEmbedder", retval) return corpus
def transform(self, corpus: Corpus) -> Corpus: """Modify the provided corpus. This is an abstract method that must be implemented by any Transformer subclass :param corpus: the Corpus to transform :return: modified version of the input Corpus. Note that unlike the scikit-learn equivalent, ``transform()`` operates inplace on the Corpus (though for convenience and compatibility with scikit-learn, it also returns the modified Corpus). """ #counter = 0 for convo in corpus.iter_conversations(): temp_chain = [] for utt in convo.iter_utterances(): if utt.meta['post_depth'] == 2: temp_chain.append(utt.id) if len(temp_chain) > 0: convo.add_meta('chain', random.choice(temp_chain)) uttid = random.choice(temp_chain) chosen_chain = [] chosen_chain.append(uttid) utt = convo.get_utterance(uttid) chosen_chain_tox = [] chosen_chain_tox.append(utt.meta['toxicity']) while (utt.meta['post_depth'] > 0): if utt.reply_to in corpus.utterances: utt = convo.get_utterance(utt.reply_to) chosen_chain.append(utt.id) chosen_chain_tox.append(utt.meta['toxicity']) else: break #counter+=1 chosen_chain.reverse() chosen_chain_tox.reverse() convo.add_meta('chain', chosen_chain) convo.add_meta('chain_tox', chosen_chain_tox) else: convo.add_meta('chain', None) return corpus
def _get_context_reply_label_dict(self, corpus: Corpus, include_label=True): dialogs = [] if self.convo_structure == "branched": for convo in corpus.iter_conversations(self.convo_selector_func): try: for path in convo.get_root_to_leaf_paths(): path = [ utt for utt in path if self.utt_selector_func(utt) ] if len(path) == 1: continue dialogs.append(path) except ValueError as e: if not self.skip_broken_convos: raise e elif self.convo_structure == "linear": for convo in corpus.iter_conversations(self.convo_selector_func): utts = convo.get_chronological_utterance_list( selector=self.utt_selector_func) if len(utts) == 1: continue dialogs.append(utts) id_to_context_reply_label = dict() for dialog in dialogs: for idx in range(1, len(dialog)): reply = self.text_func(dialog[idx]) label = dialog[idx].meta[ self.label_feat] if include_label else None reply_id = dialog[idx].id context = [self.text_func(utt) for utt in dialog[:idx]] id_to_context_reply_label[reply_id] = ( context, reply, label) if include_label else (context, reply, None) return id_to_context_reply_label
def transform(self, corpus: Corpus) -> Corpus: """Modify the provided corpus. This is an abstract method that must be implemented by any Transformer subclass :param corpus: the Corpus to transform :return: modified version of the input Corpus. Note that unlike the scikit-learn equivalent, ``transform()`` operates inplace on the Corpus (though for convenience and compatibility with scikit-learn, it also returns the modified Corpus). """ if self.toxicity_json_path: with open(self.toxicity_json_path, 'r') as f: toxicity_scores_dict = json.load(f) scores_to_save = {} for convo in tqdm(list(corpus.iter_conversations())): convo_scores = 0 count = 0 for utt in convo.iter_utterances(): ''' rerunning this takes over a day for our 110k+ comments since it uses an api with limited query rate, we'll load them from tox_dictionary.py that was pre-fetched, for others using our transformer, please run self._get_toxicity over the utterances on their corpus. ''' if self.api_key: utt_score = self._get_toxicity(utt.text) scores_to_save[utt.id] = utt_score else: utt_score = toxicity_scores_dict[utt.id] convo_scores += utt_score count += 1 #print (utt_score) utt.add_meta('toxicity', utt_score) convo.add_meta('averagetoxicity', convo_scores / count) if self.toxicity_path_to_save: with open(self.toxicity_path_to_save, 'w') as f: json.dump(scores_to_save, f) return corpus
def transform(self, corpus: Corpus): for convo in corpus.iter_conversations(): scores = [] for utt in convo.iter_utterances(): tags = [pair[1] for pair in utt.meta['tag']] for tag in tags: try: scores.append(self.rubric[tag]) except KeyError: continue convo.add_meta('damsl_score', np.mean(scores)) return corpus
def transform( self, corpus: Corpus, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True ) -> Corpus: """ Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set :param corpus: Corpus object to retrieve feature information from :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects conversations to be annotated with hypergraph features. By default, all conversations will be annotated. :return: corpus with conversations having a new meta field with the specified feature name containing the stats generated by retrieve_feats(). """ convo_id_to_feats = self.retrieve_feats(corpus) for convo in corpus.iter_conversations(): convo.add_meta(self.feat_name, convo_id_to_feats.get(convo.id, None)) return corpus
def transform(self, corpus: Corpus): # This measure is calculated on a conversation-level and statement-pair # level for c in corpus.iter_conversations(): # Update the beginning of each statement with the statement length user_tokens = {} user_order = [] prev_user = '' # User of the last utterance cur_user = '' # User of the current utterance statement_root_id = '' statement_roots = [] for u in c._utterance_ids: # Update current user prev_user = cur_user cur_user = corpus.utterances[u].user.name # A user has started new statement, add metadata field if cur_user != prev_user: statement_root_id = u statement_roots.append(u) corpus.utterances[u].meta['statement_len'] = 0 # Extract tokens from this utterance utt_tokens = self._tokenize_utt(corpus.utterances[u].text) if cur_user not in user_tokens: # New user user_tokens[cur_user] = 0 user_order.append(cur_user) user_tokens[cur_user] += len(utt_tokens) corpus.utterances[statement_root_id].meta[ 'statement_len'] += len(utt_tokens) # Update utterance-level metadata with balance ratio for i in range(len(statement_roots) - 1): utt_id_cur = statement_roots[i] utt_id_next = statement_roots[i + 1] cur_len = corpus.utterances[utt_id_cur].meta['statement_len'] next_len = corpus.utterances[utt_id_next].meta['statement_len'] if cur_len + next_len == 0: sment_balance = 1 else: sment_balance = 2 * min(cur_len, next_len) / (cur_len + next_len) corpus.utterances[utt_id_cur].meta[ 'statement_balance'] = sment_balance # Update conversation-level metadata with balance ratio convo_balance = np.zeros((len(user_order), len(user_order))) for i, A in enumerate(user_order): for j, B in enumerate(user_order): convo_balance[i, j] = 2 * min( user_tokens[A], user_tokens[B]) / (user_tokens[A] + user_tokens[B]) c._meta['conversation_balance'] = convo_balance # Add the usernames to the conversation metadata c._usernames = user_order return (corpus)
def transform(self, corpus: Corpus): sid = SentimentIntensityAnalyzer() for convo in corpus.iter_conversations(): # Compute polarity scores of each statement (set of utterances) users = convo.get_usernames() curr_user = users[0] # Current user curr_statement = [] # Current statement curr_utts = [] # Current list of utterances # Compute polarity score for the first 10% of the conversation for utt in convo.iter_utterances(): # Tokenize via NLTK tokenizer tokens = self._tokenize_utt(utt.text) if utt.user.name == curr_user: # Utterance belongs to current user curr_utts.append(utt) curr_statement += tokens else: # Utterance belongs to another user # Compute polarity scores for current statement scores = sid.polarity_scores(' '.join(curr_statement)) for u in curr_utts: u.add_meta('polarity', scores) # Move on to next user curr_user = utt.user.name curr_statement = tokens curr_utts = [utt] # flush out remaining scores scores = sid.polarity_scores(' '.join(curr_statement)) for u in curr_utts: u.add_meta('polarity', scores) total_length = 0 for utt in convo.iter_utterances(): total_length += len(self._tokenize_utt(utt.text)) initial_sentiment = {u:defaultdict(float) for u in users} curr_user = users[0] # Current user curr_statement = [] # Current statement curr_length = 0 # How many tokens are covered so far # Compute polarity score for the first 10% of the conversation for utt in convo.iter_utterances(): # Tokenize via NLTK tokenizer tokens = self._tokenize_utt(utt.text) if utt.user.name == curr_user: # Utterance belongs to current user curr_statement += tokens else: # Utterance belongs to another user # Compute polarity scores for current statement scores = sid.polarity_scores(' '.join(curr_statement)) for k, v in scores.items(): initial_sentiment[curr_user][k] += v # Move on to next user curr_user = utt.user.name curr_statement = [] # stop loop if we covered 10% of total conversation if curr_length > total_length/10: break curr_length += len(tokens) # Take the average if multiple statements are counted for user in initial_sentiment.keys(): num_statements = initial_sentiment[user]['neg'] + \ initial_sentiment[user]['neu'] + initial_sentiment[user]['pos'] if num_statements != 0: for k in initial_sentiment[user].keys(): initial_sentiment[user][k] /= num_statements convo.add_meta('initial_sentiment', initial_sentiment) return corpus
def _get_context_reply_label_dict(self, corpus: Corpus, convo_selector, utt_excluder, include_label=True): """ Returns a dict mapping reply id to (context, reply, label). If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt """ dialogs = [] if self.convo_structure == "branched": for convo in corpus.iter_conversations(convo_selector): try: for path in convo.get_root_to_leaf_paths(): path = [utt for utt in path if not utt_excluder(utt)] if len(path) == 1: continue dialogs.append(path) except ValueError as e: if not self.skip_broken_convos: raise e elif self.convo_structure == "linear": for convo in corpus.iter_conversations(convo_selector): utts = convo.get_chronological_utterance_list( selector=lambda x: not utt_excluder(x)) if len(utts) == 1: continue dialogs.append(utts) id_to_context_reply_label = dict() # this flag determines whether the dictionary entry for each utterance ID should include that # utterance in the context (True corresponds to "future" behavior). This needs to be always # False when include_label = True, since include_label assumes that the label comes from the # utterance after the last utterance in the context. This override logic won't affect # forecast_mode however, since that argument only applies to transform() while include_label # is only True when called from fit() include_current = (self.forecast_mode == 'future') and (not include_label) for dialog in dialogs: if self.use_last_only: reply = self.text_func(dialog[-1]) context = [ self.text_func(utt) for utt in (dialog if include_current else dialog[:-1]) ] label = self.label_func(dialog[-1]) if include_label else None id_to_context_reply_label[dialog[-1].id] = (context, reply, label) else: for idx in range(0 if include_current else 1, len(dialog)): reply = self.text_func(dialog[idx]) label = self.label_func( dialog[idx]) if include_label else None reply_id = dialog[idx].id context = [ self.text_func(utt) for utt in (dialog[:( idx + 1)] if include_current else dialog[:idx]) ] id_to_context_reply_label[reply_id] = ( context, reply, label) if include_label else (context, reply, None) return id_to_context_reply_label