Beispiel #1
0
	def __init__(self, output_field='prompt_types', n_types=8, use_prompt_motifs=True, root_only=True,
				questions_only=True, enforce_caps=True, recompute_all=False, min_support=100,
				 min_df=100, svd__n_components=25, max_df=.1,
					max_dist=.9,
				 random_state=None, verbosity=10000, 
				):
		self.use_motifs = use_prompt_motifs
		self.random_state=random_state
		pipe = [
			('parser', TextParser(verbosity=verbosity, 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed') is None))),
			('censor_nouns', CensorNouns('parsed_censored', 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed_censored') is None),
										 verbosity=verbosity)),
			('shallow_arcs', TextToArcs('arcs', input_field='parsed_censored',
				input_filter=lambda utt, aux: recompute_all or (utt.get_info('arcs') is None),
									   root_only=root_only, verbosity=verbosity))
			
		]
		
		if questions_only:
			pipe.append(
				('question_sentence_filter', QuestionSentences('question_arcs',
									input_field='arcs', 
								   input_filter=lambda utt, aux: recompute_all or utt.meta['is_question'],
									use_caps=enforce_caps, verbosity=verbosity))
			)
		
			prompt_input_field = 'question_arcs'
			self.prompt_selector = lambda utt: utt.meta['is_question']
			self.reference_selector = lambda utt: (not utt.meta['is_question']) and (utt.reply_to is not None)
		else:
			prompt_input_field = 'arcs'
			self.prompt_selector = lambda utt: True
			self.reference_selector = lambda utt: True
		if use_prompt_motifs:
			pipe.append(
				('pm_model', PhrasingMotifs('motifs', prompt_input_field, min_support=min_support,
						fit_filter=self.prompt_selector, verbosity=verbosity))
			)
			prompt_field = 'motifs'
			prompt_transform_field = 'motifs__sink'
		else:
			prompt_field = 'arcs'
			prompt_transform_field = 'arcs'
		pipe.append(
			('pt_model', PromptTypes(prompt_field=prompt_field, reference_field='arcs', 
									 prompt_transform_field=prompt_transform_field,
									 output_field=output_field, n_types=n_types,
									 svd__n_components=svd__n_components,
									 prompt__tfidf_min_df=min_df,
									 prompt__tfidf_max_df=max_df,
									 reference__tfidf_min_df=min_df,
									 reference__tfidf_max_df=max_df,
									 max_dist=max_dist,
									 random_state=random_state, verbosity=verbosity
			))
		)
		self.pipe = ConvokitPipeline(pipe)
def parliament_arc_pipeline():
    return ConvokitPipeline([
        # to avoid most computations, we'll only run the pipeline if the desired attributes don't exist
        ('parser',
         TextParser(input_filter=lambda utt, aux: utt.get_info('arcs') is None)
         ),
        ('censor_nouns',
         CensorNouns(
             'parsed_censored',
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed_censored',
             root_only=True,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('question_sentence_filter',
         QuestionSentences(
             'q_arc_arr',
             input_field='arc_arr',
             input_filter=lambda utt, aux: utt.get_info('q_arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_q_arcs',
         TextProcessor(
             output_field='q_arcs',
             input_field='q_arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('q_arcs') is None))
    ])
def switchboard_text_pipeline():
    # here we don't want to overwrite alpha_text fields that already exist
    return ConvokitPipeline([
        ('text',
         TextProcessor(
             proc_fn=lambda x: x,
             output_field='alpha_text',
             input_filter=lambda utt, aux: utt.get_info('alpha_text') is None))
    ])
def scotus_arc_pipeline():
    return ConvokitPipeline([
        ('parser',
         TextParser(
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed',
             root_only=False,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None))
    ])
def text_prep_pipe():
    return ConvokitPipeline([
        ('arcs_per_sent', TextToArcs(output_field='arcs_per_sent')),
        ('arcs',
         TextProcessor(input_field='arcs_per_sent',
                       output_field='arcs',
                       proc_fn=lambda sents: '\n'.join(sents))),
        ('wordcount',
         TextProcessor(input_field='parsed',
                       output_field='wordcount',
                       proc_fn=lambda sents: sum(
                           sum(x['tag'] != '_SP' for x in sent['toks'])
                           for sent in sents))),
        ('tokens',
         TextProcessor(input_field='parsed',
                       output_field='tokens',
                       proc_fn=lambda sents: '\n'.join(
                           (' '.join(x['tok'] for x in sent['toks']).strip())
                           for sent in sents)))
    ])
def wiki_arc_pipeline():
    return ConvokitPipeline([
        ('parser',
         TextParser(input_filter=lambda utt, aux: (utt.get_info(
             'arcs') is None) and (utt.get_info('parsed') is None))),
        ('censor_nouns',
         CensorNouns(
             'parsed_censored',
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('arcs',
         TextToArcs(
             'arc_arr',
             input_field='parsed_censored',
             root_only=False,
             input_filter=lambda utt, aux: utt.get_info('arcs') is None)),
        ('join_arcs',
         TextProcessor(
             output_field='arcs',
             input_field='arc_arr',
             proc_fn=lambda x: '\n'.join(x),
             input_filter=lambda utt, aux: utt.get_info('arcs') is None))
    ])
Beispiel #7
0
class PromptTypeWrapper(Transformer):
	"""	
	This is a wrapper class implementing a pipeline that infers types of rhetorical intentions encapsulated by utterances in a corpus, in terms of their anticipated responses.

	The pipeline involves:
		* parsing input text via `TextParser`
		* representing input text as dependency tree arcs, with nouns censored out, via `CensorNouns`, `TextToArcs` and `QuestionSentences`
		* extracting a set of "phrasings" from the corpus, using a `PhrasingMotifs` model
		* inferring prompt types and type assignments per-utterance, using a `PromptTypes` model.

	While the pipeline computes many attributes of an utterance along the way, the overall goal is to assign each utterance to a prompt type.
	By default, the pipeline will focus on learning types of *questions*, in terms of how the questions are phrased. However, other options are possible (see parameters below).
	For further details, see the respective classes listed above.

	:param output_field:  the name of the attribute to write to in the transform step. the transformer outputs several fields, corresponding to both vector representations and discrete type assignments.
	:param n_types: the number of prompt types to infer.
	:param use_prompt_motifs: whether to represent prompts in terms of how they are phrased. defaults to `True`. if `False`, will use individual dependency arcs as input (this might be better for noisier text)
	:param root_only: whether to only use dependency arcs attached to the root of the parse. defaults to `True`. if `False` will also consider arcs beyond the root (may be better for noisier text)
	:param questions_only: whether to only learn representations of questions (i.e., utterances containing sentences that end in question marks); defaults to `True`.
	:param enforce_caps: whether to only fit and transform on sentences that start with capital letters. defaults to `True`, which is appropriate for formal settings like transcripts of institutional proceedings, where this is a check on how well-formed the input is. in less formal settings like social media, setting to `False` may be more appropriately permissive.
	:param min_support: the minimum frequency of phrasings to extract.
	:param min_df: the minimum frequency of prompt and response terms to consider when inferring types.
	:param max_df: the maximum frequency of prompt and response terms to use. defaults to 0.1 (i.e., occurs in at most 10% of prompt-response pairs). Setting higher is more permissive, but may result in many stopword-like terms adding noise to the model.
	:param svd__n_components: the number of SVD dimensions to use when inferring types, defaults to 25. higher values result in richer vector representations, perhaps at the cost of the model learning overly-specific types.
	:param max_dist: the maximum distance between a vector representation of an utterance and the cluster centroid; a cluster whose distance to all centroids is above this cutoff will get assigned to a null type, denoted by -1. defaults to 0.9.
	:param recompute_all: if `False` (the default), checks utterances to see if they already have an attribute computed, skipping over that utterance in the relevant step of the pipeline. if `True`, recomputes all attributes.
	:param random_state: the random seed to use.
	:param verbosity: frequency of status messages.

	"""
	
	def __init__(self, output_field='prompt_types', n_types=8, use_prompt_motifs=True, root_only=True,
				questions_only=True, enforce_caps=True, recompute_all=False, min_support=100,
				 min_df=100, svd__n_components=25, max_df=.1,
					max_dist=.9,
				 random_state=None, verbosity=10000, 
				):
		self.use_motifs = use_prompt_motifs
		self.random_state=random_state
		pipe = [
			('parser', TextParser(verbosity=verbosity, 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed') is None))),
			('censor_nouns', CensorNouns('parsed_censored', 
				 input_filter=lambda utt, aux: recompute_all or (utt.get_info('parsed_censored') is None),
										 verbosity=verbosity)),
			('shallow_arcs', TextToArcs('arcs', input_field='parsed_censored',
				input_filter=lambda utt, aux: recompute_all or (utt.get_info('arcs') is None),
									   root_only=root_only, verbosity=verbosity))
			
		]
		
		if questions_only:
			pipe.append(
				('question_sentence_filter', QuestionSentences('question_arcs',
									input_field='arcs', 
								   input_filter=lambda utt, aux: recompute_all or utt.meta['is_question'],
									use_caps=enforce_caps, verbosity=verbosity))
			)
		
			prompt_input_field = 'question_arcs'
			self.prompt_selector = lambda utt: utt.meta['is_question']
			self.reference_selector = lambda utt: (not utt.meta['is_question']) and (utt.reply_to is not None)
		else:
			prompt_input_field = 'arcs'
			self.prompt_selector = lambda utt: True
			self.reference_selector = lambda utt: True
		if use_prompt_motifs:
			pipe.append(
				('pm_model', PhrasingMotifs('motifs', prompt_input_field, min_support=min_support,
						fit_filter=self.prompt_selector, verbosity=verbosity))
			)
			prompt_field = 'motifs'
			prompt_transform_field = 'motifs__sink'
		else:
			prompt_field = 'arcs'
			prompt_transform_field = 'arcs'
		pipe.append(
			('pt_model', PromptTypes(prompt_field=prompt_field, reference_field='arcs', 
									 prompt_transform_field=prompt_transform_field,
									 output_field=output_field, n_types=n_types,
									 svd__n_components=svd__n_components,
									 prompt__tfidf_min_df=min_df,
									 prompt__tfidf_max_df=max_df,
									 reference__tfidf_min_df=min_df,
									 reference__tfidf_max_df=max_df,
									 max_dist=max_dist,
									 random_state=random_state, verbosity=verbosity
			))
		)
		self.pipe = ConvokitPipeline(pipe)
		
	def fit(self, corpus, y=None):
		"""
			Fits the model for a corpus -- that is, computes all necessary utterance attributes, and fits the underlying `PhrasingMotifs` and `PromptTypes` models.

			:param corpus: Corpus
			:return: None
		"""

		self.pipe.fit(corpus, 
				pt_model__prompt_selector=self.prompt_selector, pt_model__reference_selector=self.reference_selector)
	
	def transform(self, corpus):
		"""
			Computes prompt type assignments for utterances in a corpus.

			:param corpus: Corpus
			:return: the corpus, with per-utterance representations and type assignments.
		"""

		return self.pipe.transform(corpus)
	
	def transform_utterance(self, utterance):
		"""
			Computes prompt type assignments for individual utterances. can take as input ConvoKit Utterances or raw strings. will return assignments for *all* string input, even if the input is not a question.

			:param utterance: the utterance, as an Utterance or string.
			:return: the utterance, annotated with type assignments.
		"""

		if isinstance(utterance, str):
			utterance = Utterance(text=utterance)
			utterance.meta['is_question'] = True
		return self.pipe.transform_utterance(utterance)        
	
	def dump_model(self, model_dir, type_keys='default'):
		"""
			Writes the `PhrasingMotifs` (if applicable) and `PromptTypes` models to disk. 

			:param model_dir: directory to write to.
			:return: None
		"""
		try:
			os.mkdir(model_dir)
		except:
			pass
		if self.use_motifs:
			self.pipe.named_steps['pm_model'].dump_model(os.path.join(model_dir, 'pm_model'))
		self.pipe.named_steps['pt_model'].dump_model(os.path.join(model_dir, 'pt_model'), type_keys=type_keys)
	
	def load_model(self, model_dir, type_keys='default'):
		"""
			Reads the `PhrasingMotifs` (if applicable) and `PromptTypes` models from disk. 

			:param model_dir: directory to read from.
			:return: None
		"""

		if self.use_motifs:
			self.pipe.named_steps['pm_model'].load_model(os.path.join(model_dir, 'pm_model'))
		self.pipe.named_steps['pt_model'].load_model(os.path.join(model_dir, 'pt_model'), type_keys=type_keys)

	def get_model(self, type_keys='default'):
		'''
		Returns the model:
			* pm_model: PhrasingMotifs model (if applicable, i.e., use_motifs=True)
			* pt_model: PromptTypes model

		:param type_keys: which numbers of prompt types to return corresponding PromptTypes model for 
		:return: model
		'''
		to_return = {}
		if self.use_motifs:
			to_return['pm_model'] = self.pipe.named_steps['pm_model'].get_model()
		to_return['pt_model'] = self.pipe.named_steps['pt_model'].get_model(type_keys=type_keys)
		return to_return
	
	def print_top_phrasings(self, k):
		"""
			prints the k most frequent phrasings from the `PhrasingMotifs` component of the pipeline, if phrasings are used.

			:param k: number of phrasings to print
			:return: None
		"""

		if self.use_motifs:
			self.pipe.named_steps['pm_model'].print_top_phrasings(k)
		else:
			print('phrasing motifs unavailable')
	
	def display_type(self, type_id, corpus=None, type_key=None, k=10):
		"""
			for a particular prompt type, displays the representative prompt and response terms. can also display representative prompt and response utterances.

			:param type_id: ID of the prompt type to display.
			:param corpus: pass in the training corpus to also display representative utterances.
			:param type_key: the name of the prompt type clustering model to use. defaults to `n_types` that the model was initialized with, but if `refit_types` is called with different number of types, can be modified to display this updated model as well.
			:param k: the number of sample terms (or utteranceS) to display.
			:return: None

		"""
		self.pipe.named_steps['pt_model'].display_type(type_id, corpus=corpus, type_key=type_key, k=k)

	def summarize(self, corpus, type_ids=None, type_key=None, k=10):
		'''
		Displays representative prompt and response terms and utterances for each type learned. 

		:param corpus: corpus to display utterances for (must have `transform()` called on it)
		:param type_ids: ID of the prompt type to display. if None, will display all types.
		:param type_key: the name of the prompt type clustering model to use. defaults to `n_types` that the model was initialized with, but if `refit_types` is called with different number of types, can be modified to display this updated model as well.
		:param k: the number of sample terms (or utteranceS) to display.
		:return: None
		'''
		self.pipe.named_steps['pt_model'].summarize(corpus=corpus, type_ids=type_ids, type_key=type_key, k=k)
	
	def refit_types(self, n_types, random_state=None, name=None):
		"""
			infers a different number of prompt types than was originally called.

			:param n_types: number of types to learn
			:param random_state: random seed
			:param name: the name of the new type model. defaults to n_types.
			:return: None
		"""
		self.pipe.named_steps['pt_model'].refit_types(n_types, random_state, name)
    def __init__(self,
                 context_field,
                 output_prefix,
                 text_field,
                 context_text_field=None,
                 text_pipe=None,
                 context_text_pipe=None,
                 tfidf_params={},
                 context_tfidf_params=None,
                 share_tfidf_models=True,
                 min_terms=0,
                 context_min_terms=None,
                 n_svd_dims=25,
                 snip_first_dim=True,
                 n_clusters=8,
                 cluster_on='utts',
                 ec_model=None,
                 random_state=None,
                 cluster_random_state=None):

        self.context_field = context_field
        self.output_prefix = output_prefix

        self.vect_field = 'col_normed_tfidf'
        self.share_tfidf_models = share_tfidf_models

        if share_tfidf_models:
            self.context_vect_field = self.vect_field
        else:
            self.context_vect_field = 'context_col_normed_tfidf'

        self.text_field = text_field
        if context_text_field is None:
            self.context_text_field = text_field
        else:
            self.context_text_field = context_text_field

        if text_pipe is None:
            self.text_pipe = ConvokitPipeline([
                ('text_pipe',
                 TextProcessor(output_field=self.text_field,
                               proc_fn=lambda x: x))
            ])
        else:
            self.text_pipe = text_pipe

        if context_text_pipe is None:
            self.context_text_pipe = self.text_pipe
        else:
            self.context_text_pipe = context_text_pipe

        self.tfidf_params = tfidf_params
        if context_tfidf_params is None:
            self.context_tfidf_params = tfidf_params
        else:
            self.context_tfidf_params = context_tfidf_params

        self.min_terms = min_terms
        if context_min_terms is None:
            self.context_min_terms = min_terms
        else:
            self.context_min_terms = context_min_terms

        if ec_model is not None:
            in_model = ec_model.ec_model
        else:
            in_model = None
        self.ec_model = ExpectedContextModelTransformer(
            context_field=context_field,
            output_prefix=output_prefix,
            vect_field=self.vect_field,
            context_vect_field=self.context_vect_field,
            model=in_model,
            n_svd_dims=n_svd_dims,
            snip_first_dim=snip_first_dim,
            n_clusters=n_clusters,
            cluster_on=cluster_on,
            random_state=random_state,
            cluster_random_state=cluster_random_state)

        self.tfidf_model = ColNormedTfidfTransformer(
            input_field=self.text_field,
            output_field=self.vect_field,
            **self.tfidf_params)
        if not share_tfidf_models:
            self.context_tfidf_model = ColNormedTfidfTransformer(
                input_field=self.context_text_field,
                output_field=self.context_vect_field,
                **self.context_tfidf_params)
        else:
            self.context_tfidf_model = self.tfidf_model
    def __init__(self,
                 context_fields,
                 output_prefixes,
                 text_field,
                 context_text_field=None,
                 wrapper_output_prefix='',
                 text_pipe=None,
                 context_text_pipe=None,
                 tfidf_params={},
                 context_tfidf_params=None,
                 share_tfidf_models=True,
                 min_terms=0,
                 context_min_terms=None,
                 n_svd_dims=25,
                 snip_first_dim=True,
                 n_clusters=8,
                 cluster_on='utts',
                 random_state=None,
                 cluster_random_state=None):

        self.vect_field = 'col_normed_tfidf'
        self.share_tfidf_models = share_tfidf_models

        if share_tfidf_models:
            self.context_vect_field = self.vect_field
        else:
            self.context_vect_field = 'context_col_normed_tfidf'

        self.text_field = text_field
        if context_text_field is None:
            self.context_text_field = text_field
        else:
            self.context_text_field = context_text_field

        if text_pipe is None:
            self.text_pipe = ConvokitPipeline([
                ('text_pipe',
                 TextProcessor(output_field=self.text_field,
                               proc_fn=lambda x: x))
            ])
        self.text_pipe = text_pipe
        self.text_pipe.steps[-1][1].output_field = self.text_field

        if context_text_pipe is None:
            self.context_text_pipe = self.text_pipe
        else:
            self.context_text_pipe = context_text_pipe
            self.context_text_pipe.steps[-1][
                1].output_field = self.context_text_field

        self.tfidf_params = tfidf_params
        if context_tfidf_params is None:
            self.context_tfidf_params = tfidf_params
        else:
            self.context_tfidf_params = context_tfidf_params

        self.min_terms = min_terms
        if context_min_terms is None:
            self.context_min_terms = min_terms
        else:
            self.context_min_terms = context_min_terms

        self.dualmodel = DualContextWrapper(
            context_fields=context_fields,
            output_prefixes=output_prefixes,
            vect_field=self.vect_field,
            context_vect_field=self.context_vect_field,
            wrapper_output_prefix=wrapper_output_prefix,
            n_svd_dims=n_svd_dims,
            snip_first_dim=snip_first_dim,
            n_clusters=n_clusters,
            cluster_on=cluster_on,
            random_state=random_state,
            cluster_random_state=cluster_random_state)

        self.tfidf_model = ColNormedTfidfTransformer(
            input_field=self.text_field,
            output_field=self.vect_field,
            **self.tfidf_params)
        if not share_tfidf_models:
            self.context_tfidf_model = ColNormedTfidfTransformer(
                input_field=self.context_text_field,
                output_field=self.context_vect_field,
                **self.context_tfidf_params)
        else:
            self.context_tfidf_model = self.tfidf_model
class DualContextPipeline(Transformer):
    """
    Wrapper class implementing a pipeline that derives characterizations of terms and utterances in terms of two choices of conversational context. The pipeline handles the following steps:

    * processing input text (via a pipeline supplied by the user in the `text_pipe` argument);
    * transforming text to input representation (via `ColNormedTfidfTransformer`);
    * deriving characterizations (via `DualContextWrapper`)

    The `ColNormedTfidfTransformer` components are stored as the `tfidf_model` and `context_tfidf_model` attributes of the class; the `DualContextWrapper` is stored as the `dualmodel` attribute.
    
    For further details, see the `ColNormedTfidfTransformer` and `DualContextWrapper` classes.

    :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance. in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example, to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply, set `context_field` to `'next_id'`.
    :param output_prefixes: list containing the name of the attributes and vectors that the `DualContextWrapper` component will write to in the transform step.
    :param text_field: the  name of the utterance-level attribute containing the text to use as input.
    :param context_text_field: the  name of the utterance-level attribute containing the text to use as input for context-utterances. by default, is equivalent to `text_field`.
    :param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `<wrapper_output_prefix>_orn` (orientation) and `<wrapper_output_prefix>_shift` (shift).
    :param text_pipe: a `convokitPipeline` object used to compute the contents of `text_field`. defaults to populating the `text_field` attribute of each utterance utt with `utt.text`.
    :param context_text_pipe: a `convokitPipeline` object used to compute the contents of `context_text_field`; by default equivalent to `text_pipe`
    :param tfidf_params: a dictionary specifying parameters to be passed to the `ColNormedTfidfTransformer` object to compute input representations of utterances.
    :param context_tfidf_parms: a dictionary specifying parameters to be passed to the `ColNormedTfidfTransformer` object to compute input representations of context-utterances. equivalent to `tfidf_params` by default.
    :param share_tfidf_models: whether or not to use the same `ColNormedTfidfTransformer` for both utterances and context-utterances. defaults to `True`.
    :param min_terms: the minimum number of terms in the vocabulary, derived by `ColNormedTfidfTransformer`, that an utterance must contain for it to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. defaults to 0, meaning the transformer will consider all utterances.
    :param context_min_terms: minimum number of terms in the vocabulary for a context-utterance to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. equivalent to `min_terms` by default.
    :param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
    :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
    :param n_clusters: the number of clusters to infer.
    :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might  be better to cluster term representations first.
    :param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
    :param cluster_random_state: the random seed to use to infer clusters.

    """
    def __init__(self,
                 context_fields,
                 output_prefixes,
                 text_field,
                 context_text_field=None,
                 wrapper_output_prefix='',
                 text_pipe=None,
                 context_text_pipe=None,
                 tfidf_params={},
                 context_tfidf_params=None,
                 share_tfidf_models=True,
                 min_terms=0,
                 context_min_terms=None,
                 n_svd_dims=25,
                 snip_first_dim=True,
                 n_clusters=8,
                 cluster_on='utts',
                 random_state=None,
                 cluster_random_state=None):

        self.vect_field = 'col_normed_tfidf'
        self.share_tfidf_models = share_tfidf_models

        if share_tfidf_models:
            self.context_vect_field = self.vect_field
        else:
            self.context_vect_field = 'context_col_normed_tfidf'

        self.text_field = text_field
        if context_text_field is None:
            self.context_text_field = text_field
        else:
            self.context_text_field = context_text_field

        if text_pipe is None:
            self.text_pipe = ConvokitPipeline([
                ('text_pipe',
                 TextProcessor(output_field=self.text_field,
                               proc_fn=lambda x: x))
            ])
        self.text_pipe = text_pipe
        self.text_pipe.steps[-1][1].output_field = self.text_field

        if context_text_pipe is None:
            self.context_text_pipe = self.text_pipe
        else:
            self.context_text_pipe = context_text_pipe
            self.context_text_pipe.steps[-1][
                1].output_field = self.context_text_field

        self.tfidf_params = tfidf_params
        if context_tfidf_params is None:
            self.context_tfidf_params = tfidf_params
        else:
            self.context_tfidf_params = context_tfidf_params

        self.min_terms = min_terms
        if context_min_terms is None:
            self.context_min_terms = min_terms
        else:
            self.context_min_terms = context_min_terms

        self.dualmodel = DualContextWrapper(
            context_fields=context_fields,
            output_prefixes=output_prefixes,
            vect_field=self.vect_field,
            context_vect_field=self.context_vect_field,
            wrapper_output_prefix=wrapper_output_prefix,
            n_svd_dims=n_svd_dims,
            snip_first_dim=snip_first_dim,
            n_clusters=n_clusters,
            cluster_on=cluster_on,
            random_state=random_state,
            cluster_random_state=cluster_random_state)

        self.tfidf_model = ColNormedTfidfTransformer(
            input_field=self.text_field,
            output_field=self.vect_field,
            **self.tfidf_params)
        if not share_tfidf_models:
            self.context_tfidf_model = ColNormedTfidfTransformer(
                input_field=self.context_text_field,
                output_field=self.context_vect_field,
                **self.context_tfidf_params)
        else:
            self.context_tfidf_model = self.tfidf_model

    def fit(self,
            corpus,
            y=None,
            selector=lambda x: True,
            context_selector=lambda x: True):
        """
        Fits the model over training data.

        :param corpus: Corpus containing training data
        :param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances, subject to `min_terms` parameter passed at initialization.
        :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances, subject to `context_min_terms` parameter passed at initialization.
        :return: None
        """
        self.text_pipe.fit_transform(corpus)
        if not self.share_tfidf_models:
            self.context_text_pipe.fit_transform(corpus)
        self.tfidf_model.fit_transform(corpus, selector=selector)
        if not self.share_tfidf_models:
            self.context_tfidf_model.fit_transform(corpus,
                                                   selector=context_selector)
        self.dualmodel.fit(
            corpus,
            selector=lambda x: selector(x) and
            (x.meta.get(self.vect_field + '__n_feats', 0) >= self.min_terms),
            context_selector=lambda x: context_selector(x) and
            (x.meta.get(self.context_vect_field + '__n_feats', 0) >= self.
             context_min_terms))

    def transform(self, corpus, y=None, selector=lambda x: True):
        """
        Computes vector representations, and statistics for utterances in a corpus, using the `DualContextWrapper` component. 

        :param corpus: Corpus
        :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
        :return: the Corpus, with per-utterance attributes.
        """
        _ = self.text_pipe.transform(corpus)
        _ = self.tfidf_model.transform(corpus, selector=selector)
        _ = self.dualmodel.transform(
            corpus,
            selector=lambda x: selector(x) and
            (x.meta.get(self.vect_field + '__n_feats', 0) >= self.min_terms))
        return corpus

    def transform_utterance(self, utt):
        """
        Computes representations and statistics for a single utterance, which can be a ConvoKit Utterance or a string. 
        Will return an Utterance object a nd write all of these characterizations (including vectors) to the utterance's metadata; attribute names are prefixed with the `output_prefix` constructor argument.

        :param utt: Utterance or string
        :return: the utterance, with per-utterance representation, range and cluster assignments.
        """
        if isinstance(utt, str):
            utt = Utterance(text=utt, speaker=Speaker())
        self.text_pipe.transform_utterance(utt)
        self.tfidf_model.transform_utterance(utt)
        return self.dualmodel.transform_utterance(utt)

    def summarize(self, k=10, max_chars=1000, corpus=None):
        """
        Prints inferred clusters and statistics about their sizes, for each component in the underlying `DualContextWrapper`.

        :param k: number of examples to print out.
        :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
        :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.

        :return: None
        """
        self.dualmodel.summarize(k, max_chars, corpus)

    def get_terms(self):
        """
        Gets the names of the terms for which the transformer has computed representations.

        :return: list of terms
        """
        return self.dualmodel.get_terms()

    def get_term_df(self):
        """
        Gets a Pandas dataframe containing term-level statistics computed by the transformer (shift, orientation, ranges)

        :return: dataframe of term-level statistics
        """
        return self.dualmodel.get_term_df()

    def load(self, dirname, model_dirs=None):
        """
        Loads a model from disk.

        :param dirname: directory to read model from
        :param model_dirs: optional list containing the directories (relative to `dirname`) in which each component is stored. the order of the list is as follows: [the `DualContextWrapper` components, the utterance `ColNormedTfidfTransformer`, the context-utterance `ColNormedTfidfTransformer` (if `share_tfidf_models` is set to `False` at initialization)]. defaults to `[output_prefixes[0], output_prefixes[1], 'tfidf_model', 'context_tfidf_model']` where `output_prefixes` is passed at initialization.
        :return: None
        """
        if model_dirs is None:
            model_dirs = self.dualmodel.output_prefixes + [
                'tfidf_model', 'context_tfidf_model'
            ]

        self.tfidf_model.load(os.path.join(dirname, model_dirs[2]))
        if not self.share_tfidf_models:
            self.context_tfidf_model.load(os.path.join(dirname, model_dirs[3]))
        else:
            self.context_tfidf_model = self.tfidf_model
        self.dualmodel.load(dirname, model_dirs[:2])

    def dump(self, dirname):
        """
        Writes a model to disk.

        :param dirname: directory to write model to.
        :return: None
        """
        self.dualmodel.dump(dirname)
        try:
            os.mkdir(os.path.join(dirname, 'tfidf_model'))
        except:
            pass
        self.tfidf_model.dump(os.path.join(dirname, 'tfidf_model'))
        if not self.share_tfidf_models:
            self.context_tfidf_model.dump(
                os.path.join(dirname, 'context_tfidf_model'))