def __init__(self,
                 data_frame,
                 text_col,
                 clean_function=lambda x: x,
                 nlp=None,
                 feats_from_spacy_doc=None,
                 verbose=False):
        '''Creates a TermDocMatrix from a pandas data frame.

        Parameters
        ----------
        data_frame : pd.DataFrame
            The data frame that contains columns for the category of interest
            and the document text.
        text_col : str
            The name of the column which contains each document's raw text.
        clean_function : function, optional
            A function that strips invalid characters out of the document text string,
            returning the new string.
        nlp : function, optional
        feats_from_spacy_doc : FeatsFromSpacyDoc or None
        verbose : boolean, optional
            If true, prints a message every time a document index % 100 is 0.

        See Also
        --------
        TermDocMatrixFactory
        '''
        TermDocMatrixFactory.__init__(self,
                                      clean_function=clean_function,
                                      nlp=nlp,
                                      feats_from_spacy_doc=feats_from_spacy_doc)
        self.data_frame = data_frame.reset_index()
        self._text_col = text_col
        self._verbose = verbose
Beispiel #2
0
def build_term_doc_matrix():
	term_doc_matrix = TermDocMatrixFactory(
		category_text_iter=iter_party_speech_pairs(),
		clean_function=clean_function_factory(),
		nlp=whitespace_nlp
	).build()
	return term_doc_matrix