コード例 #1
0
    def use_external_metadata_lists(self, metadata_lists):
        '''
        Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document.
        :param metadata: List[List[str]]
        :return: new TermDocMatrix
        '''
        metadata_index_store = IndexStore()
        metadata_csr_factory = CSRMatrixFactory()
        assert len(metadata_lists) == self.get_num_docs()
        print("STARTING")
        for doc_i, metadata_list in enumerate(metadata_lists):
            print("L", metadata_list)
            for metadatum in metadata_list:
                print("METADATUM", metadatum)
                # raise Exception(str(metadatum)
                #                + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list)
                #                + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists))
                # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list)
                metadata_csr_factory[
                    doc_i, metadata_index_store.getidx(metadatum)] = 1

        return self._make_new_term_doc_matrix(
            new_mX=metadata_csr_factory.get_csr_matrix(dtype=int),
            new_metadata_idx_store=metadata_index_store,
            new_y_mask=self._y == self._y)
コード例 #2
0
 def _get_term_indices_to_compact_from_term_freqs(self, term_freqs):
     fact = CSRMatrixFactory()
     idx = IndexStore()
     tdf_vals = term_freqs.values
     valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
     tdf_vals = term_freqs[valid_terms_mask].values
     terms = np.array(term_freqs.index)[valid_terms_mask]
     lengths = []
     for i, t in enumerate(terms):
         for tok in t.split():
             fact[i, idx.getidx(tok)] = 1
         lengths.append(len(t.split()))
     lengths = np.array(lengths)
     mat = fact.get_csr_matrix()
     coocs = lengths - (mat * mat.T)
     pairs = np.argwhere(coocs == 0).T
     pairs = pairs.T[(pairs[0] != pairs[1])]
     pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])]
     pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]],
                          axis=1)]
     idx_store = self.term_doc_matrix._term_idx_store
     redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
         pairs[:, 1])])
     infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
     terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
     return terms_to_remove
コード例 #3
0
    def _get_term_indices_to_compact_from_term_freqs(self, term_freqs,
                                                     term_doc_matrix,
                                                     non_text):
        idx = IndexStore()
        tdf_vals = term_freqs.values
        valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
        tdf_vals = term_freqs[valid_terms_mask].values
        terms = np.array(term_freqs.index)[valid_terms_mask]

        lengths = []
        fact = CSRMatrixFactory()
        for i, t in enumerate(terms):
            for tok in t.split():
                fact[i, idx.getidx(tok)] = 1
            lengths.append(len(t.split()))
        lengths = np.array(lengths)
        mat = fact.get_csr_matrix()

        coocs = lengths - (mat * mat.T)
        pairs = np.argwhere(coocs == 0).T
        pairs = self._limit_to_non_identical_terms(pairs)
        pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(
            pairs, terms)
        pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
        idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
        redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
            pairs[:, 1])])
        infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
        terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
        return terms_to_remove
    def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        TermDocMatrix
        '''

        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(),
                                              X_factory,
                                              mX_factory,
                                              term_idx_store,
                                              metadata_idx_store,
                                              self)
        df = self._clean_and_filter_nulls_and_empties_from_dataframe()
        tdm = self._apply_pipeline_and_get_build_instance(X_factory,
                                                          mX_factory,
                                                          df,
                                                          parse_pipeline,
                                                          term_idx_store,
                                                          metadata_idx_store)
        return tdm
コード例 #5
0
	def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix):
		idx = IndexStore()
		tdf_vals = term_freqs.values
		valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
		tdf_vals = term_freqs[valid_terms_mask].values
		terms = np.array(term_freqs.index)[valid_terms_mask]

		lengths = []
		fact = CSRMatrixFactory()
		for i, t in enumerate(terms):
			for tok in t.split():
				fact[i, idx.getidx(tok)] = 1
			lengths.append(len(t.split()))
		lengths = np.array(lengths)
		mat = fact.get_csr_matrix()

		coocs = lengths - (mat * mat.T)
		pairs = np.argwhere(coocs == 0).T
		pairs = self._limit_to_non_identical_terms(pairs)
		pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
		pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
		idx_store = term_doc_matrix._term_idx_store
		redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
		infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
		terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
		return terms_to_remove
コード例 #6
0
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc
コード例 #7
0
    def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." %
                            (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx(
                    '%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
コード例 #8
0
    def __init__(self,
                 df,
                 category_col,
                 parsed_col,
                 feats_from_spacy_doc=FeatsFromSpacyDoc()):
        '''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feats_from_spacy_doc = feats_from_spacy_doc
コード例 #9
0
 def __init__(self,
              df,
              parsed_col,
              feat_and_offset_getter,
              category_col=None):
     '''
     Parameters
     ----------
     df : pd.DataFrame
      contains category_col, and parse_col, were parsed col is entirely spacy docs
     parsed_col : str
         name of spacy parsed column in convention_df
     feats_from_spacy_doc : FeatsFromSpacyDoc
     category_col : str, Optional
         name of category column in df; if None, all category names will be '_'
     '''
     self._df = df.reset_index()
     self._category_col = category_col
     self._parsed_col = parsed_col
     self._category_idx_store = IndexStore()
     self._X_factory = CSRMatrixFactory()
     self._mX_factory = CSRMatrixFactory()
     self._term_idx_store = IndexStore()
     self._metadata_idx_store = IndexStore()
     self._feat_and_offset_getter = feat_and_offset_getter
     self._term_offsets = {}
     self._metadata_offsets = {}
	def __init__(self,
	             df,
	             category_col,
	             text_col,
	             feature_col,
	             metadata_col=None,
	             parsed_col=None):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
				name of category column in convention_df
		text_col : str
				The name of the column which contains each document's raw text.
		feature_col : str
				name of column in convention_df with a feature dictionary
		metadata_col : str, optional
				name of column in convention_df with a meatadata dictionary
		parsed_col : str, optional
				name of column in convention_df with parsed strings
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._text_col = text_col
		self._feature_col = feature_col
		self._parsed_col = parsed_col
		self._metadata_col = metadata_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
コード例 #11
0
class CorpusFromParsedDocuments(object):
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in df
		parsed_col : str
			name of spacy parsed column in df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		return ParsedCorpus(self._df,
		                    self._X,
		                    self._mX,
		                    self._y,
		                    self._term_idx_store,
		                    self._category_idx_store,
		                    self._metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		parsed_text = row[self._parsed_col]
		for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items():
			term_idx = self._term_idx_store.getidx(term)
			self._X_factory[row.name, term_idx] = count
		for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items():
			meta_idx = self._metadata_idx_store.getidx(meta)
			self._mX_factory[row.name, meta_idx] = val
コード例 #12
0
 def test_main(self):
     mat_factory = CSRMatrixFactory()
     mat_factory[0, 0] = 4
     mat_factory[1, 5] = 3
     mat = mat_factory.get_csr_matrix()
     self.assertEqual(type(mat), csr_matrix)
     np.testing.assert_array_almost_equal(
         np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]), mat.todense())
コード例 #13
0
    def init_term_doc_matrix_variables():
        y = []
        X_factory = CSRMatrixFactory()
        mX_factory = CSRMatrixFactory()
        category_idx_store = IndexStore()
        term_idx_store = IndexStore()
        metadata_idx_store = IndexStore()

        return X_factory, mX_factory, category_idx_store, \
               term_idx_store, metadata_idx_store, y
コード例 #14
0
    def use_categories_as_metadata_and_replace_terms(self):
        '''
        Returns a TermDocMatrix which is identical to self except the metadata values are now identical to the
         categories present and term-doc-matrix is now the metadata matrix.

        :return: TermDocMatrix
        '''
        new_metadata_factory = CSRMatrixFactory()
        for i, category_idx in enumerate(self.get_category_ids()):
            new_metadata_factory[i, category_idx] = 1
        new_metadata = new_metadata_factory.get_csr_matrix()
        new_tdm = self._make_new_term_doc_matrix(
            self._mX, new_metadata, self._y,
            self._metadata_idx_store, self._category_idx_store,
            copy(self._category_idx_store), self._y == self._y)
        return new_tdm
コード例 #15
0
    def feats_from_doc(self, raw_text):
        '''
        Parameters
        ----------
        raw_text, uncleaned text for parsing out features

        Returns
        -------
        csr_matrix, feature matrix
        '''
        parsed_text = self._nlp(self._clean_function(raw_text))
        X_factory = CSRMatrixFactory()
        X_factory.set_last_col_idx(self._term_idx_store.getnumvals() - 1)
        term_freq = self._get_features_from_parsed_text(
            parsed_text, self._term_idx_store)
        self._register_document_features_with_X_factory(
            X_factory, 0, term_freq)
        return X_factory.get_csr_matrix()
	def test_typing(self):
		mat_factory = CSRMatrixFactory()
		mat_factory[0, 0] = 4
		mat_factory[1, 5] = 3.1
		mat = mat_factory.get_csr_matrix()
		self.assertEqual(type(mat), csr_matrix)
		np.testing.assert_array_almost_equal(
			np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3.1]]),
			mat.todense())

		mat = mat_factory.get_csr_matrix(dtype=np.bool)
		self.assertEqual(type(mat), csr_matrix)
		np.testing.assert_array_almost_equal(
			np.array([[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1]]),
			mat.todense())

		mat = mat_factory.get_csr_matrix(dtype=np.int32)
		self.assertEqual(type(mat), csr_matrix)
		np.testing.assert_array_almost_equal(
			np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]),
			mat.todense())
コード例 #17
0
def build_from_category_whitespace_delimited_text(category_text_iter):
    '''

    Parameters
    ----------
    category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs

    Returns
    -------
    TermDocMatrix
    '''
    y = []
    X_factory = CSRMatrixFactory()
    term_idx_store = IndexStore()
    category_idx_store = IndexStore()
    mX_factory = CSRMatrixFactory()
    for doci, (category, text) in enumerate(category_text_iter):
        y.append(category_idx_store.getidx(category))
        term_freq = Counter()
        for sent in text.strip(string.punctuation).lower().split('\n'):
            unigrams = []
            for tok in sent.strip().split():
                unigrams.append(tok)
            bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:])))
            for term in unigrams + bigrams:
                term_freq[term_idx_store.getidx(term)] += 1
        for word_idx, freq in term_freq.items():
            X_factory[doci, word_idx] = freq
    metadata_idx_store = IndexStore()
    return TermDocMatrix(X=X_factory.get_csr_matrix(),
                         mX=mX_factory.get_csr_matrix(),
                         y=np.array(y),
                         term_idx_store=term_idx_store,
                         metadata_idx_store=metadata_idx_store,
                         category_idx_store=category_idx_store)
コード例 #18
0
    def add_doc_names_as_metadata(self, doc_names):
        '''
        :param doc_names: array-like[str], document names of reach document
        :return: Corpus-like object with doc names as metadata. If two documents share the same name
        (doc number) will be appended to their names.
        '''
        if len(doc_names) != self.get_num_docs():
            raise Exception("The parameter doc_names contains %s elements. "
                            "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs()))

        doc_names_counter = collections.Counter(np.array(doc_names))
        metafact = CSRMatrixFactory()
        metaidxstore = IndexStore()
        doc_id_uses = collections.Counter()
        for i in range(self.get_num_docs()):
            doc_id = doc_names[i]
            if doc_names_counter[doc_id] > 1:
                doc_id_uses[doc_id] += 1
                doc_name_idx = metaidxstore.getidx('%s (%s)' % (doc_id, doc_id_uses[doc_id]))
            else:
                doc_name_idx = metaidxstore.getidx(doc_id)
            metafact[i, i] = doc_name_idx
        return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
コード例 #19
0
 def _get_features_and_labels_from_documents_and_indexes(
         self, category_doc_iter, category_idx_store, term_idx_store,
         metadata_idx_store):
     y = []
     X_factory = CSRMatrixFactory()
     mX_factory = CSRMatrixFactory()
     for document_index, (category,
                          parsed_text) in enumerate(category_doc_iter):
         self._register_doc_and_category(X_factory, mX_factory, category,
                                         category_idx_store, document_index,
                                         parsed_text, term_idx_store,
                                         metadata_idx_store, y)
     X = X_factory.get_csr_matrix()
     mX = mX_factory.get_csr_matrix()
     y = np.array(y)
     return X, mX, y
class CorpusFromFeatureDict(object):
	def __init__(self,
	             df,
	             category_col,
	             text_col,
	             feature_col,
	             metadata_col=None,
	             parsed_col=None):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
				name of category column in convention_df
		text_col : str
				The name of the column which contains each document's raw text.
		feature_col : str
				name of column in convention_df with a feature dictionary
		metadata_col : str, optional
				name of column in convention_df with a meatadata dictionary
		parsed_col : str, optional
				name of column in convention_df with parsed strings
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._text_col = text_col
		self._feature_col = feature_col
		self._parsed_col = parsed_col
		self._metadata_col = metadata_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y) - 1).get_csr_matrix()
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df,
			                    self._X,
			                    self._mX,
			                    self._y,
			                    self._term_idx_store,
			                    self._category_idx_store,
			                    self._metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df,
			                self._X,
			                self._mX,
			                self._y,
			                self._text_col,
			                self._term_idx_store,
			                self._category_idx_store,
			                self._metadata_idx_store)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		for feat, count in row[self._feature_col].items():
			feat_idx = self._term_idx_store.getidx(feat)
			self._X_factory[row.name, feat_idx] = count
		if self._metadata_col in self._df:
			for meta, count in row[self._metadata_col].items():
				meta_idx = self._metadata_idx_store.getidx(meta)
				self._mX_factory[row.name, meta_idx] = count

	def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		if self._parsed_col is not None and self._parsed_col in self._df:
			return ParsedCorpus(self._df[new_y_mask],
			                    new_X,
			                    new_mX,
			                    new_y,
			                    new_term_idx_store,
			                    new_category_idx_store,
			                    new_metadata_idx_store,
			                    self._parsed_col,
			                    self._category_col)
		else:
			return CorpusDF(self._df[new_y_mask],
			                new_X,
			                new_mX,
			                new_y,
			                self._text_col,
			                new_term_idx_store,
			                new_category_idx_store,
			                new_metadata_idx_store,
			                self._df[self._text_col][new_y_mask])
コード例 #21
0
class CorpusFromParsedDocuments(object):
	def __init__(self,
	             df,
	             category_col,
	             parsed_col,
	             feats_from_spacy_doc=FeatsFromSpacyDoc()):

		'''
		Parameters
		----------
		df : pd.DataFrame
		 contains category_col, and parse_col, were parsed col is entirely spacy docs
		category_col : str
			name of category column in convention_df
		parsed_col : str
			name of spacy parsed column in convention_df
		feats_from_spacy_doc : FeatsFromSpacyDoc
		'''
		self._df = df.reset_index()
		self._category_col = category_col
		self._parsed_col = parsed_col
		self._category_idx_store = IndexStore()
		self._X_factory = CSRMatrixFactory()
		self._mX_factory = CSRMatrixFactory()
		self._term_idx_store = IndexStore()
		self._metadata_idx_store = IndexStore()
		self._feats_from_spacy_doc = feats_from_spacy_doc

	def build(self):
		'''Constructs the term doc matrix.

		Returns
		-------
		scattertext.ParsedCorpus.ParsedCorpus
		'''
		self._y = self._get_y_and_populate_category_idx_store()
		self._df.apply(self._add_to_x_factory, axis=1)
		self._X = self._X_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		self._mX = self._mX_factory.set_last_row_idx(len(self._y)-1).get_csr_matrix()
		return ParsedCorpus(self._df,
		                    self._X,
		                    self._mX,
		                    self._y,
		                    self._term_idx_store,
		                    self._category_idx_store,
		                    self._metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)

	def _get_y_and_populate_category_idx_store(self):
		return np.array(self._df[self._category_col].apply(self._category_idx_store.getidx))

	def _add_to_x_factory(self, row):
		parsed_text = row[self._parsed_col]
		for term, count in self._feats_from_spacy_doc.get_feats(parsed_text).items():
			term_idx = self._term_idx_store.getidx(term)
			self._X_factory[row.name, term_idx] = count
		for meta, val in self._feats_from_spacy_doc.get_doc_metadata(parsed_text).items():
			meta_idx = self._metadata_idx_store.getidx(meta)
			self._mX_factory[row.name, meta_idx] = val

	def _make_new_term_doc_matrix(self,
	                              new_X,
	                              new_mX,
	                              new_y,
	                              new_term_idx_store,
	                              new_category_idx_store,
	                              new_metadata_idx_store,
	                              new_y_mask):
		return ParsedCorpus(self._df[new_y_mask],
		                    new_X,
		                    new_mX,
		                    new_y,
		                    new_term_idx_store,
		                    new_category_idx_store,
		                    new_metadata_idx_store,
		                    self._parsed_col,
		                    self._category_col)
コード例 #22
0
    def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True):
        '''
        Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed
        by its document's label followed by the separator.

        :param doc_labels: list[str], should be the same size as the number of documents in the TermDocMatrix.
        :param separator: str, default is '_'
        :return: self
        '''

        assert len(doc_labels) == self.get_num_docs()

        doc_labels = np.array(doc_labels)

        terms_in_corpus = np.array(self._term_idx_store.values())
        new_metadata_list = []
        new_meta_X = None

        ordered_doc_labels = list(sorted(set(doc_labels)))
        X = self._X
        if replace_metadata:
            #X = self._mX
            X = self._X

        for doc_label in ordered_doc_labels:
            label_doc_mask = doc_labels == doc_label
            label_X = X[label_doc_mask, :]
            label_term_mask = (X.sum(axis=0) > 0).A1
            label_X = label_X[:, label_term_mask]
            cols_to_pad = len(new_metadata_list)

            new_metadata_list += [doc_label + separator + term
                                  for term in terms_in_corpus[label_term_mask]]
            if new_meta_X is None:
                new_meta_X = label_X
            else:
                label_X_pad = (CSRMatrixFactory()
                               .set_last_col_idx(cols_to_pad - 1)
                               .set_last_row_idx(sum(label_doc_mask) - 1)
                               .get_csr_matrix())
                padded_label_X = scipy.sparse.hstack([label_X_pad, label_X])
                new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1])
                new_meta_X = scipy.sparse.vstack([new_meta_X,
                                                  padded_label_X])

        new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list)
        new_meta_X = new_meta_X.tocsr()
        new_mX = (CSRMatrixFactory()
                  .set_last_col_idx(new_meta_X.shape[1] - 1)
                  .set_last_row_idx(new_meta_X.shape[0] - 1)
                  .get_csr_matrix().tolil())
        start_row = 0
        for doc_label in ordered_doc_labels:
            label_doc_mask = doc_labels == doc_label
            num_rows = sum(label_doc_mask)
            new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :]
            start_row += num_rows

        new_mX = new_mX.tocsr()
        new_tdm = self._make_new_term_doc_matrix(self._X,
                                                 new_mX,
                                                 self._y,
                                                 self._term_idx_store,
                                                 self._category_idx_store,
                                                 new_metadata_idx_store,
                                                 self._y == self._y)
        return new_tdm
コード例 #23
0
class OffsetCorpusFactory(object):
    def __init__(self,
                 df,
                 parsed_col,
                 feat_and_offset_getter,
                 category_col=None):
        '''
        Parameters
        ----------
        df : pd.DataFrame
         contains category_col, and parse_col, were parsed col is entirely spacy docs
        parsed_col : str
            name of spacy parsed column in convention_df
        feats_from_spacy_doc : FeatsFromSpacyDoc
        category_col : str, Optional
            name of category column in df; if None, all category names will be '_'
        '''
        self._df = df.reset_index()
        self._category_col = category_col
        self._parsed_col = parsed_col
        self._category_idx_store = IndexStore()
        self._X_factory = CSRMatrixFactory()
        self._mX_factory = CSRMatrixFactory()
        self._term_idx_store = IndexStore()
        self._metadata_idx_store = IndexStore()
        self._feat_and_offset_getter = feat_and_offset_getter
        self._term_offsets = {}
        self._metadata_offsets = {}

    def build(self):
        '''Constructs the term doc matrix.

        Returns
        -------
        scattertext.ParsedCorpus.ParsedCorpus
        '''
        self._ensure_category_col_is_in_df()

        y = self._get_y_and_populate_category_idx_store(
            self._df[self._category_col])
        self._df.apply(self._add_to_x_factory, axis=1)
        self._mX = self._mX_factory.set_last_row_idx(len(y) -
                                                     1).get_csr_matrix()
        return OffsetCorpus(
            df=self._df,
            X=self._X_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            mX=self._mX_factory.set_last_row_idx(len(y) - 1).get_csr_matrix(),
            y=self._get_y_and_populate_category_idx_store(
                self._df[self._category_col]),
            term_idx_store=self._term_idx_store,
            category_idx_store=self._category_idx_store,
            metadata_idx_store=self._metadata_idx_store,
            parsed_col=self._parsed_col,
            category_col=self._category_col,
            term_offsets=self._term_offsets,
            metadata_offsets=self._metadata_offsets)

    def _ensure_category_col_is_in_df(self):
        if self._category_col not in self._df:
            self._category_col = 'Category'
            while self._category_col in self._df:
                self._category_col = 'Category_' + ''.join(
                    np.random.choice(string.ascii_letters) for _ in range(5))

    def _get_y_and_populate_category_idx_store(self, categories):
        return np.array(categories.apply(self._category_idx_store.getidx))

    def _add_to_x_factory(self, row):
        parsed_text = row[self._parsed_col]
        for term, (count,
                   offsets) in self._feat_and_offset_getter.get_term_offsets(
                       parsed_text):
            term_idx = self._term_idx_store.getidx(term)
            self._X_factory[row.name, term_idx] = count
            if offsets is not None:
                self._term_offsets.setdefault(term, {}).setdefault(
                    row.name, []).extend(offsets)

        for meta, (
                val, offsets
        ) in self._feat_and_offset_getter.get_metadata_offsets(parsed_text):
            meta_idx = self._metadata_idx_store.getidx(meta)
            self._mX_factory[row.name, meta_idx] = val
            if offsets is not None:
                self._metadata_offsets.setdefault(meta, {}).setdefault(
                    row.name, []).extend(offsets)