Beispiel #1
0
    def _get_term_indices_to_compact_from_term_freqs(self, term_freqs,
                                                     term_doc_matrix,
                                                     non_text):
        idx = IndexStore()
        tdf_vals = term_freqs.values
        valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
        tdf_vals = term_freqs[valid_terms_mask].values
        terms = np.array(term_freqs.index)[valid_terms_mask]

        lengths = []
        fact = CSRMatrixFactory()
        for i, t in enumerate(terms):
            for tok in t.split():
                fact[i, idx.getidx(tok)] = 1
            lengths.append(len(t.split()))
        lengths = np.array(lengths)
        mat = fact.get_csr_matrix()

        coocs = lengths - (mat * mat.T)
        pairs = np.argwhere(coocs == 0).T
        pairs = self._limit_to_non_identical_terms(pairs)
        pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(
            pairs, terms)
        pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
        idx_store = term_doc_matrix._get_relevant_idx_store(non_text)
        redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
            pairs[:, 1])])
        infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
        terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
        return terms_to_remove
    def use_external_metadata_lists(self, metadata_lists):
        '''
        Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document.
        :param metadata: List[List[str]]
        :return: new TermDocMatrix
        '''
        metadata_index_store = IndexStore()
        metadata_csr_factory = CSRMatrixFactory()
        assert len(metadata_lists) == self.get_num_docs()
        print("STARTING")
        for doc_i, metadata_list in enumerate(metadata_lists):
            print("L", metadata_list)
            for metadatum in metadata_list:
                print("METADATUM", metadatum)
                # raise Exception(str(metadatum)
                #                + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list)
                #                + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists))
                # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list)
                metadata_csr_factory[
                    doc_i, metadata_index_store.getidx(metadatum)] = 1

        return self._make_new_term_doc_matrix(
            new_mX=metadata_csr_factory.get_csr_matrix(dtype=int),
            new_metadata_idx_store=metadata_index_store,
            new_y_mask=self._y == self._y)
	def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix):
		idx = IndexStore()
		tdf_vals = term_freqs.values
		valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
		tdf_vals = term_freqs[valid_terms_mask].values
		terms = np.array(term_freqs.index)[valid_terms_mask]

		lengths = []
		fact = CSRMatrixFactory()
		for i, t in enumerate(terms):
			for tok in t.split():
				fact[i, idx.getidx(tok)] = 1
			lengths.append(len(t.split()))
		lengths = np.array(lengths)
		mat = fact.get_csr_matrix()

		coocs = lengths - (mat * mat.T)
		pairs = np.argwhere(coocs == 0).T
		pairs = self._limit_to_non_identical_terms(pairs)
		pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram(pairs, terms)
		pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals)
		idx_store = term_doc_matrix._term_idx_store
		redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(pairs[:, 1])])
		infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
		terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
		return terms_to_remove
Beispiel #4
0
 def _get_term_indices_to_compact_from_term_freqs(self, term_freqs):
     fact = CSRMatrixFactory()
     idx = IndexStore()
     tdf_vals = term_freqs.values
     valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count
     tdf_vals = term_freqs[valid_terms_mask].values
     terms = np.array(term_freqs.index)[valid_terms_mask]
     lengths = []
     for i, t in enumerate(terms):
         for tok in t.split():
             fact[i, idx.getidx(tok)] = 1
         lengths.append(len(t.split()))
     lengths = np.array(lengths)
     mat = fact.get_csr_matrix()
     coocs = lengths - (mat * mat.T)
     pairs = np.argwhere(coocs == 0).T
     pairs = pairs.T[(pairs[0] != pairs[1])]
     pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])]
     pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]],
                          axis=1)]
     idx_store = self.term_doc_matrix._term_idx_store
     redundant_terms = idx_store.getidxstrictbatch(terms[np.unique(
         pairs[:, 1])])
     infrequent_terms = np.argwhere(~valid_terms_mask).T[0]
     terms_to_remove = np.concatenate([redundant_terms, infrequent_terms])
     return terms_to_remove
Beispiel #5
0
    def __init__(self,
                 X: csr_matrix,
                 term_vocabulary: List[str],
                 mX: Optional[csr_matrix] = None,
                 y: Optional[np.array] = None,
                 category_names: Optional[str] = None,
                 metadata_vocabulary: Optional[List[str]] = None,
                 text_df: Optional[pd.DataFrame] = None,
                 text_col: Optional[str] = None,
                 parsed_col: Optional[str] = None,
                 category_col: Optional[str] = None,
                 unigram_frequency_path: Optional[str] = None):
        '''
        Parameters
        ----------
        X: csr_matrix; term-document frequency matrix; columns represent terms and rows documents
        term_vocabulary: List[str]; Each entry corresponds to a term
        mX: Optional[csr_matrix]; metadata csr matrix
        y: Optional[np.array[int]]; indices of category names for each document
        category_names: Optional[List[str]], names of categories for y
        text_df: pd.DataFrame with a row containing the raw document text
        text_col: str; name of row containing the text of each document
        parsed_col: str; name of row containing the parsed text of each document
        unigram_frequency_path: str (see TermDocMatrix)

        '''
        self.X = X
        self.term_idx_store = IndexStoreFromList.build(term_vocabulary)
        assert self.X.shape[1] == len(term_vocabulary)
        self.metadata_idx_store = IndexStore()
        if y is None:
            self.y = np.zeros(self.X.shape[0], dtype=np.int)
            self.category_idx_store = IndexStoreFromList.build(['_'])
            assert category_names is None
        else:
            self.y = y
            assert len(category_names) == len(set(y))
            self.category_idx_store = IndexStoreFromList.build(category_names)
        if metadata_vocabulary is not None:
            assert mX.shape[1] == metadata_vocabulary
            self.mX = mX
            self.metadata_idx_store = IndexStoreFromList.build(
                metadata_vocabulary)
        else:
            assert metadata_vocabulary is None
            self.mX = csr_matrix((0, 0))
            self.metadata_idx_store = IndexStore()
        self.text_df = text_df
        if parsed_col is not None:
            assert parsed_col in text_df
        if text_col is not None:
            assert text_col in text_df
        if category_col is not None:
            assert category_col in text_df
        self.category_col = category_col
        self.text_col = text_col
        self.parsed_col = parsed_col
        self.unigram_frequency_path = unigram_frequency_path
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(),
                             meta_index_store)
	def build(term_to_index_dict):
		'''
		Parameters
		----------
		term_to_index_dict: term -> idx dictionary

		Returns
		-------
		IndexStore
		'''
		idxstore = IndexStore()
		idxstore._val2i = term_to_index_dict
		idxstore._next_i = len(term_to_index_dict)
		idxstore._i2val = [None for _ in range(idxstore._next_i)]
		for term, idx in idxstore._val2i.items():
			idxstore._i2val[idx] = term
		return idxstore
	def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])
	def _get_build_kwargs(self):
		constructor_kwargs = {
			'X': csr_matrix(self.term_freq_df.values.T),
			'mX': csr_matrix((0, 0)),
			'y': np.array(range(len(self.term_freq_df.columns))),
			'term_idx_store': IndexStoreFromList.build(self.term_freq_df.index.values),
			'metadata_idx_store': IndexStore(),
			'category_idx_store': IndexStoreFromList.build(self.term_freq_df.columns),
			'unigram_frequency_path': self.unigram_frequency_path
		}
		return constructor_kwargs
 def _get_build_kwargs(self):
     constructor_kwargs = {
         'X': self.X,
         'mX': csr_matrix((0, 0)),
         'y': self.y,
         'term_idx_store':
         IndexStoreFromDict.build(self.feature_vocabulary),
         'metadata_idx_store': IndexStore(),
         'category_idx_store':
         IndexStoreFromList.build(self.category_names),
         'unigram_frequency_path': self.unigram_frequency_path
     }
     return constructor_kwargs
 def test_add_metadata(self):
     hamlet = get_hamlet_term_doc_matrix()
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs()):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     other_hamlet = hamlet.add_metadata(meta_fact.get_csr_matrix(),
                                        meta_index_store)
     assert other_hamlet != hamlet
     meta_index_store = IndexStore()
     meta_fact = CSRMatrixFactory()
     for i in range(hamlet.get_num_docs() - 5):
         meta_fact[i, i] = meta_index_store.getidx(str(i))
     with self.assertRaises(AssertionError):
         hamlet.add_metadata(meta_fact.get_csr_matrix(), meta_index_store)
Beispiel #12
0
    def build(term_to_index_dict):
        '''
		Parameters
		----------
		term_to_index_dict: term -> idx dictionary

		Returns
		-------
		IndexStore
		'''
        idxstore = IndexStore()
        idxstore._val2i = term_to_index_dict
        idxstore._next_i = len(term_to_index_dict)
        idxstore._i2val = [None for _ in range(idxstore._next_i)]
        for term, idx in idxstore._val2i.items():
            idxstore._i2val[idx] = term
        return idxstore