def __init__(self, df, category_col, text_col, feature_col, metadata_col=None, parsed_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df text_col : str The name of the column which contains each document's raw text. feature_col : str name of column in convention_df with a feature dictionary metadata_col : str, optional name of column in convention_df with a meatadata dictionary parsed_col : str, optional name of column in convention_df with parsed strings ''' self._df = df.reset_index() self._category_col = category_col self._text_col = text_col self._feature_col = feature_col self._parsed_col = parsed_col self._metadata_col = metadata_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore()
def __init__(self, df, category_col, parsed_col, feats_from_spacy_doc=FeatsFromSpacyDoc()): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs category_col : str name of category column in convention_df parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feats_from_spacy_doc = feats_from_spacy_doc
def build_from_category_whitespace_delimited_text(category_text_iter): ''' Parameters ---------- category_text_iter iterator of (string category name, one line per sentence, whitespace-delimited text) pairs Returns ------- TermDocMatrix ''' y = [] X_factory = CSRMatrixFactory() term_idx_store = IndexStore() category_idx_store = IndexStore() mX_factory = CSRMatrixFactory() for doci, (category, text) in enumerate(category_text_iter): y.append(category_idx_store.getidx(category)) term_freq = Counter() for sent in text.strip(string.punctuation).lower().split('\n'): unigrams = [] for tok in sent.strip().split(): unigrams.append(tok) bigrams = list(map(' '.join, zip(unigrams[:-1], unigrams[1:]))) for term in unigrams + bigrams: term_freq[term_idx_store.getidx(term)] += 1 for word_idx, freq in term_freq.items(): X_factory[doci, word_idx] = freq metadata_idx_store = IndexStore() return TermDocMatrix(X=X_factory.get_csr_matrix(), mX=mX_factory.get_csr_matrix(), y=np.array(y), term_idx_store=term_idx_store, metadata_idx_store=metadata_idx_store, category_idx_store=category_idx_store)
def build(self): '''Constructs the term doc matrix. Returns ------- TermDocMatrix ''' X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() term_idx_store = IndexStore() metadata_idx_store = IndexStore() parse_pipeline = ParsePipelineFactoryWithoutCategories(self.get_nlp(), X_factory, mX_factory, term_idx_store, metadata_idx_store, self) df = self._clean_and_filter_nulls_and_empties_from_dataframe() tdm = self._apply_pipeline_and_get_build_instance(X_factory, mX_factory, df, parse_pipeline, term_idx_store, metadata_idx_store) return tdm
def __init__(self, df, parsed_col, feat_and_offset_getter, category_col=None): ''' Parameters ---------- df : pd.DataFrame contains category_col, and parse_col, were parsed col is entirely spacy docs parsed_col : str name of spacy parsed column in convention_df feats_from_spacy_doc : FeatsFromSpacyDoc category_col : str, Optional name of category column in df; if None, all category names will be '_' ''' self._df = df.reset_index() self._category_col = category_col self._parsed_col = parsed_col self._category_idx_store = IndexStore() self._X_factory = CSRMatrixFactory() self._mX_factory = CSRMatrixFactory() self._term_idx_store = IndexStore() self._metadata_idx_store = IndexStore() self._feat_and_offset_getter = feat_and_offset_getter self._term_offsets = {} self._metadata_offsets = {}
def init_term_doc_matrix_variables(): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() category_idx_store = IndexStore() term_idx_store = IndexStore() metadata_idx_store = IndexStore() return X_factory, mX_factory, category_idx_store, \ term_idx_store, metadata_idx_store, y
def add_doc_names_as_metadata(self, doc_names): ''' :param doc_names: array-like[str], document names of reach document :return: Corpus-like object with doc names as metadata. If two documents share the same name (doc number) will be appended to their names. ''' if len(doc_names) != self.get_num_docs(): raise Exception("The parameter doc_names contains %s elements. " "It should have %s elements, one per document." % (len(doc_names), self.get_num_docs())) doc_names_counter = collections.Counter(np.array(doc_names)) metafact = CSRMatrixFactory() metaidxstore = IndexStore() doc_id_uses = collections.Counter() for i in range(self.get_num_docs()): doc_id = doc_names[i] if doc_names_counter[doc_id] > 1: doc_id_uses[doc_id] += 1 doc_name_idx = metaidxstore.getidx( '%s (%s)' % (doc_id, doc_id_uses[doc_id])) else: doc_name_idx = metaidxstore.getidx(doc_id) metafact[i, i] = doc_name_idx return self.add_metadata(metafact.get_csr_matrix(), metaidxstore)
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs, term_doc_matrix, non_text): idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] fact = CSRMatrixFactory() for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = self._limit_to_non_identical_terms(pairs) pairs = self._limit_to_pairs_of_bigrams_and_a_constituent_unigram( pairs, terms) pairs = self._limit_to_redundant_unigrams(pairs, tdf_vals) idx_store = term_doc_matrix._get_relevant_idx_store(non_text) redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def use_external_metadata_lists(self, metadata_lists): ''' Takes a list of string lists. Each list corresponds to metadata to associate its corresponding document. :param metadata: List[List[str]] :return: new TermDocMatrix ''' metadata_index_store = IndexStore() metadata_csr_factory = CSRMatrixFactory() assert len(metadata_lists) == self.get_num_docs() print("STARTING") for doc_i, metadata_list in enumerate(metadata_lists): print("L", metadata_list) for metadatum in metadata_list: print("METADATUM", metadatum) # raise Exception(str(metadatum) # + " " + str(type(metadatum)) + " " + str(len(metadatum)) + str(metadata_list) # + " " + str(type(metadata_list)) + " " + str(len(metadata_list)) + str(metadata_lists)) # raise Exception(f"METADATUM {metadatum} " + metadatum + ":::" + metadata_list) metadata_csr_factory[ doc_i, metadata_index_store.getidx(metadatum)] = 1 return self._make_new_term_doc_matrix( new_mX=metadata_csr_factory.get_csr_matrix(dtype=int), new_metadata_idx_store=metadata_index_store, new_y_mask=self._y == self._y)
def _get_features_and_labels_from_documents_and_indexes( self, category_doc_iter, category_idx_store, term_idx_store, metadata_idx_store): y = [] X_factory = CSRMatrixFactory() mX_factory = CSRMatrixFactory() for document_index, (category, parsed_text) in enumerate(category_doc_iter): self._register_doc_and_category(X_factory, mX_factory, category, category_idx_store, document_index, parsed_text, term_idx_store, metadata_idx_store, y) X = X_factory.get_csr_matrix() mX = mX_factory.get_csr_matrix() y = np.array(y) return X, mX, y
def _get_term_indices_to_compact_from_term_freqs(self, term_freqs): fact = CSRMatrixFactory() idx = IndexStore() tdf_vals = term_freqs.values valid_terms_mask = tdf_vals.sum(axis=1) >= self.minimum_term_count tdf_vals = term_freqs[valid_terms_mask].values terms = np.array(term_freqs.index)[valid_terms_mask] lengths = [] for i, t in enumerate(terms): for tok in t.split(): fact[i, idx.getidx(tok)] = 1 lengths.append(len(t.split())) lengths = np.array(lengths) mat = fact.get_csr_matrix() coocs = lengths - (mat * mat.T) pairs = np.argwhere(coocs == 0).T pairs = pairs.T[(pairs[0] != pairs[1])] pairs = pairs[np.array([terms[i[1]] in terms[i[0]] for i in pairs])] pairs = pairs[np.all(tdf_vals[pairs[:, 1]] <= tdf_vals[pairs[:, 0]], axis=1)] idx_store = self.term_doc_matrix._term_idx_store redundant_terms = idx_store.getidxstrictbatch(terms[np.unique( pairs[:, 1])]) infrequent_terms = np.argwhere(~valid_terms_mask).T[0] terms_to_remove = np.concatenate([redundant_terms, infrequent_terms]) return terms_to_remove
def test_main(self): mat_factory = CSRMatrixFactory() mat_factory[0, 0] = 4 mat_factory[1, 5] = 3 mat = mat_factory.get_csr_matrix() self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]), mat.todense())
def use_categories_as_metadata_and_replace_terms(self): ''' Returns a TermDocMatrix which is identical to self except the metadata values are now identical to the categories present and term-doc-matrix is now the metadata matrix. :return: TermDocMatrix ''' new_metadata_factory = CSRMatrixFactory() for i, category_idx in enumerate(self.get_category_ids()): new_metadata_factory[i, category_idx] = 1 new_metadata = new_metadata_factory.get_csr_matrix() new_tdm = self._make_new_term_doc_matrix( self._mX, new_metadata, self._y, self._metadata_idx_store, self._category_idx_store, copy(self._category_idx_store), self._y == self._y) return new_tdm
def feats_from_doc(self, raw_text): ''' Parameters ---------- raw_text, uncleaned text for parsing out features Returns ------- csr_matrix, feature matrix ''' parsed_text = self._nlp(self._clean_function(raw_text)) X_factory = CSRMatrixFactory() X_factory.set_last_col_idx(self._term_idx_store.getnumvals() - 1) term_freq = self._get_features_from_parsed_text( parsed_text, self._term_idx_store) self._register_document_features_with_X_factory( X_factory, 0, term_freq) return X_factory.get_csr_matrix()
def test_typing(self): mat_factory = CSRMatrixFactory() mat_factory[0, 0] = 4 mat_factory[1, 5] = 3.1 mat = mat_factory.get_csr_matrix() self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3.1]]), mat.todense()) mat = mat_factory.get_csr_matrix(dtype=np.bool) self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1]]), mat.todense()) mat = mat_factory.get_csr_matrix(dtype=np.int32) self.assertEqual(type(mat), csr_matrix) np.testing.assert_array_almost_equal( np.array([[4, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 3]]), mat.todense())
def use_doc_labeled_terms_as_metadata(self, doc_labels, separator='_', replace_metadata = True): ''' Makes the metadata of a new TermDocMatrix a copy of the term-document matrix, except each term is prefixed by its document's label followed by the separator. :param doc_labels: list[str], should be the same size as the number of documents in the TermDocMatrix. :param separator: str, default is '_' :return: self ''' assert len(doc_labels) == self.get_num_docs() doc_labels = np.array(doc_labels) terms_in_corpus = np.array(self._term_idx_store.values()) new_metadata_list = [] new_meta_X = None ordered_doc_labels = list(sorted(set(doc_labels))) X = self._X if replace_metadata: #X = self._mX X = self._X for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label label_X = X[label_doc_mask, :] label_term_mask = (X.sum(axis=0) > 0).A1 label_X = label_X[:, label_term_mask] cols_to_pad = len(new_metadata_list) new_metadata_list += [doc_label + separator + term for term in terms_in_corpus[label_term_mask]] if new_meta_X is None: new_meta_X = label_X else: label_X_pad = (CSRMatrixFactory() .set_last_col_idx(cols_to_pad - 1) .set_last_row_idx(sum(label_doc_mask) - 1) .get_csr_matrix()) padded_label_X = scipy.sparse.hstack([label_X_pad, label_X]) new_meta_X.resize(new_meta_X.shape[0], padded_label_X.shape[1]) new_meta_X = scipy.sparse.vstack([new_meta_X, padded_label_X]) new_metadata_idx_store = IndexStoreFromList.build(new_metadata_list) new_meta_X = new_meta_X.tocsr() new_mX = (CSRMatrixFactory() .set_last_col_idx(new_meta_X.shape[1] - 1) .set_last_row_idx(new_meta_X.shape[0] - 1) .get_csr_matrix().tolil()) start_row = 0 for doc_label in ordered_doc_labels: label_doc_mask = doc_labels == doc_label num_rows = sum(label_doc_mask) new_mX[label_doc_mask, :] = new_meta_X[start_row:start_row + num_rows, :] start_row += num_rows new_mX = new_mX.tocsr() new_tdm = self._make_new_term_doc_matrix(self._X, new_mX, self._y, self._term_idx_store, self._category_idx_store, new_metadata_idx_store, self._y == self._y) return new_tdm