def test_update_counts(): index = load_document_index(filename=StringIO(TEST_FAULTY_DOCUMENT_INDEX), sep=';') statistics = {'extra_1': 1, 'extra_2': 2} assert 'extra_1' not in index.columns update_document_index_properties(index, document_name='tran_2020_01_test', property_bag=statistics) assert 'extra_1' in index.columns assert 'extra_2' in index.columns assert int(index.loc['tran_2020_01_test'].extra_1) == 1 assert int(index.loc['tran_2020_01_test'].extra_2) == 2 assert index.extra_1.sum() == 1 assert index.extra_2.sum() == 2 statistics = {'extra_1': 10, 'extra_2': 22} update_document_index_properties(index, document_name='tran_2020_01_test', property_bag=statistics) assert int(index.loc['tran_2020_01_test'].extra_1) == 10 assert int(index.loc['tran_2020_01_test'].extra_2) == 22 assert index.extra_1.sum() == 10 assert index.extra_2.sum() == 22 statistics = {'extra_1': 10, 'extra_2': 22} update_document_index_properties(index, document_name='tran_2020_01_test', property_bag=statistics)
def test_load_primary_document_index(): filename = './tests/test_data/legal_instrument_five_docs_test.csv' df = load_document_index(filename, sep=';') assert df is not None assert 'unesco_id' in df.columns assert 'document_id' in df.columns
def test_group_by_category(): index: pd.DataFrame = load_document_index( filename=StringIO(TEST_DOCUMENT_INDEX), sep=';') result: pd.DataFrame = (DocumentIndexHelper(index).group_by_column( pivot_column_name='year', transformer=None, index_values=None).document_index) assert result.category.tolist() == [2019, 2020] assert result.year.tolist() == [2019, 2020]
def simple_index() -> pd.DataFrame: data = """ ;filename;year;document_name;document_id a;a.txt;2019;a;0 b;b.txt;2019;b;1 c;c.txt;2019;c;2 d;d.txt;2020;d;3 e;e.txt;2020;e;4 """ document_index = load_document_index(filename=StringIO(data), sep=';') return document_index
def load( *, folder: str, filename_fields: pu.FilenameFieldSpecs = None, slim: bool = False, verbose: bool = False, ): """Loads previously stored aggregate""" if not isfile(jj(folder, "topic_token_weights.zip")): return PickleUtility.explode(source=folder, target_folder=folder) document_index: pd.DataFrame = (pd.read_feather( jj(folder, "documents.feather")).rename_axis('document_id') if isfile( jj(folder, "documents.feather")) else pc.load_document_index( jj(folder, 'documents.zip'), filename_fields=filename_fields, **CSV_OPTS).set_index('document_id', drop=True)) data: InferredTopicsData = InferredTopicsData( dictionary=smart_load(jj(folder, 'dictionary.zip'), feather_pipe=pu.set_index, columns='token_id'), document_index=document_index, topic_token_weights=smart_load( jj(folder, 'topic_token_weights.zip')), document_topic_weights=smart_load( jj(folder, 'document_topic_weights.zip')), topic_token_overview=smart_load(jj(folder, 'topic_token_overview.zip'), feather_pipe=pu.set_index, columns='topic_id'), topic_diagnostics=smart_load(jj(folder, 'topic_diagnostics.zip'), missing_ok=True, feather_pipe=pu.set_index, columns='topic_id'), token_diagnostics=smart_load(jj(folder, 'token_diagnostics.zip'), missing_ok=True), ) # HACK: Handle renamed column: data.document_index = fix_renamed_columns(data.document_index) assert "year" in data.document_index.columns data.topic_token_overview = data.load_topic_labels(folder, **CSV_OPTS) data.slim_types() if slim: data.slimmer() if verbose: data.log_usage(total=True) return data
def _document_index(self) -> Optional[DocumentIndex]: """Returns the document index stored in archive, or None if not exists""" if self.document_index_name not in self.namelist(): return None return load_document_index( StringIO( zip_utils.read_file_content(zip_or_filename=self, filename=self.document_index_name, as_binary=False)), sep=self.checkpoint_opts.document_index_sep, )
def test_load(): index = load_document_index(filename=StringIO(TEST_FAULTY_DOCUMENT_INDEX), sep=';') assert isinstance(index, pd.DataFrame) assert len(index) == 5 assert index.columns.tolist() == [ 'filename', 'year', 'year_id', 'document_name', 'document_id', 'title', 'n_raw_tokens', ] assert index.document_id.tolist() == [0, 1, 2, 3, 4] assert index.index.name == '' index2 = load_document_index(filename=StringIO(TEST_DOCUMENT_INDEX), sep=';') assert isinstance(index2, pd.DataFrame) assert len(index2) == 5 assert index2.columns.tolist() == [ 'filename', 'year', 'year_id', 'document_name', 'document_id', 'title', 'n_raw_tokens', ] assert index2.document_id.tolist() == [0, 1, 2, 3, 4] assert index2.index.name == '' assert ((index == index2).all()).all() # pylint: disable=no-member index3 = load_document_index_from_str(TEST_DOCUMENT_INDEX, sep=';') assert ((index == index3).all()).all() # pylint: disable=no-member index4 = load_document_index_from_str(TEST_DOCUMENT_INDEX2, sep=';') assert ((index == index4).all()).all() # pylint: disable=no-member
def test_group_by_time_period_aggregates_n_documents(): index: pd.DataFrame = load_document_index( filename=StringIO(TEST_DOCUMENT_INDEX3), sep=';') yearly_document_index, _ = DocumentIndexHelper(index).group_by_time_period( time_period_specifier='year', source_column_name='year') assert yearly_document_index.time_period.tolist() == [ 2009, 2019, 2024, 2029 ] assert yearly_document_index.time_period.tolist() == [ 2009, 2019, 2024, 2029 ] assert yearly_document_index.n_documents.tolist() == [2, 3, 2, 2] decade_document_index, _ = DocumentIndexHelper( yearly_document_index).group_by_time_period( time_period_specifier='decade', source_column_name='time_period') assert decade_document_index.time_period.tolist() == [2000, 2010, 2020] assert decade_document_index.time_period.tolist() == [2000, 2010, 2020] assert decade_document_index.n_documents.tolist() == [2, 3, 4]
def load_test_index(data_str: str) -> DocumentIndexHelper: index = DocumentIndexHelper( load_document_index(filename=StringIO(data_str), sep=';')) return index