Ejemplo n.º 1
0
def test_update_counts():
    index = load_document_index(filename=StringIO(TEST_FAULTY_DOCUMENT_INDEX),
                                sep=';')

    statistics = {'extra_1': 1, 'extra_2': 2}

    assert 'extra_1' not in index.columns
    update_document_index_properties(index,
                                     document_name='tran_2020_01_test',
                                     property_bag=statistics)

    assert 'extra_1' in index.columns
    assert 'extra_2' in index.columns

    assert int(index.loc['tran_2020_01_test'].extra_1) == 1
    assert int(index.loc['tran_2020_01_test'].extra_2) == 2

    assert index.extra_1.sum() == 1
    assert index.extra_2.sum() == 2

    statistics = {'extra_1': 10, 'extra_2': 22}
    update_document_index_properties(index,
                                     document_name='tran_2020_01_test',
                                     property_bag=statistics)

    assert int(index.loc['tran_2020_01_test'].extra_1) == 10
    assert int(index.loc['tran_2020_01_test'].extra_2) == 22

    assert index.extra_1.sum() == 10
    assert index.extra_2.sum() == 22

    statistics = {'extra_1': 10, 'extra_2': 22}
    update_document_index_properties(index,
                                     document_name='tran_2020_01_test',
                                     property_bag=statistics)
Ejemplo n.º 2
0
def test_load_primary_document_index():

    filename = './tests/test_data/legal_instrument_five_docs_test.csv'
    df = load_document_index(filename, sep=';')

    assert df is not None
    assert 'unesco_id' in df.columns
    assert 'document_id' in df.columns
Ejemplo n.º 3
0
def test_group_by_category():
    index: pd.DataFrame = load_document_index(
        filename=StringIO(TEST_DOCUMENT_INDEX), sep=';')
    result: pd.DataFrame = (DocumentIndexHelper(index).group_by_column(
        pivot_column_name='year', transformer=None,
        index_values=None).document_index)

    assert result.category.tolist() == [2019, 2020]
    assert result.year.tolist() == [2019, 2020]
Ejemplo n.º 4
0
def simple_index() -> pd.DataFrame:
    data = """
;filename;year;document_name;document_id
a;a.txt;2019;a;0
b;b.txt;2019;b;1
c;c.txt;2019;c;2
d;d.txt;2020;d;3
e;e.txt;2020;e;4
"""
    document_index = load_document_index(filename=StringIO(data), sep=';')
    return document_index
Ejemplo n.º 5
0
    def load(
        *,
        folder: str,
        filename_fields: pu.FilenameFieldSpecs = None,
        slim: bool = False,
        verbose: bool = False,
    ):
        """Loads previously stored aggregate"""

        if not isfile(jj(folder, "topic_token_weights.zip")):
            return PickleUtility.explode(source=folder, target_folder=folder)

        document_index: pd.DataFrame = (pd.read_feather(
            jj(folder,
               "documents.feather")).rename_axis('document_id') if isfile(
                   jj(folder,
                      "documents.feather")) else pc.load_document_index(
                          jj(folder, 'documents.zip'),
                          filename_fields=filename_fields,
                          **CSV_OPTS).set_index('document_id', drop=True))

        data: InferredTopicsData = InferredTopicsData(
            dictionary=smart_load(jj(folder, 'dictionary.zip'),
                                  feather_pipe=pu.set_index,
                                  columns='token_id'),
            document_index=document_index,
            topic_token_weights=smart_load(
                jj(folder, 'topic_token_weights.zip')),
            document_topic_weights=smart_load(
                jj(folder, 'document_topic_weights.zip')),
            topic_token_overview=smart_load(jj(folder,
                                               'topic_token_overview.zip'),
                                            feather_pipe=pu.set_index,
                                            columns='topic_id'),
            topic_diagnostics=smart_load(jj(folder, 'topic_diagnostics.zip'),
                                         missing_ok=True,
                                         feather_pipe=pu.set_index,
                                         columns='topic_id'),
            token_diagnostics=smart_load(jj(folder, 'token_diagnostics.zip'),
                                         missing_ok=True),
        )

        # HACK: Handle renamed column:
        data.document_index = fix_renamed_columns(data.document_index)
        assert "year" in data.document_index.columns

        data.topic_token_overview = data.load_topic_labels(folder, **CSV_OPTS)

        data.slim_types()
        if slim:
            data.slimmer()
        if verbose:
            data.log_usage(total=True)
        return data
Ejemplo n.º 6
0
    def _document_index(self) -> Optional[DocumentIndex]:
        """Returns the document index stored in archive, or None if not exists"""
        if self.document_index_name not in self.namelist():
            return None

        return load_document_index(
            StringIO(
                zip_utils.read_file_content(zip_or_filename=self,
                                            filename=self.document_index_name,
                                            as_binary=False)),
            sep=self.checkpoint_opts.document_index_sep,
        )
Ejemplo n.º 7
0
def test_load():

    index = load_document_index(filename=StringIO(TEST_FAULTY_DOCUMENT_INDEX),
                                sep=';')
    assert isinstance(index, pd.DataFrame)
    assert len(index) == 5
    assert index.columns.tolist() == [
        'filename',
        'year',
        'year_id',
        'document_name',
        'document_id',
        'title',
        'n_raw_tokens',
    ]
    assert index.document_id.tolist() == [0, 1, 2, 3, 4]
    assert index.index.name == ''

    index2 = load_document_index(filename=StringIO(TEST_DOCUMENT_INDEX),
                                 sep=';')
    assert isinstance(index2, pd.DataFrame)
    assert len(index2) == 5
    assert index2.columns.tolist() == [
        'filename',
        'year',
        'year_id',
        'document_name',
        'document_id',
        'title',
        'n_raw_tokens',
    ]
    assert index2.document_id.tolist() == [0, 1, 2, 3, 4]
    assert index2.index.name == ''
    assert ((index == index2).all()).all()  # pylint: disable=no-member

    index3 = load_document_index_from_str(TEST_DOCUMENT_INDEX, sep=';')
    assert ((index == index3).all()).all()  # pylint: disable=no-member

    index4 = load_document_index_from_str(TEST_DOCUMENT_INDEX2, sep=';')
    assert ((index == index4).all()).all()  # pylint: disable=no-member
Ejemplo n.º 8
0
def test_group_by_time_period_aggregates_n_documents():

    index: pd.DataFrame = load_document_index(
        filename=StringIO(TEST_DOCUMENT_INDEX3), sep=';')
    yearly_document_index, _ = DocumentIndexHelper(index).group_by_time_period(
        time_period_specifier='year', source_column_name='year')

    assert yearly_document_index.time_period.tolist() == [
        2009, 2019, 2024, 2029
    ]
    assert yearly_document_index.time_period.tolist() == [
        2009, 2019, 2024, 2029
    ]
    assert yearly_document_index.n_documents.tolist() == [2, 3, 2, 2]

    decade_document_index, _ = DocumentIndexHelper(
        yearly_document_index).group_by_time_period(
            time_period_specifier='decade', source_column_name='time_period')

    assert decade_document_index.time_period.tolist() == [2000, 2010, 2020]
    assert decade_document_index.time_period.tolist() == [2000, 2010, 2020]
    assert decade_document_index.n_documents.tolist() == [2, 3, 4]
Ejemplo n.º 9
0
def load_test_index(data_str: str) -> DocumentIndexHelper:
    index = DocumentIndexHelper(
        load_document_index(filename=StringIO(data_str), sep=';'))
    return index