Ejemplo n.º 1
0
def test_tagged_tokens_filter_apply_when_list_attribute_succeeds():

    doc = pd.DataFrame(data=dict(text=['a', 'b', 'c'], pos=['X', 'X', 'Y']))

    new_doc = PropertyValueMaskingOpts(pos='X').apply(doc)
    assert len(new_doc) == 2
    assert new_doc['text'].to_list() == ['a', 'b']

    new_doc = PropertyValueMaskingOpts(pos=['X', 'Y']).apply(doc)
    assert len(new_doc) == 3
    assert new_doc['text'].to_list() == ['a', 'b', 'c']
Ejemplo n.º 2
0
def test_tagged_tokens_filter_mask_when_boolean_attribute_succeeds():
    doc = pd.DataFrame(data=dict(
        text=['a', 'b', 'c', 'd'],
        is_stop=[True, False, True, np.nan],
        is_punct=[False, False, True, False],
    ))

    masking_opts = PropertyValueMaskingOpts(is_stop=True)
    mask = masking_opts.mask(doc)
    new_doc = doc[mask]
    assert len(new_doc) == 2
    assert new_doc['text'].to_list() == ['a', 'c']

    new_doc = doc[PropertyValueMaskingOpts(is_stop=None).mask(doc)]
    assert len(new_doc) == 4
    assert new_doc['text'].to_list() == ['a', 'b', 'c', 'd']

    new_doc = doc[PropertyValueMaskingOpts(is_stop=True,
                                           is_punct=True).mask(doc)]
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['c']

    new_doc = doc[PropertyValueMaskingOpts(is_stop=True,
                                           is_punct=False).mask(doc)]
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['a']

    new_doc = doc[PropertyValueMaskingOpts(is_stop=False).mask(doc)]
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['b']

    new_doc = doc[PropertyValueMaskingOpts(is_stop=[False]).mask(doc)]
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['b']
Ejemplo n.º 3
0
def test_tagged_tokens_filter_apply_unknown_attribute_is_ignored():

    doc = pd.DataFrame(data=dict(text=['a', 'b', 'c'], pos=['X', 'X', 'Y']))

    new_doc = PropertyValueMaskingOpts(kallekula='kurt').apply(doc)
    assert len(new_doc) == 3
    assert new_doc['text'].to_list() == ['a', 'b', 'c']
Ejemplo n.º 4
0
def test_tagged_tokens_filter_apply_when_boolean_attribute_succeeds():

    doc = pd.DataFrame(
        data=dict(text=['a', 'b', 'c'], is_stop=[True, False, True]))

    new_doc = PropertyValueMaskingOpts(is_stop=True).apply(doc)
    assert len(new_doc) == 2
    assert new_doc['text'].to_list() == ['a', 'c']

    new_doc = PropertyValueMaskingOpts(is_stop=None).apply(doc)
    assert len(new_doc) == 3
    assert new_doc['text'].to_list() == ['a', 'b', 'c']

    new_doc = PropertyValueMaskingOpts(is_stop=False).apply(doc)
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['b']
def test_mask_punct_space_when_no_space():

    # data = doc.head().to_csv(sep='\t',header=True))
    data = """
\ttext\tlemma_\tpos_\tis_space\tis_punct
0\tConstitution\tconstitution\tNOUN\tFalse\tFalse
1\tof\tof\tADP\tFalse\tFalse
2\tthe\tthe\tDET\tFalse\tFalse
3\tUnited\tUnited\tPROPN\tFalse\tFalse
4\tNations\tNations\tPROPN\tFalse\tFalse
"""
    doc: pd.DataFrame = pd.read_csv(io.StringIO(data), sep='\t', index_col=0)

    mask_opts = PropertyValueMaskingOpts(is_punct=False)
    mask = mask_opts.mask(doc)

    assert mask is not None
Ejemplo n.º 6
0
def test_tagged_tokens_filter_apply_when_unary_sign_operator_attribute_succeeds(
):

    doc = pd.DataFrame(data=dict(text=['a', 'b', 'c'], pos=['X', 'X', 'Y']))

    new_doc = PropertyValueMaskingOpts(pos=(True, ['X'])).apply(doc)
    assert len(new_doc) == 2
    assert new_doc['text'].to_list() == ['a', 'b']

    new_doc = PropertyValueMaskingOpts(pos=(False, ['X'])).apply(doc)
    assert len(new_doc) == 1
    assert new_doc['text'].to_list() == ['c']

    new_doc = PropertyValueMaskingOpts(pos=(False, ['Y'])).apply(doc)
    assert len(new_doc) == 2
    assert new_doc['text'].to_list() == ['a', 'b']

    new_doc = PropertyValueMaskingOpts(pos=(False, ['X', 'Y'])).apply(doc)
    assert len(new_doc) == 0
    assert new_doc['text'].to_list() == []

    with pytest.raises(ValueError):
        new_doc = PropertyValueMaskingOpts(pos=(None, ['X', 'Y'])).apply(doc)

    assert len(PropertyValueMaskingOpts(pos=(True, 'X')).apply(doc)) == 2
    assert len(PropertyValueMaskingOpts(pos=(True, 0)).apply(doc)) == 0
Ejemplo n.º 7
0
def test_hot_attributes():

    doc = pd.DataFrame(data=dict(text=['a', 'b', 'c'],
                                 pos=['X', 'X', 'Y'],
                                 lemma=['a', 'b', 'c'],
                                 is_stop=[True, False, True]))

    assert len(PropertyValueMaskingOpts(pos=(True,
                                             1)).hot_attributes(doc)) == 1
    assert len(
        PropertyValueMaskingOpts(pos='A', lemma='a').hot_attributes(doc)) == 2
    assert len(
        PropertyValueMaskingOpts(pos='A', lemma='a',
                                 _lemma='c').hot_attributes(doc)) == 2
    assert len(PropertyValueMaskingOpts().hot_attributes(doc)) == 0
    assert len(
        PropertyValueMaskingOpts(kalle=1, kula=2,
                                 kurt=2).hot_attributes(doc)) == 0
def test_mask_when_empty_document_succeeds():
    data = {
        'index': [],
        'columns': ['text', 'lemma_', 'pos_', 'is_punct'],
        'data': [],
    }

    doc = pd.DataFrame(**data)

    assert PropertyValueMaskingOpts(is_punct=False).mask(doc).sum() == 0
Ejemplo n.º 9
0
 def extract(self, indices: Sequence[int], filter_opts: pu.PropertyValueMaskingOpts = None) -> pd.DataFrame:
     data: pd.DataFrame = self.tabular_compiler.compile(
         corpus=self.transformed_corpus,
         temporal_key=self.compute_opts.temporal_key,
         pivot_keys_id_names=self.compute_opts.pivot_keys_id_names,
         indices=indices,
     )
     if filter_opts and len(filter_opts) > 0:
         data = data[filter_opts.mask(data)]
     return data
def test_mask_punct_space_when_no_space_or_punct():
    data = {
        'index': [0, 1, 2, 3, 4, 5, 6, 7],
        'columns': ['text', 'lemma_', 'pos_', 'is_space', 'is_punct'],
        'data': [
            ['Mamma', 'mamma', 'NN', False, False],
            ['pappa', 'pappa', 'NN', False, False],
            ['varför', 'varför', 'HA', False, False],
            ['är', 'vara', 'VB', False, False],
            ['det', 'den', 'PN', False, False],
            ['så', 'så', 'AB', False, False],
            ['kallt', 'kall', 'JJ', False, False],
            ['?', '?', 'MAD', False, True],
        ],
    }

    doc = pd.DataFrame(**data)

    assert PropertyValueMaskingOpts(is_punct=False,
                                    is_stop=True).mask(doc).sum() == 7
Ejemplo n.º 11
0
    def group_by_pivot_keys(  # pylint: disable=too-many-arguments)
        self: IVectorizedCorpusProtocol | GroupByMixIn,
        temporal_key: Literal['year', 'decade', 'lustrum'],
        pivot_keys: List[str],
        filter_opts: pu.PropertyValueMaskingOpts,
        document_namer: Callable[[pd.DataFrame], pd.Series],
        aggregate: str = 'sum',
        fill_gaps: bool = False,
        drop_group_ids: bool = True,
        dtype: np.dtype = None,
    ):
        """Groups corpus by a temporal key and zero to many pivot keys

        Args:
            self (IVectorizedCorpusProtocol): [description]
            temporal_key (Literal['year', 'decade', 'lustrum']): Temporal grouping key value (year, lustrum, decade)
            pivot_keys (List[str]): Grouping key value, must be discrete categorical values.
            filter_opts (PropertyValueMaskingOpts): Filters that should be applied to documets index.
            document_namer (Callable[[pd.DataFrame], pd.Series]): Funciton that computes a document name for each result groups.
            aggregate (str, optional): Aggregate function for DTM and document index (n_tokens). Defaults to 'sum'.
            dtype (np.dtype, optional): Value type of target DTM matrix. Defaults to None.
        """
        def default_document_namer(df: pd.DataFrame) -> pd.Series:
            """Default name that just joins the grouping key values to a single string"""
            return df[[temporal_key] + pivot_keys].apply(
                lambda x: '_'.join([str(t) for t in x]), axis=1)

        def _document_index_aggregates(df: pd.DataFrame,
                                       grouping_keys: List[str]) -> dict:
            """Creates an aggregate dict to be used in groupby."""
            """Add for group's document ids"""
            aggs: dict = dict(document_ids=('document_id', list))
            """Sum up all available count columns"""
            for count_column in {'n_tokens', 'n_raw_tokens',
                                 'tokens'}.intersection(set(df.columns)):
                aggs.update({count_column: (count_column, 'sum')})
            """Set year to min year for each group"""
            if 'year' in df.columns and 'year' not in grouping_keys:
                aggs.update(year=(
                    'year',
                    min))  # , year_from=('year', min), year_to=('year', max))
            """Add counter for number of documents in each group"""
            if 'n_documents' not in df.columns:
                aggs.update(n_documents=('document_id', 'nunique'))
            else:
                aggs.update(n_documents=('n_documents', 'sum'))

            return aggs

        if document_namer is None:
            document_namer = default_document_namer

        di: pd.DataFrame = self.document_index
        fdi: pd.DataFrame = di if not pivot_keys or len(
            filter_opts or []) == 0 else di[filter_opts.mask(di)]

        if temporal_key not in fdi.columns:
            fdi[temporal_key] = fdi['year'].apply(
                create_time_period_categorizer(temporal_key))

        aggs: dict = _document_index_aggregates(fdi,
                                                [temporal_key] + pivot_keys)

        gdi: pd.DataFrame = fdi.groupby([temporal_key] + pivot_keys,
                                        as_index=False).agg(**aggs)
        gdi['document_name'] = document_namer(gdi)
        gdi['filename'] = gdi.document_name

        if fill_gaps:
            """Add a dummy document for each missing temporal key value"""
            gdi = fill_temporal_gaps_in_group_document_index(
                gdi, temporal_key, pivot_keys, aggs)

        gdi['document_id'] = gdi.index.astype(np.int32)

        gdi = pu.as_slim_types(
            gdi, {'n_documents', 'n_tokens', 'n_raw_tokens', 'tokens'},
            np.int32)
        gdi = pu.as_slim_types(gdi, {'year', temporal_key}, np.int16)
        """Set a fixed name for temporal key as well"""
        gdi['time_period'] = gdi[temporal_key]

        category_indices: Mapping[int,
                                  List[int]] = gdi['document_ids'].to_dict()

        if drop_group_ids:
            gdi.drop(columns='document_ids', inplace=True, errors='ignore')

        return self.group_by_indices_mapping(
            document_index=gdi,
            category_indices=category_indices,
            aggregate=aggregate,
            dtype=dtype,
        )
Ejemplo n.º 12
0
 def __post_init__(self):
     self.filter_opts = (PropertyValueMaskingOpts(
         **self.filter_opts) if isinstance(self.filter_opts, dict) else
                         self.filter_opts or PropertyValueMaskingOpts())
Ejemplo n.º 13
0
def test_tagged_tokens_filter_props_is_as_expected():
    masking_opts = PropertyValueMaskingOpts()
    masking_opts.is_stop = 1
    masking_opts.pos_includes = ['NOUN', 'VERB']
    assert masking_opts.props == dict(is_stop=1, pos_includes=['NOUN', 'VERB'])
Ejemplo n.º 14
0
def test_tagged_tokens_filter_opts_get_of_unknown_field_succeeds():
    masking_opts = PropertyValueMaskingOpts()
    assert masking_opts.is_stop is None
Ejemplo n.º 15
0
def test_tagged_tokens_filter_opts_set_of_new_field_succeeds():
    masking_opts = PropertyValueMaskingOpts()
    masking_opts.is_stop = 1
    assert masking_opts.is_stop == 1