Example #1
0
def map_irregular_forms(word_str_ser, can_replace_mask):
    # replaces all strings and stop rules
    # need to process it
    irregular_forms = {
        "sky": ["sky", "skies"],
        "die": ["dying"],
        "lie": ["lying"],
        "tie": ["tying"],
        "news": ["news"],
        "inning": ["innings", "inning"],
        "outing": ["outings", "outing"],
        "canning": ["cannings", "canning"],
        "howe": ["howe"],
        "proceed": ["proceed"],
        "exceed": ["exceed"],
        "succeed": ["succeed"],
    }
    for replacement, form_ls in irregular_forms.items():
        for form in form_ls:
            equal_flag = word_str_ser == form
            stem_ser = get_stem_series(word_str_ser, len(form),
                                       can_replace_mask & equal_flag)
            replacement_ser = get_str_replacement_series(
                replacement, can_replace_mask & equal_flag)

            word_str_ser = stem_ser.str.cat(replacement_ser)
            can_replace_mask = can_replace_mask & cudf.logical_not(equal_flag)

    return word_str_ser, can_replace_mask
Example #2
0
def apply_rule(word_str_ser, rule, w_in_c_flag):
    """Applies the first applicable suffix-removal rule to the word

    Takes a word and a list of suffix-removal rules represented as
    3-tuples, with the first element being the suffix to remove,
    the second element being the string to replace it with, and the
    final element being the condition for the rule to be applicable,
    or None if the rule is unconditional.
    """
    suffix, replacement, condition = rule
    if suffix == "*d":
        double_consonant_mask = ends_with_double_constant(word_str_ser)
        # all flags needed  here
        # with **d in nltk we pass word_series rather than stem_series
        # see below:
        # lambda stem: intermediate_stem[-1] not in ('l', 's', 'z'),
        # condition is on  intermediate_stem
        intermediate_stem = word_str_ser.str.slice(stop=-1)
        condition_mask = get_condition_flag(intermediate_stem, condition)

        # mask where replacement will happen
        valid_mask = double_consonant_mask & condition_mask & w_in_c_flag

        # new series with updated valid_mask
        word_str_ser = replace_suffix(word_str_ser, suffix, replacement,
                                      valid_mask)
        w_in_c_flag = w_in_c_flag & cudf.logical_not(double_consonant_mask)

    else:

        suffix_mask = ends_with_suffix(word_str_ser, suffix)
        valid_mask = suffix_mask & w_in_c_flag

        stem_ser = replace_suffix(word_str_ser, suffix, "", valid_mask)

        condition_mask = get_condition_flag(stem_ser, condition)
        # mask where replacement will happen
        valid_mask = condition_mask & suffix_mask & w_in_c_flag
        word_str_ser = replace_suffix(word_str_ser, suffix, replacement,
                                      valid_mask)

        # we wont apply further rules if it has a matching suffix
        w_in_c_flag = w_in_c_flag & cudf.logical_not(suffix_mask)

    return word_str_ser, w_in_c_flag
Example #3
0
def test_series_not(dtype):
    import pandas as pd
    arr = pd.Series(np.random.choice([True, False], 1000)).astype(dtype)
    if dtype is not np.bool_:
        arr = arr * (np.random.random(1000) * 100).astype(dtype)
    sr = Series(arr)

    result = cudf.logical_not(sr).to_array()
    expect = np.logical_not(arr)
    np.testing.assert_equal(result, expect)
    np.testing.assert_equal((~sr).to_array(), ~arr)
Example #4
0
def ends_cvc(string_ser, mode="NLTK_EXTENSIONS"):
    """Implements condition *o from the paper

    From the paper:

        *o  - the stem ends cvc, where the second c is not W, X or Y
              (e.g. -WIL, -HOP).
    """

    if mode == "NLTK_EXTENSIONS":

        # rule_1
        # len(word) >= 3
        # and self._is_consonant(word, len(word) - 3)
        # and not self._is_consonant(word, len(word) - 2)
        # and self._is_consonant(word, len(word) - 1)
        # and word[-1] not in ("w", "x", "y")

        len_flag = len_gt_n(string_ser, 2)

        first_consonant = is_consonant(string_ser, -3)
        middle_vowel = is_vowel(string_ser, -2)
        last_consonant = is_consonant(string_ser, -1)

        last_char_strs = string_ser.str.get(-1)

        # converting to series to all strings
        last_char_ser = cudf.Series(last_char_strs)
        last_char_flag = None
        for char in ["w", "x", "y"]:
            if last_char_flag is not None:
                last_char_flag = last_char_flag & (last_char_ser != char)
            else:
                last_char_flag = last_char_ser != char

        rule_1 = (len_flag
                  & first_consonant
                  & middle_vowel
                  & last_consonant
                  & last_char_flag)
        # rule_2
        # self.mode == self.NLTK_EXTENSIONS
        # and len(word) == 2
        # and not self._is_consonant(word, 0)
        # and self._is_consonant(word, 1)
        len_flag = len_eq_n(string_ser, 2)
        first_char = cudf.logical_not(is_consonant(string_ser, 0))
        second_char = is_consonant(string_ser, 1)
        rule_2 = len_flag & first_char & second_char

        return rule_1 | rule_2
    else:
        assert NotImplementedError
Example #5
0
    def _step1a(self, word_str_ser, can_replace_mask=None):
        """Implements Step 1a from "An algorithm for suffix stripping"

        From the paper:

            SSES -> SS                         caresses  ->  caress
            IES  -> I                          ponies    ->  poni
                                               ties      ->  ti
                                               (### this is for orignal impl)
            SS   -> SS                         caress    ->  caress
            S    ->                            cats      ->  cat
        """
        can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser),
                                                  mask=can_replace_mask)

        # this NLTK-only rule extends the original algorithm, so
        # that 'flies'->'fli' but 'dies'->'die' etc
        # ties -> tie
        if self.mode == "NLTK_EXTENSIONS":
            # equivalent to
            # word.endswith('ies') and len(word) == 4:
            suffix_mask = ends_with_suffix(word_str_ser, "ies")
            len_mask = len_eq_n(word_str_ser, 4)

            condition_mask = suffix_mask & len_mask

            valid_mask = can_replace_mask & condition_mask
            word_str_ser = replace_suffix(word_str_ser, "ies", "ie",
                                          valid_mask)

            # update can replace mask
            can_replace_mask = can_replace_mask & cudf.logical_not(
                condition_mask)

        return apply_rule_list(
            word_str_ser,
            [
                ("sses", "ss", None),  # SSES -> SS
                ("ies", "i", None),  # IES  -> I
                ("ss", "ss", None),  # SS   -> SS
                ("s", "", None),  # S    ->
            ],
            can_replace_mask,
        )[0]
Example #6
0
    def _step5a(self, word_str_ser, can_replace_mask=None):
        """Implements Step 5a from "An algorithm for suffix stripping"

        From the paper:

        Step 5a

            (m>1) E     ->                  probate        ->  probat
                                            rate           ->  rate
            (m=1 and not *o) E ->           cease          ->  ceas
        """

        can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser),
                                                  mask=can_replace_mask)
        # Note that Martin's test vocabulary and reference
        # implementations are inconsistent in how they handle the case
        # where two rules both refer to a suffix that matches the word
        # to be stemmed, but only the condition of the second one is
        # true.
        # Earlier in step2b we had the rules:
        #     (m>0) EED -> EE
        #     (*v*) ED  ->
        # but the examples in the paper included "feed"->"feed", even
        # though (*v*) is true for "fe" and therefore the second rule
        # alone would map "feed"->"fe".
        # However, in THIS case, we need to handle the consecutive rules
        # differently and try both conditions (obviously; the second
        # rule here would be redundant otherwise). Martin's paper makes
        # no explicit mention of the inconsistency; you have to infer it
        # from the examples.
        # For this reason, we can't use _apply_rule_list here.

        ##

        # logic is equivalent to below
        # if word.endswith('e'):
        #  stem = self._replace_suffix(word, 'e', '')
        #  if self._measure(stem) > 1:
        #      return stem  rule_1
        #  if self._measure(stem) == 1 and not self._ends_cvc(stem):
        #      return stem  rule_2
        #

        e_suffix_flag = ends_with_suffix(word_str_ser, "e")
        stem = replace_suffix(word_str_ser, "e", "",
                              e_suffix_flag & can_replace_mask)

        measure_gt_1_flag = measure_gt_n(stem, 1)

        # if self._measure(stem) > 1:
        rule_1_flag = measure_gt_1_flag

        # if measure==1 and not self._ends_cvc(stem):
        measure_eq_1_flag = measure_eq_n(stem, 1)
        does_not_ends_with_cvc_flag = cudf.logical_not(ends_cvc(stem))
        rule_2_flag = measure_eq_1_flag & does_not_ends_with_cvc_flag

        overall_rule_flag = ((rule_1_flag | rule_2_flag) & e_suffix_flag
                             & can_replace_mask)

        return replace_suffix(word_str_ser, "e", "", overall_rule_flag)
Example #7
0
    def _step1b(self, word_str_ser, can_replace_mask=None):
        """Implements Step 1b from "An algorithm for suffix stripping"

        From the paper:

            (m>0) EED -> EE                    feed      ->  feed
                                            agreed    ->  agree
            (*v*) ED  ->                       plastered ->  plaster
                                            bled      ->  bled
            (*v*) ING ->                       motoring  ->  motor
                                            sing      ->  sing

        If the second or third of the rules in Step 1b is successful,
        the following is done:

            AT -> ATE                       conflat(ed)  ->  conflate
            BL -> BLE                       troubl(ed)   ->  trouble
            IZ -> IZE                       siz(ed)      ->  size
            (*d and not (*L or *S or *Z))
            -> single letter
                                            hopp(ing)    ->  hop
                                            tann(ed)     ->  tan
                                            fall(ing)    ->  fall
                                            hiss(ing)    ->  hiss
                                            fizz(ed)     ->  fizz
            (m=1 and *o) -> E               fail(ing)    ->  fail
                                            fil(ing)     ->  file

        The rule to map to a single letter causes the removal of one of
        the double letter pair. The -E is put back on -AT, -BL and -IZ,
        so that the suffixes -ATE, -BLE and -IZE can be recognised
        later. This E may be removed in step 4.
        """

        can_replace_mask = build_can_replace_mask(len_mask=len(word_str_ser),
                                                  mask=can_replace_mask)

        # this NLTK-only block extends the original algorithm, so that
        # 'spied'->'spi' but 'died'->'die' etc
        if self.mode == "NLTK_EXTENSIONS":
            # word.endswith('ied'):
            suffix_mask = ends_with_suffix(word_str_ser, "ied")
            len_mask = len_eq_n(word_str_ser, 4)

            condition_mask = suffix_mask & len_mask

            valid_mask = can_replace_mask & condition_mask
            word_str_ser = replace_suffix(word_str_ser, "ied", "ie",
                                          valid_mask)

            # update can replace mask
            can_replace_mask = can_replace_mask & cudf.logical_not(
                condition_mask)

            condition_mask = suffix_mask
            valid_mask = can_replace_mask & condition_mask
            word_str_ser = replace_suffix(word_str_ser, "ied", "i", valid_mask)

            # update can replace mask
            can_replace_mask = can_replace_mask & cudf.logical_not(
                condition_mask)

        # (m>0) EED -> EE
        # if suffix ==eed we stop processing
        # to be consistent with nltk
        suffix_mask = ends_with_suffix(word_str_ser, "eed")
        valid_mask = suffix_mask & can_replace_mask

        stem = replace_suffix(word_str_ser, "eed", "", valid_mask)
        measure_mask = measure_gt_n(stem, 0)

        valid_mask = measure_mask & suffix_mask & can_replace_mask
        # adding ee series to stem
        word_str_ser = replace_suffix(word_str_ser, "eed", "ee", valid_mask)

        # to be consistent with nltk we dont replace
        # if word.endswith('eed') we stop proceesing
        can_replace_mask = can_replace_mask & cudf.logical_not(suffix_mask)

        # rule 2
        #    (*v*) ED  ->   plastered ->  plaster
        #                   bled      ->  bled

        ed_suffix_mask = ends_with_suffix(word_str_ser, "ed")
        intermediate_stem = replace_suffix(word_str_ser, "ed", "",
                                           ed_suffix_mask & can_replace_mask)
        vowel_mask = contains_vowel(intermediate_stem)

        rule_2_mask = vowel_mask & ed_suffix_mask & can_replace_mask

        # rule 3

        #    (*v*) ING ->  motoring  ->  motor
        #                   sing      ->  sing
        ing_suffix_mask = ends_with_suffix(word_str_ser, "ing")
        intermediate_stem = replace_suffix(word_str_ser, "ing", "",
                                           ing_suffix_mask & can_replace_mask)
        vowel_mask = contains_vowel(intermediate_stem)
        rule_3_mask = vowel_mask & ing_suffix_mask & can_replace_mask

        rule_2_or_rule_3_mask = rule_2_mask | rule_3_mask

        # replace masks only if rule_2_or_rule_3_mask
        intermediate_stem_1 = replace_suffix(word_str_ser, "ed", "",
                                             rule_2_mask)
        intermediate_stem_2 = replace_suffix(intermediate_stem_1, "ing", "",
                                             rule_3_mask)

        can_replace_mask = can_replace_mask & rule_2_or_rule_3_mask
        return apply_rule_list(
            intermediate_stem_2,
            [
                ("at", "ate", None),  # AT -> ATE
                ("bl", "ble", None),  # BL -> BLE
                ("iz", "ize", None),  # IZ -> IZE
                # (*d and not (*L or *S or *Z))
                # -> single letter
                (
                    "*d",
                    -1,  # intermediate_stem[-1],
                    lambda stem: last_char_not_in(stem,
                                                  characters=["l", "s", "z"]),
                ),
                # (m=1 and *o) -> E
                (
                    "",
                    "e",
                    lambda stem: measure_eq_n(stem, n=1) & ends_cvc(stem),
                ),
            ],
            can_replace_mask,
        )[0]