Ejemplo n.º 1
0
    def tokenize(self):
        """
        Tokenize the text.
        """

        self.tokens = []
        self.terms = OrderedDict()

        # Load stopwords.
        stopwords = self.stopwords()

        # Generate tokens.
        for token in utils.tokenize(self.text):

            # Ignore stopwords.
            if token['unstemmed'] in stopwords:
                self.tokens.append(None)

            else:

                # Token:
                self.tokens.append(token)

                # Term:
                offsets = self.terms.setdefault(token['stemmed'], [])
                offsets.append(token['offset'])
Ejemplo n.º 2
0
def assert_abc(text):

    """
    Assert tokens aa/bb/cc.

    Args:
        text (str): A raw text string.
    """

    tokens = tokenize(text)

    aa = next(tokens)
    assert aa['stemmed']    == 'aa'
    assert aa['unstemmed']  == 'aa'
    assert aa['offset']     == 0

    bb = next(tokens)
    assert bb['stemmed']    == 'bb'
    assert bb['unstemmed']  == 'bb'
    assert bb['offset']     == 1

    cc = next(tokens)
    assert cc['stemmed']    == 'cc'
    assert cc['unstemmed']  == 'cc'
    assert cc['offset']     == 2
Ejemplo n.º 3
0
    def tokenize(self):

        """
        Tokenize the text.
        """

        self.tokens = []
        self.terms = OrderedDict()

        # Load stopwords.
        stopwords = self.stopwords()

        # Generate tokens.
        for token in utils.tokenize(self.text):

            # Ignore stopwords.
            if token['unstemmed'] in stopwords:
                self.tokens.append(None)

            else:

                # Token:
                self.tokens.append(token)

                # Term:
                offsets = self.terms.setdefault(token['stemmed'], [])
                offsets.append(token['offset'])
Ejemplo n.º 4
0
def test_ignore_case():

    """
    Tokens should be downcased.
    """

    text = 'One TWO ThReE'

    tokens = tokenize(text)

    t1 = next(tokens)
    assert t1['stemmed']    == 'one'
    assert t1['unstemmed']  == 'one'
    assert t1['offset']     == 0

    t2 = next(tokens)
    assert t2['stemmed']    == 'two'
    assert t2['unstemmed']  == 'two'
    assert t2['offset']     == 1

    t2 = next(tokens)
    assert t2['stemmed']    == 'three'
    assert t2['unstemmed']  == 'three'
    assert t2['offset']     == 2
Ejemplo n.º 5
0
def test_stem():

    """
    Stemm-able tokens should be stemmed.
    """

    text = 'happy lovely days'

    tokens = tokenize(text)

    t1 = next(tokens)
    assert t1['stemmed']    == 'happi'
    assert t1['unstemmed']  == 'happy'
    assert t1['offset']     == 0

    t2 = next(tokens)
    assert t2['stemmed']    == 'love'
    assert t2['unstemmed']  == 'lovely'
    assert t2['offset']     == 1

    t3 = next(tokens)
    assert t3['stemmed']    == 'day'
    assert t3['unstemmed']  == 'days'
    assert t3['offset']     == 2