コード例 #1
0
    def tokenize(self, input_string):
        """Tokenizes input string into alphanumeric tokens.

        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> alnum_tok = AlphanumericTokenizer()
            >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
            ['data9', 'science', 'data9', 'integration', '88']
            >>> alnum_tok.tokenize('#.&')
            []
            >>> alnum_tok = AlphanumericTokenizer(return_set=True) 
            >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
            ['data9', 'science', 'integration', '88']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None,
                                 self.__alnum_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #2
0
def delimiter(input_string, delim_str=' '):
    """
    Tokenizes input string based on the given delimiter.

    Args:
        input_string (str): Input string

        delim_str (str): Delimiter string


    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> delimiter('data science')
        ['data', 'science']
        >>> delimiter('data$#$science', '$#$')
        ['data', 'science']
        >>> delimiter('data science', ',')
        ['data science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split(delim_str)
コード例 #3
0
def whitespace(input_string):
    """
    Tokenizes input string based on white space.

    Args:
        input_string (str): Input string

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> whitespace('data science')
        ['data', 'science']
        >>> whitespace('data        science')
        ['data', 'science']
        >>> whitespace('data\tscience')
        ['data', 'science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split()
コード例 #4
0
def delimiter(input_string, delim_str=' '):
    """
    Tokenizes input string based on the given delimiter.

    Args:
        input_string (str): Input string

        delim_str (str): Delimiter string


    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> delimiter('data science')
        ['data', 'science']
        >>> delimiter('data$#$science', '$#$')
        ['data', 'science']
        >>> delimiter('data science', ',')
        ['data science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split(delim_str)
コード例 #5
0
def whitespace(input_string):
    """
    Tokenizes input string based on white space.

    Args:
        input_string (str): Input string

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> whitespace('data science')
        ['data', 'science']
        >>> whitespace('data        science')
        ['data', 'science']
        >>> whitespace('data\tscience')
        ['data', 'science']

    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    return input_string.split()
コード例 #6
0
    def tokenize(self, input_string):
        """Tokenizes input string into alphabetical tokens.
        
        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> al_tok = AlphabeticTokenizer()
            >>> al_tok.tokenize('data99science, data#integration.')
            ['data', 'science', 'data', 'integration']
            >>> al_tok.tokenize('99')
            []
            >>> al_tok = AlphabeticTokenizer(return_set=True) 
            >>> al_tok.tokenize('data99science, data#integration.')
            ['data', 'science', 'integration']
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None, self.__al_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #7
0
    def tokenize(self, input_string):
        """Tokenizes input string into qgrams.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. 

        Raises:
            TypeError : If the input is not a string

        Examples:
            >>> qg2_tok = QgramTokenizer()
            >>> qg2_tok.tokenize('database')
            ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
            >>> qg2_tok.tokenize('a')
            ['#a', 'a$']
            >>> qg3_tok = QgramTokenizer(qval=3)
            >>> qg3_tok.tokenize('database')
            ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
            >>> qg3_nopad = QgramTokenizer(padding=False)
            >>> qg3_nopad.tokenize('database')
            ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
            >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
            >>> qg3_diffpads.tokenize('database')
            ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        qgram_list = []

        # If the padding flag is set to true, add q-1 "prefix_pad" characters
        # in front of the input string and  add q-1 "suffix_pad" characters at
        # the end of the input string.
        if self.padding:
            input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
                           + (self.suffix_pad * (self.qval - 1))

        if len(input_string) < self.qval:
            return qgram_list

        qgram_list = [
            input_string[i:i + self.qval]
            for i in xrange(len(input_string) - (self.qval - 1))
        ]
        qgram_list = list(filter(None, qgram_list))

        if self.return_set:
            return utils.convert_bag_to_set(qgram_list)

        return qgram_list
コード例 #8
0
    def tokenize(self, input_string):
        """Tokenizes input string into qgrams.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. 

        Raises:
            TypeError : If the input is not a string

        Examples:
            >>> qg2_tok = QgramTokenizer()
            >>> qg2_tok.tokenize('database')
            ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$']
            >>> qg2_tok.tokenize('a')
            ['#a', 'a$']
            >>> qg3_tok = QgramTokenizer(qval=3)
            >>> qg3_tok.tokenize('database')
            ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$']
            >>> qg3_nopad = QgramTokenizer(padding=False)
            >>> qg3_nopad.tokenize('database')
            ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']
            >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!')
            >>> qg3_diffpads.tokenize('database')
            ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!']
                      
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        qgram_list = []

        # If the padding flag is set to true, add q-1 "prefix_pad" characters
        # in front of the input string and  add q-1 "suffix_pad" characters at
        # the end of the input string.
        if self.padding:
            input_string = (self.prefix_pad * (self.qval - 1)) + input_string \
                           + (self.suffix_pad * (self.qval - 1))

        if len(input_string) < self.qval:
            return qgram_list

        qgram_list = [input_string[i:i + self.qval] for i in
                      xrange(len(input_string) - (self.qval - 1))]
        qgram_list = list(filter(None, qgram_list))

        if self.return_set:
            return utils.convert_bag_to_set(qgram_list)

        return qgram_list
コード例 #9
0
    def tokenize(self, input_string):
        """Tokenizes input string based on the set of delimiters.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> delim_tok = DelimiterTokenizer() 
            >>> delim_tok.tokenize('data science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer(['$#$']) 
            >>> delim_tok.tokenize('data$#$science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer([',', '.']) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'data', 'integration']
            >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'integration']

        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        if self.__use_split:
            token_list = list(
                filter(None, input_string.split(self.__delim_str)))
        else:
            token_list = list(
                filter(None, self.__delim_regex.split(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #10
0
def qgram(input_string, qval=2):
    """
    Tokenizes input string into q-grams.

    A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and
    k-grams.

    Args:
        input_string (str): Input string

        qval (int): Q-gram length (defaults to 2)

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> qgram('database')
        ['da','at','ta','ab','ba','as','se']
        >>> qgram('a')
        []
        >>> qgram('database', 3)
        ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']


    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    qgram_list = []

    if len(input_string) < qval or qval < 1:
        return qgram_list

    qgram_list = [
        input_string[i:i + qval]
        for i in _range(len(input_string) - (qval - 1))
    ]
    return qgram_list
コード例 #11
0
    def tokenize(self, input_string):
        """Tokenizes input string based on the set of delimiters.

        Args:
            input_string (str): The string to be tokenized. 

        Returns:
            A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 

        Raises:
            TypeError : If the input is not a string.

        Examples:
            >>> delim_tok = DelimiterTokenizer() 
            >>> delim_tok.tokenize('data science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer(['$#$']) 
            >>> delim_tok.tokenize('data$#$science')
            ['data', 'science']
            >>> delim_tok = DelimiterTokenizer([',', '.']) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'data', 'integration']
            >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 
            >>> delim_tok.tokenize('data,science.data,integration.')
            ['data', 'science', 'integration']

        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)
    
        if self.__use_split:
            token_list = list(filter(None,
                                     input_string.split(self.__delim_str)))
        else:
            token_list = list(filter(None,
                                     self.__delim_regex.split(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #12
0
    def tokenize(self, input_string):
        """Tokenizes input string into numeric tokens.

        Args:
            input_string (str): The string to be tokenized.

        Returns:
            A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 

        Raises:
            TypeError : If the input is not a string.
        """
        utils.tok_check_for_none(input_string)
        utils.tok_check_for_string_input(input_string)

        token_list = list(filter(None, self.__num_regex.findall(input_string)))

        if self.return_set:
            return utils.convert_bag_to_set(token_list)

        return token_list
コード例 #13
0
def qgram(input_string, qval=2):
    """
    Tokenizes input string into q-grams.

    A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and
    k-grams.

    Args:
        input_string (str): Input string

        qval (int): Q-gram length (defaults to 2)

    Returns:
        Token list (list)

    Raises:
        TypeError : If the input is not a string

    Examples:
        >>> qgram('database')
        ['da','at','ta','ab','ba','as','se']
        >>> qgram('a')
        []
        >>> qgram('database', 3)
        ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']


    """
    utils.tok_check_for_none(input_string)
    utils.tok_check_for_string_input(input_string)

    qgram_list = []

    if len(input_string) < qval or qval < 1:
        return qgram_list

    qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))]
    return qgram_list