def tokenize(self, input_string): """Tokenizes input string into alphanumeric tokens. Args: input_string (str): The string to be tokenized. Returns: A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. Raises: TypeError : If the input is not a string. Examples: >>> alnum_tok = AlphanumericTokenizer() >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88') ['data9', 'science', 'data9', 'integration', '88'] >>> alnum_tok.tokenize('#.&') [] >>> alnum_tok = AlphanumericTokenizer(return_set=True) >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88') ['data9', 'science', 'integration', '88'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) token_list = list(filter(None, self.__alnum_regex.findall(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
def delimiter(input_string, delim_str=' '): """ Tokenizes input string based on the given delimiter. Args: input_string (str): Input string delim_str (str): Delimiter string Returns: Token list (list) Raises: TypeError : If the input is not a string Examples: >>> delimiter('data science') ['data', 'science'] >>> delimiter('data$#$science', '$#$') ['data', 'science'] >>> delimiter('data science', ',') ['data science'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) return input_string.split(delim_str)
def whitespace(input_string): """ Tokenizes input string based on white space. Args: input_string (str): Input string Returns: Token list (list) Raises: TypeError : If the input is not a string Examples: >>> whitespace('data science') ['data', 'science'] >>> whitespace('data science') ['data', 'science'] >>> whitespace('data\tscience') ['data', 'science'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) return input_string.split()
def tokenize(self, input_string): """Tokenizes input string into alphabetical tokens. Args: input_string (str): The string to be tokenized. Returns: A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise. Raises: TypeError : If the input is not a string. Examples: >>> al_tok = AlphabeticTokenizer() >>> al_tok.tokenize('data99science, data#integration.') ['data', 'science', 'data', 'integration'] >>> al_tok.tokenize('99') [] >>> al_tok = AlphabeticTokenizer(return_set=True) >>> al_tok.tokenize('data99science, data#integration.') ['data', 'science', 'integration'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) token_list = list(filter(None, self.__al_regex.findall(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
def tokenize(self, input_string): """Tokenizes input string into qgrams. Args: input_string (str): The string to be tokenized. Returns: A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. Raises: TypeError : If the input is not a string Examples: >>> qg2_tok = QgramTokenizer() >>> qg2_tok.tokenize('database') ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'] >>> qg2_tok.tokenize('a') ['#a', 'a$'] >>> qg3_tok = QgramTokenizer(qval=3) >>> qg3_tok.tokenize('database') ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$'] >>> qg3_nopad = QgramTokenizer(padding=False) >>> qg3_nopad.tokenize('database') ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'] >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!') >>> qg3_diffpads.tokenize('database') ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) qgram_list = [] # If the padding flag is set to true, add q-1 "prefix_pad" characters # in front of the input string and add q-1 "suffix_pad" characters at # the end of the input string. if self.padding: input_string = (self.prefix_pad * (self.qval - 1)) + input_string \ + (self.suffix_pad * (self.qval - 1)) if len(input_string) < self.qval: return qgram_list qgram_list = [ input_string[i:i + self.qval] for i in xrange(len(input_string) - (self.qval - 1)) ] qgram_list = list(filter(None, qgram_list)) if self.return_set: return utils.convert_bag_to_set(qgram_list) return qgram_list
def tokenize(self, input_string): """Tokenizes input string into qgrams. Args: input_string (str): The string to be tokenized. Returns: A Python list, which is a set or a bag of qgrams, depending on whether return_set flag is True or False. Raises: TypeError : If the input is not a string Examples: >>> qg2_tok = QgramTokenizer() >>> qg2_tok.tokenize('database') ['#d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e$'] >>> qg2_tok.tokenize('a') ['#a', 'a$'] >>> qg3_tok = QgramTokenizer(qval=3) >>> qg3_tok.tokenize('database') ['##d', '#da', 'dat', 'ata', 'tab', 'aba', 'bas', 'ase', 'se$', 'e$$'] >>> qg3_nopad = QgramTokenizer(padding=False) >>> qg3_nopad.tokenize('database') ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'] >>> qg3_diffpads = QgramTokenizer(prefix_pad='^', suffix_pad='!') >>> qg3_diffpads.tokenize('database') ['^d', 'da', 'at', 'ta', 'ab', 'ba', 'as', 'se', 'e!'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) qgram_list = [] # If the padding flag is set to true, add q-1 "prefix_pad" characters # in front of the input string and add q-1 "suffix_pad" characters at # the end of the input string. if self.padding: input_string = (self.prefix_pad * (self.qval - 1)) + input_string \ + (self.suffix_pad * (self.qval - 1)) if len(input_string) < self.qval: return qgram_list qgram_list = [input_string[i:i + self.qval] for i in xrange(len(input_string) - (self.qval - 1))] qgram_list = list(filter(None, qgram_list)) if self.return_set: return utils.convert_bag_to_set(qgram_list) return qgram_list
def tokenize(self, input_string): """Tokenizes input string based on the set of delimiters. Args: input_string (str): The string to be tokenized. Returns: A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. Raises: TypeError : If the input is not a string. Examples: >>> delim_tok = DelimiterTokenizer() >>> delim_tok.tokenize('data science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer(['$#$']) >>> delim_tok.tokenize('data$#$science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer([',', '.']) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'data', 'integration'] >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'integration'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) if self.__use_split: token_list = list( filter(None, input_string.split(self.__delim_str))) else: token_list = list( filter(None, self.__delim_regex.split(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
def qgram(input_string, qval=2): """ Tokenizes input string into q-grams. A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and k-grams. Args: input_string (str): Input string qval (int): Q-gram length (defaults to 2) Returns: Token list (list) Raises: TypeError : If the input is not a string Examples: >>> qgram('database') ['da','at','ta','ab','ba','as','se'] >>> qgram('a') [] >>> qgram('database', 3) ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) qgram_list = [] if len(input_string) < qval or qval < 1: return qgram_list qgram_list = [ input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1)) ] return qgram_list
def tokenize(self, input_string): """Tokenizes input string based on the set of delimiters. Args: input_string (str): The string to be tokenized. Returns: A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. Raises: TypeError : If the input is not a string. Examples: >>> delim_tok = DelimiterTokenizer() >>> delim_tok.tokenize('data science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer(['$#$']) >>> delim_tok.tokenize('data$#$science') ['data', 'science'] >>> delim_tok = DelimiterTokenizer([',', '.']) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'data', 'integration'] >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) >>> delim_tok.tokenize('data,science.data,integration.') ['data', 'science', 'integration'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) if self.__use_split: token_list = list(filter(None, input_string.split(self.__delim_str))) else: token_list = list(filter(None, self.__delim_regex.split(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
def tokenize(self, input_string): """Tokenizes input string into numeric tokens. Args: input_string (str): The string to be tokenized. Returns: A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. Raises: TypeError : If the input is not a string. """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) token_list = list(filter(None, self.__num_regex.findall(input_string))) if self.return_set: return utils.convert_bag_to_set(token_list) return token_list
def qgram(input_string, qval=2): """ Tokenizes input string into q-grams. A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and k-grams. Args: input_string (str): Input string qval (int): Q-gram length (defaults to 2) Returns: Token list (list) Raises: TypeError : If the input is not a string Examples: >>> qgram('database') ['da','at','ta','ab','ba','as','se'] >>> qgram('a') [] >>> qgram('database', 3) ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'] """ utils.tok_check_for_none(input_string) utils.tok_check_for_string_input(input_string) qgram_list = [] if len(input_string) < qval or qval < 1: return qgram_list qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))] return qgram_list