Python PunktLanguageVars.word_tokenize Examples, nltk.tokenize.punkt.PunktLanguageVars.word_tokenize Python Examples

Example #1

0

Show file

File: word.py Project: ManviG/cltk

 def tokenize(self, string):
     """Tokenize incoming string."""
     punkt = PunktLanguageVars()
     generic_tokens = punkt.word_tokenize(string)
     generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
     specific_tokens = []
     for generic_token in generic_tokens:
         is_enclitic = False
         if generic_token not in self.exceptions:
             for enclitic in self.enclitics:
                 if generic_token.endswith(enclitic):
                     if enclitic == 'mst':
                         specific_tokens += [generic_token[:-len(enclitic)+1]] + ['e'+ generic_token[-len(enclitic)+1:]]
                     elif enclitic == 'cum':
                         if generic_token in self.inclusions:
                             specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                         else:
                             specific_tokens += [generic_token]                                                     
                     else:
                         specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                     is_enclitic = True
                     break
         if not is_enclitic:
             specific_tokens.append(generic_token)
     return specific_tokens

Example #2

0

Show file

File: word.py Project: Sarthak30/cltk

def nltk_tokenize_words(string, attached_period=False, language=None):
    """Wrap NLTK's tokenizer PunktLanguageVars(), but make final period
    its own token.
    >>> nltk_punkt("Sentence 1. Sentence 2.")
    >>> ['Sentence', 'one', '.', 'Sentence', 'two', '.']

    Optionally keep the NLTK's output:
    >>> nltk_punkt("Sentence 1. Sentence 2.", attached_period=True)
    >>> ['Sentence', 'one.', 'Sentence', 'two.']

    TODO: Run some tests to determine whether there is a large penalty for
    re-calling PunktLanguageVars() for each use of this function. If so, this
    will need to become a class, perhaps inheriting from the PunktLanguageVars
    object. Maybe integrate with WordTokenizer.
    """
    assert isinstance(string, str), "Incoming string must be type str."
    if language=='sanskrit': 
        periods = ['.', 'ред','рее']
    else:
        periods = ['.']
    punkt = PunktLanguageVars()
    tokens = punkt.word_tokenize(string)
    if attached_period:
        return tokens
    new_tokens = []
    for word in tokens:
        for char in periods:
            if word.endswith(char):
                new_tokens.append(word[:-1])
                new_tokens.append(char)
                break
        else:
            new_tokens.append(word)
    return new_tokens

Example #3

0

Show file

File: pdb_test_oulatin.py Project: oudalab/sqlite-fts-python

 def tokenize(self, string):
     """Tokenize incoming string."""
     #punkt = WhitespaceTokenizer()
     punkt= PunktLanguageVars()
     generic_tokens = punkt.word_tokenize(string)
     generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
     specific_tokens = []
     for generic_token in generic_tokens:
         is_enclitic = False
         if generic_token not in self.exceptions:
             for enclitic in self.enclitics:
                 if generic_token.endswith(enclitic):
                     if enclitic == 'cum':
                         if generic_token in self.inclusions:
                             specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                         else:
                             specific_tokens += [generic_token]                                                                         
                     elif enclitic == 'st':
                         if generic_token.endswith('ust'):
                             specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
                         else:
                             # Does not handle 'similist', 'qualist', etc. correctly
                             specific_tokens += [generic_token[:-len(enclitic)]] + ['est']
                     else:
                         specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                     is_enclitic = True
                     break
         if not is_enclitic:
             specific_tokens.append(generic_token)
     #return iter(specific_tokens) #change this one into an iterator.
     startPoint=0 #this is to accumulate the start point.
     for item in specific_tokens:
         itemLength=len(item)
         yield item, startPoint, startPoint+itemLength
         startPoint=startPoint+itemLength+1

Example #4

0

Show file

File: word.py Project: vipul-sharma20/cltk

 def tokenize(self, string):
     """Tokenize incoming string."""
     punkt = PunktLanguageVars()
     generic_tokens = punkt.word_tokenize(string)
     # Rewrite as an if-else block for exceptions rather than separate list comprehensions
     generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'nec' else ['c', 'ne'])] # Handle 'nec' as a special case.
     generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sodes' else ['si', 'audes'])] # Handle 'sodes' as a special case.
     generic_tokens = [x for item in generic_tokens for x in ([item] if item != 'sultis' else ['si', 'vultis'])] # Handle 'sultis' as a special case.        
     specific_tokens = []
     for generic_token in generic_tokens:
         is_enclitic = False
         if generic_token not in self.exceptions:
             for enclitic in self.enclitics:
                 if generic_token.endswith(enclitic):
                     if enclitic == 'cum':
                         if generic_token in self.inclusions:
                             specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                         else:
                             specific_tokens += [generic_token]                                                                         
                     elif enclitic == 'st':
                         if generic_token.endswith('ust'):
                             specific_tokens += [generic_token[:-len(enclitic)+1]] + ['est']
                         else:
                             # Does not handle 'similist', 'qualist', etc. correctly
                             specific_tokens += [generic_token[:-len(enclitic)]] + ['est']
                     else:
                         specific_tokens += [enclitic] + [generic_token[:-len(enclitic)]]
                     is_enclitic = True
                     break
         if not is_enclitic:
             specific_tokens.append(generic_token)
     return specific_tokens

Example #5

0

Show file

File: scanner.py Project: Akirato/cltk

    def _tokenize(self, text):
        """
        Use NLTK's standard tokenizer, rm punctuation.
        :param text: pre-processed text
        :return: tokenized text
        :rtype : list
        """
        sentence_tokenizer = TokenizeSentence('latin')
        sentences = sentence_tokenizer.tokenize_sentences(text.lower())

        sent_words = []
        punkt = PunktLanguageVars()
        for sentence in sentences:
            words = punkt.word_tokenize(sentence)

            assert isinstance(words, list)
            words_new = []
            for word in words:
                if word not in self.punctuation or self.abbreviations or self.numbers or self.abbreviations:  # pylint: disable=line-too-long
                    words_new.append(word)

            # rm all numbers here with: re.compose(r'[09]')
            sent_words.append(words_new)

        return sent_words

Example #6

0

Show file

File: featurize.py Project: jonoleson/PriceMyRental

def tokenize(doc):
    '''
    INPUT: Document
    OUTPUT: Tokenized and stemmed list of words from the document 
    '''
    plv      = PunktLanguageVars()
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in plv.word_tokenize(doc.lower())]

Example #7

0

Show file

File: featurize.py Project: nhu2000/PriceHome

def tokenize(desc):
	'''
	INPUT: List of cleaned descriptions
	OUTPUT: Tokenized and stemmed list of words from the descriptions 
	'''
	plv = PunktLanguageVars()
	snowball = SnowballStemmer('english')
	return [snowball.stem(word) for word in plv.word_tokenize(desc.lower())]

Example #8

0

Show file

File: ner.py Project: cltk/cltk

def tag_ner(lang, input_text, output_type=list):
    """Run NER for chosen language.
    """

    _check_latest_data(lang)

    assert lang in NER_DICT.keys(), \
        'Invalid language. Choose from: {}'.format(', '.join(NER_DICT.keys()))
    types = [str, list]
    assert type(input_text) in types, 'Input must be: {}.'.format(', '.join(types))
    assert output_type in types, 'Output must be a {}.'.format(', '.join(types))

    if type(input_text) == str:
        punkt = PunktLanguageVars()
        tokens = punkt.word_tokenize(input_text)
        new_tokens = []
        for word in tokens:
            if word.endswith('.'):
                new_tokens.append(word[:-1])
                new_tokens.append('.')
            else:
                new_tokens.append(word)
        input_text = new_tokens

    ner_file_path = os.path.expanduser(NER_DICT[lang])
    with open(ner_file_path) as file_open:
        ner_str = file_open.read()
    ner_list = ner_str.split('\n')

    ner_tuple_list = []
    for count, word_token in enumerate(input_text):
        match = False
        for ner_word in ner_list:
            # the replacer slows things down, but is necessary
            if word_token == ner_word:
                ner_tuple = (word_token, 'Entity')
                ner_tuple_list.append(ner_tuple)
                match = True
                break
        if not match:
            ner_tuple_list.append((word_token,))

    if output_type is str:
        string = ''
        for tup in ner_tuple_list:
            start_space = ' '
            final_space = ''
            # this is some mediocre string reconstitution
            # maybe not worth the effort
            if tup[0] in [',', '.', ';', ':', '?', '!']:
                start_space = ''
            if len(tup) == 2:
                string += start_space + tup[0] + '/' + tup[1] + final_space
            else:
                string += start_space + tup[0] + final_space
        return string

    return ner_tuple_list

Example #9

0

Show file