Ejemplo n.º 1
0
 def compute_feature(self, tokens, token_index):
     if self.use_stemming:
         value = stem_token(tokens[token_index], self.language)
     else:
         value = normalize_token(tokens[token_index])
     cluster = get_word_clusters(self.language)[self.cluster_name]
     return cluster.get(value, None)
Ejemplo n.º 2
0
 def compute_feature(self, tokens, token_index):
     max_len = len(tokens)
     end = token_index + self.n
     if 0 <= token_index < max_len and end <= max_len:
         if self.gazetteer is None:
             if self.use_stemming:
                 stems = (stem_token(t, self.language)
                          for t in tokens[token_index:end])
                 return get_default_sep(self.language).join(stems)
             normalized_values = (normalize_token(t)
                                  for t in tokens[token_index:end])
             return get_default_sep(self.language).join(normalized_values)
         words = []
         for t in tokens[token_index:end]:
             if self.use_stemming:
                 value = stem_token(t, self.language)
             else:
                 value = normalize_token(t)
             words.append(value if value in self.gazetteer else "rare_word")
         return get_default_sep(self.language).join(words)
     return None
Ejemplo n.º 3
0
 def _transform(self, tokens):
     if self.use_stemming:
         light_tokens = (stem_token(t, self.language) for t in tokens)
     else:
         light_tokens = (normalize_token(t) for t in tokens)
     current_index = 0
     transformed_tokens = []
     for light_token in light_tokens:
         transformed_token = Token(value=light_token,
                                   start=current_index,
                                   end=current_index + len(light_token))
         transformed_tokens.append(transformed_token)
         current_index = transformed_token.end + 1
     return transformed_tokens
 def _preprocess_text(self, string):
     """Replaces stop words and characters that are tokenized out by
         whitespaces"""
     tokens = tokenize(string, self.language)
     current_idx = 0
     cleaned_string = ""
     for token in tokens:
         if self.stop_words and normalize_token(token) in self.stop_words:
             token.value = "".join(" " for _ in range(len(token.value)))
         prefix_length = token.start - current_idx
         cleaned_string += "".join((" " for _ in range(prefix_length)))
         cleaned_string += token.value
         current_idx = token.end
     suffix_length = len(string) - current_idx
     cleaned_string += "".join((" " for _ in range(suffix_length)))
     return cleaned_string
Ejemplo n.º 5
0
 def compute_feature(self, tokens, token_index):
     return get_word_chunk(normalize_token(tokens[token_index]),
                           self.suffix_size,
                           len(tokens[token_index].value),
                           reverse=True)
Ejemplo n.º 6
0
 def compute_feature(self, tokens, token_index):
     return get_word_chunk(normalize_token(tokens[token_index]),
                           self.prefix_size, 0)
Ejemplo n.º 7
0
 def compute_feature(self, tokens, token_index):
     if self.use_stemming:
         value = stem_token(tokens[token_index], self.resources)
     else:
         value = normalize_token(tokens[token_index])
     return self.cluster.get(value, None)
Ejemplo n.º 8
0
 def _transform(self, token):
     if self.use_stemming:
         return stem_token(token, self.language)
     return normalize_token(token)