Ejemplo n.º 1
0
    def compute_features(self, tokens, drop_out=False):
        """Compute features on the provided tokens

        The *drop_out* parameters allows to activate drop out on features that
        have a positive drop out ratio. This should only be used during
        training.
        """

        if resource_exists(self.language, STEMS):
            tokens = [
                Token(t.value, t.start, t.end,
                      stem=stem(t.normalized_value, self.language))
                for t in tokens]
        else:
            tokens = [Token(t.value, t.start, t.end, stem=t.normalized_value)
                      for t in tokens]
        cache = [{TOKEN_NAME: token} for token in tokens]
        features = []
        random_state = check_random_state(self.config.random_seed)
        for i in range(len(tokens)):
            token_features = UnupdatableDict()
            for feature in self.features:
                f_drop_out = feature.drop_out
                if drop_out and random_state.rand() < f_drop_out:
                    continue
                value = feature.compute(i, cache)
                if value is not None:
                    token_features[feature.name] = value
            features.append(token_features)
        return features
Ejemplo n.º 2
0
def _normalize_stem(text, language):
    normalized_stemmed = normalize(text)
    try:
        normalized_stemmed = stem(normalized_stemmed, language)
    except MissingResource:
        pass
    return normalized_stemmed
Ejemplo n.º 3
0
def _normalize_stem(text, language):
    normalized_stemmed = normalize(text)
    try:
        normalized_stemmed = stem(normalized_stemmed, language)
    except UnknownResource:
        pass
    return normalized_stemmed
Ejemplo n.º 4
0
    def compute_features(self, tokens, drop_out=False):
        """Compute features on the provided tokens

        The *drop_out* parameters allows to activate drop out on features that
        have a positive drop out ratio. This should only be used during
        training.
        """

        if resource_exists(self.language, STEMS):
            tokens = [
                Token(t.value, t.start, t.end,
                      stem=stem(t.normalized_value, self.language))
                for t in tokens]
        else:
            tokens = [Token(t.value, t.start, t.end, stem=t.normalized_value)
                      for t in tokens]
        cache = [{TOKEN_NAME: token} for token in tokens]
        features = []
        random_state = check_random_state(self.config.random_seed)
        for i in range(len(tokens)):
            token_features = UnupdatableDict()
            for feature in self.features:
                f_drop_out = feature.drop_out
                if drop_out and random_state.rand() < f_drop_out:
                    continue
                value = feature.compute(i, cache)
                if value is not None:
                    token_features[feature.name] = value
            features.append(token_features)
        return features
Ejemplo n.º 5
0
def _stem_entity_utterances(entity_utterances, language, resources):
    values = dict()
    # Sort by resolved value, so that values conflict in a deterministic way
    for raw_value, resolved_value in sorted(iteritems(entity_utterances),
                                            key=operator.itemgetter(1)):
        stemmed_value = stem(raw_value, language, resources)
        if stemmed_value not in values:
            values[stemmed_value] = resolved_value
    return values
Ejemplo n.º 6
0
 def language(self, value):
     if value is not None:
         self._language = value
         self.args["language_code"] = self.language
         if self.common_words_gazetteer_name is not None:
             gazetteer = get_gazetteer(self.language,
                                       self.common_words_gazetteer_name)
             if self.use_stemming:
                 gazetteer = set(stem(w, self.language) for w in gazetteer)
             self.gazetteer = gazetteer
Ejemplo n.º 7
0
 def language(self, value):
     if value is not None:
         self._language = value
         self.args["language_code"] = self.language
         if self.common_words_gazetteer_name is not None:
             gazetteer = get_gazetteer(self.language,
                                       self.common_words_gazetteer_name)
             if self.use_stemming:
                 gazetteer = set(stem(w, self.language) for w in gazetteer)
             self.gazetteer = gazetteer
Ejemplo n.º 8
0
def _normalize_stem(text, language, use_stemming):
    if use_stemming:
        return stem(text, language)
    return normalize(text)
Ejemplo n.º 9
0
def _normalize_stem(text, language, resources, use_stemming):
    if use_stemming:
        return stem(text, language, resources)
    return normalize(text)
Ejemplo n.º 10
0
 def preprocess(string):
     normalized = normalize(string)
     if resource_exists(self.language, STEMS) and self.use_stemming:
         return stem(normalized, self.language)
     return normalized
Ejemplo n.º 11
0
 def preprocess(string):
     normalized = normalize(string)
     return stem(normalized, self.language) if self.use_stemming \
         else normalized
Ejemplo n.º 12
0
def _stem_entity_utterances(entity_utterances, language):
    return {
        stem(raw_value, language): resolved_value
        for raw_value, resolved_value in iteritems(entity_utterances)
    }
def _normalize_stem(text, language, resources, use_stemming):
    from snips_nlu_utils import normalize

    if use_stemming:
        return stem(text, language, resources)
    return normalize(text)
Ejemplo n.º 14
0
 def preprocess(string):
     normalized = normalize(string)
     if self.use_stemming:
         return stem(normalized, self.language)
     return normalized
Ejemplo n.º 15
0
 def preprocess(string):
     normalized = normalize(string)
     if resource_exists(self.language, STEMS) and self.use_stemming:
         return stem(normalized, self.language)
     return normalized