def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: return get_default_sep(self.language).join( get_shape(t.value) for t in tokens[token_index:end]) return None
def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: return get_default_sep(self.language).join( get_shape(t.value) for t in tokens[token_index:end]) return None
def _enrich_utterance(self, utterance, builtin_entities, custom_entities, word_clusters): custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], self.language) for e in custom_entities ] builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], self.language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid # learning specific samples such as '42' or 'tomorrow' filtered_tokens = [ chunk[TEXT] for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(self.language).join(filtered_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters: features += " " + " ".join(sorted(word_clusters)) return features
def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: if self.gazetteer is None: if self.use_stemming: return get_default_sep(self.language).join( t.stem for t in tokens[token_index:end]) return get_default_sep(self.language).join( t.normalized_value for t in tokens[token_index:end]) words = [] for t in tokens[token_index:end]: normalized = t.stem if self.use_stemming else \ t.normalized_value words.append(normalized if normalized in self.gazetteer else "rare_word") return get_default_sep(self.language).join(words) return None
def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: if self.gazetteer is None: if self.use_stemming: return get_default_sep(self.language).join( t.stem for t in tokens[token_index:end]) return get_default_sep(self.language).join( t.normalized_value for t in tokens[token_index:end]) words = [] for t in tokens[token_index:end]: normalized = t.stem if self.use_stemming else \ t.normalized_value words.append(normalized if normalized in self.gazetteer else "rare_word") return get_default_sep(self.language).join(words) return None
def compute_feature(self, tokens, token_index): max_len = len(tokens) end = token_index + self.n if 0 <= token_index < max_len and end <= max_len: if self.gazetteer is None: if self.use_stemming: stems = (stem_token(t, self.language) for t in tokens[token_index:end]) return get_default_sep(self.language).join(stems) normalized_values = (normalize_token(t) for t in tokens[token_index:end]) return get_default_sep(self.language).join(normalized_values) words = [] for t in tokens[token_index:end]: if self.use_stemming: value = stem_token(t, self.language) else: value = normalize_token(t) words.append(value if value in self.gazetteer else "rare_word") return get_default_sep(self.language).join(words) return None
def _preprocess_query(query, language, entity_utterances_to_features_names): query_tokens = tokenize_light(query, language) word_clusters_features = _get_word_cluster_features(query_tokens, language) normalized_stemmed_tokens = [_normalize_stem(t, language) for t in query_tokens] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) features = get_default_sep(language).join(normalized_stemmed_tokens) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_query(query, language, entity_utterances_to_features_names): query_tokens = tokenize_light(query, language) word_clusters_features = _get_word_cluster_features(query_tokens, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in query_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) features = get_default_sep(language).join(normalized_stemmed_tokens) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _load_gazetteers(gazetteers_path, language): if not gazetteers_path.is_dir(): return dict() gazetteers = dict() for filepath in gazetteers_path.iterdir(): gazetteer_name = filepath.stem with filepath.open(encoding="utf8") as f: gazetteers[gazetteer_name] = set() for line in f: normalized = normalize(line.strip()) if normalized: token_values = (t.value for t in tokenize(normalized, language)) normalized = get_default_sep(language).join(token_values) gazetteers[gazetteer_name].add(normalized) return gazetteers
def _load_gazetteers(language): gazetteers_paths = { os.path.splitext(name)[0]: os.path.join(get_resources_path(language), name) for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) } gazetteers = dict() for name, path in iteritems(gazetteers_paths): with io.open(path, encoding="utf8") as f: gazetteers[name] = set() for l in f: normalized = normalize(l.strip()) if normalized: token_values = (t.value for t in tokenize(normalized, language)) normalized = get_default_sep(language).join(token_values) gazetteers[name].add(normalized) return gazetteers
def _preprocess_utterance(utterance, language, builtin_entity_parser, custom_entity_parser, word_clusters_name, use_stemming, unknownword_replacement_string): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming) for t in utterance_tokens] custom_entities = custom_entity_parser.parse( " ".join(normalized_stemmed_tokens)) custom_entities = [e for e in custom_entities if e["value"] != unknownword_replacement_string] custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], language) for e in custom_entities] builtin_entities = builtin_entity_parser.parse( utterance_text, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language, use_stemming) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def get_string_variations(string, language): variations = {string} variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(and_variations(v, language) for v in variations)) variations.update( flatten(punctuation_variations(v, language) for v in variations)) variations.update( flatten(numbers_variations(v, language) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations
def get_string_variations(string, language): variations = {string} variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations variations.update(flatten(case_variations(v) for v in variations)) variations.update(flatten(and_variations(v, language) for v in variations)) variations.update( flatten(punctuation_variations(v, language) for v in variations)) variations.update( flatten(numbers_variations(v, language) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language) for t in utterance_tokens] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = ( e[RES_MATCH_RANGE] for e in sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]) ) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [_normalize_stem(t, language) for t in filtered_utterance_tokens] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_tokens = tokenize_light(utterance, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance, language) entities_ranges = (e[RES_MATCH_RANGE] for e in sorted( builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove builtin entities from the utterance to avoid learning specific # examples such as '42' filtered_utterance = _remove_ranges(utterance, entities_ranges) filtered_utterance_tokens = tokenize_light(filtered_utterance, language) filtered_normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in filtered_utterance_tokens ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def stem_function(text, language): return get_default_sep(language).join( [_stem(t) for t in tokenize_light(text, language)])
def capitalize(text, language): tokens = tokenize_light(text, language) stop_words = get_stop_words(language) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)
variations.update(flatten(normalization_variations(v) for v in variations)) # We re-generate case variations as normalization can produce new # variations if case: variations.update(flatten(case_variations(v) for v in variations)) if and_: variations.update( flatten(and_variations(v, language) for v in variations)) if punctuation: variations.update( flatten(punctuation_variations(v, language) for v in variations)) # Special case of number variation which are long to generate due to the # BuilinEntityParser running on each variation if numbers: variations.update( flatten( numbers_variations(v, language, builtin_entity_parser) for v in variations)) # Add single space variations single_space_variations = set(" ".join(v.split()) for v in variations) variations.update(single_space_variations) # Add tokenized variations tokenized_variations = set( get_default_sep(language).join(tokenize_light(v, language)) for v in variations) variations.update(tokenized_variations) return variations
def capitalize(text, language, resources): tokens = tokenize_light(text, language) stop_words = get_stop_words(resources) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)