def compute_feature(self, tokens, token_index): if self.use_stemming: value = stem_token(tokens[token_index], self.language) else: value = normalize_token(tokens[token_index]) cluster = get_word_clusters(self.language)[self.cluster_name] return cluster.get(value, None)
def _get_word_cluster_features(query_tokens, clusters_name, language): if not clusters_name: return [] ngrams = get_all_ngrams(query_tokens) cluster_features = [] for ngram in ngrams: cluster = get_word_clusters(language)[clusters_name].get( ngram[NGRAM].lower(), None) if cluster is not None: cluster_features.append(cluster) return cluster_features
def _get_word_cluster_features(query_tokens, language): cluster_name = CLUSTER_USED_PER_LANGUAGES.get(language, False) if not cluster_name: return [] ngrams = get_all_ngrams(query_tokens) cluster_features = [] for ngram in ngrams: cluster = get_word_clusters(language)[cluster_name].get( ngram[NGRAM].lower(), None) if cluster is not None: cluster_features.append(cluster) return cluster_features
def language(self, value): if value is not None: self._language = value self.cluster = get_word_clusters(self.language)[self.cluster_name] self.args["language_code"] = self.language
def compute_feature(self, tokens, token_index): normalized_value = tokens[token_index].stem if self.use_stemming \ else tokens[token_index].normalized_value cluster = get_word_clusters(self.language)[self.cluster_name] return cluster.get(normalized_value, None)
def _get_tokens_clusters(tokens, language, cluster_name): clusters = get_word_clusters(language)[cluster_name] return [clusters[t] for t in tokens if t in clusters]