def handle(self, *args, **options): text_analyzer = get_text_analyzer("german") elastic_index = Index("mst_debug") if not elastic_index.exists(): elastic_index.create() elastic_index.close() elastic_index.analyzer(text_analyzer) elastic_index.save() elastic_index.open() elastic_index.flush() for word in options["words"]: analysis = elastic_index.analyze(body={ "analyzer": "text_analyzer", "text": word }) tokens = [i["token"] for i in analysis["tokens"]] self.stdout.write("{} {}\n".format(word, tokens))
def gen_suggests(index, info_tuple): # 根据字符串生成搜索建议数组 used_words = set() suggests = [] for text, weight in info_tuple: if text: # 调用es的analyze接口分析字符串 ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"]) my_analyzer = analyzer('my_analyzer', tokenizer=tokenizer('trigram', 'nGram', min_gram=3, max_gram=3), filter=['lowercase']) i = Index(index) i._analysis = ik_analyzer # i.analyzer(analyzer=ik_analyzer) # i.analyzer.default.type: "ik_max_word" a = i.analyze(params={'filter': ["lowercase"]}, body=text) # i.analyzer(analyzer = "ik_max_word") words = es.indices.analyze(index=index, params={'filter': ["lowercase"]}, body=text) anylyzed_words = set( [r["token"] for r in words["tokens"] if len(r["token"]) > 1]) new_words = anylyzed_words - used_words else: new_words = set() if new_words: suggests.append({"input": list(new_words), "weight": weight}) return suggests
def analyze( url: str, text: str, analyzer: str, ): # We can confidently use a single host here because we're not searching a cluster. connections.create_connection(hosts=[url]) index = Index(INDEX_ALIAS_NAME) analysis = index.analyze(body={"text": text, "analyzer": analyzer}) print(f"For text: {text!r}") if "tokens" in analysis: keys = None for token in analysis["tokens"]: if keys is None: keys = token.keys() longest_key = max(len(x) for x in keys) for key in keys: print(f"{key:{longest_key + 1}} {token[key]!r}") print() elif not analysis: print("No tokens found!") else: # Desperate if it's not a list of tokens print(json.dumps(analysis, indent=2))