def handle(self, *args, **options):
        text_analyzer = get_text_analyzer("german")
        elastic_index = Index("mst_debug")
        if not elastic_index.exists():
            elastic_index.create()
        elastic_index.close()
        elastic_index.analyzer(text_analyzer)
        elastic_index.save()
        elastic_index.open()
        elastic_index.flush()

        for word in options["words"]:
            analysis = elastic_index.analyze(body={
                "analyzer": "text_analyzer",
                "text": word
            })
            tokens = [i["token"] for i in analysis["tokens"]]
            self.stdout.write("{} {}\n".format(word, tokens))
Exemple #2
0
def gen_suggests(index, info_tuple):
    # 根据字符串生成搜索建议数组
    used_words = set()
    suggests = []
    for text, weight in info_tuple:
        if text:
            # 调用es的analyze接口分析字符串
            ik_analyzer = CustomAnalyzer("ik_max_word", filter=["lowercase"])
            my_analyzer = analyzer('my_analyzer',
                                   tokenizer=tokenizer('trigram',
                                                       'nGram',
                                                       min_gram=3,
                                                       max_gram=3),
                                   filter=['lowercase'])
            i = Index(index)
            i._analysis = ik_analyzer
            # i.analyzer(analyzer=ik_analyzer)

            # i.analyzer.default.type: "ik_max_word"
            a = i.analyze(params={'filter': ["lowercase"]}, body=text)

            # i.analyzer(analyzer = "ik_max_word")

            words = es.indices.analyze(index=index,
                                       params={'filter': ["lowercase"]},
                                       body=text)
            anylyzed_words = set(
                [r["token"] for r in words["tokens"] if len(r["token"]) > 1])
            new_words = anylyzed_words - used_words
        else:
            new_words = set()

        if new_words:
            suggests.append({"input": list(new_words), "weight": weight})

    return suggests
Exemple #3
0
def analyze(
    url: str,
    text: str,
    analyzer: str,
):
    # We can confidently use a single host here because we're not searching a cluster.
    connections.create_connection(hosts=[url])
    index = Index(INDEX_ALIAS_NAME)
    analysis = index.analyze(body={"text": text, "analyzer": analyzer})
    print(f"For text: {text!r}")
    if "tokens" in analysis:
        keys = None
        for token in analysis["tokens"]:
            if keys is None:
                keys = token.keys()
            longest_key = max(len(x) for x in keys)
            for key in keys:
                print(f"{key:{longest_key + 1}} {token[key]!r}")
            print()
    elif not analysis:
        print("No tokens found!")
    else:
        # Desperate if it's not a list of tokens
        print(json.dumps(analysis, indent=2))