Python ESの例

プログラミング言語: Python

名前空間/パッケージ名: yalse_core.common.constants

クラス/型: ES

hotexamples.comのコード掲載数: 16

Python ES - 16件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのyalse_core.common.constants.ESの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

search(10)

update(4)

index(3)

scroll(2)

delete(1)

exists(1)

get(1)

コード例 #1

ファイルを表示

ファイル: index.py プロジェクト: iaga84/yalse-core

def index_document(file_hash, file_path):
    if not ES.exists(index=DOCUMENTS_INDEX, id=file_hash):
        file_name, file_extension = get_file_name_and_extension(file_path)
        doc = {
            'name': file_name,
            'extension': file_extension,
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc, id=file_hash)

コード例 #2

ファイルを表示

def document_exist(path):
    body = {"query": {"match": {"path": path}}}
    r = ES.search(body=body, index=DOCUMENTS_INDEX)
    exists = r['hits']['total']['value'] > 0

    for doc in r['hits']['hits']:
        ES.update(index=DOCUMENTS_INDEX,
                  id=doc['_id'],
                  body={"doc": {
                      "exists": True
                  }})

    return exists

コード例 #3

ファイルを表示

def index_document(path):
    if not document_exist(path):
        file_hash = SHA256.hash_file(path)
        file_name, extension = get_file_name_and_extension(path)

        doc = {
            'path': path,
            'name': file_name,
            'extension': extension,
            'hash': file_hash,
            'size': os.stat(path).st_size,
            'timestamp': datetime.now(),
            'exists': True
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc)

コード例 #4

ファイルを表示

def remove_document_from_index(path):
    body = {
        "query": {
            "term": {
                "path": {
                    "value": path,
                    "boost": 1.0
                }
            }
        }
    }
    try:
        doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0]
        ES.delete(id=doc['_id'], index=DOCUMENTS_INDEX)
    except:
        logging.error("Can't remove {}".format(path))

コード例 #5

ファイルを表示

def delete_document_copies(hash):
    body = {
        "query": {
            "term": {
                "hash": {
                    "value": hash,
                    "boost": 1.0
                }
            }
        }, "sort": [
            {"timestamp": {"order": "asc"}}
        ]
    }
    all_copies = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits']
    for copy in all_copies:
        assert Path(copy['_source']['path']).exists()
    result = []
    if len(all_copies) > 1:
        for copy in all_copies[1:]:
            path = copy['_source']['path']
            try:
                os.remove(path)
                remove_document_from_index(path)
                result.append(path)
            except:
                logging.error("Can't delete file {}".format(path))

    return result

コード例 #6

ファイルを表示

def search_documents(query):
    body = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["name^5", "content", "path"]
            }
        },
        "size": 1000
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)

コード例 #7

ファイルを表示

def get_all_missing_documents(index=DOCUMENTS_INDEX,
                              scroll_timeout="2m",
                              **kwargs):
    is_first = True
    body = {"query": {"term": {"exists": {"value": False, "boost": 1.0}}}}
    while True:
        if is_first:
            result = ES.search(index=index,
                               scroll=scroll_timeout,
                               **kwargs,
                               body=body)
            is_first = False
        else:
            result = ES.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            })
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        if not hits:
            break
        yield from (hit for hit in hits)

コード例 #8

ファイルを表示

def get_all_documents(pagesize=250,
                      index=DOCUMENTS_INDEX,
                      scroll_timeout="2m",
                      **kwargs):
    is_first = True
    while True:
        if is_first:  # Initialize scroll
            result = ES.search(index=index,
                               scroll=scroll_timeout,
                               **kwargs,
                               body={"size": pagesize})
            is_first = False
        else:
            result = ES.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            })
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        if not hits:
            break
        yield from (hit for hit in hits)

コード例 #9

ファイルを表示

def get_stats_extensions():
    body = {
        "aggs": {
            "extensions": {
                "terms": {
                    "field": "extension",
                    "size": 1000
                }
            }
        },
        "size": 0
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)

コード例 #10

ファイルを表示

def get_similar_documents(file_hash):
    body = {
        "query": {
            "term": {
                "hash": {
                    "value": file_hash,
                    "boost": 1.0
                }
            }
        }
    }
    doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0]
    original_content = doc['_source']['content']
    original_name = doc['_source']['name']
    original_hash = doc['_source']['hash']
    if len(original_content) > 0:
        body = {
            "query": {
                "match": {
                    "content": {
                        "query": " ".join(sorted(original_content.split()[:500]))
                    }
                }
            },
            "sort": [
                "_score"
            ]
        }
        match = ES.search(body=body, index=DOCUMENTS_INDEX)

        score_max = match['hits']['max_score']
        score_threshold = score_max - (score_max / 100 * 5)
        results = {original_hash: original_name}
        for r in match['hits']['hits']:
            if r['_score'] > score_threshold:
                results[r['_source']['hash']] = r['_source']['name']
        if len(results) > 1:
            ES.index(index=DUPLICATES_INDEX, body=results)

コード例 #11

ファイルを表示

def library_size():
    body = {
        "query": {
            "match_all": {}
        },
        "size": 0,
        "aggs": {
            "library_size": {
                "sum": {
                    "field": "size"
                }
            }
        }
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)

コード例 #12

ファイルを表示

def get_duplicate_files():
    body = {
        "size": 0,
        "aggs": {
            "duplicate_hashes": {
                "terms": {
                    "field": "hash",
                    "size": 5000,
                    "min_doc_count": 2
                }
            }
        }
    }
    hashes = ES.search(body=body, index=DOCUMENTS_INDEX)['aggregations']['duplicate_hashes']['buckets']
    result = []
    for h in hashes:
        result.append(h['key'])

    return result

コード例 #13

ファイルを表示

def index_document_metadata(id, path):
    try:
        file_meta = get_tika_meta(path)
    except:
        file_meta = {}
    ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"meta": file_meta}})

コード例 #14

ファイルを表示

def reset_exists():
    for doc in get_all_documents():
        ES.update(index=DOCUMENTS_INDEX, id=doc['_id'], body={"doc": {"exists": False}})

コード例 #15

ファイルを表示

def index_document_content(id, path):
    try:
        file_content = get_tika_content(path)
    except:
        file_content = ""
    ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"content": file_content}})

コード例 #16

ファイルを表示

def get_document(id):
    return ES.get(index=DOCUMENTS_INDEX, id=id)