コード例 #1
0
ファイル: index.py プロジェクト: iaga84/yalse-core
def index_document(file_hash, file_path):
    if not ES.exists(index=DOCUMENTS_INDEX, id=file_hash):
        file_name, file_extension = get_file_name_and_extension(file_path)
        doc = {
            'name': file_name,
            'extension': file_extension,
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc, id=file_hash)
コード例 #2
0
def document_exist(path):
    body = {"query": {"match": {"path": path}}}
    r = ES.search(body=body, index=DOCUMENTS_INDEX)
    exists = r['hits']['total']['value'] > 0

    for doc in r['hits']['hits']:
        ES.update(index=DOCUMENTS_INDEX,
                  id=doc['_id'],
                  body={"doc": {
                      "exists": True
                  }})

    return exists
コード例 #3
0
def index_document(path):
    if not document_exist(path):
        file_hash = SHA256.hash_file(path)
        file_name, extension = get_file_name_and_extension(path)

        doc = {
            'path': path,
            'name': file_name,
            'extension': extension,
            'hash': file_hash,
            'size': os.stat(path).st_size,
            'timestamp': datetime.now(),
            'exists': True
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc)
コード例 #4
0
def remove_document_from_index(path):
    body = {
        "query": {
            "term": {
                "path": {
                    "value": path,
                    "boost": 1.0
                }
            }
        }
    }
    try:
        doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0]
        ES.delete(id=doc['_id'], index=DOCUMENTS_INDEX)
    except:
        logging.error("Can't remove {}".format(path))
コード例 #5
0
def delete_document_copies(hash):
    body = {
        "query": {
            "term": {
                "hash": {
                    "value": hash,
                    "boost": 1.0
                }
            }
        }, "sort": [
            {"timestamp": {"order": "asc"}}
        ]
    }
    all_copies = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits']
    for copy in all_copies:
        assert Path(copy['_source']['path']).exists()
    result = []
    if len(all_copies) > 1:
        for copy in all_copies[1:]:
            path = copy['_source']['path']
            try:
                os.remove(path)
                remove_document_from_index(path)
                result.append(path)
            except:
                logging.error("Can't delete file {}".format(path))

    return result
コード例 #6
0
def search_documents(query):
    body = {
        "query": {
            "multi_match": {
                "query": query,
                "fields": ["name^5", "content", "path"]
            }
        },
        "size": 1000
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)
コード例 #7
0
def get_all_missing_documents(index=DOCUMENTS_INDEX,
                              scroll_timeout="2m",
                              **kwargs):
    is_first = True
    body = {"query": {"term": {"exists": {"value": False, "boost": 1.0}}}}
    while True:
        if is_first:
            result = ES.search(index=index,
                               scroll=scroll_timeout,
                               **kwargs,
                               body=body)
            is_first = False
        else:
            result = ES.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            })
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        if not hits:
            break
        yield from (hit for hit in hits)
コード例 #8
0
def get_all_documents(pagesize=250,
                      index=DOCUMENTS_INDEX,
                      scroll_timeout="2m",
                      **kwargs):
    is_first = True
    while True:
        if is_first:  # Initialize scroll
            result = ES.search(index=index,
                               scroll=scroll_timeout,
                               **kwargs,
                               body={"size": pagesize})
            is_first = False
        else:
            result = ES.scroll(body={
                "scroll_id": scroll_id,
                "scroll": scroll_timeout
            })
        scroll_id = result["_scroll_id"]
        hits = result["hits"]["hits"]
        if not hits:
            break
        yield from (hit for hit in hits)
コード例 #9
0
def get_stats_extensions():
    body = {
        "aggs": {
            "extensions": {
                "terms": {
                    "field": "extension",
                    "size": 1000
                }
            }
        },
        "size": 0
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)
コード例 #10
0
def get_similar_documents(file_hash):
    body = {
        "query": {
            "term": {
                "hash": {
                    "value": file_hash,
                    "boost": 1.0
                }
            }
        }
    }
    doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0]
    original_content = doc['_source']['content']
    original_name = doc['_source']['name']
    original_hash = doc['_source']['hash']
    if len(original_content) > 0:
        body = {
            "query": {
                "match": {
                    "content": {
                        "query": " ".join(sorted(original_content.split()[:500]))
                    }
                }
            },
            "sort": [
                "_score"
            ]
        }
        match = ES.search(body=body, index=DOCUMENTS_INDEX)

        score_max = match['hits']['max_score']
        score_threshold = score_max - (score_max / 100 * 5)
        results = {original_hash: original_name}
        for r in match['hits']['hits']:
            if r['_score'] > score_threshold:
                results[r['_source']['hash']] = r['_source']['name']
        if len(results) > 1:
            ES.index(index=DUPLICATES_INDEX, body=results)
コード例 #11
0
def library_size():
    body = {
        "query": {
            "match_all": {}
        },
        "size": 0,
        "aggs": {
            "library_size": {
                "sum": {
                    "field": "size"
                }
            }
        }
    }
    return ES.search(body=body, index=DOCUMENTS_INDEX)
コード例 #12
0
def get_duplicate_files():
    body = {
        "size": 0,
        "aggs": {
            "duplicate_hashes": {
                "terms": {
                    "field": "hash",
                    "size": 5000,
                    "min_doc_count": 2
                }
            }
        }
    }
    hashes = ES.search(body=body, index=DOCUMENTS_INDEX)['aggregations']['duplicate_hashes']['buckets']
    result = []
    for h in hashes:
        result.append(h['key'])

    return result
コード例 #13
0
def index_document_metadata(id, path):
    try:
        file_meta = get_tika_meta(path)
    except:
        file_meta = {}
    ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"meta": file_meta}})
コード例 #14
0
def reset_exists():
    for doc in get_all_documents():
        ES.update(index=DOCUMENTS_INDEX, id=doc['_id'], body={"doc": {"exists": False}})
コード例 #15
0
def index_document_content(id, path):
    try:
        file_content = get_tika_content(path)
    except:
        file_content = ""
    ES.update(index=DOCUMENTS_INDEX, id=id, body={"doc": {"content": file_content}})
コード例 #16
0
def get_document(id):
    return ES.get(index=DOCUMENTS_INDEX, id=id)