Exemple #1
0
def index_document(file_hash, file_path):
    if not ES.exists(index=DOCUMENTS_INDEX, id=file_hash):
        file_name, file_extension = get_file_name_and_extension(file_path)
        doc = {
            'name': file_name,
            'extension': file_extension,
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc, id=file_hash)
Exemple #2
0
def index_document(path):
    if not document_exist(path):
        file_hash = SHA256.hash_file(path)
        file_name, extension = get_file_name_and_extension(path)

        doc = {
            'path': path,
            'name': file_name,
            'extension': extension,
            'hash': file_hash,
            'size': os.stat(path).st_size,
            'timestamp': datetime.now(),
            'exists': True
        }
        ES.index(index=DOCUMENTS_INDEX, body=doc)
Exemple #3
0
def get_similar_documents(file_hash):
    body = {
        "query": {
            "term": {
                "hash": {
                    "value": file_hash,
                    "boost": 1.0
                }
            }
        }
    }
    doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0]
    original_content = doc['_source']['content']
    original_name = doc['_source']['name']
    original_hash = doc['_source']['hash']
    if len(original_content) > 0:
        body = {
            "query": {
                "match": {
                    "content": {
                        "query": " ".join(sorted(original_content.split()[:500]))
                    }
                }
            },
            "sort": [
                "_score"
            ]
        }
        match = ES.search(body=body, index=DOCUMENTS_INDEX)

        score_max = match['hits']['max_score']
        score_threshold = score_max - (score_max / 100 * 5)
        results = {original_hash: original_name}
        for r in match['hits']['hits']:
            if r['_score'] > score_threshold:
                results[r['_source']['hash']] = r['_source']['name']
        if len(results) > 1:
            ES.index(index=DUPLICATES_INDEX, body=results)