def index_document(file_hash, file_path): if not ES.exists(index=DOCUMENTS_INDEX, id=file_hash): file_name, file_extension = get_file_name_and_extension(file_path) doc = { 'name': file_name, 'extension': file_extension, } ES.index(index=DOCUMENTS_INDEX, body=doc, id=file_hash)
def index_document(path): if not document_exist(path): file_hash = SHA256.hash_file(path) file_name, extension = get_file_name_and_extension(path) doc = { 'path': path, 'name': file_name, 'extension': extension, 'hash': file_hash, 'size': os.stat(path).st_size, 'timestamp': datetime.now(), 'exists': True } ES.index(index=DOCUMENTS_INDEX, body=doc)
def get_similar_documents(file_hash): body = { "query": { "term": { "hash": { "value": file_hash, "boost": 1.0 } } } } doc = ES.search(body=body, index=DOCUMENTS_INDEX)['hits']['hits'][0] original_content = doc['_source']['content'] original_name = doc['_source']['name'] original_hash = doc['_source']['hash'] if len(original_content) > 0: body = { "query": { "match": { "content": { "query": " ".join(sorted(original_content.split()[:500])) } } }, "sort": [ "_score" ] } match = ES.search(body=body, index=DOCUMENTS_INDEX) score_max = match['hits']['max_score'] score_threshold = score_max - (score_max / 100 * 5) results = {original_hash: original_name} for r in match['hits']['hits']: if r['_score'] > score_threshold: results[r['_source']['hash']] = r['_source']['name'] if len(results) > 1: ES.index(index=DUPLICATES_INDEX, body=results)