def do_folder_sync(self, node_id): """ Pass a folder_id, read it from disk and the ES index. Compare its content and files and return the correct results based what is stored on disk. We do two queries, first to find total hits and then use total hits to do the second query. The reason for this is that a huge size value decrease Elasticsearch query performance by a huge margin. """ folder = Folder.get_instance(node_id, decode=True) if folder: index_id = uenc(folder.path.encode('utf-8')) max_size = int(self.es_service.conn.get(self.count_url, data={ "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } })['hits']['total']) search_url = u'{idx_name}/node/_search'.format( idx_name=self.idx_name) results = self.es_service.conn.get(search_url, data={ "from": 0, "size": max_size, "fields": [], "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } }) es_node_ids = set([doc['_id'] for doc in results['hits']['hits']]) disk_nodes = {node.index_id: node for node in ( folder.folders + folder.files)} disk_node_ids = set(disk_nodes.keys()) deleted_docs = es_node_ids - disk_node_ids new_docs = disk_node_ids - es_node_ids for doc_to_delete in deleted_docs: self.delete_document_by_id(doc_to_delete) for new_document in new_docs: self.index_document_by_node(disk_nodes[new_document]) self.flush_index() else: self.app.logger.error( 'No folder found by passing node id: {node_id}'.format( node_id=node_id ))
def index_files(self, folder): """ Index the files in folder """ folder_instance = Folder.get_instance(folder['path'], user=self.user) folder_files = folder_instance.files file_bulk = [] for file_obj in folder_files: file_bulk.append({'index': {'_id': file_obj.index_id}}) fdata = { 'name': file_obj.name, 'parent': folder_instance.index_id, 'date_modified': file_obj.date_modified, 'size': file_obj.get_size(), 'type': file_obj.type } file_bulk.append(fdata) if len(file_bulk) > 0: self.es_service.bulk_insert(self.bulk_insert_url, file_bulk) return len(file_bulk)