def do_folder_sync(self, node_id): """ Pass a folder_id, read it from disk and the ES index. Compare its content and files and return the correct results based what is stored on disk. We do two queries, first to find total hits and then use total hits to do the second query. The reason for this is that a huge size value decrease Elasticsearch query performance by a huge margin. """ folder = Folder.get_instance(node_id, decode=True) if folder: index_id = uenc(folder.path.encode('utf-8')) max_size = int(self.es_service.conn.get(self.count_url, data={ "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } })['hits']['total']) search_url = u'{idx_name}/node/_search'.format( idx_name=self.idx_name) results = self.es_service.conn.get(search_url, data={ "from": 0, "size": max_size, "fields": [], "query": { "bool": { "must": [{ "term": { "parent": index_id } }] } } }) es_node_ids = set([doc['_id'] for doc in results['hits']['hits']]) disk_nodes = {node.index_id: node for node in ( folder.folders + folder.files)} disk_node_ids = set(disk_nodes.keys()) deleted_docs = es_node_ids - disk_node_ids new_docs = disk_node_ids - es_node_ids for doc_to_delete in deleted_docs: self.delete_document_by_id(doc_to_delete) for new_document in new_docs: self.index_document_by_node(disk_nodes[new_document]) self.flush_index() else: self.app.logger.error( 'No folder found by passing node id: {node_id}'.format( node_id=node_id ))
def index_files(self, folder): """ Index the files in folder """ folder_instance = Folder.get_instance(folder['path'], user=self.user) folder_files = folder_instance.files file_bulk = [] for file_obj in folder_files: file_bulk.append({'index': {'_id': file_obj.index_id}}) fdata = { 'name': file_obj.name, 'parent': folder_instance.index_id, 'date_modified': file_obj.date_modified, 'size': file_obj.get_size(), 'type': file_obj.type } file_bulk.append(fdata) if len(file_bulk) > 0: self.es_service.bulk_insert(self.bulk_insert_url, file_bulk) return len(file_bulk)
def do_full_sync(self): """ Do a full sync of the user filesystem. Find all folders from the user's filesystem compare them one by one. Then fetch all folders in ES and compare them for a cleanup. In case some folders have been renamed. """ self.disable_realtime_indexing() disk_folders = Folder.find_all_folders(self.user) es_data = {"docs": []} for folder in disk_folders: es_data["docs"].append({ "_index": self.idx_name, "_type": "node", "_id": folder.index_id, "fields": ["_id"] }) results = self.es_service.conn.get('_mget', data=es_data) if results['status'] == 200: es_results = [d_id['_id'] for d_id in results['docs'] if d_id['exists'] == True] for folder in disk_folders: if folder.index_id in es_results: self.app.logger.debug(u'Syncing folder: {f}'.format( f=folder.sys_path)) self.do_folder_sync(folder.index_id) else: self.app.logger.debug(u'Created folder: {f}'.format( f=folder.sys_path)) self.index_folders_and_files(folder=folder) else: self.app.logger.error(u'Couldn\'t fetch documents. Full sync stopped.') return max_size = int(self.es_service.conn.get(self.count_url, data={ "query": { "bool": { "must": [{ "term": { "type": Node.FOLDER_TYPE } }] } } })['hits']['total']) search_url = '{idx_name}/_search'.format(idx_name=self.idx_name) es_docs = self.es_service.conn.get(search_url, data={ "from": 0, "size": max_size, "fields": [], "query": { "bool": { "must": [{ "term": { "type": Node.FOLDER_TYPE } }] } } })['hits']['hits'] es_docs = {doc['_id']: doc for doc in es_docs} folder_nodes = {folder.index_id: folder for folder in disk_folders} es_folders = set(es_docs.keys()) folders = set(folder_nodes.keys()) deleted_docs = es_folders - folders deleted_ids = [] for doc_to_delete in deleted_docs: deleted_ids += self.delete_document_by_parent_id(doc_to_delete) if doc_to_delete not in deleted_ids: deleted_ids.append( self.delete_document_by_id(doc_to_delete)) self.app.logger.debug( u'Deleted nodes with id like:\n {name}'.format(name=deleted_ids)) self.enable_realtime_indexing() self.optimize_index() self.flush_index()