コード例 #1
0
ファイル: user_data_service.py プロジェクト: olavgg/py-sth
 def do_folder_sync(self, node_id):
     """
     Pass a folder_id, read it from disk and the ES index. Compare its
     content and files and return the correct results based what is stored
     on disk.
     
     We do two queries, first to find total hits and then use total hits
     to do the second query. The reason for this is that a huge size value
     decrease Elasticsearch query performance by a huge margin.
     """
     folder = Folder.get_instance(node_id, decode=True)
     if folder:
         index_id = uenc(folder.path.encode('utf-8'))
         max_size = int(self.es_service.conn.get(self.count_url, data={
             "query": {
                 "bool": {
                     "must": [{
                                  "term": {
                                      "parent": index_id
                                  }
                              }]
                 }
             }
         })['hits']['total'])
         search_url = u'{idx_name}/node/_search'.format(
             idx_name=self.idx_name)
         results = self.es_service.conn.get(search_url, data={
             "from": 0,
             "size": max_size,
             "fields": [],
             "query": {
                 "bool": {
                     "must": [{
                                  "term": {
                                      "parent": index_id
                                  }
                              }]
                 }
             }
         })
         es_node_ids = set([doc['_id'] for doc in results['hits']['hits']])
         disk_nodes = {node.index_id: node for node in (
             folder.folders + folder.files)}
         disk_node_ids = set(disk_nodes.keys())
         deleted_docs = es_node_ids - disk_node_ids
         new_docs = disk_node_ids - es_node_ids
         for doc_to_delete in deleted_docs:
             self.delete_document_by_id(doc_to_delete)
         for new_document in new_docs:
             self.index_document_by_node(disk_nodes[new_document])
         self.flush_index()
     else:
         self.app.logger.error(
             'No folder found by passing node id: {node_id}'.format(
                 node_id=node_id
             ))
コード例 #2
0
ファイル: user_data_service.py プロジェクト: olavgg/py-sth
 def index_files(self, folder):
     """ Index the files in folder """
     folder_instance = Folder.get_instance(folder['path'], user=self.user)
     folder_files = folder_instance.files
     file_bulk = []
     for file_obj in folder_files:
         file_bulk.append({'index': {'_id': file_obj.index_id}})
         fdata = {
             'name': file_obj.name,
             'parent': folder_instance.index_id,
             'date_modified': file_obj.date_modified,
             'size': file_obj.get_size(),
             'type': file_obj.type
         }
         file_bulk.append(fdata)
     if len(file_bulk) > 0:
         self.es_service.bulk_insert(self.bulk_insert_url, file_bulk)
     return len(file_bulk)
コード例 #3
0
ファイル: user_data_service.py プロジェクト: olavgg/py-sth
 def do_full_sync(self):
     """
     Do a full sync of the user filesystem. Find all folders from the
     user's filesystem compare them one by one. Then fetch all folders in
     ES and compare them for a cleanup. In case some folders have been 
     renamed.
     """
     self.disable_realtime_indexing()
     disk_folders = Folder.find_all_folders(self.user)
     es_data = {"docs": []}
     for folder in disk_folders:
         es_data["docs"].append({
             "_index": self.idx_name,
             "_type": "node",
             "_id": folder.index_id,
             "fields": ["_id"]
         })
     results = self.es_service.conn.get('_mget', data=es_data)
     if results['status'] == 200:
         es_results = [d_id['_id'] for d_id in results['docs']
                       if d_id['exists'] == True]
         for folder in disk_folders:
             if folder.index_id in es_results:
                 self.app.logger.debug(u'Syncing folder: {f}'.format(
                     f=folder.sys_path))
                 self.do_folder_sync(folder.index_id)
             else:
                 self.app.logger.debug(u'Created folder: {f}'.format(
                     f=folder.sys_path))
                 self.index_folders_and_files(folder=folder)
     else:
         self.app.logger.error(u'Couldn\'t fetch documents. Full sync stopped.')
         return
     max_size = int(self.es_service.conn.get(self.count_url, data={
         "query": {
             "bool": {
                 "must": [{
                              "term": {
                                  "type": Node.FOLDER_TYPE
                              }
                          }]
             }
         }
     })['hits']['total'])
     search_url = '{idx_name}/_search'.format(idx_name=self.idx_name)
     es_docs = self.es_service.conn.get(search_url, data={
         "from": 0,
         "size": max_size,
         "fields": [],
         "query": {
             "bool": {
                 "must": [{
                              "term": {
                                  "type": Node.FOLDER_TYPE
                              }
                          }]
             }
         }
     })['hits']['hits']
     es_docs = {doc['_id']: doc for doc in es_docs}
     folder_nodes = {folder.index_id: folder for folder in disk_folders}
     es_folders = set(es_docs.keys())
     folders = set(folder_nodes.keys())
     deleted_docs = es_folders - folders
     deleted_ids = []
     for doc_to_delete in deleted_docs:
         deleted_ids += self.delete_document_by_parent_id(doc_to_delete)
         if doc_to_delete not in deleted_ids:
             deleted_ids.append(
                 self.delete_document_by_id(doc_to_delete))
     self.app.logger.debug(
         u'Deleted nodes with id like:\n {name}'.format(name=deleted_ids))
     self.enable_realtime_indexing()
     self.optimize_index()
     self.flush_index()