def _get_export_documents_from_file(dump_path, doc_count): """Mimic the results of an ES scroll query but get results from jsonlines file""" def _doc_iter(): with gzip.open(dump_path) as file: for line in file: yield json.loads(line.decode()) os.remove(dump_path) return ScanResult(doc_count, _doc_iter())
def scroll(self): """ Run the query against the scroll api. Returns an iterator yielding each document that matches the query. """ result = scroll_query(self.index, self.raw_query) return ScanResult(result.count, (ESQuerySet.normalize_result(deepcopy(self), r) for r in result))
def scroll(self): """ Run the query against the scroll api. Returns an iterator yielding each document that matches the query. """ query = deepcopy(self) if query._size is None: query._size = SCROLL_PAGE_SIZE_LIMIT result = scroll_query(query.index, query.raw_query) return ScanResult(result.count, (ESQuerySet.normalize_result(query, r) for r in result))
def get_export_documents(export_instance, filters): # Pull doc ids from elasticsearch and stream to disk query = _get_export_query(export_instance, filters) _, temp_path = tempfile.mkstemp() with open(temp_path, 'w') as f: scroll_result = query.scroll_ids() for doc_id in scroll_result: f.write(doc_id + '\n') def iter_export_docs(): # Stream doc ids from disk and fetch documents from ES in chunks with open(temp_path) as f: doc_ids = (doc_id.strip() for doc_id in f) for doc in iter_es_docs(query.index, doc_ids): yield doc os.remove(temp_path) return ScanResult(scroll_result.count, iter_export_docs())
def get_export_documents(export_instance, filters): # Pull doc ids from elasticsearch and stream to disk query = _get_export_query(export_instance, filters) scroll_result = query.scroll_ids() def iter_export_docs(): with TransientTempfile() as temp_path: with open(temp_path, 'w', encoding='utf-8') as f: for doc_id in scroll_result: f.write(doc_id + '\n') # Stream doc ids from disk and fetch documents from ES in chunks with open(temp_path, 'r', encoding='utf-8') as f: doc_ids = (doc_id.strip() for doc_id in f) for doc in iter_es_docs(query.index, doc_ids): yield doc return ScanResult(scroll_result.count, iter_export_docs())
def scroll(self): result_docs = list(self._result_docs) total = len(result_docs) if self._sort_field: result_docs.sort(key=lambda doc: doc[self._sort_field], reverse=self._sort_desc) if self._size is not None: result_docs = result_docs[self._start:self._start + self._size] else: result_docs = result_docs[self._start:] def _get_doc(doc): if self._source_fields: return {key: doc[key] for key in self._source_fields if key in doc} return doc es_query_set = (ESQuerySet.normalize_result(self, {'_source': _get_doc(r)}) for r in result_docs) return ScanResult(total, es_query_set)