Exemple #1
0
def _get_export_documents_from_file(dump_path, doc_count):
    """Mimic the results of an ES scroll query but get results from jsonlines file"""
    def _doc_iter():
        with gzip.open(dump_path) as file:
            for line in file:
                yield json.loads(line.decode())
        os.remove(dump_path)

    return ScanResult(doc_count, _doc_iter())
Exemple #2
0
 def scroll(self):
     """
     Run the query against the scroll api. Returns an iterator yielding each
     document that matches the query.
     """
     result = scroll_query(self.index, self.raw_query)
     return ScanResult(result.count,
                       (ESQuerySet.normalize_result(deepcopy(self), r)
                        for r in result))
Exemple #3
0
 def scroll(self):
     """
     Run the query against the scroll api. Returns an iterator yielding each
     document that matches the query.
     """
     query = deepcopy(self)
     if query._size is None:
         query._size = SCROLL_PAGE_SIZE_LIMIT
     result = scroll_query(query.index, query.raw_query)
     return ScanResult(result.count, (ESQuerySet.normalize_result(query, r)
                                      for r in result))
Exemple #4
0
def get_export_documents(export_instance, filters):
    # Pull doc ids from elasticsearch and stream to disk
    query = _get_export_query(export_instance, filters)
    _, temp_path = tempfile.mkstemp()
    with open(temp_path, 'w') as f:
        scroll_result = query.scroll_ids()
        for doc_id in scroll_result:
            f.write(doc_id + '\n')

    def iter_export_docs():
        # Stream doc ids from disk and fetch documents from ES in chunks
        with open(temp_path) as f:
            doc_ids = (doc_id.strip() for doc_id in f)
            for doc in iter_es_docs(query.index, doc_ids):
                yield doc
        os.remove(temp_path)

    return ScanResult(scroll_result.count, iter_export_docs())
Exemple #5
0
def get_export_documents(export_instance, filters):
    # Pull doc ids from elasticsearch and stream to disk
    query = _get_export_query(export_instance, filters)
    scroll_result = query.scroll_ids()

    def iter_export_docs():
        with TransientTempfile() as temp_path:
            with open(temp_path, 'w', encoding='utf-8') as f:
                for doc_id in scroll_result:
                    f.write(doc_id + '\n')

            # Stream doc ids from disk and fetch documents from ES in chunks
            with open(temp_path, 'r', encoding='utf-8') as f:
                doc_ids = (doc_id.strip() for doc_id in f)
                for doc in iter_es_docs(query.index, doc_ids):
                    yield doc

    return ScanResult(scroll_result.count, iter_export_docs())
Exemple #6
0
    def scroll(self):
        result_docs = list(self._result_docs)
        total = len(result_docs)
        if self._sort_field:
            result_docs.sort(key=lambda doc: doc[self._sort_field],
                             reverse=self._sort_desc)
        if self._size is not None:
            result_docs = result_docs[self._start:self._start + self._size]
        else:
            result_docs = result_docs[self._start:]

        def _get_doc(doc):
            if self._source_fields:
                return {key: doc[key] for key in self._source_fields if key in doc}
            return doc

        es_query_set = (ESQuerySet.normalize_result(self,
                                                    {'_source': _get_doc(r)}) for r in result_docs)
        return ScanResult(total, es_query_set)