def foreign_id_exists(self, source, foreign_id): q = Document.all_ids().filter(Document.source_id == source.id) q = q.filter(Document.foreign_id == foreign_id) exists = q.first() is not None if exists: log.info("Foreign ID exists (%s): %s", source, foreign_id) return exists
def load_fixtures(self, file_name, process_documents=True): filepath = os.path.abspath(os.path.join(FIXTURES, file_name)) load_fixtures(db, loaders.load(filepath)) db.session.commit() if process_documents: for doc_id, in Document.all_ids(): analyze_document(doc_id) optimize_search()
def load_fixtures(self, file_name, process_documents=True): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() if process_documents: for doc_id, in Document.all_ids(): analyze_document(doc_id) optimize_search()
def retry(): """Retry importing documents which were not successfully parsed.""" q = Document.all_ids() q = q.filter(Document.status != Document.STATUS_SUCCESS) log.info("Retry: %s documents", q.count()) for idx, (doc_id,) in enumerate(q.all(), 1): ingest.delay(doc_id) if idx % 1000 == 0: log.info("Process: %s documents...", idx)
def index(foreign_id=None): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: index_document.delay(doc_id) if foreign_id is None: reindex_entities()
def index(foreign_id=None): """Index documents in the given collection (or throughout).""" q = Document.all_ids() if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) clause = Collection.id == collection.id q = q.filter(Document.collections.any(clause)) for doc_id, in q: index_document_id.delay(doc_id) if foreign_id is None: reindex_entities()
def index(foreign_id=None): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) else: delete_index() init_search() for doc_id, in q: index_document.delay(doc_id)
def retry(foreign_id=None): """Retry importing documents which were not successfully parsed.""" q = Document.all_ids() q = q.filter(Document.status != Document.STATUS_SUCCESS) if foreign_id is not None: collection = Collection.by_foreign_id(foreign_id) q = q.filter(Document.collection_id == collection.id) log.info("Retry: %s documents", q.count()) for idx, (doc_id, ) in enumerate(q.all(), 1): ingest.apply_async([doc_id], priority=1) if idx % 1000 == 0: log.info("Process: %s documents...", idx)
def index(foreign_id=None): """Index documents in the given collection (or throughout).""" q = Document.all_ids() # re-index newest document first. q = q.order_by(Document.id.desc()) if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) q = q.filter(Document.collection_id == collection.id) for doc_id, in q.yield_per(10000): index_document_id.delay(doc_id) if foreign_id is None: reindex_entities()
def index(foreign_id=None): """Index documents in the given collection (or throughout).""" q = Document.all_ids() # re-index newest document first. q = q.order_by(Document.id.desc()) if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) q = q.filter(Document.collection_id == collection.id) for idx, (doc_id,) in enumerate(q.yield_per(5000), 1): index_document_id.delay(doc_id) if idx % 1000 == 0: log.info("Index: %s documents...", idx) if foreign_id is None: reindex_entities()
def index(foreign_id=None, immediate=False): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: #import time; time.sleep(10) #let's not get banned print('indexing %s' % doc_id) if immediate: #bypass the queue index_document(doc_id) else: index_document.delay(doc_id) if foreign_id is None: reindex_entities()