Ejemplo n.º 1
0
Archivo: crawler.py Proyecto: 01-/aleph
 def foreign_id_exists(self, source, foreign_id):
     q = Document.all_ids().filter(Document.source_id == source.id)
     q = q.filter(Document.foreign_id == foreign_id)
     exists = q.first() is not None
     if exists:
         log.info("Foreign ID exists (%s): %s", source, foreign_id)
     return exists
Ejemplo n.º 2
0
 def foreign_id_exists(self, source, foreign_id):
     q = Document.all_ids().filter(Document.source_id == source.id)
     q = q.filter(Document.foreign_id == foreign_id)
     exists = q.first() is not None
     if exists:
         log.info("Foreign ID exists (%s): %s", source, foreign_id)
     return exists
Ejemplo n.º 3
0
Archivo: util.py Proyecto: 01-/aleph
 def load_fixtures(self, file_name, process_documents=True):
     filepath = os.path.abspath(os.path.join(FIXTURES, file_name))
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     if process_documents:
         for doc_id, in Document.all_ids():
             analyze_document(doc_id)
         optimize_search()
Ejemplo n.º 4
0
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     if process_documents:
         for doc_id, in Document.all_ids():
             analyze_document(doc_id)
         optimize_search()
Ejemplo n.º 5
0
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     if process_documents:
         for doc_id, in Document.all_ids():
             analyze_document(doc_id)
         optimize_search()
Ejemplo n.º 6
0
def retry():
    """Retry importing documents which were not successfully parsed."""
    q = Document.all_ids()
    q = q.filter(Document.status != Document.STATUS_SUCCESS)
    log.info("Retry: %s documents", q.count())
    for idx, (doc_id,) in enumerate(q.all(), 1):
        ingest.delay(doc_id)
        if idx % 1000 == 0:
            log.info("Process: %s documents...", idx)
Ejemplo n.º 7
0
def index(foreign_id=None):
    """Index documents in the given source (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        source = Source.by_foreign_id(foreign_id)
        if source is None:
            raise ValueError("No such source: %r" % foreign_id)
        q = q.filter(Document.source_id == source.id)
    for doc_id, in q:
        index_document.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Ejemplo n.º 8
0
def index(foreign_id=None):
    """Index documents in the given collection (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        clause = Collection.id == collection.id
        q = q.filter(Document.collections.any(clause))
    for doc_id, in q:
        index_document_id.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Ejemplo n.º 9
0
def index(foreign_id=None):
    """Index documents in the given source (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        source = Source.by_foreign_id(foreign_id)
        if source is None:
            raise ValueError("No such source: %r" % foreign_id)
        q = q.filter(Document.source_id == source.id)
    else:
        delete_index()
        init_search()
    for doc_id, in q:
        index_document.delay(doc_id)
Ejemplo n.º 10
0
def retry(foreign_id=None):
    """Retry importing documents which were not successfully parsed."""
    q = Document.all_ids()
    q = q.filter(Document.status != Document.STATUS_SUCCESS)
    if foreign_id is not None:
        collection = Collection.by_foreign_id(foreign_id)
        q = q.filter(Document.collection_id == collection.id)

    log.info("Retry: %s documents", q.count())
    for idx, (doc_id, ) in enumerate(q.all(), 1):
        ingest.apply_async([doc_id], priority=1)
        if idx % 1000 == 0:
            log.info("Process: %s documents...", idx)
Ejemplo n.º 11
0
def index(foreign_id=None):
    """Index documents in the given collection (or throughout)."""
    q = Document.all_ids()
    # re-index newest document first.
    q = q.order_by(Document.id.desc())
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        q = q.filter(Document.collection_id == collection.id)
    for doc_id, in q.yield_per(10000):
        index_document_id.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Ejemplo n.º 12
0
def index(foreign_id=None):
    """Index documents in the given collection (or throughout)."""
    q = Document.all_ids()
    # re-index newest document first.
    q = q.order_by(Document.id.desc())
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        q = q.filter(Document.collection_id == collection.id)
    for idx, (doc_id,) in enumerate(q.yield_per(5000), 1):
        index_document_id.delay(doc_id)
        if idx % 1000 == 0:
            log.info("Index: %s documents...", idx)
    if foreign_id is None:
        reindex_entities()
Ejemplo n.º 13
0
def index(foreign_id=None, immediate=False):
    """Index documents in the given source (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        source = Source.by_foreign_id(foreign_id)
        if source is None:
            raise ValueError("No such source: %r" % foreign_id)
        q = q.filter(Document.source_id == source.id)
    for doc_id, in q:
        #import time; time.sleep(10) #let's not get banned
        print('indexing %s' % doc_id)
        if immediate: #bypass the queue
            index_document(doc_id)
        else:
            index_document.delay(doc_id)
    if foreign_id is None:
        reindex_entities()