Exemple #1
0
def init():
    """Create or upgrade the search index and database."""
    upgrade_db()
    init_search()
    upgrade_search()
    install_analyzers()
    get_archive().upgrade()
Exemple #2
0
    def dispatch(cls, source_id, meta):
        local_path = get_archive().load_file(meta)
        best_cls = cls.auction_file(meta, local_path)
        if best_cls is None:
            message = "No ingestor found: %r" % meta.file_name
            process.log(process.INGEST,
                        component=cls.__name__,
                        meta=meta,
                        source_id=source_id,
                        error_type='NoIngestorFound',
                        error_message=message)
            return

        log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__)
        try:
            best_cls(source_id).ingest(meta, local_path)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.INGEST,
                              component=best_cls.__name__,
                              exception=ex,
                              meta=meta,
                              source_id=source_id)
        finally:
            get_archive().cleanup_file(meta)
Exemple #3
0
def init():
    """Create or upgrade the search index and database."""
    upgrade_db()
    init_search()
    upgrade_search()
    install_analyzers()
    get_archive().upgrade()
Exemple #4
0
 def dispatch(cls, source_id, meta):
     local_path = get_archive().load_file(meta)
     try:
         best_cls = cls.auction_file(meta, local_path)
         log.debug("Dispatching %r to %r", meta.file_name, best_cls)
         best_cls(source_id).ingest(meta, local_path)
         CrawlerState.store_ok(meta, source_id)
         db.session.commit()
     except Exception as exception:
         cls.handle_exception(meta, source_id, exception)
     finally:
         get_archive().cleanup_file(meta)
Exemple #5
0
 def dispatch(cls, collection_id, meta):
     local_path = get_archive().load_file(meta)
     try:
         best_cls = cls.auction_file(meta, local_path)
         log.debug("Dispatching %r to %r", meta.file_name, best_cls)
         best_cls(collection_id).ingest(meta, local_path)
         CrawlerState.store_ok(meta, collection_id)
         db.session.commit()
     except Exception as exc:
         cls.handle_exception(meta, collection_id, exc)
     finally:
         get_archive().cleanup_file(meta)
Exemple #6
0
def file(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    url = get_archive().generate_url(document.meta)
    if url is not None:
        return redirect(url)

    local_path = get_archive().load_file(document.meta)
    fh = open(local_path, 'rb')
    return send_file(fh, as_attachment=True,
                     attachment_filename=document.meta.file_name,
                     mimetype=document.meta.mime_type)
Exemple #7
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = get_archive().generate_url(pdf)
    if url is not None:
        return redirect(url)

    local_path = get_archive().load_file(pdf)
    fh = open(local_path, 'rb')
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #8
0
def file(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    url = get_archive().generate_url(document.meta)
    if url is not None:
        return redirect(url)

    local_path = get_archive().load_file(document.meta)
    fh = open(local_path, 'rb')
    return send_file(fh,
                     as_attachment=True,
                     attachment_filename=document.meta.file_name,
                     mimetype=document.meta.mime_type)
Exemple #9
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = get_archive().generate_url(pdf)
    if url is not None:
        return redirect(url)

    local_path = get_archive().load_file(pdf)
    fh = open(local_path, 'rb')
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #10
0
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Exemple #11
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #12
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    log_event(request, document_id=document.id)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = get_archive().generate_url(pdf)
    if url is not None:
        return redirect(url)

    try:
        local_path = get_archive().load_file(pdf)
        fh = open(local_path, 'rb')
    except Exception as ex:
        raise NotFound("Missing PDF file: %r" % ex)
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #13
0
def pdf(document_id):
    document = get_document(document_id)
    enable_cache(server_side=True)
    log_event(request, document_id=document.id)
    if document.type != Document.TYPE_TEXT:
        raise BadRequest("PDF is only available for text documents")
    pdf = document.meta.pdf
    url = get_archive().generate_url(pdf)
    if url is not None:
        return redirect(url)

    try:
        local_path = get_archive().load_file(pdf)
        fh = open(local_path, 'rb')
    except Exception as ex:
        raise NotFound("Missing PDF file: %r" % ex)
    return send_file(fh, mimetype=pdf.mime_type)
Exemple #14
0
def view(document_id):
    doc = get_document(document_id)
    enable_cache()
    data = doc.to_dict()
    data['data_url'] = get_archive().generate_url(doc.meta)
    if data['data_url'] is None:
        data['data_url'] = url_for('documents_api.file',
                                   document_id=document_id)
    if doc.meta.is_pdf:
        data['pdf_url'] = data['data_url']
    else:
        data['pdf_url'] = get_archive().generate_url(doc.meta.pdf)
        if data['pdf_url'] is None:
            data['pdf_url'] = url_for('documents_api.pdf',
                                      document_id=document_id)
    data['source'] = doc.source
    return jsonify(data)
Exemple #15
0
def view(document_id):
    doc = get_document(document_id)
    enable_cache()
    data = doc.to_dict()
    data['data_url'] = get_archive().generate_url(doc.meta)
    if data['data_url'] is None:
        data['data_url'] = url_for('documents_api.file',
                                   document_id=document_id)
    if doc.meta.is_pdf:
        data['pdf_url'] = data['data_url']
    else:
        data['pdf_url'] = get_archive().generate_url(doc.meta.pdf)
        if data['pdf_url'] is None:
            data['pdf_url'] = url_for('documents_api.pdf',
                                      document_id=document_id)
    data['source'] = doc.source
    return jsonify(data)
Exemple #16
0
    def dispatch(cls, source_id, meta):
        local_path = get_archive().load_file(meta)
        best_cls = cls.auction_file(meta, local_path)
        if best_cls is None:
            message = "No ingestor found: %r" % meta.file_name
            process.log(process.INGEST, component=cls.__name__, meta=meta,
                        source_id=source_id, error_type='NoIngestorFound',
                        error_message=message)
            return

        log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__)
        try:
            best_cls(source_id).ingest(meta, local_path)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.INGEST, component=best_cls.__name__,
                              exception=ex, meta=meta, source_id=source_id)
        finally:
            get_archive().cleanup_file(meta)
Exemple #17
0
def ingest_file(source_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(source_id, meta.data)
    except Exception as ex:
        Ingestor.handle_exception(meta, source_id, ex)
Exemple #18
0
def view(document_id):
    doc = get_document(document_id)
    enable_cache()
    data = doc.to_dict()
    data['data_url'] = get_archive().generate_url(doc.meta)
    if data['data_url'] is None:
        data['data_url'] = url_for('documents_api.file',
                                   document_id=document_id)
    if doc.meta.is_pdf:
        data['pdf_url'] = data['data_url']
    else:
        try:
            data['pdf_url'] = get_archive().generate_url(doc.meta.pdf)
        except Exception as ex:
            log.info('Could not generate PDF url: %r', ex)
        if data.get('pdf_url') is None:
            data['pdf_url'] = url_for('documents_api.pdf',
                                      document_id=document_id)
    data['source'] = doc.source
    return jsonify(data)
Exemple #19
0
def view(document_id):
    doc = get_document(document_id)
    enable_cache()
    data = doc.to_dict()
    log_event(request, document_id=doc.id)
    data['data_url'] = get_archive().generate_url(doc.meta)
    if data['data_url'] is None:
        data['data_url'] = url_for('documents_api.file',
                                   document_id=document_id)
    if doc.meta.is_pdf:
        data['pdf_url'] = data['data_url']
    else:
        try:
            data['pdf_url'] = get_archive().generate_url(doc.meta.pdf)
        except Exception as ex:
            log.info('Could not generate PDF url: %r', ex)
        if data.get('pdf_url') is None:
            data['pdf_url'] = url_for('documents_api.pdf',
                                      document_id=document_id)
    return jsonify(data)
Exemple #20
0
def view(document_id):
    doc = get_document(document_id)
    enable_cache()
    data = doc.to_dict()
    data['data_url'] = get_archive().generate_url(doc.meta)
    if data['data_url'] is None:
        data['data_url'] = url_for('documents_api.file',
                                   document_id=document_id)
    if doc.meta.is_pdf:
        data['pdf_url'] = data['data_url']
    else:
        try:
            data['pdf_url'] = get_archive().generate_url(doc.meta.pdf)
        except Exception as ex:
            log.info('Could not generate PDF url: %r', ex)
        if data.get('pdf_url') is None:
            data['pdf_url'] = url_for('documents_api.pdf',
                                      document_id=document_id)
    data['source'] = doc.source
    #data['metadata'] = {k:v for k,v in data['metadata'].items() if k in ALLOWED_METADATA}
    return jsonify(data)
Exemple #21
0
def ingest_file(collection_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(collection_id, meta.to_attr_dict())
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
Exemple #22
0
def ingest_file(source_id, meta, file_name, move=False):
    try:
        if not os.path.isfile(file_name):
            raise ValueError("No such file: %r", file_name)
        if not meta.has('source_path'):
            meta.source_path = file_name
        meta = get_archive().archive_file(file_name, meta, move=move)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #23
0
def ingest_file(source_id, meta, file_name, move=False):
    try:
        if not os.path.isfile(file_name):
            raise ValueError("No such file: %r", file_name)
        if not meta.has('source_path'):
            meta.source_path = file_name
        meta = get_archive().archive_file(file_name, meta, move=move)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #24
0
def ingest_file(collection_id, meta, file_path, move=False,
                queue=WORKER_QUEUE, routing_key=WORKER_ROUTING_KEY):
    # the queue and routing key arguments are a workaround to 
    # expedite user uploads over long-running batch imports.
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.apply_async([collection_id, meta.to_attr_dict()],
                           queue=queue, routing_key=routing_key)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
Exemple #25
0
def ingest_url(source_id, metadata, url):
    meta = Metadata(data=metadata)
    try:
        fh, tmp_path = mkstemp()
        os.close(fh)
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 400:
            raise Exception("HTTP Error %r: %r" % (url, res.status_code))
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(source_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, source_id, ex)
Exemple #26
0
def ingest_file(collection_id,
                meta,
                file_path,
                move=False,
                queue=WORKER_QUEUE,
                routing_key=WORKER_ROUTING_KEY):
    # the queue and routing key arguments are a workaround to
    # expedite user uploads over long-running batch imports.
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.apply_async([collection_id, meta.to_attr_dict()],
                           queue=queue,
                           routing_key=routing_key)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
Exemple #27
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #28
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Exemple #29
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Exemple #30
0
 def store_pdf(self, meta, pdf_path, move=True):
     get_archive().archive_file(pdf_path, meta.pdf, move=move)
Exemple #31
0
 def store_pdf(self, meta, pdf_path):
     get_archive().archive_file(pdf_path, meta.pdf, move=False)
Exemple #32
0
def upgrade():
    """Create or upgrade the search index and database."""
    upgrade_db()
    upgrade_search()
    upgrade_graph()
    get_archive().upgrade()
Exemple #33
0
 def store_pdf(self, meta, pdf_path):
     get_archive().archive_file(pdf_path, meta.pdf, move=False)