Ejemplo n.º 1
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Ejemplo n.º 2
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Ejemplo n.º 3
0
def analyze_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        cls().analyze(document, document.meta)
    index_document(document_id)
Ejemplo n.º 4
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    with NamedTemporaryFile() as fh:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 400:
            log.error("Error ingesting %r: %r", url, res.status_code)
        for chunk in res.iter_content(chunk_size=1024):
            if chunk:
                fh.write(chunk)
        fh.flush()
        if not meta.has("source_url"):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(fh.name, meta, move=True)
        ingest.delay(source_id, meta.data)
Ejemplo n.º 5
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Ejemplo n.º 6
0
def ingest(source_id, metadata):
    clear_session()
    meta = Metadata(data=metadata)
    Ingestor.dispatch(source_id, meta)