Exemple #1
0
 def execute(self, **kwargs):
     try:
         self.crawl(**kwargs)
         self.finalize()
     except Exception as ex:
         log.exception(ex)
         process.exception(process.CRAWL, component=self.name, exception=ex)
Exemple #2
0
    def dispatch(cls, source_id, meta):
        local_path = get_archive().load_file(meta)
        best_cls = cls.auction_file(meta, local_path)
        if best_cls is None:
            message = "No ingestor found: %r" % meta.file_name
            process.log(process.INGEST,
                        component=cls.__name__,
                        meta=meta,
                        source_id=source_id,
                        error_type='NoIngestorFound',
                        error_message=message)
            return

        log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__)
        try:
            best_cls(source_id).ingest(meta, local_path)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.INGEST,
                              component=best_cls.__name__,
                              exception=ex,
                              meta=meta,
                              source_id=source_id)
        finally:
            get_archive().cleanup_file(meta)
Exemple #3
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    try:
        log.info("Index document: %r", document)
        data = document.to_index_dict()
        data['entities'] = generate_entities(document)
        data['title_latin'] = latinize_text(data.get('title'))
        data['summary_latin'] = latinize_text(data.get('summary'))
        get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                       id=document.id)

        clear_children(document)
        if document.type == Document.TYPE_TEXT:
            bulk(get_es(), generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(get_es(), generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INDEX, component=__name__,
                          document_id=document.id, meta=document.meta,
                          source_id=document.source_id, exception=ex)
Exemple #4
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #5
0
 def execute(self, **kwargs):
     try:
         self.crawl(**kwargs)
         self.finalize()
     except Exception as ex:
         log.exception(ex)
         process.exception(process.CRAWL, component=self.name, exception=ex)
Exemple #6
0
Fichier : mf.py Projet : 01-/aleph
 def crawl(self, folder, source=None):
     mf = metafolder.open(folder)
     sources = {}
     for item in mf:
         try:
             self.crawl_item(item, sources, source)
         except Exception as ex:
             process.exception(process.INDEX, component=self.name,
                               foreign_id=item.identifier,
                               source_location=folder, meta=item.meta,
                               exception=ex)
Exemple #7
0
 def crawl(self, folder, source=None):
     mf = metafolder.open(folder)
     sources = {}
     for item in mf:
         try:
             self.crawl_item(item, sources, source)
         except Exception as ex:
             process.exception(process.INDEX,
                               component=self.name,
                               foreign_id=item.identifier,
                               source_location=folder,
                               meta=item.meta,
                               exception=ex)
Exemple #8
0
def ingest_file(source_id, meta, file_name, move=False):
    try:
        if not os.path.isfile(file_name):
            raise ValueError("No such file: %r", file_name)
        if not meta.has('source_path'):
            meta.source_path = file_name
        meta = get_archive().archive_file(file_name, meta, move=move)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #9
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE, component=cls.__name__,
                              document_id=document.id, meta=document.meta,
                              source_id=document.source_id, exception=ex)
    index_document(document_id)
Exemple #10
0
def ingest_file(source_id, meta, file_name, move=False):
    try:
        if not os.path.isfile(file_name):
            raise ValueError("No such file: %r", file_name)
        if not meta.has('source_path'):
            meta.source_path = file_name
        meta = get_archive().archive_file(file_name, meta, move=move)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST,
                          component='ingest_url',
                          source_id=source_id,
                          meta=meta,
                          exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #11
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE,
                              component=cls.__name__,
                              document_id=document.id,
                              meta=document.meta,
                              source_id=document.source_id,
                              exception=ex)
    index_document(document_id)
Exemple #12
0
    def dispatch(cls, source_id, meta):
        local_path = get_archive().load_file(meta)
        best_cls = cls.auction_file(meta, local_path)
        if best_cls is None:
            message = "No ingestor found: %r" % meta.file_name
            process.log(process.INGEST, component=cls.__name__, meta=meta,
                        source_id=source_id, error_type='NoIngestorFound',
                        error_message=message)
            return

        log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__)
        try:
            best_cls(source_id).ingest(meta, local_path)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.INGEST, component=best_cls.__name__,
                              exception=ex, meta=meta, source_id=source_id)
        finally:
            get_archive().cleanup_file(meta)
Exemple #13
0
    def crawl(self, directory=None, source=None):
        source = source or directory
        source = Source.create({
            'foreign_id': 'directory:%s' % slugify(source),
            'label': source
        })

        if os.path.isfile(directory):
            meta = self.metadata()
            meta.file_name = directory
            self.emit_file(source, meta, directory)

        directory = directory or os.getcwd()
        directory = directory.encode('utf-8')
        for (dirname, dirs, files) in os.walk(directory):
            log.info("Descending: %r", dirname)
            for file_name in files:
                if file_name in SKIP_FILES:
                    continue
                file_path = os.path.join(dirname, file_name)
                if not os.path.isfile(file_path):
                    continue
                try:
                    meta = self.metadata()
                    if isinstance(file_name, six.text_type):
                        meta.source_path = file_path
                    else:
                        enc = chardet.detect(file_name)
                        enc = enc.get('encoding')
                        try:
                            meta.source_path = file_path.decode(enc)
                        except:
                            meta.source_path = file_path.decode('ascii', 'ignore')

                    self.emit_file(source, meta, file_path)
                except Exception as ex:
                    log.exception(ex)
                    process.exception(process.INDEX, component=self.name,
                                      source_location=directory,
                                      source_id=source.id, exception=ex)
Exemple #14
0
def ingest_url(source_id, metadata, url):
    clear_session()
    meta = Metadata(data=metadata)
    try:
        with NamedTemporaryFile() as fh:
            log.info("Ingesting URL: %r", url)
            res = requests.get(url, stream=True)
            if res.status_code >= 400:
                log.error("Error ingesting %r: %r", url, res.status_code)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
            fh.flush()
            if not meta.has('source_url'):
                meta.source_url = res.url
            meta.headers = res.headers
            meta = get_archive().archive_file(fh.name, meta, move=True)
    except Exception as ex:
        log.exception(ex)
        process.exception(process.INGEST, component='ingest_url',
                          source_id=source_id, meta=meta, exception=ex)
        return
    ingest.delay(source_id, meta.data)
Exemple #15
0
 def log_exception(self, meta, exception):
     process.exception(process.INGEST, component=type(self).__name__,
                       source_id=self.source_id, meta=meta,
                       exception=exception)
Exemple #16
0
 def log_exception(self, meta, exception):
     process.exception(process.INGEST,
                       component=type(self).__name__,
                       source_id=self.source_id,
                       meta=meta,
                       exception=exception)