def execute(self, **kwargs): try: self.crawl(**kwargs) self.finalize() except Exception as ex: log.exception(ex) process.exception(process.CRAWL, component=self.name, exception=ex)
def dispatch(cls, source_id, meta): local_path = get_archive().load_file(meta) best_cls = cls.auction_file(meta, local_path) if best_cls is None: message = "No ingestor found: %r" % meta.file_name process.log(process.INGEST, component=cls.__name__, meta=meta, source_id=source_id, error_type='NoIngestorFound', error_message=message) return log.debug("Dispatching %r to %r", meta.file_name, best_cls.__name__) try: best_cls(source_id).ingest(meta, local_path) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component=best_cls.__name__, exception=ex, meta=meta, source_id=source_id) finally: get_archive().cleanup_file(meta)
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return try: log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) if document.type == Document.TYPE_TEXT: bulk(get_es(), generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex)
def ingest_url(source_id, metadata, url): clear_session() meta = Metadata(data=metadata) try: with NamedTemporaryFile() as fh: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: log.error("Error ingesting %r: %r", url, res.status_code) for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) fh.flush() if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(fh.name, meta, move=True) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def crawl(self, folder, source=None): mf = metafolder.open(folder) sources = {} for item in mf: try: self.crawl_item(item, sources, source) except Exception as ex: process.exception(process.INDEX, component=self.name, foreign_id=item.identifier, source_location=folder, meta=item.meta, exception=ex)
def ingest_file(source_id, meta, file_name, move=False): try: if not os.path.isfile(file_name): raise ValueError("No such file: %r", file_name) if not meta.has('source_path'): meta.source_path = file_name meta = get_archive().archive_file(file_name, meta, move=move) except Exception as ex: log.exception(ex) process.exception(process.INGEST, component='ingest_url', source_id=source_id, meta=meta, exception=ex) return ingest.delay(source_id, meta.data)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): try: cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) process.exception(process.ANALYZE, component=cls.__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex) index_document(document_id)
def crawl(self, directory=None, source=None): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) if os.path.isfile(directory): meta = self.metadata() meta.file_name = directory self.emit_file(source, meta, directory) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): log.info("Descending: %r", dirname) for file_name in files: if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) if not os.path.isfile(file_path): continue try: meta = self.metadata() if isinstance(file_name, six.text_type): meta.source_path = file_path else: enc = chardet.detect(file_name) enc = enc.get('encoding') try: meta.source_path = file_path.decode(enc) except: meta.source_path = file_path.decode('ascii', 'ignore') self.emit_file(source, meta, file_path) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=self.name, source_location=directory, source_id=source.id, exception=ex)
def log_exception(self, meta, exception): process.exception(process.INGEST, component=type(self).__name__, source_id=self.source_id, meta=meta, exception=exception)