def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) if meta.foreign_id is None: meta.foreign_id = url tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600**self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600**self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(source_id, metadata): meta = Metadata(data=metadata) try: process.log(process.INGEST, component='ingest', meta=meta, source_id=source_id) except Exception as ex: log.exception(ex) Ingestor.dispatch(source_id, meta)
def ingest_file(source_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(source_id, meta.data) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def ingest_file(collection_id, meta, file_path, move=False): try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.delay(collection_id, meta.to_attr_dict()) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def ingest_file(collection_id, meta, file_path, move=False, queue=WORKER_QUEUE, routing_key=WORKER_ROUTING_KEY): # the queue and routing key arguments are a workaround to # expedite user uploads over long-running batch imports. try: if not os.path.isfile(file_path): raise IngestorException("No such file: %r", file_path) if not meta.has('source_path'): meta.source_path = file_path meta = get_archive().archive_file(file_path, meta, move=move) ingest.apply_async([collection_id, meta.to_attr_dict()], queue=queue, routing_key=routing_key) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove()
def ingest_url(source_id, metadata, url): meta = Metadata(data=metadata) try: fh, tmp_path = mkstemp() os.close(fh) log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True) if res.status_code >= 400: raise Exception("HTTP Error %r: %r" % (url, res.status_code)) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(source_id, meta) except Exception as ex: Ingestor.handle_exception(meta, source_id, ex)
def ingest_url(collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True, timeout=120) if res.status_code >= 400: msg = "HTTP Error %r: %r" % (url, res.status_code) raise IngestorException(msg) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(collection_id, metadata): meta = Metadata.from_data(metadata) Ingestor.dispatch(collection_id, meta)
def ingest(source_id, metadata): meta = Metadata(data=metadata) Ingestor.dispatch(source_id, meta)
def ingest(collection_id, metadata): meta = Metadata(data=metadata) Ingestor.dispatch(collection_id, meta)
def ingest(source_id, metadata): clear_session() meta = Metadata(data=metadata) Ingestor.dispatch(source_id, meta)