def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def write_temp(self, body, suffix=None): out_path = make_tempfile(suffix=suffix) with open(out_path, 'w') as fh: if isinstance(body, unicode): body = body.encode('utf-8') fh.write(body) return out_path
def crawl_query(self, engine, collection, meta_base, name, query): meta_ = meta_base.copy() meta_.update(query.get('meta', {})) meta = self.make_meta(meta_) meta.mime_type = 'text/csv' meta.foreign_id = '%s:%s' % (collection.foreign_id, name) query = SQLQuery(engine, query) file_path = make_tempfile(name=name, suffix='.csv') try: with open(file_path, 'w') as fh: headers = [query.alias(c) for c in query.columns] writer = unicodecsv.writer(fh, quoting=unicodecsv.QUOTE_ALL) writer.writerow(headers) log.info('Query: %s', query.query) rp = engine.execute(query.query) while True: rows = rp.fetchmany(10000) if not rows: break for row in rows: writer.writerow(row[h] for h in headers) ingest_file(collection.id, meta, file_path, move=True) finally: remove_tempfile(file_path)
def ingest_url(self, collection_id, metadata, url): meta = Metadata.from_data(metadata) if meta.foreign_id is None: meta.foreign_id = url tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code == 404: log.info("HTTP not found: %s", url) return if res.status_code >= 399: countdown = 3600**self.request.retries self.retry(countdown=countdown) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = archive.archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600**self.request.retries self.retry(countdown=countdown) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(self, meta, local_path): with open(local_path, 'rb') as fh: data = fh.read() if meta.encoding: data = data.decode(meta.encoding) doc = html.fromstring(data) if not meta.has('title'): title = doc.findtext('.//title') if title is not None: meta.title = title if not meta.has('summary'): summary = doc.find('.//meta[@name="description"]') if summary is not None and summary.get('content'): meta.summary = summary.get('content') for field in ['keywords', 'news_keywords']: value = doc.find('.//meta[@name="%s"]' % field) if value is not None: value = value.get('content') or '' for keyword in value.split(','): meta.add_keyword(keyword) self.cleaner(doc) out_path = make_tempfile(name=meta.file_name, suffix='htm') try: with open(out_path, 'w') as fh: fh.write(etree.tostring(doc)) self.handle_html(meta, out_path) finally: remove_tempfile(out_path)
def ingest(self, meta, local_path): message = Message(local_path) if message.header is not None: meta = self.parse_headers(message.header, meta) for attachment in message.attachments: self.ingest_attachment(attachment, meta) if message.body is not None: out_path = make_tempfile(suffix="txt") with open(out_path, "w") as fh: body = message.body # TODO: figure out if this is really IMAP UTF-7 if not isinstance(body, unicode): enc = chardet.detect(message.body).get("encoding") if enc is None: body = body.decode("utf-8", "replace") log.warning("Cannot detect encoding of MSG: %r", meta) else: body = body.decode(enc, "replace") fh.write(body.encode("utf-8")) ing = DocumentIngestor(self.collection_id) ing.ingest(meta, out_path) remove_tempfile(out_path)
def generate_pdf_alternative(self, meta, local_path): """Convert DjVu book to PDF.""" out_path = make_tempfile(meta.file_name, suffix='pdf') ddjvu = get_config('DDJVU_BIN') args = [ddjvu, '-format=pdf', '-quality=85', '-skip', local_path, out_path] log.debug('Converting DJVU book: %r', ' '.join(args)) subprocess.call(args, stderr=subprocess.STDOUT) return out_path
def write_temp(self, body, suffix=None): if suffix is not None and not suffix.startswith('.'): suffix = '.' + suffix out_path = make_tempfile(suffix=suffix) with open(out_path, 'w') as fh: if isinstance(body, unicode): body = body.encode('utf-8') fh.write(body) return out_path
def save_data(self, data): """Store a lump object of data to a temporary file.""" file_path = make_tempfile() try: with open(file_path, 'w') as fh: fh.write(data or '') return file_path except Exception: remove_tempfile(file_path) raise
def generate_pdf_alternative(self, meta, local_path): """Convert DjVu book to PDF.""" out_path = make_tempfile(meta.file_name, suffix='pdf') ddjvu = get_config('DDJVU_BIN') args = [ ddjvu, '-format=pdf', '-quality=85', '-skip', local_path, out_path ] log.debug('Converting DJVU book: %r', ' '.join(args)) subprocess.call(args, stderr=subprocess.STDOUT) return out_path
def handle_html(self, meta, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" out_path = make_tempfile(name=meta.file_name, suffix='pdf') try: wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path] subprocess.call(args) if not os.path.isfile(out_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, out_path) finally: remove_tempfile(out_path)
def ingest_attachment(self, attachment, meta): try: if attachment.data is None: log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename) return out_path = make_tempfile() with open(out_path, "w") as fh: fh.write(attachment.data) child = meta.make_child() child.file_name = attachment.longFilename ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path) except Exception as ex: log.exception(ex)
def handle_html(self, meta, html_path): """OK, this is weirder. Converting HTML to PDF via WebKit.""" out_path = make_tempfile(name=meta.file_name, suffix='pdf') try: wkhtmltopdf = get_config('WKHTMLTOPDF_BIN') args = [ wkhtmltopdf, '--disable-javascript', '--no-outline', '--no-images', '--quiet', html_path, out_path ] subprocess.call(args) if not os.path.isfile(out_path): raise IngestorException("Could not convert document: %r", meta) self.extract_pdf_alternative(meta, out_path) finally: remove_tempfile(out_path)
def ingest_attachment(self, attachment, meta): try: if attachment.data is None: log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename) return out_path = make_tempfile() with open(out_path, 'w') as fh: fh.write(attachment.data) child = meta.make_child() child.file_name = string_value(attachment.longFilename) ingest_file(self.collection_id, child, out_path, move=True) remove_tempfile(out_path) except Exception as ex: log.exception(ex)
def save_response(self, res, suffix=None): """Store the return data from a requests response to a file.""" # This must be a streaming response. if res.status_code >= 400: message = "Error ingesting %r: %r" % (res.url, res.status_code) raise CrawlerException(message) file_path = make_tempfile(suffix=suffix) try: with open(file_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) return file_path except Exception: remove_tempfile(file_path) raise
def ingest(self, meta, local_path): message = Message(local_path) if message.header is not None: meta = self.parse_headers(message.header, meta) for attachment in message.attachments: self.ingest_attachment(attachment, meta) if message.body is not None: out_path = make_tempfile(suffix='txt') with open(out_path, 'w') as fh: # TODO: figure out if this is really IMAP UTF-7 body = string_value(message.body) fh.write(body.encode('utf-8')) ing = DocumentIngestor(self.collection_id) ing.ingest(meta, out_path) remove_tempfile(out_path)
def ingest(self, meta, local_path): pdf_path = make_tempfile(name=meta.file_name, suffix='pdf') try: meta.title = meta.file_name if not self.check_image_size(meta, local_path): return convert = get_config('CONVERT_BIN') args = [convert, local_path, '-density', '300', '-define', 'pdf:fit-page=A4', pdf_path] subprocess.call(args) if not os.path.isfile(pdf_path): msg = "Could not convert image: %r" % meta raise ImageIngestorException(msg) self.store_pdf(meta, pdf_path) self.extract_pdf(meta, pdf_path) finally: remove_tempfile(pdf_path)
def ingest_url(self, document_id, url): """Load the given URL into the document specified by document_id.""" document = Document.by_id(document_id) if document is None: log.error("Could not find document: %s", document_id) return tmp_path = make_tempfile(document.file_name, suffix=document.extension) try: log.info("Ingesting URL: %s", url) res = requests.get(url, stream=True) if res.status_code >= 500: countdown = 3600 ** self.request.retries self.retry(countdown=countdown) return if res.status_code >= 400: document.status = Document.STATUS_FAIL document.error_message = "HTTP %s: %s" % (res.status_code, url) db.session.commit() return with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not document.has_meta('source_url'): document.source_url = res.url if not document.foreign_id: document.foreign_id = res.url document.headers = res.headers document.content_hash = archive.archive_file(tmp_path) db.session.commit() get_manager().ingest_document(document) except IOError as ioe: log.info("IO Failure: %r", ioe) countdown = 3600 ** self.request.retries self.retry(countdown=countdown) except Exception as ex: document.status = Document.STATUS_FAIL document.error_type = type(ex).__name__ document.error_message = six.text_type(ex) db.session.commit() log.exception(ex) finally: db.session.remove() remove_tempfile(tmp_path)
def ingest(self, meta, local_path): pdf_path = make_tempfile(name=meta.file_name, suffix='pdf') try: meta.title = meta.file_name if not self.check_image_size(meta, local_path): return convert = get_config('CONVERT_BIN') args = [ convert, local_path, '-density', '450', '-define', 'pdf:fit-page=A4', pdf_path ] subprocess.call(args) if not os.path.isfile(pdf_path): msg = "Could not convert image: %r" % meta raise ImageIngestorException(msg) self.store_pdf(meta, pdf_path) self.extract_pdf(meta, pdf_path) finally: remove_tempfile(pdf_path)
def ingest_url(collection_id, metadata, url): meta = Metadata.from_data(metadata) tmp_path = make_tempfile(meta.file_name, suffix=meta.extension) try: log.info("Ingesting URL: %r", url) res = requests.get(url, stream=True, timeout=120) if res.status_code >= 400: msg = "HTTP Error %r: %r" % (url, res.status_code) raise IngestorException(msg) with open(tmp_path, 'w') as fh: for chunk in res.iter_content(chunk_size=1024): if chunk: fh.write(chunk) if not meta.has('source_url'): meta.source_url = res.url meta.headers = res.headers meta = get_archive().archive_file(tmp_path, meta, move=True) Ingestor.dispatch(collection_id, meta) except Exception as ex: Ingestor.handle_exception(meta, collection_id, ex) finally: db.session.remove() remove_tempfile(tmp_path)