Esempio n. 1
0
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Esempio n. 2
0
 def write_temp(self, body, suffix=None):
     out_path = make_tempfile(suffix=suffix)
     with open(out_path, 'w') as fh:
         if isinstance(body, unicode):
             body = body.encode('utf-8')
         fh.write(body)
     return out_path
Esempio n. 3
0
    def crawl_query(self, engine, collection, meta_base, name, query):
        meta_ = meta_base.copy()
        meta_.update(query.get('meta', {}))
        meta = self.make_meta(meta_)
        meta.mime_type = 'text/csv'
        meta.foreign_id = '%s:%s' % (collection.foreign_id, name)

        query = SQLQuery(engine, query)

        file_path = make_tempfile(name=name, suffix='.csv')
        try:
            with open(file_path, 'w') as fh:
                headers = [query.alias(c) for c in query.columns]
                writer = unicodecsv.writer(fh, quoting=unicodecsv.QUOTE_ALL)
                writer.writerow(headers)
                log.info('Query: %s', query.query)
                rp = engine.execute(query.query)
                while True:
                    rows = rp.fetchmany(10000)
                    if not rows:
                        break
                    for row in rows:
                        writer.writerow(row[h] for h in headers)
            ingest_file(collection.id, meta, file_path, move=True)
        finally:
            remove_tempfile(file_path)
Esempio n. 4
0
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    if meta.foreign_id is None:
        meta.foreign_id = url
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600**self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600**self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Esempio n. 5
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            data = fh.read()
        if meta.encoding:
            data = data.decode(meta.encoding)
        doc = html.fromstring(data)
        if not meta.has('title'):
            title = doc.findtext('.//title')
            if title is not None:
                meta.title = title

        if not meta.has('summary'):
            summary = doc.find('.//meta[@name="description"]')
            if summary is not None and summary.get('content'):
                meta.summary = summary.get('content')

        for field in ['keywords', 'news_keywords']:
            value = doc.find('.//meta[@name="%s"]' % field)
            if value is not None:
                value = value.get('content') or ''
                for keyword in value.split(','):
                    meta.add_keyword(keyword)

        self.cleaner(doc)
        out_path = make_tempfile(name=meta.file_name, suffix='htm')
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))
            self.handle_html(meta, out_path)
        finally:
            remove_tempfile(out_path)
Esempio n. 6
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            data = fh.read()
        if meta.encoding:
            data = data.decode(meta.encoding)
        doc = html.fromstring(data)
        if not meta.has('title'):
            title = doc.findtext('.//title')
            if title is not None:
                meta.title = title

        if not meta.has('summary'):
            summary = doc.find('.//meta[@name="description"]')
            if summary is not None and summary.get('content'):
                meta.summary = summary.get('content')

        for field in ['keywords', 'news_keywords']:
            value = doc.find('.//meta[@name="%s"]' % field)
            if value is not None:
                value = value.get('content') or ''
                for keyword in value.split(','):
                    meta.add_keyword(keyword)

        self.cleaner(doc)
        out_path = make_tempfile(name=meta.file_name, suffix='htm')
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))
            self.handle_html(meta, out_path)
        finally:
            remove_tempfile(out_path)
Esempio n. 7
0
    def ingest(self, meta, local_path):
        message = Message(local_path)
        if message.header is not None:
            meta = self.parse_headers(message.header, meta)

        for attachment in message.attachments:
            self.ingest_attachment(attachment, meta)

        if message.body is not None:
            out_path = make_tempfile(suffix="txt")
            with open(out_path, "w") as fh:
                body = message.body
                # TODO: figure out if this is really IMAP UTF-7
                if not isinstance(body, unicode):
                    enc = chardet.detect(message.body).get("encoding")
                    if enc is None:
                        body = body.decode("utf-8", "replace")
                        log.warning("Cannot detect encoding of MSG: %r", meta)
                    else:
                        body = body.decode(enc, "replace")

                fh.write(body.encode("utf-8"))
            ing = DocumentIngestor(self.collection_id)
            ing.ingest(meta, out_path)
            remove_tempfile(out_path)
Esempio n. 8
0
 def generate_pdf_alternative(self, meta, local_path):
     """Convert DjVu book to PDF."""
     out_path = make_tempfile(meta.file_name, suffix='pdf')
     ddjvu = get_config('DDJVU_BIN')
     args = [ddjvu, '-format=pdf', '-quality=85', '-skip',
             local_path, out_path]
     log.debug('Converting DJVU book: %r', ' '.join(args))
     subprocess.call(args, stderr=subprocess.STDOUT)
     return out_path
Esempio n. 9
0
 def write_temp(self, body, suffix=None):
     if suffix is not None and not suffix.startswith('.'):
         suffix = '.' + suffix
     out_path = make_tempfile(suffix=suffix)
     with open(out_path, 'w') as fh:
         if isinstance(body, unicode):
             body = body.encode('utf-8')
         fh.write(body)
     return out_path
Esempio n. 10
0
 def save_data(self, data):
     """Store a lump object of data to a temporary file."""
     file_path = make_tempfile()
     try:
         with open(file_path, 'w') as fh:
             fh.write(data or '')
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Esempio n. 11
0
 def save_data(self, data):
     """Store a lump object of data to a temporary file."""
     file_path = make_tempfile()
     try:
         with open(file_path, 'w') as fh:
             fh.write(data or '')
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Esempio n. 12
0
 def generate_pdf_alternative(self, meta, local_path):
     """Convert DjVu book to PDF."""
     out_path = make_tempfile(meta.file_name, suffix='pdf')
     ddjvu = get_config('DDJVU_BIN')
     args = [
         ddjvu, '-format=pdf', '-quality=85', '-skip', local_path, out_path
     ]
     log.debug('Converting DJVU book: %r', ' '.join(args))
     subprocess.call(args, stderr=subprocess.STDOUT)
     return out_path
Esempio n. 13
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [wkhtmltopdf, '--disable-javascript', '--no-outline',
                 '--no-images', '--quiet', html_path, out_path]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
Esempio n. 14
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, "w") as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = attachment.longFilename
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Esempio n. 15
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [
             wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path
         ]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
Esempio n. 16
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta,
                         attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, 'w') as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = string_value(attachment.longFilename)
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Esempio n. 17
0
 def save_response(self, res, suffix=None):
     """Store the return data from a requests response to a file."""
     # This must be a streaming response.
     if res.status_code >= 400:
         message = "Error ingesting %r: %r" % (res.url, res.status_code)
         raise CrawlerException(message)
     file_path = make_tempfile(suffix=suffix)
     try:
         with open(file_path, 'w') as fh:
             for chunk in res.iter_content(chunk_size=1024):
                 if chunk:
                     fh.write(chunk)
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Esempio n. 18
0
 def save_response(self, res, suffix=None):
     """Store the return data from a requests response to a file."""
     # This must be a streaming response.
     if res.status_code >= 400:
         message = "Error ingesting %r: %r" % (res.url, res.status_code)
         raise CrawlerException(message)
     file_path = make_tempfile(suffix=suffix)
     try:
         with open(file_path, 'w') as fh:
             for chunk in res.iter_content(chunk_size=1024):
                 if chunk:
                     fh.write(chunk)
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Esempio n. 19
0
    def ingest(self, meta, local_path):
        message = Message(local_path)
        if message.header is not None:
            meta = self.parse_headers(message.header, meta)

        for attachment in message.attachments:
            self.ingest_attachment(attachment, meta)

        if message.body is not None:
            out_path = make_tempfile(suffix='txt')
            with open(out_path, 'w') as fh:
                # TODO: figure out if this is really IMAP UTF-7
                body = string_value(message.body)
                fh.write(body.encode('utf-8'))
            ing = DocumentIngestor(self.collection_id)
            ing.ingest(meta, out_path)
            remove_tempfile(out_path)
Esempio n. 20
0
 def ingest(self, meta, local_path):
     pdf_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         meta.title = meta.file_name
         if not self.check_image_size(meta, local_path):
             return
         convert = get_config('CONVERT_BIN')
         args = [convert, local_path, '-density', '300', '-define',
                 'pdf:fit-page=A4', pdf_path]
         subprocess.call(args)
         if not os.path.isfile(pdf_path):
             msg = "Could not convert image: %r" % meta
             raise ImageIngestorException(msg)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         remove_tempfile(pdf_path)
Esempio n. 21
0
def ingest_url(self, document_id, url):
    """Load the given URL into the document specified by document_id."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    tmp_path = make_tempfile(document.file_name, suffix=document.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 500:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
            return
        if res.status_code >= 400:
            document.status = Document.STATUS_FAIL
            document.error_message = "HTTP %s: %s" % (res.status_code, url)
            db.session.commit()
            return
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not document.has_meta('source_url'):
            document.source_url = res.url
        if not document.foreign_id:
            document.foreign_id = res.url
        document.headers = res.headers
        document.content_hash = archive.archive_file(tmp_path)
        db.session.commit()
        get_manager().ingest_document(document)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        document.status = Document.STATUS_FAIL
        document.error_type = type(ex).__name__
        document.error_message = six.text_type(ex)
        db.session.commit()
        log.exception(ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Esempio n. 22
0
 def ingest(self, meta, local_path):
     pdf_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         meta.title = meta.file_name
         if not self.check_image_size(meta, local_path):
             return
         convert = get_config('CONVERT_BIN')
         args = [
             convert, local_path, '-density', '450', '-define',
             'pdf:fit-page=A4', pdf_path
         ]
         subprocess.call(args)
         if not os.path.isfile(pdf_path):
             msg = "Could not convert image: %r" % meta
             raise ImageIngestorException(msg)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         remove_tempfile(pdf_path)
Esempio n. 23
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Esempio n. 24
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)