Example #1
0
    def crawl_query(self, engine, collection, meta_base, name, query):
        meta_ = meta_base.copy()
        meta_.update(query.get('meta', {}))
        meta = self.make_meta(meta_)
        meta.mime_type = 'text/csv'
        meta.foreign_id = '%s:%s' % (collection.foreign_id, name)

        query = SQLQuery(engine, query)

        file_path = make_tempfile(name=name, suffix='.csv')
        try:
            with open(file_path, 'w') as fh:
                headers = [query.alias(c) for c in query.columns]
                writer = unicodecsv.writer(fh, quoting=unicodecsv.QUOTE_ALL)
                writer.writerow(headers)
                log.info('Query: %s', query.query)
                rp = engine.execute(query.query)
                while True:
                    rows = rp.fetchmany(10000)
                    if not rows:
                        break
                    for row in rows:
                        writer.writerow(row[h] for h in headers)
            ingest_file(collection.id, meta, file_path, move=True)
        finally:
            remove_tempfile(file_path)
Example #2
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            data = fh.read()
        if meta.encoding:
            data = data.decode(meta.encoding)
        doc = html.fromstring(data)
        if not meta.has('title'):
            title = doc.findtext('.//title')
            if title is not None:
                meta.title = title

        if not meta.has('summary'):
            summary = doc.find('.//meta[@name="description"]')
            if summary is not None and summary.get('content'):
                meta.summary = summary.get('content')

        for field in ['keywords', 'news_keywords']:
            value = doc.find('.//meta[@name="%s"]' % field)
            if value is not None:
                value = value.get('content') or ''
                for keyword in value.split(','):
                    meta.add_keyword(keyword)

        self.cleaner(doc)
        out_path = make_tempfile(name=meta.file_name, suffix='htm')
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))
            self.handle_html(meta, out_path)
        finally:
            remove_tempfile(out_path)
Example #3
0
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    if meta.foreign_id is None:
        meta.foreign_id = url
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600**self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = archive.archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600**self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Example #4
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as emlfh:
            data = emlfh.read()
        msg = mime.from_string(data)
        meta = self.parse_headers(msg, meta)

        body_type = 'text/plain'
        body_part = msg.body

        for part in msg.walk():
            if not part.is_body():
                self.ingest_attachment(part, meta)
                continue

            body = part.body
            if 'html' not in body_type and \
                    body is not None and len(body.strip()):
                body_type = unicode(part.detected_content_type)
                body_part = body

        out_path = ''
        if body_part is None:
            raise IngestorException("No body in E-Mail: %r" % meta)
        try:
            if 'html' in body_type:
                out_path = self.write_temp(body_part, 'htm')
                ing = HtmlIngestor(self.source_id)
            else:
                out_path = self.write_temp(body_part, 'txt')
                ing = DocumentIngestor(self.source_id)
            ing.ingest(meta, out_path)
        finally:
            remove_tempfile(out_path)
Example #5
0
    def ingest(self, meta, local_path):
        with open(local_path, 'rb') as fh:
            data = fh.read()
        if meta.encoding:
            data = data.decode(meta.encoding)
        doc = html.fromstring(data)
        if not meta.has('title'):
            title = doc.findtext('.//title')
            if title is not None:
                meta.title = title

        if not meta.has('summary'):
            summary = doc.find('.//meta[@name="description"]')
            if summary is not None and summary.get('content'):
                meta.summary = summary.get('content')

        for field in ['keywords', 'news_keywords']:
            value = doc.find('.//meta[@name="%s"]' % field)
            if value is not None:
                value = value.get('content') or ''
                for keyword in value.split(','):
                    meta.add_keyword(keyword)

        self.cleaner(doc)
        out_path = make_tempfile(name=meta.file_name, suffix='htm')
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))
            self.handle_html(meta, out_path)
        finally:
            remove_tempfile(out_path)
Example #6
0
    def ingest(self, meta, local_path):
        message = Message(local_path)
        if message.header is not None:
            meta = self.parse_headers(message.header, meta)

        for attachment in message.attachments:
            self.ingest_attachment(attachment, meta)

        if message.body is not None:
            out_path = make_tempfile(suffix="txt")
            with open(out_path, "w") as fh:
                body = message.body
                # TODO: figure out if this is really IMAP UTF-7
                if not isinstance(body, unicode):
                    enc = chardet.detect(message.body).get("encoding")
                    if enc is None:
                        body = body.decode("utf-8", "replace")
                        log.warning("Cannot detect encoding of MSG: %r", meta)
                    else:
                        body = body.decode(enc, "replace")

                fh.write(body.encode("utf-8"))
            ing = DocumentIngestor(self.collection_id)
            ing.ingest(meta, out_path)
            remove_tempfile(out_path)
Example #7
0
def ingest_url(self, collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code == 404:
            log.info("HTTP not found: %s", url)
            return
        if res.status_code >= 399:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Example #8
0
 def save_data(self, data):
     """Store a lump object of data to a temporary file."""
     file_path = make_tempfile()
     try:
         with open(file_path, 'w') as fh:
             fh.write(data or '')
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Example #9
0
 def save_data(self, data):
     """Store a lump object of data to a temporary file."""
     file_path = make_tempfile()
     try:
         with open(file_path, 'w') as fh:
             fh.write(data or '')
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Example #10
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [wkhtmltopdf, '--disable-javascript', '--no-outline',
                 '--no-images', '--quiet', html_path, out_path]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
Example #11
0
    def ingest_attachment(self, part, meta):
        if part.body is None:
            log.warning("Empty attachment [%r]: %s", meta, part)
            return

        child = meta.make_child()
        child.mime_type = six.text_type(part.detected_content_type)
        child.file_name = string_value(part.detected_file_name)
        out_path = self.write_temp(part.body, child.extension)

        try:
            ingest_file(self.collection_id, child, out_path, move=True)
        finally:
            remove_tempfile(out_path)
Example #12
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta, attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, "w") as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = attachment.longFilename
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Example #13
0
 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [
             wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path
         ]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)
Example #14
0
 def ingest_attachment(self, attachment, meta):
     try:
         if attachment.data is None:
             log.warning("Attachment is empty [%r]: %s", meta,
                         attachment.longFilename)
             return
         out_path = make_tempfile()
         with open(out_path, 'w') as fh:
             fh.write(attachment.data)
         child = meta.make_child()
         child.file_name = string_value(attachment.longFilename)
         ingest_file(self.collection_id, child, out_path, move=True)
         remove_tempfile(out_path)
     except Exception as ex:
         log.exception(ex)
Example #15
0
 def save_response(self, res, suffix=None):
     """Store the return data from a requests response to a file."""
     # This must be a streaming response.
     if res.status_code >= 400:
         message = "Error ingesting %r: %r" % (res.url, res.status_code)
         raise CrawlerException(message)
     file_path = make_tempfile(suffix=suffix)
     try:
         with open(file_path, 'w') as fh:
             for chunk in res.iter_content(chunk_size=1024):
                 if chunk:
                     fh.write(chunk)
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Example #16
0
 def save_response(self, res, suffix=None):
     """Store the return data from a requests response to a file."""
     # This must be a streaming response.
     if res.status_code >= 400:
         message = "Error ingesting %r: %r" % (res.url, res.status_code)
         raise CrawlerException(message)
     file_path = make_tempfile(suffix=suffix)
     try:
         with open(file_path, 'w') as fh:
             for chunk in res.iter_content(chunk_size=1024):
                 if chunk:
                     fh.write(chunk)
         return file_path
     except Exception:
         remove_tempfile(file_path)
         raise
Example #17
0
    def ingest(self, meta, local_path):
        message = Message(local_path)
        if message.header is not None:
            meta = self.parse_headers(message.header, meta)

        for attachment in message.attachments:
            self.ingest_attachment(attachment, meta)

        if message.body is not None:
            out_path = make_tempfile(suffix='txt')
            with open(out_path, 'w') as fh:
                # TODO: figure out if this is really IMAP UTF-7
                body = string_value(message.body)
                fh.write(body.encode('utf-8'))
            ing = DocumentIngestor(self.collection_id)
            ing.ingest(meta, out_path)
            remove_tempfile(out_path)
Example #18
0
 def ingest(self, meta, local_path):
     pdf_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         meta.title = meta.file_name
         if not self.check_image_size(meta, local_path):
             return
         convert = get_config('CONVERT_BIN')
         args = [convert, local_path, '-density', '300', '-define',
                 'pdf:fit-page=A4', pdf_path]
         subprocess.call(args)
         if not os.path.isfile(pdf_path):
             msg = "Could not convert image: %r" % meta
             raise ImageIngestorException(msg)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         remove_tempfile(pdf_path)
Example #19
0
def ingest_url(self, document_id, url):
    """Load the given URL into the document specified by document_id."""
    document = Document.by_id(document_id)
    if document is None:
        log.error("Could not find document: %s", document_id)
        return

    tmp_path = make_tempfile(document.file_name, suffix=document.extension)
    try:
        log.info("Ingesting URL: %s", url)
        res = requests.get(url, stream=True)
        if res.status_code >= 500:
            countdown = 3600 ** self.request.retries
            self.retry(countdown=countdown)
            return
        if res.status_code >= 400:
            document.status = Document.STATUS_FAIL
            document.error_message = "HTTP %s: %s" % (res.status_code, url)
            db.session.commit()
            return
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not document.has_meta('source_url'):
            document.source_url = res.url
        if not document.foreign_id:
            document.foreign_id = res.url
        document.headers = res.headers
        document.content_hash = archive.archive_file(tmp_path)
        db.session.commit()
        get_manager().ingest_document(document)
    except IOError as ioe:
        log.info("IO Failure: %r", ioe)
        countdown = 3600 ** self.request.retries
        self.retry(countdown=countdown)
    except Exception as ex:
        document.status = Document.STATUS_FAIL
        document.error_type = type(ex).__name__
        document.error_message = six.text_type(ex)
        db.session.commit()
        log.exception(ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Example #20
0
    def ingest_attachment(self, part, meta):
        name, ext = os.path.splitext(part.detected_file_name)
        if len(ext):
            ext = ext.strip().lower()
        body = part.body
        if body is None:
            return
        out_path = self.write_temp(body, ext)
        child = meta.make_child()
        child.file_name = part.detected_file_name
        child.mime_type = part.detected_content_type

        # Weird outlook RTF representations -- do we want them?
        if child.file_name == 'rtf-body.rtf':
            return

        ingest_file(self.collection_id, child, out_path, move=True)
        remove_tempfile(out_path)
Example #21
0
 def ingest(self, meta, local_path):
     pdf_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         meta.title = meta.file_name
         if not self.check_image_size(meta, local_path):
             return
         convert = get_config('CONVERT_BIN')
         args = [
             convert, local_path, '-density', '450', '-define',
             'pdf:fit-page=A4', pdf_path
         ]
         subprocess.call(args)
         if not os.path.isfile(pdf_path):
             msg = "Could not convert image: %r" % meta
             raise ImageIngestorException(msg)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         remove_tempfile(pdf_path)
Example #22
0
    def ingest_message_data(self, meta, data):
        msg = mime.from_string(data)
        meta = self.parse_headers(msg, meta)
        bodies = {'text/plain': msg.body}

        for part in msg.walk():
            if part.is_body():
                content_type = six.text_type(part.content_type)
                bodies[content_type] = part.body
            else:
                self.ingest_attachment(part, meta)

        try:
            if 'text/html' in bodies:
                out_path = self.write_text(bodies['text/html'], 'htm')
                HtmlIngestor(self.collection_id).ingest(meta, out_path)
            elif 'text/plain' in bodies:
                out_path = self.write_text(bodies['text/plain'], 'txt')
                DocumentIngestor(self.collection_id).ingest(meta, out_path)
        finally:
            remove_tempfile(out_path)
Example #23
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)
Example #24
0
def ingest_url(collection_id, metadata, url):
    meta = Metadata.from_data(metadata)
    tmp_path = make_tempfile(meta.file_name, suffix=meta.extension)
    try:
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()
        remove_tempfile(tmp_path)