Esempio n. 1
0
    def extract_headers_metadata(self, headers):
        self.result.headers = safe_dict(dict(headers))
        headers = [(safe_string(k), safe_string(v)) for k, v in headers]
        for field, value in headers:
            field = field.lower()
            if field is None or value is None:
                continue

            if field == 'subject':
                self.update('title', value)

            if field == 'message-id':
                self.update('message_id', value)

            if field == 'in-reply-to':
                self.result.emit_in_reply_to(value)
            if field == 'references':
                for email_addr in value.split():
                    self.result.emit_in_reply_to(email_addr)

            if field == 'date':
                date = value
                try:
                    date = email.utils.parsedate(date)
                    date = datetime.fromtimestamp(mktime(date))
                    self.update('created_at', date)
                except Exception as ex:
                    log.warning("Failed to parse [%s]: %s", date, ex)

            if field == 'from':
                for (name, _) in self.parse_emails(value):
                    self.update('author', name)

            if field in ['to', 'cc', 'bcc']:
                self.parse_emails(value)
Esempio n. 2
0
    def ingest(self, file_path):
        message = Message(file_path)
        self._parse_headers(message)
        self.extract_plain_text_content(message.getField('1000'))
        self.update('message_id', message.getField('1035'))

        # all associated person names, i.e. sender, recipient etc.
        NAME_FIELDS = ['0C1A', '0E04', '0040', '004D']
        EMAIL_FIELDS = [
            '0C1F', '0076', '0078', '1046', '3003', '0065', '3FFC', '403E'
        ]
        for field in NAME_FIELDS + EMAIL_FIELDS:
            self.parse_emails(message.getField(field))

        self.update('title', message.getField('0037'))
        self.update('title', message.getField('0070'))
        self.update('author', message.getField('0C1A'))

        # from pprint import pprint
        # pprint(self.result.to_dict())

        self.extract_olefileio_metadata(message)
        self.result.flag(self.result.FLAG_EMAIL)
        self.result.flag(self.result.FLAG_PLAINTEXT)
        for attachment in message.attachments:
            name = safe_string(attachment.longFilename)
            name = name or safe_string(attachment.shortFilename)
            self.ingest_attachment(name, attachment.mimeType, attachment.data)
Esempio n. 3
0
    def ingest_attachment(self, name, mime_type, body):
        has_body = body is not None and len(body)
        if safe_string(name) is None and not has_body:
            # Hello, Outlook.
            return

        file_name = safe_filename(name, default='attachment')
        name = safe_string(name) or file_name
        foreign_id = join_path(self.result.id, name)

        file_path = join_path(self.work_path, file_name)
        with open(file_path, 'wb') as fh:
            if isinstance(body, str):
                body = body.encode('utf-8')
            if body is not None:
                fh.write(body)

        if isinstance(mime_type, bytes):
            mime_type = mime_type.decode('utf-8')

        self.manager.handle_child(self.result,
                                  file_path,
                                  id=foreign_id,
                                  file_name=name,
                                  mime_type=mime_type)
Esempio n. 4
0
 def generate_rows(self, table):
     headers = [safe_string(h) for h in table.field_names]
     for row in table:
         try:
             data = OrderedDict()
             for header, value in zip(headers, row):
                 data[header] = safe_string(value)
             yield data
         except Exception as ex:
             log.warning("Cannot decode DBF row: %s", ex)
Esempio n. 5
0
 def generate_rows(self, reader, has_header=False):
     headers = next(reader) if has_header else []
     headers = [safe_string(h) for h in headers]
     for row in reader:
         while len(headers) < len(row):
             next_col = len(headers) + 1
             headers.append('Column %s' % next_col)
         data = OrderedDict()
         for header, value in zip(headers, row):
             data[header] = safe_string(value)
         yield data
Esempio n. 6
0
 def get_email_addresses(self, doc, tag):
     path = './%s/emailAddress' % tag
     for address in doc.findall(path):
         email = safe_string(address.get('OPFContactEmailAddressAddress'))
         if not self.check_email(email):
             email = None
         self.result.emit_email(email)
         name = safe_string(address.get('OPFContactEmailAddressName'))
         if self.check_email(name):
             name = None
         if name or email:
             yield (name, email)
Esempio n. 7
0
    def update(self):
        """Apply the outcome of the result to the document."""
        doc = self.document
        if self.status == self.STATUS_SUCCESS:
            doc.status = Document.STATUS_SUCCESS
            doc.error_message = None
        else:
            doc.status = Document.STATUS_FAIL
            doc.error_message = stringify(self.error_message)

        schema = model['Document']
        for flag, name in self.SCHEMATA:
            if flag in self.flags:
                schema = model[name]

        doc.schema = schema.name
        doc.foreign_id = safe_string(self.id)
        doc.content_hash = self.checksum or doc.content_hash
        doc.pdf_version = self.pdf_checksum
        doc.title = self.title or doc.meta.get('title')
        doc.file_name = self.file_name or doc.meta.get('file_name')
        doc.file_size = self.size or doc.meta.get('file_size')
        doc.summary = self.summary or doc.meta.get('summary')
        doc.author = self.author or doc.meta.get('author')
        doc.generator = self.generator or doc.meta.get('generator')
        doc.mime_type = self.mime_type or doc.meta.get('mime_type')
        doc.encoding = self.encoding or doc.meta.get('encoding')
        doc.date = self.date or doc.meta.get('date')
        doc.authored_at = self.created_at or doc.meta.get('authored_at')
        doc.modified_at = self.modified_at or doc.meta.get('modified_at')
        doc.published_at = self.published_at or doc.meta.get('published_at')
        doc.message_id = self.message_id or doc.meta.get('message_id')
        doc.in_reply_to = ensure_list(self.in_reply_to)
        doc.columns = list(self.columns.keys())
        doc.body_raw = self.body_html
        doc.body_text = self.body_text
        doc.headers = self.headers

        for kw in self.keywords:
            doc.add_keyword(safe_string(kw))
        for lang in self.languages:
            doc.add_language(safe_string(lang))

        db.session.flush()

        collector = DocumentTagCollector(doc, 'ingestors')
        for entity in self.entities:
            collector.emit(entity, DocumentTag.TYPE_PERSON)
        for email in self.emails:
            collector.emit(email, DocumentTag.TYPE_EMAIL)
        collector.save()
Esempio n. 8
0
    def ingest(self, file_path):
        self.result.flag(self.result.FLAG_EMAIL)
        try:
            doc = self.parse_xml(file_path)
        except TypeError:
            raise ProcessingException("Cannot parse OPF XML file.")

        if len(doc.findall('//email')) != 1:
            raise ProcessingException("More than one email in file.")

        email = doc.find('//email')
        props = email.getchildren()
        props = {c.tag: safe_string(c.text) for c in props if c.text}
        headers = {
            'Subject': props.get('OPFMessageCopySubject'),
            'Message-ID': props.pop('OPFMessageCopyMessageID', None),
            'From': self.get_contacts(email, 'OPFMessageCopyFromAddresses'),
            'Sender': self.get_contacts(email, 'OPFMessageCopySenderAddress'),
            'To': self.get_contacts(email, 'OPFMessageCopyToAddresses'),
            'CC': self.get_contacts(email, 'OPFMessageCopyCCAddresses'),
            'BCC': self.get_contacts(email, 'OPFMessageCopyBCCAddresses'),
        }
        date = props.get('OPFMessageCopySentTime')
        if date is not None:
            date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
            date = time.mktime(date.timetuple())
            headers['Date'] = utils.formatdate(date)

        self.result.headers = safe_dict(headers)

        self.update('title', props.pop('OPFMessageCopySubject', None))
        self.update('title', props.pop('OPFMessageCopyThreadTopic', None))
        for tag in ('OPFMessageCopyFromAddresses',
                    'OPFMessageCopySenderAddress'):
            self.update('author', self.get_contact_name(email, tag))

        self.update('summary', props.pop('OPFMessageCopyPreview', None))
        self.update('created_at', props.pop('OPFMessageCopySentTime', None))
        self.update('modified_at', props.pop('OPFMessageCopyModDate', None))

        body = props.pop('OPFMessageCopyBody', None)
        html = props.pop('OPFMessageCopyHTMLBody', None)

        has_html = '1E0' == props.pop('OPFMessageGetHasHTML', None)
        if has_html and safe_string(html):
            self.extract_html_content(html)
            self.result.flag(self.result.FLAG_HTML)
        else:
            self.extract_plain_text_content(body)
            self.result.flag(self.result.FLAG_PLAINTEXT)
Esempio n. 9
0
 def emit_page(self, index, text):
     """Emit a plain text page."""
     record = DocumentRecord()
     record.document_id = self.document.id
     record.text = safe_string(text)
     record.index = index
     db.session.add(record)
Esempio n. 10
0
    def ingest(self, file_path, result=None, work_path=None):
        """Main execution step of an ingestor."""
        if result is None:
            file_name = os.path.basename(file_path) if file_path else None
            result = self.RESULT_CLASS(file_path=file_path,
                                       file_name=file_name)

        self.checksum_file(result, file_path)
        self.before(result)
        result.status = Result.STATUS_PENDING
        try:
            ingestor_class = self.auction(file_path, result)
            log.debug("Ingestor [%s]: %s", result, ingestor_class.__name__)
            self.delegate(ingestor_class,
                          result,
                          file_path,
                          work_path=work_path)
            result.status = Result.STATUS_SUCCESS
        except ProcessingException as pexc:
            result.error_message = safe_string(pexc)
            result.status = Result.STATUS_FAILURE
            log.warning("Failed [%s]: %s", result, result.error_message)
        finally:
            if result.status == Result.STATUS_PENDING:
                result.status = Result.STATUS_STOPPED
            self.after(result)

        return result
Esempio n. 11
0
File: result.py Progetto: pudo/aleph
 def emit_page(self, index, text):
     """Emit a plain text page."""
     record = DocumentRecord()
     record.document_id = self.document.id
     record.text = safe_string(text)
     record.index = index
     db.session.add(record)
Esempio n. 12
0
    def parse_emails(self, text):
        """Parse an email list with the side effect of adding them to the
        relevant result lists."""
        parsed = address.parse_list(safe_string(text))

        # If the snippet didn't parse, assume it is just a name.
        if not len(parsed):
            return [(text, None)]

        values = []
        for addr in parsed:
            name = stringify(addr.display_name)
            email = stringify(addr.address)

            if not self.check_email(email):
                email = None

            if self.check_email(name):
                email = email or name
                name = None

            self.result.emit_email(email)
            self.result.emit_name(name)
            values.append((name, email))
        return values
Esempio n. 13
0
 def update(self, name, value):
     """Set a metadata value if it is not already set with a value."""
     existing = getattr(self.result, name)
     if existing:
         return
     if not isinstance(value, (date, datetime)):
         value = safe_string(value)
     if value is None:
         return
     setattr(self.result, name, value)
Esempio n. 14
0
 def generate_csv(self, table):
     for row in table.getElementsByType(TableRow):
         values = []
         for cell in row.getElementsByType(TableCell):
             repeat = cell.getAttribute("numbercolumnsrepeated") or 1
             value = self.convert_cell(cell)
             value = safe_string(value)
             for i in range(int(repeat)):
                 values.append(value)
         yield values
Esempio n. 15
0
 def convert_cell(self, cell, sheet):
     value = cell.value
     try:
         if cell.ctype == 3:
             if value == 0:
                 return None
             year, month, day, hour, minute, second = \
                 xlrd.xldate_as_tuple(value, sheet.book.datemode)
             if (year, month, day) == (0, 0, 0):
                 value = time(hour, minute, second)
                 return value.isoformat()
             else:
                 value = datetime(year, month, day, hour, minute, second)
                 return value.isoformat()
     except Exception:
         pass
     return safe_string(value)
Esempio n. 16
0
 def _emit_iterator_rows(self, iterator):
     for data in iterator:
         for column in data.keys():
             column = safe_string(column)
             self.columns[column] = None
         yield data
Esempio n. 17
0
 def emit_name(self, text):
     text = safe_string(text)
     if text is None:
         return
     self.entities.append(text)
Esempio n. 18
0
 def emit_in_reply_to(self, text):
     text = safe_string(text)
     if text is None:
         return
     if text not in self.in_reply_to:
         self.in_reply_to.append(text)
Esempio n. 19
0
 def emit_pdf_alternative(self, file_path):
     self.pdf_path = safe_string(file_path)
Esempio n. 20
0
 def emit_page(self, index, text):
     self.pages.append({'text': safe_string(text), 'index': index})
Esempio n. 21
0
 def label(self):
     return safe_string(self.file_name) or self.checksum or '<result>'
Esempio n. 22
0
 def emit_language(self, text):
     text = safe_string(text)
     if text is None:
         return
     if text not in self.keywords:
         self.languages.append(text)
Esempio n. 23
0
 def emit_email(self, text):
     text = safe_string(text)
     if text is None:
         return
     self.emails.append(text)
Esempio n. 24
0
 def emit_html_body(self, html, text):
     self.body_html = safe_string(html)
     self.emit_text_body(text)
Esempio n. 25
0
File: result.py Progetto: pudo/aleph
 def _emit_iterator_rows(self, iterator):
     for data in iterator:
         for column in data.keys():
             column = safe_string(column)
             self.columns[column] = None
         yield data
Esempio n. 26
0
 def emit_text_body(self, text):
     self.body_text = safe_string(text)
Esempio n. 27
0
 def generate_csv(self, sheet):
     for row in sheet.rows:
         try:
             yield [safe_string(c.value) for c in row]
         except (ValueError, OverflowError, ParseError) as ve:
             log.warning("Failed to read Excel row: %s", ve)