コード例 #1
0
    def parse(self, response):
        msg = BytesParser(policy=default).parsebytes(response.body)
        attachments = list(msg.iter_attachments())
        pdf_list = [
            a for a in attachments if a.get_content_type() == "application/pdf"
        ]
        # List of tuples of filename, match string
        match_list = []

        for pdf_obj in pdf_list:
            pdf_text = self._parse_pdf_text(pdf_obj.get_payload(decode=True))
            meeting_match = re.search(
                r"Senior Citizens\s+Commission\n.*?(?=\n\n)",
                pdf_text,
                flags=re.I | re.M | re.DOTALL,
            )
            if meeting_match:
                match_list.append(
                    (pdf_obj.get_filename(), meeting_match.group()))

        if len(match_list) == 0:
            raise ValueError("Meeting not found in {} PDFs".format(
                len(pdf_list)))

        for pdf_name, meeting_str in match_list:
            year_match = re.search(r"\d{4}", pdf_list[0].get_filename())
            year_str = None
            if year_match:
                year_str = year_match.group()
            start, end = self._parse_times(meeting_str, year_str)
            if not start:
                return
            meeting = Meeting(
                title="Senior Citizens Commission",
                description="",
                classification=COMMISSION,
                start=start,
                end=end,
                all_day=False,
                time_notes="",
                location=self._parse_location(meeting_str),
                links=[],
                source=response.url,
            )

            meeting["status"] = self._get_status(meeting, text=meeting_str)
            meeting["id"] = self._get_id(meeting)

            yield meeting
コード例 #2
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        msg = BytesParser(policy=default).parsebytes(response.body)
        attachments = list(msg.iter_attachments())
        docx_list = [a for a in attachments if ".docx" in a.get_filename()]
        items = []
        if len(docx_list) > 0:
            items.extend(
                self._parse_docx(docx_list[0].get_payload(decode=True)))
        items.extend(self._parse_email_text(msg))
        yield from self._parse_meetings(items)
コード例 #3
0
    def parse(self, response):
        """
        `parse` should always `yield` Meeting items.

        Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
        needs.
        """
        msg = BytesParser(policy=default).parsebytes(response.body)
        attachments = list(msg.iter_attachments())
        pdf_list = [
            a for a in attachments if a.get_content_type() == "application/pdf"
        ]
        if len(pdf_list) > 0:
            detail_text = self._parse_pdf_text(
                pdf_list[0].get_payload(decode=True))
        else:
            detail_text = self._parse_email_text(msg)

        yield self._parse_detail(detail_text)
コード例 #4
0
from email import policy
from email.parser import BytesParser

myfiles = [
    '20140217-0121.eml.1c2dffd0', '20140217-0314.eml.14bac63d',
    '20140218-0722.eml.00fe7528', '20140219-0541.eml.74741be1',
    '20140219-0543.eml.1c20938f', '20140219-0608.eml.02af7d91',
    '20140219-0612.eml.0d9a2c0b', '20140224-2004.eml.6f36a877',
    '20140225-1702.eml.39a4225b'
]

for filename in myfiles:
    msg = BytesParser(policy=policy.default).parse(open(filename, 'rb'))
    print('Processing %s' % (filename, ))

    for attachment in msg.iter_attachments():
        fn = attachment.get_filename()
        print('Attachment filename is %s' % (fn, ))
        if fn:
            extension = os.path.splitext(attachment.get_filename())[1]
        else:
            extension = mimetypes.guess_extension(
                attachment.get_content_type())
        f = io.BytesIO()
        data = attachment.get_content()
        with open(fn, 'wb') as f:
            if isinstance(data, str):
                # data is a string
                f.write(data.encode('utf-8'))
            else:
                # data is bytes