Python pdf_to_lxml 예제들, billy.scrape.utils.pdf_to_lxml Python 예제들

예제 #1

0

파일 보기

파일: bills.py 프로젝트: drewstaylor/open13

    def scrape(self, session, chambers):
        url = 'http://www.legassembly.sk.ca/legislative-business/bills/'
        doc = lxml.html.fromstring(self.urlopen(url))
        doc.make_links_absolute(url)

        url = doc.xpath('//a[text() = "Progress of Bills"]/@href').pop()
        filename, resp = self.urlretrieve(url)

        doc = pdf_to_lxml(filename)

        actions = [
            'First Reading',
            'Crown recommendation',
            'Committee',
            'Second Reading',
            'Committee',
            'Amend Date',
            'Third Reading',
            'Royal Assent',
            'In Effect'
            ]

        for a in doc.xpath('//a[contains(@href, "legdocs/Bills")]'):
            bill_id = a.text_content().strip()
            predicate = lambda el: el.tag == 'br'
            sibs = list(takewhile(predicate, a.itersiblings()))

            # If the star is missing, insert it to avoid complicated code.
            if not sibs[0].tail.strip() == '*':
                sibs.insert(0, DummyBR('br', None, '*'))

            title_chunks = [sibs[1].tail.strip()]
            sponsor = sibs[2].tail.strip()
            dates = sibs[3].tail.split(u'\xa0')
            title_chunks.extend((br.tail or '').strip() for br in sibs[4:])
            title = ' '.join(title_chunks).strip()

            bill = Bill(session, 'lower', bill_id, title, type='bill')
            bill.add_sponsor(name=sponsor, type='primary')

            for action, date in zip(actions, dates):
                date = datetime.datetime.strptime(date.strip(), '%Y-%m-%d')
                attrs = dict(action=action, date=date, actor='lower')
                attrs.update(self.categorizer.categorize(action))
                bill.add_action(**attrs)

            bill.add_source(url)
            bill.add_version('Introduced', a.attrib['href'],
                mimetype='application/pdf')
            self.save_bill(bill)

예제 #2

0

파일 보기

파일: votes.py 프로젝트: PamelaM/openstates

 def scrape_roll_call(self, chamber, session, idx):
     url = self.roll_call_url_format % locals()
     try:
         filename, response = self.urlretrieve(url)
     except scrapelib.HTTPError:
         return False
         
     try:
         xml = pdf_to_lxml(filename)
     finally:
         os.remove(filename)
     
     print lxml.etree.tostring(xml, pretty_print=True)
     return True

예제 #3

0

파일 보기

파일: bills.py 프로젝트: annerajb/openstates

    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READING' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote['type'] = type
        vote.add_source(url)

        (fd, temp_path) = tempfile.mkstemp()
        self.urlretrieve(url, temp_path)

        html = pdf_to_lxml(temp_path)
        os.close(fd)
        os.remove(temp_path)

        vote_type = None
        total_re = re.compile('^Total--(\d+)$')
        body = html.xpath('string(/html/body)')

        date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body)
        try:
            date = date_match.group(1)
        except AttributeError:
            self.warning("BAD VOTE: date error")
            return

        vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y')

        for line in body.replace(u'\xa0', '\n').split('\n'):
            line = line.replace('&nbsp;', '').strip()
            if not line:
                continue

            if line in ('YEAS', 'NAYS', 'ABSENT'):
                vote_type = {
                    'YEAS': 'yes',
                    'NAYS': 'no',
                    'ABSENT': 'other'
                }[line]
            elif line in ('Total', '--'):
                vote_type = None
            elif vote_type:
                match = total_re.match(line)
                if match:
                    vote['%s_count' % vote_type] = int(match.group(1))
                elif vote_type == 'yes':
                    vote.yes(line)
                elif vote_type == 'no':
                    vote.no(line)
                elif vote_type == 'other':
                    vote.other(line)

        # tally counts
        vote['yes_count'] = len(vote['yes_votes'])
        vote['no_count'] = len(vote['no_votes'])
        vote['other_count'] = len(vote['other_votes'])

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)

예제 #4

0

파일 보기

파일: bills.py 프로젝트: ipzero/openstates

    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READING' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None,
                    None, None, None)
        vote['type'] = type
        vote.add_source(url)

        with self.urlopen(url) as text:
            (fd, temp_path) = tempfile.mkstemp()
            with os.fdopen(fd, 'wb') as w:
                w.write(text)
            html = pdf_to_lxml(temp_path)
            os.remove(temp_path)

            vote_type = None
            total_re = re.compile('^Total--(\d+)$')
            body = html.xpath('string(/html/body)')

            date_match = re.search('Date: (\d{1,2}/\d{1,2}/\d{4})', body)
            try:
                date = date_match.group(1)
            except AttributeError:
                self.warning("BAD VOTE: date error")
                return

            vote['date'] = datetime.datetime.strptime(date, '%m/%d/%Y')

            for line in body.replace(u'\xa0', '\n').split('\n'):
                line = line.replace('&nbsp;', '').strip()
                if not line:
                    continue

                if line in ('YEAS', 'NAYS', 'ABSENT'):
                    vote_type = {'YEAS': 'yes', 'NAYS': 'no',
                                 'ABSENT': 'other'}[line]
                elif line in ('Total', '--'):
                    vote_type = None
                elif vote_type:
                    match = total_re.match(line)
                    if match:
                        vote['%s_count' % vote_type] = int(match.group(1))
                    elif vote_type == 'yes':
                        vote.yes(line)
                    elif vote_type == 'no':
                        vote.no(line)
                    elif vote_type == 'other':
                        vote.other(line)

        # tally counts
        vote['yes_count'] = len(vote['yes_votes'])
        vote['no_count'] = len(vote['no_votes'])
        vote['other_count'] = len(vote['other_votes'])

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)

예제 #5

0

파일 보기

    def scrape_vote(self, bill, name, url):
        match = re.match('^(Senate|House) Vote on [^,]*,(.*)$', name)

        if not match:
            return

        chamber = {'Senate': 'upper', 'House': 'lower'}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith('FINAL PASSAGE'):
            type = 'passage'
        elif motion.startswith('AMENDMENT'):
            type = 'amendment'
        elif 'ON 3RD READINT' in motion:
            type = 'reading:3'
        else:
            type = 'other'

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote['type'] = type
        vote.add_source(url)

        with self.urlopen(url) as text:
            (fd, temp_path) = tempfile.mkstemp()
            with os.fdopen(fd, 'wb') as w:
                w.write(text)
            html = pdf_to_lxml(temp_path)
            os.remove(temp_path)

            vote_type = None
            total_re = re.compile('^Total--(\d+)$')
            body = html.xpath('string(/html/body)')

            date_match = re.search('%s (\d{4,4})' % bill['bill_id'], body)
            try:
                date = date_match.group(1)
            except AttributeError:
                print "BAD VOTE"
                return
            month = int(date[0:2])
            day = int(date[2:4])
            date = datetime.date(int(bill['session']), month, day)
            vote['date'] = date

            for line in body.replace(u'\xa0', '\n').split('\n'):
                line = line.replace('&nbsp;', '').strip()
                if not line:
                    continue

                if line in ('YEAS', 'NAYS', 'ABSENT'):
                    vote_type = {
                        'YEAS': 'yes',
                        'NAYS': 'no',
                        'ABSENT': 'other'
                    }[line]
                elif vote_type:
                    match = total_re.match(line)
                    if match:
                        vote['%s_count' % vote_type] = int(match.group(1))
                    elif vote_type == 'yes':
                        vote.yes(line)
                    elif vote_type == 'no':
                        vote.no(line)
                    elif vote_type == 'other':
                        vote.other(line)

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote['yes_count'] > (vote['no_count'] + vote['other_count']):
            vote['passed'] = True
        else:
            vote['passed'] = False

        bill.add_vote(vote)

예제 #6

0

파일 보기

파일: bills.py 프로젝트: QuorumUS/openstates

    def scrape_vote(self, bill, name, url):
        match = re.match("^(Senate|House) Vote on [^,]*,(.*)$", name)

        if not match:
            return

        chamber = {"Senate": "upper", "House": "lower"}[match.group(1)]
        motion = match.group(2).strip()

        if motion.startswith("FINAL PASSAGE"):
            type = "passage"
        elif motion.startswith("AMENDMENT"):
            type = "amendment"
        elif "ON 3RD READING" in motion:
            type = "reading:3"
        else:
            type = "other"

        vote = Vote(chamber, None, motion, None, None, None, None)
        vote["type"] = type
        vote.add_source(url)

        (fd, temp_path) = tempfile.mkstemp()
        self.urlretrieve(url, temp_path)

        html = pdf_to_lxml(temp_path)
        os.close(fd)
        os.remove(temp_path)

        vote_type = None
        total_re = re.compile("^Total--(\d+)$")
        body = html.xpath("string(/html/body)")

        date_match = re.search("Date: (\d{1,2}/\d{1,2}/\d{4})", body)
        try:
            date = date_match.group(1)
        except AttributeError:
            self.warning("BAD VOTE: date error")
            return

        vote["date"] = dt.datetime.strptime(date, "%m/%d/%Y")

        for line in body.replace(u"\xa0", "\n").split("\n"):
            line = line.replace("&nbsp;", "").strip()
            if not line:
                continue

            if line in ("YEAS", "NAYS", "ABSENT"):
                vote_type = {"YEAS": "yes", "NAYS": "no", "ABSENT": "other"}[line]
            elif line in ("Total", "--"):
                vote_type = None
            elif vote_type:
                match = total_re.match(line)
                if match:
                    vote["%s_count" % vote_type] = int(match.group(1))
                elif vote_type == "yes":
                    vote.yes(line)
                elif vote_type == "no":
                    vote.no(line)
                elif vote_type == "other":
                    vote.other(line)

        # tally counts
        vote["yes_count"] = len(vote["yes_votes"])
        vote["no_count"] = len(vote["no_votes"])
        vote["other_count"] = len(vote["other_votes"])

        # The PDFs oddly don't say whether a vote passed or failed.
        # Hopefully passage just requires yes_votes > not_yes_votes
        if vote["yes_count"] > (vote["no_count"] + vote["other_count"]):
            vote["passed"] = True
        else:
            vote["passed"] = False

        bill.add_vote(vote)

예제 #7

0

파일 보기

파일: committees.py 프로젝트: drewstaylor/open13

    def scrape(self, term, chambers):
        url = ('http://www.gov.mb.ca/legislature/committees/membership.pdf')
        filename, resp = self.urlretrieve(url)
        doc = pdf_to_lxml(filename, type='xml')

        import pdb;pdb.set_trace()