Example #1
0
    def parse_knesset_laws_page(self,soup):
        name_tag = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key('href') and tag['href'].find(".pdf")>=0)
        for tag in name_tag:
            pdf_link = self.pdf_url + tag['href']
            booklet = re.search(r"/(\d+)/",tag['href']).groups(1)[0]
            if int(booklet) <= self.min_booklet:
                return False
            pdf_data = parse_knesset_bill_pdf.parse(pdf_link)
            for j in range(len(pdf_data)): # sometime there is more than 1 law in a pdf
                title = pdf_data[j]['title']
                m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]',title)
                try:
                    comment = m[-1].strip().replace('\n','').replace('&nbsp;',' ')
                    law = title[:title.find(comment)-1]
                except:
                    comment = None
                    law = title.replace(',','')
                try:
                    correction = m[-2].strip().replace('\n','').replace('&nbsp;',' ')
                    law = title[:title.find(correction)-1]
                except:
                    correction = None
                correction = fix_dash(correction)
                law = law.strip().replace('\n','').replace('&nbsp;',' ')
                if law.find("הצעת ".decode("utf8"))==0:
                    law = law[5:]

                self.laws_data.append({'booklet':booklet,'link':pdf_link, 'law':law, 'correction':correction,
                                       'comment':comment, 'original_ids':pdf_data[j]['original_ids'],'date':pdf_data[j]['date']})
        return True               
Example #2
0
 def parse_pdf(self,pdf_url):
     return parse_knesset_bill_pdf.parse(pdf_url)
Example #3
0
 def parse_pdf(self,pdf_url):
     return parse_knesset_bill_pdf.parse(pdf_url)