def parse_knesset_laws_page(self,soup): name_tag = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key('href') and tag['href'].find(".pdf")>=0) for tag in name_tag: pdf_link = self.pdf_url + tag['href'] booklet = re.search(r"/(\d+)/",tag['href']).groups(1)[0] if int(booklet) <= self.min_booklet: return False pdf_data = parse_knesset_bill_pdf.parse(pdf_link) for j in range(len(pdf_data)): # sometime there is more than 1 law in a pdf title = pdf_data[j]['title'] m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]',title) try: comment = m[-1].strip().replace('\n','').replace(' ',' ') law = title[:title.find(comment)-1] except: comment = None law = title.replace(',','') try: correction = m[-2].strip().replace('\n','').replace(' ',' ') law = title[:title.find(correction)-1] except: correction = None correction = fix_dash(correction) law = law.strip().replace('\n','').replace(' ',' ') if law.find("הצעת ".decode("utf8"))==0: law = law[5:] self.laws_data.append({'booklet':booklet,'link':pdf_link, 'law':law, 'correction':correction, 'comment':comment, 'original_ids':pdf_data[j]['original_ids'],'date':pdf_data[j]['date']}) return True
def parse_pdf(self,pdf_url): return parse_knesset_bill_pdf.parse(pdf_url)