def parse_laws_page(self, soup): name_tags = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key('href') and tag['href'].find(".pdf") >= 0) for tag in name_tags: pdf_link = self.pdf_url + tag['href'] booklet = re.search(r"/(\d+)/", tag['href']).groups(1)[0] if int(booklet) <= self.min_booklet: return False pdf_data = self.parse_pdf(pdf_link) or [] for j in range(len(pdf_data)): # sometime there is more than 1 law in a pdf title = pdf_data[j]['title'] m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title) try: comment = m[-1].strip().replace('\n', '').replace(' ', ' ') law = title[:title.find(comment) - 1] except: comment = None law = title.replace(',', '') try: correction = m[-2].strip().replace('\n', '').replace(' ', ' ') law = title[:title.find(correction) - 1] except: correction = None correction = normalize_correction_title_dashes(correction) law = law.strip().replace('\n', '').replace(' ', ' ') if law.find("הצעת ".decode("utf8")) == 0: law = law[5:] law_data = {'booklet': booklet, 'link': pdf_link, 'law': law, 'correction': correction, 'comment': comment, 'date': pdf_data[j]['date']} if 'original_ids' in pdf_data[j]: law_data['original_ids'] = pdf_data[j]['original_ids'] if 'bill' in pdf_data[j]: law_data['bill'] = pdf_data[j]['bill'] self.laws_data.append(law_data) return True
def update_single_bill(self, pdf_link, booklet=None, alt_title=None): gp = None if booklet is None: # get booklet from existing bill gps = GovProposal.objects.filter(source_url=pdf_link) if gps.count() < 1: logger.error('no existing object with given pdf link and no ' 'booklet given. pdf_link = %s' % pdf_link) return gp = gps[0] booklet = gp.booklet_number pdf_data = self.parse_pdf(pdf_link) if pdf_data is None: return for j in range(len(pdf_data)): # sometime there is more than 1 gov # bill in a pdf if alt_title: # just use the given title title = alt_title else: # get the title from the PDF file itself. # doesn't work so well title = pdf_data[j]['title'] m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title) try: comment = m[-1].strip().replace('\n', '').replace(' ', ' ') law = title[:title.find(comment) - 1] except: comment = None law = title.replace(',', '') try: correction = m[-2].strip().replace('\n', '').replace(' ', ' ') law = title[:title.find(correction) - 1] except: correction = None correction = normalize_correction_title_dashes(correction) law = law.strip().replace('\n', '').replace(' ', ' ') if law.find("הצעת ".decode("utf8")) == 0: law = law[5:] law_data = { 'booklet': booklet, 'link': pdf_link, 'law': law, 'correction': correction, 'comment': comment, 'date': pdf_data[j]['date'] } if 'original_ids' in pdf_data[j]: law_data['original_ids'] = pdf_data[j]['original_ids'] if 'bill' in pdf_data[j]: law_data['bill'] = pdf_data[j]['bill'] self.laws_data.append(law_data) self.create_or_update_single_bill( data=law_data, pdf_link=pdf_link, link_file=pdf_data[j]['link_file'], gp=gp)
def update_single_bill(self, pdf_link, booklet=None, alt_title=None): gp = None if booklet is None: # get booklet from existing bill gps = GovProposal.objects.filter(source_url=pdf_link) if gps.count() < 1: logger.error('no existing object with given pdf link and no ' 'booklet given. pdf_link = %s' % pdf_link) return gp = gps[0] booklet = gp.booklet_number pdf_data = self.parse_pdf(pdf_link) if pdf_data is None: return for j in range(len(pdf_data)): # sometime there is more than 1 gov # bill in a pdf if alt_title: # just use the given title title = alt_title else: # get the title from the PDF file itself. # doesn't work so well title = pdf_data[j]['title'] m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title) try: comment = m[-1].strip().replace('\n', '').replace( ' ', ' ') law = title[:title.find(comment) - 1] except: comment = None law = title.replace(',', '') try: correction = m[-2].strip().replace('\n', '').replace( ' ', ' ') law = title[:title.find(correction) - 1] except: correction = None correction = normalize_correction_title_dashes(correction) law = law.strip().replace('\n', '').replace(' ', ' ') if law.find("הצעת ".decode("utf8")) == 0: law = law[5:] law_data = {'booklet': booklet, 'link': pdf_link, 'law': law, 'correction': correction, 'comment': comment, 'date': pdf_data[j]['date']} if 'original_ids' in pdf_data[j]: law_data['original_ids'] = pdf_data[j]['original_ids'] if 'bill' in pdf_data[j]: law_data['bill'] = pdf_data[j]['bill'] self.laws_data.append(law_data) self.create_or_update_single_bill( data=law_data, pdf_link=pdf_link, link_file=pdf_data[j]['link_file'], gp=gp)
def parse_private_laws_page(self, soup): name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key( 'valign') and tag['valign'] == 'Top') for tag in name_tag: tds = tag.findAll(lambda td: td.name == 'td') law_data = {} law_data['knesset_id'] = int(tds[0].string.strip()) law_data['law_id'] = int(tds[1].string.strip()) if tds[2].findAll('a')[0].has_key('href'): law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll( 'a')[0]['href'] law_data['law_full_title'] = tds[3].string.strip() m = re.match( u'הצעת ([^\(,]*)(.*?\((.*?)\))?(.*?\((.*?)\))?(.*?,(.*))?', law_data['law_full_title']) if not m: logger.warn("can't parse proposal title: %s" % law_data['law_full_title']) continue law_data['law_name'] = clean_line(m.group(1)) comment1 = m.group(3) comment2 = m.group(5) if comment2: law_data['correction'] = clean_line(comment2) law_data['comment'] = comment1 else: law_data['comment'] = None if comment1: law_data['correction'] = clean_line(comment1) else: law_data['correction'] = None law_data['correction'] = normalize_correction_title_dashes( law_data['correction']) law_data['law_year'] = m.group(7) law_data['proposal_date'] = datetime.datetime.strptime( tds[4].string.strip(), '%d/%m/%Y').date() names_string = ''.join( [unicode(y) for y in tds[5].findAll('font')[0].contents]) names_string = clean_line(names_string) proposers = [] joiners = [] if re.search('ONMOUSEOUT', names_string) > 0: splitted_names = names_string.split('ONMOUSEOUT') joiners = [ name for name in re.match('(.*?)\',\'', splitted_names[0]). group(1).split('<br />') if len(name) > 0 ] proposers = splitted_names[1][10:].split('<br />') else: proposers = names_string.split('<br />') law_data['proposers'] = proposers law_data['joiners'] = joiners self.laws_data.append(law_data)
def parse_private_laws_page(self, soup): name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key('valign') and tag['valign'] == 'Top') for tag in name_tag: tds = tag.findAll(lambda td: td.name == 'td') law_data = {} law_data['knesset_id'] = int(tds[0].string.strip()) law_data['law_id'] = int(tds[1].string.strip()) if tds[2].findAll('a')[0].has_key('href'): law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll('a')[0]['href'] law_data['law_full_title'] = tds[3].string.strip() parsed_law_title = laws_parser_utils.parse_title(law_data['law_full_title']) if not parsed_law_title: logger.warn("can't parse proposal title: %s" % law_data['law_full_title']) continue law_data['law_name'] = clean_line(parsed_law_title.group(1)) comment1 = parsed_law_title.group(3) comment2 = parsed_law_title.group(5) if comment2: law_data['correction'] = clean_line(comment2) law_data['comment'] = comment1 else: law_data['comment'] = None if comment1: law_data['correction'] = clean_line(comment1) else: law_data['correction'] = None law_data['correction'] = normalize_correction_title_dashes(law_data['correction']) law_data['law_year'] = parsed_law_title.group(7) law_data['proposal_date'] = datetime.datetime.strptime(tds[4].string.strip(), '%d/%m/%Y').date() names_string = ''.join([unicode(y) for y in tds[5].findAll('font')[0].contents]) names_string = clean_line(names_string) proposers = [] joiners = [] # Old deprecated way to search for joiners if re.search('ONMOUSEOUT', names_string) > 0: splitted_names = names_string.split('ONMOUSEOUT') joiners = [name for name in re.match('(.*?)\',\'', splitted_names[0]).group(1).split('<br />') if len(name) > 0] proposers = splitted_names[1][10:].split('<br />') else: proposers = names_string.split('<br />') more_joiners = [name for name in tds[6].findAll(text=lambda text: isinstance(text, NavigableString)) if name.strip() not in [u'מצטרפים לחוק:', u'אין מצטרפים לחוק']] if len(more_joiners) and not joiners: joiners = more_joiners law_data['proposers'] = proposers law_data['joiners'] = joiners self.laws_data.append(law_data)
def parse_laws_page(self, soup): name_tags = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key( 'href') and tag['href'].find(".pdf") >= 0) for tag in name_tags: pdf_link = self.pdf_url + tag['href'] booklet = re.search(r"/(\d+)/", tag['href']).groups(1)[0] if int(booklet) <= self.min_booklet: return False pdf_data = self.parse_pdf(pdf_link) or [] for j in range(len( pdf_data)): # sometime there is more than 1 law in a pdf title = pdf_data[j]['title'] m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title) try: comment = m[-1].strip().replace('\n', '').replace(' ', ' ') law = title[:title.find(comment) - 1] except: comment = None law = title.replace(',', '') try: correction = m[-2].strip().replace('\n', '').replace( ' ', ' ') law = title[:title.find(correction) - 1] except: correction = None correction = normalize_correction_title_dashes(correction) law = law.strip().replace('\n', '').replace(' ', ' ') if law.find("הצעת ".decode("utf8")) == 0: law = law[5:] law_data = { 'booklet': booklet, 'link': pdf_link, 'law': law, 'correction': correction, 'comment': comment, 'date': pdf_data[j]['date'] } if 'original_ids' in pdf_data[j]: law_data['original_ids'] = pdf_data[j]['original_ids'] if 'bill' in pdf_data[j]: law_data['bill'] = pdf_data[j]['bill'] self.laws_data.append(law_data) return True
def parse_private_laws_page(self, soup): name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key('valign') and tag['valign'] == 'Top') for tag in name_tag: tds = tag.findAll(lambda td: td.name == 'td') law_data = {} law_data['knesset_id'] = int(tds[0].string.strip()) law_data['law_id'] = int(tds[1].string.strip()) if tds[2].findAll('a')[0].has_key('href'): law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll('a')[0]['href'] law_data['law_full_title'] = tds[3].string.strip() m = re.match(u'הצעת ([^\(,]*)(.*?\((.*?)\))?(.*?\((.*?)\))?(.*?,(.*))?', law_data['law_full_title']) if not m: logger.warn("can't parse proposal title: %s" % law_data['law_full_title']) continue law_data['law_name'] = clean_line(m.group(1)) comment1 = m.group(3) comment2 = m.group(5) if comment2: law_data['correction'] = clean_line(comment2) law_data['comment'] = comment1 else: law_data['comment'] = None if comment1: law_data['correction'] = clean_line(comment1) else: law_data['correction'] = None law_data['correction'] = normalize_correction_title_dashes(law_data['correction']) law_data['law_year'] = m.group(7) law_data['proposal_date'] = datetime.datetime.strptime(tds[4].string.strip(), '%d/%m/%Y').date() names_string = ''.join([unicode(y) for y in tds[5].findAll('font')[0].contents]) names_string = clean_line(names_string) proposers = [] joiners = [] if re.search('ONMOUSEOUT', names_string) > 0: splitted_names = names_string.split('ONMOUSEOUT') joiners = [name for name in re.match('(.*?)\',\'', splitted_names[0]).group(1).split('<br />') if len(name) > 0] proposers = splitted_names[1][10:].split('<br />') else: proposers = names_string.split('<br />') law_data['proposers'] = proposers law_data['joiners'] = joiners self.laws_data.append(law_data)
def parse_private_laws_page(self, soup): name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key( 'valign') and tag['valign'] == 'Top') for tag in name_tag: tds = tag.findAll(lambda td: td.name == 'td') law_data = {} law_data['knesset_id'] = int(tds[0].string.strip()) law_data['law_id'] = int(tds[1].string.strip()) if tds[2].findAll('a')[0].has_key('href'): law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll( 'a')[0]['href'] law_data['law_full_title'] = tds[3].string.strip() parsed_law_title = laws_parser_utils.parse_title( law_data['law_full_title']) if not parsed_law_title: logger.warn("can't parse proposal title: %s" % law_data['law_full_title']) continue law_data['law_name'] = clean_line(parsed_law_title.group(1)) comment1 = parsed_law_title.group(3) comment2 = parsed_law_title.group(5) if comment2: law_data['correction'] = clean_line(comment2) law_data['comment'] = comment1 else: law_data['comment'] = None if comment1: law_data['correction'] = clean_line(comment1) else: law_data['correction'] = None law_data['correction'] = normalize_correction_title_dashes( law_data['correction']) law_data['law_year'] = parsed_law_title.group(7) law_data['proposal_date'] = datetime.datetime.strptime( tds[4].string.strip(), '%d/%m/%Y').date() names_string = ''.join( [unicode(y) for y in tds[5].findAll('font')[0].contents]) names_string = clean_line(names_string) proposers = [] joiners = [] # Old deprecated way to search for joiners if re.search('ONMOUSEOUT', names_string) > 0: splitted_names = names_string.split('ONMOUSEOUT') joiners = [ name for name in re.match('(.*?)\',\'', splitted_names[0]). group(1).split('<br />') if len(name) > 0 ] proposers = splitted_names[1][10:].split('<br />') else: proposers = names_string.split('<br />') more_joiners = [ name for name in tds[6].findAll( text=lambda text: isinstance(text, NavigableString)) if name.strip() not in [u'מצטרפים לחוק:', u'אין מצטרפים לחוק'] ] if len(more_joiners) and not joiners: joiners = more_joiners law_data['proposers'] = proposers law_data['joiners'] = joiners self.laws_data.append(law_data)