Example #1
0
    def parse_laws_page(self, soup):
        name_tags = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key('href') and tag['href'].find(".pdf") >= 0)
        for tag in name_tags:
            pdf_link = self.pdf_url + tag['href']
            booklet = re.search(r"/(\d+)/", tag['href']).groups(1)[0]
            if int(booklet) <= self.min_booklet:
                return False
            pdf_data = self.parse_pdf(pdf_link) or []
            for j in range(len(pdf_data)):  # sometime there is more than 1 law in a pdf
                title = pdf_data[j]['title']
                m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title)
                try:
                    comment = m[-1].strip().replace('\n', '').replace('&nbsp;', ' ')
                    law = title[:title.find(comment) - 1]
                except:
                    comment = None
                    law = title.replace(',', '')
                try:
                    correction = m[-2].strip().replace('\n', '').replace('&nbsp;', ' ')
                    law = title[:title.find(correction) - 1]
                except:
                    correction = None
                correction = normalize_correction_title_dashes(correction)
                law = law.strip().replace('\n', '').replace('&nbsp;', ' ')
                if law.find("הצעת ".decode("utf8")) == 0:
                    law = law[5:]

                law_data = {'booklet': booklet, 'link': pdf_link, 'law': law, 'correction': correction,
                            'comment': comment, 'date': pdf_data[j]['date']}
                if 'original_ids' in pdf_data[j]:
                    law_data['original_ids'] = pdf_data[j]['original_ids']
                if 'bill' in pdf_data[j]:
                    law_data['bill'] = pdf_data[j]['bill']
                self.laws_data.append(law_data)
        return True
Example #2
0
    def update_single_bill(self, pdf_link, booklet=None, alt_title=None):
        gp = None
        if booklet is None:
            # get booklet from existing bill
            gps = GovProposal.objects.filter(source_url=pdf_link)
            if gps.count() < 1:
                logger.error('no existing object with given pdf link and no '
                             'booklet given. pdf_link = %s' % pdf_link)
                return
            gp = gps[0]
            booklet = gp.booklet_number
        pdf_data = self.parse_pdf(pdf_link)
        if pdf_data is None:
            return
        for j in range(len(pdf_data)):  # sometime there is more than 1 gov
            # bill in a pdf
            if alt_title:  # just use the given title
                title = alt_title
            else:  # get the title from the PDF file itself.
                # doesn't work so well
                title = pdf_data[j]['title']
            m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title)
            try:
                comment = m[-1].strip().replace('\n',
                                                '').replace('&nbsp;', ' ')
                law = title[:title.find(comment) - 1]
            except:
                comment = None
                law = title.replace(',', '')
            try:
                correction = m[-2].strip().replace('\n',
                                                   '').replace('&nbsp;', ' ')
                law = title[:title.find(correction) - 1]
            except:
                correction = None
            correction = normalize_correction_title_dashes(correction)
            law = law.strip().replace('\n', '').replace('&nbsp;', ' ')
            if law.find("הצעת ".decode("utf8")) == 0:
                law = law[5:]

            law_data = {
                'booklet': booklet,
                'link': pdf_link,
                'law': law,
                'correction': correction,
                'comment': comment,
                'date': pdf_data[j]['date']
            }
            if 'original_ids' in pdf_data[j]:
                law_data['original_ids'] = pdf_data[j]['original_ids']
            if 'bill' in pdf_data[j]:
                law_data['bill'] = pdf_data[j]['bill']
            self.laws_data.append(law_data)
            self.create_or_update_single_bill(
                data=law_data,
                pdf_link=pdf_link,
                link_file=pdf_data[j]['link_file'],
                gp=gp)
Example #3
0
    def update_single_bill(self, pdf_link, booklet=None, alt_title=None):
        gp = None
        if booklet is None:
            # get booklet from existing bill
            gps = GovProposal.objects.filter(source_url=pdf_link)
            if gps.count() < 1:
                logger.error('no existing object with given pdf link and no '
                             'booklet given. pdf_link = %s' % pdf_link)
                return
            gp = gps[0]
            booklet = gp.booklet_number
        pdf_data = self.parse_pdf(pdf_link)
        if pdf_data is None:
            return
        for j in range(len(pdf_data)):  # sometime there is more than 1 gov
            # bill in a pdf
            if alt_title:  # just use the given title
                title = alt_title
            else:  # get the title from the PDF file itself.
                # doesn't work so well
                title = pdf_data[j]['title']
            m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title)
            try:
                comment = m[-1].strip().replace('\n', '').replace(
                    '&nbsp;', ' ')
                law = title[:title.find(comment) - 1]
            except:
                comment = None
                law = title.replace(',', '')
            try:
                correction = m[-2].strip().replace('\n', '').replace(
                    '&nbsp;', ' ')
                law = title[:title.find(correction) - 1]
            except:
                correction = None
            correction = normalize_correction_title_dashes(correction)
            law = law.strip().replace('\n', '').replace('&nbsp;', ' ')
            if law.find("הצעת ".decode("utf8")) == 0:
                law = law[5:]

            law_data = {'booklet': booklet, 'link': pdf_link,
                        'law': law, 'correction': correction,
                        'comment': comment, 'date': pdf_data[j]['date']}
            if 'original_ids' in pdf_data[j]:
                law_data['original_ids'] = pdf_data[j]['original_ids']
            if 'bill' in pdf_data[j]:
                law_data['bill'] = pdf_data[j]['bill']
            self.laws_data.append(law_data)
            self.create_or_update_single_bill(
                data=law_data,
                pdf_link=pdf_link,
                link_file=pdf_data[j]['link_file'],
                gp=gp)
Example #4
0
 def parse_private_laws_page(self, soup):
     name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key(
         'valign') and tag['valign'] == 'Top')
     for tag in name_tag:
         tds = tag.findAll(lambda td: td.name == 'td')
         law_data = {}
         law_data['knesset_id'] = int(tds[0].string.strip())
         law_data['law_id'] = int(tds[1].string.strip())
         if tds[2].findAll('a')[0].has_key('href'):
             law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll(
                 'a')[0]['href']
         law_data['law_full_title'] = tds[3].string.strip()
         m = re.match(
             u'הצעת ([^\(,]*)(.*?\((.*?)\))?(.*?\((.*?)\))?(.*?,(.*))?',
             law_data['law_full_title'])
         if not m:
             logger.warn("can't parse proposal title: %s" %
                         law_data['law_full_title'])
             continue
         law_data['law_name'] = clean_line(m.group(1))
         comment1 = m.group(3)
         comment2 = m.group(5)
         if comment2:
             law_data['correction'] = clean_line(comment2)
             law_data['comment'] = comment1
         else:
             law_data['comment'] = None
             if comment1:
                 law_data['correction'] = clean_line(comment1)
             else:
                 law_data['correction'] = None
         law_data['correction'] = normalize_correction_title_dashes(
             law_data['correction'])
         law_data['law_year'] = m.group(7)
         law_data['proposal_date'] = datetime.datetime.strptime(
             tds[4].string.strip(), '%d/%m/%Y').date()
         names_string = ''.join(
             [unicode(y) for y in tds[5].findAll('font')[0].contents])
         names_string = clean_line(names_string)
         proposers = []
         joiners = []
         if re.search('ONMOUSEOUT', names_string) > 0:
             splitted_names = names_string.split('ONMOUSEOUT')
             joiners = [
                 name for name in re.match('(.*?)\',\'', splitted_names[0]).
                 group(1).split('<br />') if len(name) > 0
             ]
             proposers = splitted_names[1][10:].split('<br />')
         else:
             proposers = names_string.split('<br />')
         law_data['proposers'] = proposers
         law_data['joiners'] = joiners
         self.laws_data.append(law_data)
Example #5
0
    def parse_private_laws_page(self, soup):
        name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key('valign') and tag['valign'] == 'Top')
        for tag in name_tag:
            tds = tag.findAll(lambda td: td.name == 'td')
            law_data = {}
            law_data['knesset_id'] = int(tds[0].string.strip())
            law_data['law_id'] = int(tds[1].string.strip())
            if tds[2].findAll('a')[0].has_key('href'):
                law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll('a')[0]['href']
            law_data['law_full_title'] = tds[3].string.strip()
            parsed_law_title = laws_parser_utils.parse_title(law_data['law_full_title'])
            if not parsed_law_title:
                logger.warn("can't parse proposal title: %s" % law_data['law_full_title'])
                continue
            law_data['law_name'] = clean_line(parsed_law_title.group(1))
            comment1 = parsed_law_title.group(3)
            comment2 = parsed_law_title.group(5)
            if comment2:
                law_data['correction'] = clean_line(comment2)
                law_data['comment'] = comment1
            else:
                law_data['comment'] = None
                if comment1:
                    law_data['correction'] = clean_line(comment1)
                else:
                    law_data['correction'] = None
            law_data['correction'] = normalize_correction_title_dashes(law_data['correction'])
            law_data['law_year'] = parsed_law_title.group(7)
            law_data['proposal_date'] = datetime.datetime.strptime(tds[4].string.strip(), '%d/%m/%Y').date()
            names_string = ''.join([unicode(y) for y in tds[5].findAll('font')[0].contents])
            names_string = clean_line(names_string)
            proposers = []
            joiners = []
            # Old deprecated way to search for joiners
            if re.search('ONMOUSEOUT', names_string) > 0:
                splitted_names = names_string.split('ONMOUSEOUT')
                joiners = [name for name in re.match('(.*?)\',\'', splitted_names[0]).group(1).split('<br />') if
                           len(name) > 0]
                proposers = splitted_names[1][10:].split('<br />')
            else:
                proposers = names_string.split('<br />')

            more_joiners = [name for name in tds[6].findAll(text=lambda text: isinstance(text, NavigableString)) if
                            name.strip() not in [u'מצטרפים לחוק:', u'אין מצטרפים לחוק']]
            if len(more_joiners) and not joiners:
                joiners = more_joiners
            law_data['proposers'] = proposers
            law_data['joiners'] = joiners
            self.laws_data.append(law_data)
Example #6
0
    def parse_laws_page(self, soup):
        name_tags = soup.findAll(lambda tag: tag.name == 'a' and tag.has_key(
            'href') and tag['href'].find(".pdf") >= 0)
        for tag in name_tags:
            pdf_link = self.pdf_url + tag['href']
            booklet = re.search(r"/(\d+)/", tag['href']).groups(1)[0]
            if int(booklet) <= self.min_booklet:
                return False
            pdf_data = self.parse_pdf(pdf_link) or []
            for j in range(len(
                    pdf_data)):  # sometime there is more than 1 law in a pdf
                title = pdf_data[j]['title']
                m = re.findall('[^\(\)]*\((.*?)\)[^\(\)]', title)
                try:
                    comment = m[-1].strip().replace('\n',
                                                    '').replace('&nbsp;', ' ')
                    law = title[:title.find(comment) - 1]
                except:
                    comment = None
                    law = title.replace(',', '')
                try:
                    correction = m[-2].strip().replace('\n', '').replace(
                        '&nbsp;', ' ')
                    law = title[:title.find(correction) - 1]
                except:
                    correction = None
                correction = normalize_correction_title_dashes(correction)
                law = law.strip().replace('\n', '').replace('&nbsp;', ' ')
                if law.find("הצעת ".decode("utf8")) == 0:
                    law = law[5:]

                law_data = {
                    'booklet': booklet,
                    'link': pdf_link,
                    'law': law,
                    'correction': correction,
                    'comment': comment,
                    'date': pdf_data[j]['date']
                }
                if 'original_ids' in pdf_data[j]:
                    law_data['original_ids'] = pdf_data[j]['original_ids']
                if 'bill' in pdf_data[j]:
                    law_data['bill'] = pdf_data[j]['bill']
                self.laws_data.append(law_data)
        return True
Example #7
0
 def parse_private_laws_page(self, soup):
     name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key('valign') and tag['valign'] == 'Top')
     for tag in name_tag:
         tds = tag.findAll(lambda td: td.name == 'td')
         law_data = {}
         law_data['knesset_id'] = int(tds[0].string.strip())
         law_data['law_id'] = int(tds[1].string.strip())
         if tds[2].findAll('a')[0].has_key('href'):
             law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll('a')[0]['href']
         law_data['law_full_title'] = tds[3].string.strip()
         m = re.match(u'הצעת ([^\(,]*)(.*?\((.*?)\))?(.*?\((.*?)\))?(.*?,(.*))?', law_data['law_full_title'])
         if not m:
             logger.warn("can't parse proposal title: %s" % law_data['law_full_title'])
             continue
         law_data['law_name'] = clean_line(m.group(1))
         comment1 = m.group(3)
         comment2 = m.group(5)
         if comment2:
             law_data['correction'] = clean_line(comment2)
             law_data['comment'] = comment1
         else:
             law_data['comment'] = None
             if comment1:
                 law_data['correction'] = clean_line(comment1)
             else:
                 law_data['correction'] = None
         law_data['correction'] = normalize_correction_title_dashes(law_data['correction'])
         law_data['law_year'] = m.group(7)
         law_data['proposal_date'] = datetime.datetime.strptime(tds[4].string.strip(), '%d/%m/%Y').date()
         names_string = ''.join([unicode(y) for y in tds[5].findAll('font')[0].contents])
         names_string = clean_line(names_string)
         proposers = []
         joiners = []
         if re.search('ONMOUSEOUT', names_string) > 0:
             splitted_names = names_string.split('ONMOUSEOUT')
             joiners = [name for name in re.match('(.*?)\',\'', splitted_names[0]).group(1).split('<br />') if
                        len(name) > 0]
             proposers = splitted_names[1][10:].split('<br />')
         else:
             proposers = names_string.split('<br />')
         law_data['proposers'] = proposers
         law_data['joiners'] = joiners
         self.laws_data.append(law_data)
Example #8
0
    def parse_private_laws_page(self, soup):
        name_tag = soup.findAll(lambda tag: tag.name == 'tr' and tag.has_key(
            'valign') and tag['valign'] == 'Top')
        for tag in name_tag:
            tds = tag.findAll(lambda td: td.name == 'td')
            law_data = {}
            law_data['knesset_id'] = int(tds[0].string.strip())
            law_data['law_id'] = int(tds[1].string.strip())
            if tds[2].findAll('a')[0].has_key('href'):
                law_data['text_link'] = self.rtf_url + r"/" + tds[2].findAll(
                    'a')[0]['href']
            law_data['law_full_title'] = tds[3].string.strip()
            parsed_law_title = laws_parser_utils.parse_title(
                law_data['law_full_title'])
            if not parsed_law_title:
                logger.warn("can't parse proposal title: %s" %
                            law_data['law_full_title'])
                continue
            law_data['law_name'] = clean_line(parsed_law_title.group(1))
            comment1 = parsed_law_title.group(3)
            comment2 = parsed_law_title.group(5)
            if comment2:
                law_data['correction'] = clean_line(comment2)
                law_data['comment'] = comment1
            else:
                law_data['comment'] = None
                if comment1:
                    law_data['correction'] = clean_line(comment1)
                else:
                    law_data['correction'] = None
            law_data['correction'] = normalize_correction_title_dashes(
                law_data['correction'])
            law_data['law_year'] = parsed_law_title.group(7)
            law_data['proposal_date'] = datetime.datetime.strptime(
                tds[4].string.strip(), '%d/%m/%Y').date()
            names_string = ''.join(
                [unicode(y) for y in tds[5].findAll('font')[0].contents])
            names_string = clean_line(names_string)
            proposers = []
            joiners = []
            # Old deprecated way to search for joiners
            if re.search('ONMOUSEOUT', names_string) > 0:
                splitted_names = names_string.split('ONMOUSEOUT')
                joiners = [
                    name for name in re.match('(.*?)\',\'', splitted_names[0]).
                    group(1).split('<br />') if len(name) > 0
                ]
                proposers = splitted_names[1][10:].split('<br />')
            else:
                proposers = names_string.split('<br />')

            more_joiners = [
                name for name in tds[6].findAll(
                    text=lambda text: isinstance(text, NavigableString))
                if name.strip() not in [u'מצטרפים לחוק:', u'אין מצטרפים לחוק']
            ]
            if len(more_joiners) and not joiners:
                joiners = more_joiners
            law_data['proposers'] = proposers
            law_data['joiners'] = joiners
            self.laws_data.append(law_data)