Exemple #1
0
        def xt_admin_date(cls, raw_person):
            # Extract administration
            admin_datestring = Selector(text=raw_person).xpath(
                '//td[1]/span/@title').extract()[0]
            if ';' in admin_datestring:
                admin_datestring = admin_datestring.split(";")[0]

            if ',' in admin_datestring:
                admin_datestring = admin_datestring.split(",")[0]

            try:
                if " - " in admin_datestring:
                    start_date = _clean(admin_datestring.split(' - ')[0])
                    end_date = _clean(admin_datestring.split(' - ')[1])

                    start_date = datetime.datetime.strptime(
                        start_date, "%d.%m.%Y").date()
                    end_date = datetime.datetime.strptime(
                        end_date, "%d.%m.%Y").date()
                else:
                    start_date = datetime.datetime.strptime(
                        _clean(admin_datestring), "%d.%m.%Y").date()
                    end_date = None
            except:
                logger.error(
                    "Couldn't extract date from datestring {}".format(
                        admin_datestring))
                import ipdb
                ipdb.set_trace()

            return (start_date, end_date)
Exemple #2
0
        def xt(cls, response):
            ops = []
            raw_ops = response.xpath(cls.XPATH).extract()
            for raw_op in raw_ops[1:]:
                op_sel = Selector(text=raw_op)

                date = op_sel.xpath("//td[1]/text()").extract()
                date = date[0]

                url = op_sel.xpath("//td[2]/a/@href").extract()[0]
                parl_id = u"({})".format(op_sel.xpath("//td[2]/a/text()").extract()[0])

                title = op_sel.xpath("//td[3]/text()").extract()[0]
                if title:
                    title = _clean(title).replace("*", ", ")
                else:
                    title = None

                email = op_sel.xpath("//td[3]/a/@href").extract()
                if email:
                    email = email[0].replace("mailto:", "")
                    title = op_sel.xpath("//td[3]/a/text()").extract()[0]
                else:
                    email = None

                try:
                    date = datetime.datetime.strptime(_clean(date), "%d.%m.%Y").date()
                except:
                    date = None

                ops.append({"date": date, "url": url, "email": email, "title": title, "parl_id": parl_id})

            return ops
Exemple #3
0
        def xt(cls, response):
            ops = []
            raw_ops = response.xpath(cls.XPATH).extract()
            for raw_op in raw_ops[1:]:
                op_sel = Selector(text=raw_op)

                date = op_sel.xpath('//td[1]').xpath("normalize-space()").extract()[0]

                url = op_sel.xpath('//td[2]/a/@href').extract()[0]
                parl_id = u"({})".format(
                    op_sel.xpath('//td[3]/a').xpath('normalize-space()').extract()[0])

                title = op_sel.xpath('//td[2]').xpath('normalize-space()').extract()[0]
                if title:
                    title = _clean(title).replace("*", ", ")
                else:
                    title = None

                email = None

                try:
                    date = datetime.datetime.strptime(
                        _clean(date), "%d.%m.%Y").date()
                except:
                    date = None

                ops.append({
                    'date': date,
                    'url': url,
                    'email': email,
                    'title': title,
                    'parl_id': parl_id
                })

            return ops
Exemple #4
0
        def xt(cls, response):
            raw_signatures = response.xpath(cls.XPATH).extract()

            signatures = []
            for raw_signature in raw_signatures:
                sig_sel = Selector(text=raw_signature)
                signature_list = sig_sel.xpath('//td/text()').extract()
                if len(signature_list) > 0:
                    full_name = _clean(signature_list[0])

                    if len(signature_list) > 1:
                        postal_code = _clean(signature_list[1])
                    else:
                        postal_code = u''

                    if len(signature_list) > 2:
                        location = _clean(signature_list[2])
                    else:
                        location = u''

                    if len(signature_list) > 3:
                        raw_date = time.strptime(_clean(signature_list[3]), '%d.%m.%Y')
                        date = datetime.date.fromtimestamp(time.mktime(raw_date))
                    else:
                        date = datetime.date.fromtimestamp(0)

                    signatures.append({
                        'full_name': full_name,
                        'postal_code': postal_code,
                        'location': location,
                        'date': date
                    })

            return signatures
Exemple #5
0
                def xt(cls, step_selector):
                    title_selector = step_selector.xpath('//td[2]')[0]

                    # we have wortmeldungen!
                    if title_selector.xpath('//table'):
                        table_selector = title_selector.xpath('//table')[0]
                        raw_rows = [
                            Selector(text=raw_row) for raw_row in
                            table_selector.xpath('//tbody//tr').extract()
                        ]
                        statements = []
                        # Extract statements data
                        for index, row_selector in enumerate(raw_rows):
                            try:
                                person_source_link = row_selector.xpath(
                                    cls.XP_P_LINK).extract()[0]
                                person_name = row_selector.xpath(
                                    cls.XP_P_NAME).extract()
                                statement_type = _clean(
                                    row_selector.xpath(
                                        cls.XP_T_TYPE).extract()[0])
                                protocol_link = row_selector.xpath(
                                    cls.XP_PROT_LINK).extract()
                                protocol_text = _clean(
                                    remove_tags(
                                        row_selector.xpath(
                                            cls.XP_PROT_TEXT).extract()[0],
                                        'td a'))
                                statements.append({
                                    'index':
                                    index,
                                    'person_source_link':
                                    person_source_link,
                                    'person_name':
                                    person_name,
                                    'statement_type':
                                    statement_type,
                                    'protocol_link':
                                    protocol_link,
                                    'protocol_text':
                                    protocol_text,
                                })
                            except:
                                logger.error(
                                    "Skipping statement '{}' due to extraction error"
                                    .format(row_selector.extract()))
                                continue
                        title = {
                            'text': u'Wortmeldungen in der Debatte',
                            'statements': statements
                        }
                    else:
                        text = _clean(
                            remove_tags(
                                step_selector.xpath(cls.XPATH).extract()[0],
                                'td')).replace('<a href="',
                                               '<a href="{}'.format(BASE_HOST))
                        title = {'text': text}
                    return title
Exemple #6
0
            def xt(cls, response):
                bio = {
                    'birthdate': None,
                    'birthplace': '',
                    'deathdate': None,
                    'deathplace': '',
                    'occupation': ''
                }
                bio_data = response.xpath(cls.XPATH).extract()
                if bio_data:
                    bio_data = bio_data[0]
                else:
                    return bio

                # Birth Data
                for data in bio_data.split('<br>'):
                    birth = Selector(text=data)\
                        .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\
                        .extract()
                    if birth:
                        birth = birth[0]
                        bio['birthdate'] = _clean(birth.split(',')[0])
                        try:
                            bio['birthdate'] = datetime.datetime.strptime(
                                bio['birthdate'], "%d.%m.%Y").date()
                        except:
                            logger.error(
                                "Failed to parse birthdate: {}".format(
                                    bio['birthdate']))
                            bio['birthdate'] = None
                        if len(birth.split(',')) > 1:
                            bio['birthplace'] = birth.split(',')[1]

                    # Death Data
                    death = Selector(text=data)\
                        .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\
                        .extract()
                    if death:
                        death = death[0]
                        bio['deathdate'] = _clean(death.split(',')[0])
                        try:
                            bio['deathdate'] = datetime.datetime.strptime(
                                bio['deathdate'], "%d.%m.%Y").date()
                        except:
                            logger.error(
                                "Failed to parse deathdate: {}".format(
                                    bio['deathdate']))
                            bio['deathdate'] = None
                        if len(death.split(',')) > 1:
                            bio['deathplace'] = death.split(',')[1]

                    # Occupation
                    occupation = Selector(text=data)\
                        .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\
                        .extract()
                    if occupation:
                        occupation = occupation[0]
                        bio['occupation'] = occupation.split(',')[0]
                return bio
Exemple #7
0
                def xt(cls, step_selector):
                    title_selector = step_selector.xpath('//td[2]')[0]

                    # we have wortmeldungen!
                    if title_selector.xpath('//table'):
                        table_selector = title_selector.xpath('//table')[0]
                        raw_rows = [
                            Selector(text=raw_row)
                            for raw_row
                            in table_selector.xpath('//tbody//tr').extract()]
                        statements = []
                        # Extract statements data
                        for index, row_selector in enumerate(raw_rows):
                            if(row_selector.xpath(cls.XP_P_LINK).extract()):
                                person_source_link = row_selector.xpath(
                                    cls.XP_P_LINK).extract()[0]
                            else:
                                continue

                            person_name = row_selector.xpath(
                                cls.XP_P_NAME).extract()
                            if(row_selector.xpath(cls.XP_T_TYPE).extract()):
                                statement_type = _clean(
                                    row_selector.xpath(cls.XP_T_TYPE).extract()[0])
                            else:
                                continue
                            protocol_link = row_selector.xpath(
                                cls.XP_PROT_LINK).extract()
                            if(row_selector.xpath(
                                        cls.XP_PROT_TEXT).extract()):
                                protocol_text = _clean(
                                    remove_tags(
                                        row_selector.xpath(
                                            cls.XP_PROT_TEXT).extract()[0],
                                        'td a'))
                            else:
                                protocol_text = []
                            statements.append({
                                'index': index,
                                'person_source_link': person_source_link,
                                'person_name': person_name,
                                'statement_type': statement_type,
                                'protocol_link': protocol_link,
                                'protocol_text': protocol_text,
                            })
                        title = {
                            'text': u'Wortmeldungen in der Debatte',
                            'statements': statements
                        }                    
                    else:
                        text = _clean(
                            remove_tags(
                                step_selector.xpath(
                                    cls.XPATH).extract()[0],
                                'td')).replace('<a href="', '<a href="{}'.format(BASE_HOST))
                        title = {'text': text}
                    return title
Exemple #8
0
            def xt(cls, response):
                bio = {
                    'birthdate': None,
                    'birthplace': '',
                    'deathdate': None,
                    'deathplace': '',
                    'occupation': ''
                }
                bio_data = response.xpath(cls.XPATH).extract()
                if bio_data:
                    bio_data = bio_data[0]
                else:
                    return bio

                # Birth Data
                for data in bio_data.split('<br>'):
                    birth = Selector(text=data)\
                        .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\
                        .extract()
                    if birth:
                        birth = birth[0]
                        bio['birthdate'] = _clean(birth.split(',')[0])
                        try:
                            bio['birthdate'] = datetime.datetime.strptime(
                                bio['birthdate'], "%d.%m.%Y").date()
                        except:
                            logger.error("Failed to parse birthdate: {}".format(
                                bio['birthdate']))
                            bio['birthdate'] = None
                        if len(birth.split(',')) > 1:
                            bio['birthplace'] = birth.split(',')[1]

                    # Death Data
                    death = Selector(text=data)\
                        .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\
                        .extract()
                    if death:
                        death = death[0]
                        bio['deathdate'] = _clean(death.split(',')[0])
                        try:
                            bio['deathdate'] = datetime.datetime.strptime(
                                bio['deathdate'], "%d.%m.%Y").date()
                        except:
                            logger.error("Failed to parse deathdate: {}".format(
                                bio['deathdate']))
                            bio['deathdate'] = None
                        if len(death.split(',')) > 1:
                            bio['deathplace'] = death.split(',')[1]

                    # Occupation
                    occupation = Selector(text=data)\
                        .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\
                        .extract()
                    if occupation:
                        occupation = occupation[0]
                        bio['occupation'] = occupation.split(',')[0]
                return bio
Exemple #9
0
        def xt(cls, response):
            persons = []
            raw_persons = response.xpath(cls.XPATH).extract()
            for raw_person in raw_persons:
                person = Selector(text=raw_person)
                if person.xpath('//th'):
                    continue
                source_link = person.xpath(
                    '//td//a/@href').extract()[0]
                reversed_name = _clean(
                    Selector(
                        text=remove_tags(raw_person, 'img')
                    ).xpath('//td//a/text()').extract()[0])
                if ' siehe ' in reversed_name:
                    reversed_name = reversed_name.split(' siehe ')[1]
                admin_title = person.xpath(
                    '//td[1]/span/text()').extract()

                (admin_start_date, admin_end_date) = cls.xt_admin_date(
                    raw_person)

                administration = {
                    'title': admin_title,
                    'start_date': admin_start_date,
                    'end_date': admin_end_date
                }
                # TODO EXTRACT DATE(S) FROM BUNDESMINISTERIUM td
                # TODO ADD EITHER DATE(S) TO FUNCTION
                try:
                    if person.xpath('//tr//td[3]/span/text()'):
                        function_short = person.xpath(
                            '//td[3]/span/text()').extract()[0]
                        function_title = person.xpath(
                            '//td[3]/span/@title').extract()[0]

                    elif person.xpath('//tr//td[3]/text()'):
                        function_short = _clean(person.xpath(
                            '//td[3]/text()').extract()[0])
                        function_title = ''
                except:
                    import ipdb
                    ipdb.set_trace()
                mandate = {
                    'short': function_short,
                    'title': function_title,
                    'administration': administration}

                persons.append({
                    'source_link': source_link,
                    'reversed_name': reversed_name,
                    'mandate': mandate,
                })

            return persons
Exemple #10
0
            def xt(cls, response):
                mandates_raw = response.xpath(cls.XPATH).extract()
                mandates = []
                for mandate in mandates_raw:
                    mandate = _clean(remove_tags(mandate, 'li'))

                    if "<div" in mandate and "</div>" in mandate:
                        mandate = _clean(
                            remove_tags(
                                Selector(
                                    text=mandate).xpath("//div").extract()[0],
                                'div'))

                    function = mandate.split(u'<br>')[0].split(',')[0]
                    party = mandate.split(u'<br>')[0].split(',')[1]

                    # Start Date
                    try:
                        start_date = _clean(
                            mandate.split('<br>')[1].split(u'\u2013')[0])

                        start_date = datetime.datetime.strptime(
                            start_date, "%d.%m.%Y").date()
                    except:
                        logger.error(
                            u"Failed to parse mandate start date: {}".format(
                                start_date))
                        start_date = None

                    # End Date
                    try:
                        end_date = mandate.split('<br>')[1].split(u'\u2013')
                        if len(end_date) > 1 and end_date[1]:
                            end_date = datetime.datetime.strptime(
                                _clean(end_date[1]), "%d.%m.%Y").date()
                        else:
                            end_date = None
                    except:
                        logger.error(
                            u"Failed to parse mandate end date: {}".format(
                                end_date))
                        end_date = None

                    mandates.append({
                        'function': function,
                        'party': _clean(party),
                        'start_date': start_date,
                        'end_date': end_date,
                    })

                return mandates
Exemple #11
0
            def xt(cls, response):
                mandates_raw = response.xpath(cls.XPATH).extract()
                mandates = []
                for mandate in mandates_raw:
                    mandate = _clean(remove_tags(mandate, 'li'))

                    if "<div" in mandate and "</div>" in mandate:
                        mandate = _clean(remove_tags(
                            Selector(text=mandate).xpath("//div").extract()[0],
                            'div'))

                    function = mandate.split(u'<br>')[0].split(',')[0]
                    party = mandate.split(u'<br>')[0].split(',')[1]

                    # Start Date
                    try:
                        start_date = _clean(
                            mandate.split('<br>')[1].split(u'\u2013')[0])

                        start_date = datetime.datetime.strptime(
                            start_date, "%d.%m.%Y").date()
                    except:
                        logger.error(
                            u"Failed to parse mandate start date: {}".format(start_date))
                        start_date = None

                    # End Date
                    try:
                        end_date = mandate.split(
                            '<br>')[1].split(u'\u2013')
                        if len(end_date) > 1 and end_date[1]:
                            end_date = datetime.datetime.strptime(
                                _clean(end_date[1]), "%d.%m.%Y").date()
                        else:
                            end_date = None
                    except:
                        logger.error(
                            u"Failed to parse mandate end date: {}".format(end_date))
                        end_date = None

                    mandates.append({
                        'function': function,
                        'party': _clean(party),
                        'start_date': start_date,
                        'end_date': end_date,
                    })

                return mandates
Exemple #12
0
        def xt(cls, response):
            XPATH_BI_creator = cls.XPATH.format("Erstunterzeichner")
            XPATH_PET_creator = cls.XPATH.format("eine Petition")

            creators = []

            raw_creators_list = response.xpath(XPATH_PET_creator).extract()
            if len(raw_creators_list) > 0:
                # PET started by members of parliament
                for raw_creator in raw_creators_list:
                    creator_sel = Selector(text=raw_creator)
                    raw_parl_id_url = creator_sel.xpath("//a/@href").extract()
                    name = u''
                    parl_id = u''
                    if len(raw_parl_id_url) > 0:
                        raw_parl_id = raw_parl_id_url[0].split("/")
                        if len(raw_parl_id) > 1:
                            parl_id = raw_parl_id[2]
                    raw_name = creator_sel.xpath("//a/text()").extract()
                    if len(raw_name) > 0:
                        name = raw_name[0]
                    if parl_id != u'' and name != u'':
                        creators.append((parl_id, name))
            else:
                raw_creators_list = response.xpath(XPATH_BI_creator).extract()
                if len(raw_creators_list) > 0:
                    # BI first signed by a person
                    name = _clean(raw_creators_list[0].split("\t")[1])
                    creators.append(("", name))
                # VBG seem to have no visible "starter"

            return creators
Exemple #13
0
        def xt(cls, response):
            persons = []
            raw_persons = response.xpath(cls.XPATH).extract()
            for raw_person in raw_persons:
                source_link = Selector(text=raw_person).xpath(
                    '//td//a/@href').extract()[0]
                reversed_name = _clean(
                    Selector(text=raw_person).xpath('//td//a/text()').extract()[0])
                if ' siehe ' in reversed_name:
                    reversed_name = reversed_name.split(' siehe ')[1]

                mandates = []
                party_spans = Selector(text=raw_person).xpath(
                    '//td[2]//span').extract()
                for party_span in party_spans:
                    party_short = Selector(text=party_span).xpath(
                        '//span/text()').extract()[0]
                    party_title = Selector(text=party_span).xpath(
                        '//span/@title').extract()[0]
                    mandates.append(
                        {'short': party_short, 'title': party_title})
                electoral_state = {
                    'short': Selector(text=raw_person).xpath('//td[last()]//span/text()').extract()[0],
                    'long': Selector(text=raw_person).xpath('//td[last()]//span/@title').extract()[0]}

                persons.append({
                    'source_link': source_link,
                    'reversed_name': reversed_name,
                    'mandates': mandates,
                    'electoral_state': electoral_state,
                })

            return persons
Exemple #14
0
        def xt(cls, response):
            persons = []
            raw_persons = response.xpath(cls.XPATH).extract()
            for raw_person in raw_persons:
                person = Selector(text=raw_person)
                if person.xpath('//th'):
                    continue
                source_link = person.xpath(
                    '//td//a/@href').extract()[0]
                reversed_name = _clean(
                    Selector(
                        text=remove_tags(raw_person, 'img')
                    ).xpath('//td//a/text()').extract()[0])

                (pres_start_date, pres_end_date) = cls.xt_pres_date(
                    raw_person)

                mandate = {
                    'title': u'RechnungshofpräsidentIn',
                    'short': u'RH-PräsidentIn',
                    'start_date': pres_start_date,
                    'end_date': pres_end_date
                }
                persons.append({
                    'source_link': source_link,
                    'reversed_name': reversed_name,
                    'mandate': mandate,
                })

            return persons
Exemple #15
0
        def xt(cls, response):
            XPATH_BI_creator = cls.XPATH.format("Erstunterzeichner")
            XPATH_PET_creator = cls.XPATH.format("eine Petition")

            creators = []

            raw_creators_list = response.xpath(XPATH_PET_creator).extract()
            if len(raw_creators_list) > 0:
                # PET started by members of parliament
                for raw_creator in raw_creators_list:
                    creator_sel = Selector(text=raw_creator)
                    raw_parl_id_url = creator_sel.xpath("//a/@href").extract()
                    name = u''
                    parl_id = u''
                    if len(raw_parl_id_url) > 0:
                        raw_parl_id = raw_parl_id_url[0].split("/")
                        if len(raw_parl_id) > 1:
                            parl_id = raw_parl_id[2]
                    raw_name = creator_sel.xpath("//a/text()").extract()
                    if len(raw_name) > 0:
                        name = raw_name[0]
                    if parl_id != u'' and name != u'':
                        creators.append((parl_id, name))
            else:
                raw_creators_list = response.xpath(XPATH_BI_creator).extract()
                if len(raw_creators_list) > 0:
                    # BI first signed by a person
                    name = _clean(raw_creators_list[0].split("\t")[1])
                    creators.append(("", name))
                # VBG seem to have no visible "starter"

            return creators
Exemple #16
0
        def xt(cls, response):
            persons = []
            raw_persons = response.xpath(cls.XPATH).extract()
            for raw_person in raw_persons:
                source_link = Selector(text=raw_person).xpath(
                    '//td//a/@href').extract()[0]
                reversed_name = _clean(
                    Selector(text=raw_person).xpath('//td//a/text()').extract()[0])
                if ' siehe ' in reversed_name:
                    reversed_name = reversed_name.split(' siehe ')[1]

                mandates = []
                party_spans = Selector(text=raw_person).xpath(
                    '//td[2]//span').extract()
                for party_span in party_spans:
                    party_short = Selector(text=party_span).xpath(
                        '//span/text()').extract()[0]
                    party_title = Selector(text=party_span).xpath(
                        '//span/@title').extract()[0]
                    mandates.append(
                        {'short': party_short, 'title': party_title})

                electoral_state = {
                    'short': Selector(text=raw_person).xpath('//td[4]//span/text()').extract()[0],
                    'long': Selector(text=raw_person).xpath('//td[4]//span/@title').extract()[0]}

                persons.append({
                    'source_link': source_link,
                    'reversed_name': reversed_name,
                    'mandates': mandates,
                    'electoral_state': electoral_state,
                })

            return persons
Exemple #17
0
                def xt(cls, step_selector):
                    title_selector = step_selector.xpath("//td[2]")[0]

                    # we have wortmeldungen!
                    if title_selector.xpath("//table"):
                        table_selector = title_selector.xpath("//table")[0]
                        raw_rows = [Selector(text=raw_row) for raw_row in table_selector.xpath("//tbody//tr").extract()]
                        statements = []
                        # Extract statements data
                        for index, row_selector in enumerate(raw_rows):
                            if row_selector.xpath(cls.XP_P_LINK).extract():
                                person_source_link = row_selector.xpath(cls.XP_P_LINK).extract()[0]
                            else:
                                continue

                            person_name = row_selector.xpath(cls.XP_P_NAME).extract()
                            if row_selector.xpath(cls.XP_T_TYPE).extract():
                                statement_type = _clean(row_selector.xpath(cls.XP_T_TYPE).extract()[0])
                            else:
                                continue
                            protocol_link = row_selector.xpath(cls.XP_PROT_LINK).extract()
                            if row_selector.xpath(cls.XP_PROT_TEXT).extract():
                                protocol_text = _clean(
                                    remove_tags(row_selector.xpath(cls.XP_PROT_TEXT).extract()[0], "td a")
                                )
                            else:
                                protocol_text = []
                            statements.append(
                                {
                                    "index": index,
                                    "person_source_link": person_source_link,
                                    "person_name": person_name,
                                    "statement_type": statement_type,
                                    "protocol_link": protocol_link,
                                    "protocol_text": protocol_text,
                                }
                            )
                        title = {"text": u"Wortmeldungen in der Debatte", "statements": statements}
                    else:
                        text = _clean(remove_tags(step_selector.xpath(cls.XPATH).extract()[0], "td")).replace(
                            '<a href="', '<a href="{}'.format(BASE_HOST)
                        )
                        title = {"text": text}
                    return title
Exemple #18
0
        def xt(cls, response):
            rows = response.xpath(cls.XPATH)

            for row in rows:
                raw_active = row.xpath('td[2]/text()').extract()
                if len(raw_active) > 0:
                    active = _clean(raw_active[0])
                    if active == u'Aufl\xf6sung':
                        return False

            return True
        def xt(cls, response):
            rows = response.xpath(cls.XPATH)

            for row in rows:
                raw_active = row.xpath('td[2]/text()').extract()
                if len(raw_active) > 0:
                    active = _clean(raw_active[0])
                    if active == u'Aufl\xf6sung':
                        return False

            return True
Exemple #20
0
        def xt(cls, response):
            raw_signatures = response.xpath(cls.XPATH).extract()

            signatures = []
            for raw_signature in raw_signatures:
                sig_sel = Selector(text=raw_signature)
                signature_list = sig_sel.xpath('//td/text()').extract()
                if len(signature_list) > 0:
                    full_name = _clean(signature_list[0])

                    if len(signature_list) > 1:
                        postal_code = _clean(signature_list[1])
                    else:
                        postal_code = u''

                    if len(signature_list) > 2:
                        location = _clean(signature_list[2])
                    else:
                        location = u''

                    if len(signature_list) > 3:
                        raw_date = time.strptime(_clean(signature_list[3]),
                                                 '%d.%m.%Y')
                        date = datetime.date.fromtimestamp(
                            time.mktime(raw_date))
                    else:
                        date = datetime.date.fromtimestamp(0)

                    signatures.append({
                        'full_name': full_name,
                        'postal_code': postal_code,
                        'location': location,
                        'date': date
                    })

            return signatures
        def xt(cls, response):
            raw_laws = response.xpath(cls.XPATH_LAWS)
            raw_reports = response.xpath(cls.XPATH_REPORTS)

            raw_laws = raw_laws + raw_reports

            laws = []

            for raw_law in raw_laws:
                raw_title = raw_law.xpath('text()').extract()

                if len(raw_title) > 0:
                    law_title = _clean(raw_title[0])
                else:
                    law_title = u''

                raw_link = raw_law.xpath('@href').extract()

                if len(raw_link) > 0:
                    law_link = raw_link[0]
                    law_llp, law_parl_id = COMITTEE.url_to_parlid(law_link)

                    law_link = "{}/{}".format(BASE_HOST, law_link)
                else:
                    # without a link we can't get the necessary info
                    continue
                if law_llp != u'' and law_parl_id != u'':
                    law = {
                        'title': law_title,
                        'source_link': law_link,
                        'parl_id': law_parl_id,
                        'llp': law_llp,
                    }

                    laws.append(law)

            return laws
Exemple #22
0
        def xt(cls, response):
            raw_laws = response.xpath(cls.XPATH_LAWS)
            raw_reports = response.xpath(cls.XPATH_REPORTS)

            raw_laws = raw_laws + raw_reports

            laws = []

            for raw_law in raw_laws:
                raw_title = raw_law.xpath('text()').extract()

                if len(raw_title) > 0:
                    law_title = _clean(raw_title[0])
                else:
                    law_title = u''

                raw_link = raw_law.xpath('@href').extract()

                if len(raw_link) > 0:
                    law_link = raw_link[0]
                    law_llp, law_parl_id = COMITTEE.url_to_parlid(law_link)

                    law_link = "{}/{}".format(BASE_HOST, law_link)
                else:
                    # without a link we can't get the necessary info
                    continue
                if law_llp != u'' and law_parl_id != u'':
                    law = {
                        'title': law_title,
                        'source_link': law_link,
                        'parl_id': law_parl_id,
                        'llp': law_llp,
                    }

                    laws.append(law)

            return laws
Exemple #23
0
            def xt(cls, response):
                mandates_raw = response.xpath(cls.XPATH).extract()
                mandates = []
                for mandate in mandates_raw:
                    mandate = _clean(remove_tags(mandate, 'li'))

                    if "<div" in mandate and "</div>" in mandate:
                        mandate = _clean(remove_tags(
                            Selector(text=mandate).xpath("//div").extract()[0],
                            'div'))

                    function = mandate.split(u'<br>')[0].split(',')[0]
                    party = mandate.split(u'<br>')[0].split(',')[1] if ',' in mandate.split(u'<br />')[0] else ''
                    llp_raw = re.match(
                        '^.*\((.*)\. GP\).*$', function
                        )
                    function = re.sub(
                        '\((.*)\. GP\)','', function
                        ).strip()

                    m_llp_roman_begin = \
                        m_llp_roman_end = \
                            llp_raw.group(1) if llp_raw else ''

                    if u'–' in m_llp_roman_begin:
                        m_llp_roman_begin,m_llp_roman_end = m_llp_roman_begin.split(u'–')

                    for llp in range(roman.fromRoman(m_llp_roman_begin.strip('. ')),
                                    roman.fromRoman(m_llp_roman_end.strip('. '))+1
                                    ) if m_llp_roman_begin else [None]:
                        llp_roman = roman.toRoman(llp) if llp else None

                        # Start Date
                        try:
                            start_date = _clean(
                                mandate.split('<br>')[1].split(u'\u2013')[0])

                            start_date = datetime.datetime.strptime(
                                start_date, "%d.%m.%Y").date()
                        except:
                            logger.error(
                                u"Failed to parse mandate start date: {}".format(start_date))
                            start_date = None

                        # End Date
                        try:
                            end_date = mandate.split(
                                '<br>')[1].split(u'\u2013')
                            if len(end_date) > 1 and end_date[1]:
                                end_date = datetime.datetime.strptime(
                                    _clean(end_date[1]), "%d.%m.%Y").date()
                            else:
                                end_date = None
                        except:
                            logger.error(
                                u"Failed to parse mandate end date: {}".format(end_date))
                            end_date = None

                        mandates.append({
                            'function': function,
                            'party': _clean(party),
                            'start_date': start_date,
                            'end_date': end_date,
                            'llp': llp,
                            'llp_roman': llp_roman,
                        })

                return mandates
        def xt(cls, response):
            raw_memberships = response.xpath(cls.XPATH)

            memberships = []

            for raw_membership in raw_memberships:
                raw_llp = raw_membership.xpath('a[1]/text()').extract()[1]
                nrbr = u'Nationalrat'
                comittee_llp = None
                if nrbr in raw_llp:
                    comittee_llp = raw_llp.split()[-2][:-1]
                else:
                    nrbr = u'Bundesrat'

                tablerows = raw_membership.xpath('following-sibling::div[1]/table[1]/tbody/tr').extract()

                last_function = u''
                for row in tablerows:
                    row_sel = Selector(text=row)

                    raw_function = row_sel.xpath('//td[@class="biogr_am_funktext"]/text()').extract()
                    if len(raw_function) > 0:
                        function = _clean(raw_function[0])
                        # TODO: standardization of functions should be done on model level
                        last_function = function
                    else:
                        function = last_function

                    raw_comittee_link = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/a/@href').extract()
                    if raw_comittee_link:
                        comittee_link = raw_comittee_link[0]
                        comittee_link = "{}/{}".format(BASE_HOST, comittee_link)
                    else:
                        comittee_link = u''

                    _,comittee_parl_id = COMITTEE.url_to_parlid(comittee_link)

                    raw_comitee_name = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/a/text()').extract()
                    if len(raw_comitee_name) > 0:
                        comittee_name = _clean(raw_comitee_name[0])
                    else:
                        raw_comitee_name = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/text()').extract()
                        if len(raw_comitee_name) > 0:
                            comittee_name = _clean(raw_comitee_name[0])
                        else:
                            comittee_name = u''

                    raw_dates = row_sel.xpath('//td[@class="biogr_am_vonbis"]/text()').extract()[0]
                    if raw_dates:
                        raw_dates = _clean(raw_dates)
                        # \u2013 == - (dash)
                        raw_dates = raw_dates.split(u'\u2013')
                        if len(raw_dates) > 0:
                            raw_from = raw_dates[0]
                            if raw_from is not u'':
                                raw_from = time.strptime(raw_from, '%d.%m.%Y')
                                date_from = datetime.datetime.fromtimestamp(time.mktime(raw_from))
                            else:
                                date_from = None
                        else:
                            date_from = None

                        if len(raw_dates) > 1:
                            raw_to = raw_dates[1]
                            if raw_to is not u'':
                                raw_to = time.strptime(raw_to, '%d.%m.%Y')
                                date_to = datetime.datetime.fromtimestamp(time.mktime(raw_to))
                            else:
                                date_to = None
                        else:
                            date_to = None

                    # we cant add the membership if the parl_id of the comitee is empty
                    if comittee_parl_id is not u'':
                        memberships.append({
                            'comittee':
                                {
                                    'name': comittee_name,
                                    'parl_id': comittee_parl_id,
                                    'nrbr': nrbr,
                                    'legislative_period': comittee_llp,
                                    'source_link': comittee_link
                                },
                            'function': function,
                            'date_from': date_from,
                            'date_to': date_to
                        })

            return memberships
Exemple #25
0
        def xt(cls, response):
            raw_memberships = response.xpath(cls.XPATH)

            memberships = []

            for raw_membership in raw_memberships:
                raw_llp = raw_membership.xpath('a[1]/text()').extract()[1]
                nrbr = u'Nationalrat'
                comittee_llp = None
                if nrbr in raw_llp:
                    comittee_llp = raw_llp.split()[-2][:-1]
                else:
                    nrbr = u'Bundesrat'

                tablerows = raw_membership.xpath(
                    'following-sibling::div[1]/table[1]/tbody/tr').extract()

                last_function = u''
                for row in tablerows:
                    row_sel = Selector(text=row)

                    raw_function = row_sel.xpath(
                        '//td[@class="biogr_am_funktext"]/text()').extract()
                    if len(raw_function) > 0:
                        function = _clean(raw_function[0])
                        # TODO: standardization of functions should be done on model level
                        last_function = function
                    else:
                        function = last_function

                    raw_comittee_link = row_sel.xpath(
                        '//td[@class="biogr_am_ausschuss"]/a/@href').extract()
                    if raw_comittee_link:
                        comittee_link = raw_comittee_link[0]
                        comittee_link = "{}{}".format(BASE_HOST, comittee_link)
                    else:
                        comittee_link = u''

                    _, comittee_parl_id = COMITTEE.url_to_parlid(comittee_link)

                    raw_comitee_name = row_sel.xpath(
                        '//td[@class="biogr_am_ausschuss"]/a/text()').extract(
                        )
                    if len(raw_comitee_name) > 0:
                        comittee_name = _clean(raw_comitee_name[0])
                    else:
                        raw_comitee_name = row_sel.xpath(
                            '//td[@class="biogr_am_ausschuss"]/text()'
                        ).extract()
                        if len(raw_comitee_name) > 0:
                            comittee_name = _clean(raw_comitee_name[0])
                        else:
                            comittee_name = u''

                    raw_dates = row_sel.xpath(
                        '//td[@class="biogr_am_vonbis"]/text()').extract()[0]
                    if raw_dates:
                        raw_dates = _clean(raw_dates)
                        # \u2013 == - (dash)
                        raw_dates = raw_dates.split(u'\u2013')
                        if len(raw_dates) > 0:
                            raw_from = raw_dates[0]
                            if raw_from is not u'':
                                raw_from = time.strptime(raw_from, '%d.%m.%Y')
                                date_from = datetime.datetime.fromtimestamp(
                                    time.mktime(raw_from))
                            else:
                                date_from = None
                        else:
                            date_from = None

                        if len(raw_dates) > 1:
                            raw_to = raw_dates[1]
                            if raw_to is not u'':
                                raw_to = time.strptime(raw_to, '%d.%m.%Y')
                                date_to = datetime.datetime.fromtimestamp(
                                    time.mktime(raw_to))
                            else:
                                date_to = None
                        else:
                            date_to = None

                    # we cant add the membership if the parl_id of the comitee is empty
                    if comittee_parl_id is not u'':
                        memberships.append({
                            'comittee': {
                                'name': comittee_name,
                                'parl_id': comittee_parl_id,
                                'nrbr': nrbr,
                                'legislative_period': comittee_llp,
                                'source_link': comittee_link
                            },
                            'function': function,
                            'date_from': date_from,
                            'date_to': date_to
                        })

            return memberships
        def xt(cls, response):
            raw_meetings = response.xpath(cls.XPATH)

            meetings = []

            for raw_meeting in raw_meetings:
                raw_header_row = raw_meeting.xpath('tr[@class="historyHeader"]')
                raw_date = raw_header_row.xpath('td[1]/text()').extract()

                if len(raw_date) > 0:
                    raw_date = _clean(raw_date[0])
                    if raw_date is not u'':
                        raw_date = time.strptime(raw_date, '%d.%m.%Y')
                        meeting_date = datetime.datetime.fromtimestamp(time.mktime(raw_date))
                    else:
                        meeting_date = None
                else:
                    meeting_date = None

                raw_number = raw_header_row.xpath('td[2]/em/a/text()').extract()
                if len(raw_number) > 0 and u'Sitzung' in raw_number[0]:
                    meeting_number = raw_number[0].split()[0][:-1]
                else:
                    continue  # not a meeting

                raw_document_urls = raw_header_row.xpath('td[2]/a/@href').extract()

                html_link, pdf_link = u"", u""
                for url in raw_document_urls:
                    if url.endswith('.pdf'):
                        pdf_link = url
                        if not pdf_link.startswith(BASE_HOST):
                            pdf_link = "{}/{}".format(BASE_HOST, pdf_link)
                    elif url.endswith('.html'):
                        html_link = url
                        if not html_link.startswith(BASE_HOST):
                            html_link = "{}/{}".format(BASE_HOST, html_link)
                title = u'Tagesordnung der {}. Sitzung des {} am {}'\
                    .format(meeting_number, COMITTEE.NAME.xt(response), str(meeting_date.date()))

                if html_link != u'' or pdf_link != u'':
                    meeting_document = {
                        'title': title,
                        'html_link': html_link,
                        'pdf_link': pdf_link
                    }
                else:
                    meeting_document = None

                raw_rows = raw_header_row.xpath('following-sibling::tr')

                meeting_topics = []

                for raw_row in raw_rows:
                    raw_topic_number = raw_row.xpath('td[1]/text()').extract()

                    if len(raw_topic_number) > 0:
                        topic_number_list = _clean(raw_topic_number[0]).split()
                        if len(topic_number_list) == 2 and topic_number_list[0] == u'TOP':
                            topic_number = int(topic_number_list[1])
                        else:
                            topic_number = 0
                    else:
                        topic_number = 0

                    raw_topic_text = raw_row.xpath('td[2]/text()').extract()

                    if len(raw_topic_text) > 0:
                        topic_text = _clean(raw_topic_text[0])
                        if topic_text.endswith('('):
                            topic_text = topic_text[:-1].rstrip()
                    else:
                        topic_text = u''

                    if len(raw_topic_text) > 1:
                        topic_comment = _clean(raw_topic_text[1])
                        if topic_comment.startswith(')'):
                            topic_comment = topic_comment[:-1].lstrip()
                    else:
                        topic_comment = u''

                    raw_topic_law_text = raw_row.xpath('td[2]/a/text()').extract()

                    if len(raw_topic_law_text) > 0:
                        topic_law_text = u'({})'.format(raw_topic_law_text[0])
                    else:
                        topic_law_text = u''

                    topic_text = u'{} {}'.format(topic_text,topic_law_text)

                    raw_topic_law_link = raw_row.xpath('td[2]/a/@href').extract()

                    if len(raw_topic_law_link) > 0:
                        topic_law_llp, topic_law_id = COMITTEE.url_to_parlid(raw_topic_law_link[0])
                    else:
                        topic_law_llp, topic_law_id = u'', u''

                    if topic_law_id != u'':
                        topic_law = {
                            'parl_id': topic_law_id,
                            'llp': topic_law_llp
                        }
                    else:
                        topic_law = None

                    topic = {
                        'number': topic_number,
                        'text': topic_text,
                        'comment': topic_comment,
                        'law': topic_law
                    }

                    meeting_topics.append(topic)

                meeting = {
                    'number': meeting_number,
                    'date': meeting_date,
                    'agenda': meeting_document,
                    'topics': meeting_topics,
                }

                meetings.append(meeting)

            return meetings
Exemple #27
0
        def xt(cls, response):
            raw_meetings = response.xpath(cls.XPATH)

            meetings = []

            for raw_meeting in raw_meetings:
                raw_header_row = raw_meeting.xpath(
                    'tr[@class="historyHeader"]')
                raw_date = raw_header_row.xpath('td[1]/text()').extract()

                if len(raw_date) > 0:
                    raw_date = _clean(raw_date[0])
                    if raw_date is not u'':
                        raw_date = time.strptime(raw_date, '%d.%m.%Y')
                        meeting_date = datetime.datetime.fromtimestamp(
                            time.mktime(raw_date))
                    else:
                        meeting_date = None
                else:
                    meeting_date = None

                raw_number = raw_header_row.xpath(
                    'td[2]/em/a/text()').extract()
                if len(raw_number) > 0 and u'Sitzung' in raw_number[0]:
                    meeting_number = raw_number[0].split()[0][:-1]
                else:
                    continue  # not a meeting

                raw_document_urls = raw_header_row.xpath(
                    'td[2]/a/@href').extract()

                html_link, pdf_link = u"", u""
                for url in raw_document_urls:
                    if url.endswith('.pdf'):
                        pdf_link = url
                        if not pdf_link.startswith(BASE_HOST):
                            pdf_link = "{}/{}".format(BASE_HOST, pdf_link)
                    elif url.endswith('.html'):
                        html_link = url
                        if not html_link.startswith(BASE_HOST):
                            html_link = "{}/{}".format(BASE_HOST, html_link)
                title = u'Tagesordnung der {}. Sitzung des {} am {}'\
                    .format(meeting_number, COMITTEE.NAME.xt(response), str(meeting_date.date()))

                if html_link != u'' or pdf_link != u'':
                    meeting_document = {
                        'title': title,
                        'html_link': html_link,
                        'pdf_link': pdf_link
                    }
                else:
                    meeting_document = None

                raw_rows = raw_header_row.xpath('following-sibling::tr')

                meeting_topics = []

                for raw_row in raw_rows:
                    raw_topic_number = raw_row.xpath('td[1]/text()').extract()

                    if len(raw_topic_number) > 0:
                        topic_number_list = _clean(raw_topic_number[0]).split()
                        if len(topic_number_list
                               ) == 2 and topic_number_list[0] == u'TOP':
                            topic_number = int(topic_number_list[1])
                        else:
                            topic_number = 0
                    else:
                        topic_number = 0

                    raw_topic_text = raw_row.xpath('td[2]/text()').extract()

                    if len(raw_topic_text) > 0:
                        topic_text = _clean(raw_topic_text[0])
                        if topic_text.endswith('('):
                            topic_text = topic_text[:-1].rstrip()
                    else:
                        topic_text = u''

                    if len(raw_topic_text) > 1:
                        topic_comment = _clean(raw_topic_text[1])
                        if topic_comment.startswith(')'):
                            topic_comment = topic_comment[:-1].lstrip()
                    else:
                        topic_comment = u''

                    raw_topic_law_text = raw_row.xpath(
                        'td[2]/a/text()').extract()

                    if len(raw_topic_law_text) > 0:
                        topic_law_text = u'({})'.format(raw_topic_law_text[0])
                    else:
                        topic_law_text = u''

                    topic_text = u'{} {}'.format(topic_text, topic_law_text)

                    raw_topic_law_link = raw_row.xpath(
                        'td[2]/a/@href').extract()

                    if len(raw_topic_law_link) > 0:
                        topic_law_llp, topic_law_id = COMITTEE.url_to_parlid(
                            raw_topic_law_link[0])
                    else:
                        topic_law_llp, topic_law_id = u'', u''

                    if topic_law_id != u'':
                        topic_law = {
                            'parl_id': topic_law_id,
                            'llp': topic_law_llp
                        }
                    else:
                        topic_law = None

                    topic = {
                        'number': topic_number,
                        'text': topic_text,
                        'comment': topic_comment,
                        'law': topic_law
                    }

                    meeting_topics.append(topic)

                meeting = {
                    'number': meeting_number,
                    'date': meeting_date,
                    'agenda': meeting_document,
                    'topics': meeting_topics,
                }

                meetings.append(meeting)

            return meetings