def xt_admin_date(cls, raw_person): # Extract administration admin_datestring = Selector(text=raw_person).xpath( '//td[1]/span/@title').extract()[0] if ';' in admin_datestring: admin_datestring = admin_datestring.split(";")[0] if ',' in admin_datestring: admin_datestring = admin_datestring.split(",")[0] try: if " - " in admin_datestring: start_date = _clean(admin_datestring.split(' - ')[0]) end_date = _clean(admin_datestring.split(' - ')[1]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() end_date = datetime.datetime.strptime( end_date, "%d.%m.%Y").date() else: start_date = datetime.datetime.strptime( _clean(admin_datestring), "%d.%m.%Y").date() end_date = None except: logger.error( "Couldn't extract date from datestring {}".format( admin_datestring)) import ipdb ipdb.set_trace() return (start_date, end_date)
def xt(cls, response): ops = [] raw_ops = response.xpath(cls.XPATH).extract() for raw_op in raw_ops[1:]: op_sel = Selector(text=raw_op) date = op_sel.xpath("//td[1]/text()").extract() date = date[0] url = op_sel.xpath("//td[2]/a/@href").extract()[0] parl_id = u"({})".format(op_sel.xpath("//td[2]/a/text()").extract()[0]) title = op_sel.xpath("//td[3]/text()").extract()[0] if title: title = _clean(title).replace("*", ", ") else: title = None email = op_sel.xpath("//td[3]/a/@href").extract() if email: email = email[0].replace("mailto:", "") title = op_sel.xpath("//td[3]/a/text()").extract()[0] else: email = None try: date = datetime.datetime.strptime(_clean(date), "%d.%m.%Y").date() except: date = None ops.append({"date": date, "url": url, "email": email, "title": title, "parl_id": parl_id}) return ops
def xt(cls, response): ops = [] raw_ops = response.xpath(cls.XPATH).extract() for raw_op in raw_ops[1:]: op_sel = Selector(text=raw_op) date = op_sel.xpath('//td[1]').xpath("normalize-space()").extract()[0] url = op_sel.xpath('//td[2]/a/@href').extract()[0] parl_id = u"({})".format( op_sel.xpath('//td[3]/a').xpath('normalize-space()').extract()[0]) title = op_sel.xpath('//td[2]').xpath('normalize-space()').extract()[0] if title: title = _clean(title).replace("*", ", ") else: title = None email = None try: date = datetime.datetime.strptime( _clean(date), "%d.%m.%Y").date() except: date = None ops.append({ 'date': date, 'url': url, 'email': email, 'title': title, 'parl_id': parl_id }) return ops
def xt(cls, response): raw_signatures = response.xpath(cls.XPATH).extract() signatures = [] for raw_signature in raw_signatures: sig_sel = Selector(text=raw_signature) signature_list = sig_sel.xpath('//td/text()').extract() if len(signature_list) > 0: full_name = _clean(signature_list[0]) if len(signature_list) > 1: postal_code = _clean(signature_list[1]) else: postal_code = u'' if len(signature_list) > 2: location = _clean(signature_list[2]) else: location = u'' if len(signature_list) > 3: raw_date = time.strptime(_clean(signature_list[3]), '%d.%m.%Y') date = datetime.date.fromtimestamp(time.mktime(raw_date)) else: date = datetime.date.fromtimestamp(0) signatures.append({ 'full_name': full_name, 'postal_code': postal_code, 'location': location, 'date': date }) return signatures
def xt(cls, step_selector): title_selector = step_selector.xpath('//td[2]')[0] # we have wortmeldungen! if title_selector.xpath('//table'): table_selector = title_selector.xpath('//table')[0] raw_rows = [ Selector(text=raw_row) for raw_row in table_selector.xpath('//tbody//tr').extract() ] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): try: person_source_link = row_selector.xpath( cls.XP_P_LINK).extract()[0] person_name = row_selector.xpath( cls.XP_P_NAME).extract() statement_type = _clean( row_selector.xpath( cls.XP_T_TYPE).extract()[0]) protocol_link = row_selector.xpath( cls.XP_PROT_LINK).extract() protocol_text = _clean( remove_tags( row_selector.xpath( cls.XP_PROT_TEXT).extract()[0], 'td a')) statements.append({ 'index': index, 'person_source_link': person_source_link, 'person_name': person_name, 'statement_type': statement_type, 'protocol_link': protocol_link, 'protocol_text': protocol_text, }) except: logger.error( "Skipping statement '{}' due to extraction error" .format(row_selector.extract())) continue title = { 'text': u'Wortmeldungen in der Debatte', 'statements': statements } else: text = _clean( remove_tags( step_selector.xpath(cls.XPATH).extract()[0], 'td')).replace('<a href="', '<a href="{}'.format(BASE_HOST)) title = {'text': text} return title
def xt(cls, response): bio = { 'birthdate': None, 'birthplace': '', 'deathdate': None, 'deathplace': '', 'occupation': '' } bio_data = response.xpath(cls.XPATH).extract() if bio_data: bio_data = bio_data[0] else: return bio # Birth Data for data in bio_data.split('<br>'): birth = Selector(text=data)\ .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\ .extract() if birth: birth = birth[0] bio['birthdate'] = _clean(birth.split(',')[0]) try: bio['birthdate'] = datetime.datetime.strptime( bio['birthdate'], "%d.%m.%Y").date() except: logger.error( "Failed to parse birthdate: {}".format( bio['birthdate'])) bio['birthdate'] = None if len(birth.split(',')) > 1: bio['birthplace'] = birth.split(',')[1] # Death Data death = Selector(text=data)\ .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\ .extract() if death: death = death[0] bio['deathdate'] = _clean(death.split(',')[0]) try: bio['deathdate'] = datetime.datetime.strptime( bio['deathdate'], "%d.%m.%Y").date() except: logger.error( "Failed to parse deathdate: {}".format( bio['deathdate'])) bio['deathdate'] = None if len(death.split(',')) > 1: bio['deathplace'] = death.split(',')[1] # Occupation occupation = Selector(text=data)\ .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\ .extract() if occupation: occupation = occupation[0] bio['occupation'] = occupation.split(',')[0] return bio
def xt(cls, step_selector): title_selector = step_selector.xpath('//td[2]')[0] # we have wortmeldungen! if title_selector.xpath('//table'): table_selector = title_selector.xpath('//table')[0] raw_rows = [ Selector(text=raw_row) for raw_row in table_selector.xpath('//tbody//tr').extract()] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): if(row_selector.xpath(cls.XP_P_LINK).extract()): person_source_link = row_selector.xpath( cls.XP_P_LINK).extract()[0] else: continue person_name = row_selector.xpath( cls.XP_P_NAME).extract() if(row_selector.xpath(cls.XP_T_TYPE).extract()): statement_type = _clean( row_selector.xpath(cls.XP_T_TYPE).extract()[0]) else: continue protocol_link = row_selector.xpath( cls.XP_PROT_LINK).extract() if(row_selector.xpath( cls.XP_PROT_TEXT).extract()): protocol_text = _clean( remove_tags( row_selector.xpath( cls.XP_PROT_TEXT).extract()[0], 'td a')) else: protocol_text = [] statements.append({ 'index': index, 'person_source_link': person_source_link, 'person_name': person_name, 'statement_type': statement_type, 'protocol_link': protocol_link, 'protocol_text': protocol_text, }) title = { 'text': u'Wortmeldungen in der Debatte', 'statements': statements } else: text = _clean( remove_tags( step_selector.xpath( cls.XPATH).extract()[0], 'td')).replace('<a href="', '<a href="{}'.format(BASE_HOST)) title = {'text': text} return title
def xt(cls, response): bio = { 'birthdate': None, 'birthplace': '', 'deathdate': None, 'deathplace': '', 'occupation': '' } bio_data = response.xpath(cls.XPATH).extract() if bio_data: bio_data = bio_data[0] else: return bio # Birth Data for data in bio_data.split('<br>'): birth = Selector(text=data)\ .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\ .extract() if birth: birth = birth[0] bio['birthdate'] = _clean(birth.split(',')[0]) try: bio['birthdate'] = datetime.datetime.strptime( bio['birthdate'], "%d.%m.%Y").date() except: logger.error("Failed to parse birthdate: {}".format( bio['birthdate'])) bio['birthdate'] = None if len(birth.split(',')) > 1: bio['birthplace'] = birth.split(',')[1] # Death Data death = Selector(text=data)\ .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\ .extract() if death: death = death[0] bio['deathdate'] = _clean(death.split(',')[0]) try: bio['deathdate'] = datetime.datetime.strptime( bio['deathdate'], "%d.%m.%Y").date() except: logger.error("Failed to parse deathdate: {}".format( bio['deathdate'])) bio['deathdate'] = None if len(death.split(',')) > 1: bio['deathplace'] = death.split(',')[1] # Occupation occupation = Selector(text=data)\ .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\ .extract() if occupation: occupation = occupation[0] bio['occupation'] = occupation.split(',')[0] return bio
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: person = Selector(text=raw_person) if person.xpath('//th'): continue source_link = person.xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector( text=remove_tags(raw_person, 'img') ).xpath('//td//a/text()').extract()[0]) if ' siehe ' in reversed_name: reversed_name = reversed_name.split(' siehe ')[1] admin_title = person.xpath( '//td[1]/span/text()').extract() (admin_start_date, admin_end_date) = cls.xt_admin_date( raw_person) administration = { 'title': admin_title, 'start_date': admin_start_date, 'end_date': admin_end_date } # TODO EXTRACT DATE(S) FROM BUNDESMINISTERIUM td # TODO ADD EITHER DATE(S) TO FUNCTION try: if person.xpath('//tr//td[3]/span/text()'): function_short = person.xpath( '//td[3]/span/text()').extract()[0] function_title = person.xpath( '//td[3]/span/@title').extract()[0] elif person.xpath('//tr//td[3]/text()'): function_short = _clean(person.xpath( '//td[3]/text()').extract()[0]) function_title = '' except: import ipdb ipdb.set_trace() mandate = { 'short': function_short, 'title': function_title, 'administration': administration} persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandate': mandate, }) return persons
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean( remove_tags( Selector( text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format( start_date)) start_date = None # End Date try: end_date = mandate.split('<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format( end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, }) return mandates
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean(remove_tags( Selector(text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format(start_date)) start_date = None # End Date try: end_date = mandate.split( '<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format(end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, }) return mandates
def xt(cls, response): XPATH_BI_creator = cls.XPATH.format("Erstunterzeichner") XPATH_PET_creator = cls.XPATH.format("eine Petition") creators = [] raw_creators_list = response.xpath(XPATH_PET_creator).extract() if len(raw_creators_list) > 0: # PET started by members of parliament for raw_creator in raw_creators_list: creator_sel = Selector(text=raw_creator) raw_parl_id_url = creator_sel.xpath("//a/@href").extract() name = u'' parl_id = u'' if len(raw_parl_id_url) > 0: raw_parl_id = raw_parl_id_url[0].split("/") if len(raw_parl_id) > 1: parl_id = raw_parl_id[2] raw_name = creator_sel.xpath("//a/text()").extract() if len(raw_name) > 0: name = raw_name[0] if parl_id != u'' and name != u'': creators.append((parl_id, name)) else: raw_creators_list = response.xpath(XPATH_BI_creator).extract() if len(raw_creators_list) > 0: # BI first signed by a person name = _clean(raw_creators_list[0].split("\t")[1]) creators.append(("", name)) # VBG seem to have no visible "starter" return creators
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: source_link = Selector(text=raw_person).xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector(text=raw_person).xpath('//td//a/text()').extract()[0]) if ' siehe ' in reversed_name: reversed_name = reversed_name.split(' siehe ')[1] mandates = [] party_spans = Selector(text=raw_person).xpath( '//td[2]//span').extract() for party_span in party_spans: party_short = Selector(text=party_span).xpath( '//span/text()').extract()[0] party_title = Selector(text=party_span).xpath( '//span/@title').extract()[0] mandates.append( {'short': party_short, 'title': party_title}) electoral_state = { 'short': Selector(text=raw_person).xpath('//td[last()]//span/text()').extract()[0], 'long': Selector(text=raw_person).xpath('//td[last()]//span/@title').extract()[0]} persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandates': mandates, 'electoral_state': electoral_state, }) return persons
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: person = Selector(text=raw_person) if person.xpath('//th'): continue source_link = person.xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector( text=remove_tags(raw_person, 'img') ).xpath('//td//a/text()').extract()[0]) (pres_start_date, pres_end_date) = cls.xt_pres_date( raw_person) mandate = { 'title': u'RechnungshofpräsidentIn', 'short': u'RH-PräsidentIn', 'start_date': pres_start_date, 'end_date': pres_end_date } persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandate': mandate, }) return persons
def xt(cls, response): persons = [] raw_persons = response.xpath(cls.XPATH).extract() for raw_person in raw_persons: source_link = Selector(text=raw_person).xpath( '//td//a/@href').extract()[0] reversed_name = _clean( Selector(text=raw_person).xpath('//td//a/text()').extract()[0]) if ' siehe ' in reversed_name: reversed_name = reversed_name.split(' siehe ')[1] mandates = [] party_spans = Selector(text=raw_person).xpath( '//td[2]//span').extract() for party_span in party_spans: party_short = Selector(text=party_span).xpath( '//span/text()').extract()[0] party_title = Selector(text=party_span).xpath( '//span/@title').extract()[0] mandates.append( {'short': party_short, 'title': party_title}) electoral_state = { 'short': Selector(text=raw_person).xpath('//td[4]//span/text()').extract()[0], 'long': Selector(text=raw_person).xpath('//td[4]//span/@title').extract()[0]} persons.append({ 'source_link': source_link, 'reversed_name': reversed_name, 'mandates': mandates, 'electoral_state': electoral_state, }) return persons
def xt(cls, step_selector): title_selector = step_selector.xpath("//td[2]")[0] # we have wortmeldungen! if title_selector.xpath("//table"): table_selector = title_selector.xpath("//table")[0] raw_rows = [Selector(text=raw_row) for raw_row in table_selector.xpath("//tbody//tr").extract()] statements = [] # Extract statements data for index, row_selector in enumerate(raw_rows): if row_selector.xpath(cls.XP_P_LINK).extract(): person_source_link = row_selector.xpath(cls.XP_P_LINK).extract()[0] else: continue person_name = row_selector.xpath(cls.XP_P_NAME).extract() if row_selector.xpath(cls.XP_T_TYPE).extract(): statement_type = _clean(row_selector.xpath(cls.XP_T_TYPE).extract()[0]) else: continue protocol_link = row_selector.xpath(cls.XP_PROT_LINK).extract() if row_selector.xpath(cls.XP_PROT_TEXT).extract(): protocol_text = _clean( remove_tags(row_selector.xpath(cls.XP_PROT_TEXT).extract()[0], "td a") ) else: protocol_text = [] statements.append( { "index": index, "person_source_link": person_source_link, "person_name": person_name, "statement_type": statement_type, "protocol_link": protocol_link, "protocol_text": protocol_text, } ) title = {"text": u"Wortmeldungen in der Debatte", "statements": statements} else: text = _clean(remove_tags(step_selector.xpath(cls.XPATH).extract()[0], "td")).replace( '<a href="', '<a href="{}'.format(BASE_HOST) ) title = {"text": text} return title
def xt(cls, response): rows = response.xpath(cls.XPATH) for row in rows: raw_active = row.xpath('td[2]/text()').extract() if len(raw_active) > 0: active = _clean(raw_active[0]) if active == u'Aufl\xf6sung': return False return True
def xt(cls, response): raw_signatures = response.xpath(cls.XPATH).extract() signatures = [] for raw_signature in raw_signatures: sig_sel = Selector(text=raw_signature) signature_list = sig_sel.xpath('//td/text()').extract() if len(signature_list) > 0: full_name = _clean(signature_list[0]) if len(signature_list) > 1: postal_code = _clean(signature_list[1]) else: postal_code = u'' if len(signature_list) > 2: location = _clean(signature_list[2]) else: location = u'' if len(signature_list) > 3: raw_date = time.strptime(_clean(signature_list[3]), '%d.%m.%Y') date = datetime.date.fromtimestamp( time.mktime(raw_date)) else: date = datetime.date.fromtimestamp(0) signatures.append({ 'full_name': full_name, 'postal_code': postal_code, 'location': location, 'date': date }) return signatures
def xt(cls, response): raw_laws = response.xpath(cls.XPATH_LAWS) raw_reports = response.xpath(cls.XPATH_REPORTS) raw_laws = raw_laws + raw_reports laws = [] for raw_law in raw_laws: raw_title = raw_law.xpath('text()').extract() if len(raw_title) > 0: law_title = _clean(raw_title[0]) else: law_title = u'' raw_link = raw_law.xpath('@href').extract() if len(raw_link) > 0: law_link = raw_link[0] law_llp, law_parl_id = COMITTEE.url_to_parlid(law_link) law_link = "{}/{}".format(BASE_HOST, law_link) else: # without a link we can't get the necessary info continue if law_llp != u'' and law_parl_id != u'': law = { 'title': law_title, 'source_link': law_link, 'parl_id': law_parl_id, 'llp': law_llp, } laws.append(law) return laws
def xt(cls, response): mandates_raw = response.xpath(cls.XPATH).extract() mandates = [] for mandate in mandates_raw: mandate = _clean(remove_tags(mandate, 'li')) if "<div" in mandate and "</div>" in mandate: mandate = _clean(remove_tags( Selector(text=mandate).xpath("//div").extract()[0], 'div')) function = mandate.split(u'<br>')[0].split(',')[0] party = mandate.split(u'<br>')[0].split(',')[1] if ',' in mandate.split(u'<br />')[0] else '' llp_raw = re.match( '^.*\((.*)\. GP\).*$', function ) function = re.sub( '\((.*)\. GP\)','', function ).strip() m_llp_roman_begin = \ m_llp_roman_end = \ llp_raw.group(1) if llp_raw else '' if u'–' in m_llp_roman_begin: m_llp_roman_begin,m_llp_roman_end = m_llp_roman_begin.split(u'–') for llp in range(roman.fromRoman(m_llp_roman_begin.strip('. ')), roman.fromRoman(m_llp_roman_end.strip('. '))+1 ) if m_llp_roman_begin else [None]: llp_roman = roman.toRoman(llp) if llp else None # Start Date try: start_date = _clean( mandate.split('<br>')[1].split(u'\u2013')[0]) start_date = datetime.datetime.strptime( start_date, "%d.%m.%Y").date() except: logger.error( u"Failed to parse mandate start date: {}".format(start_date)) start_date = None # End Date try: end_date = mandate.split( '<br>')[1].split(u'\u2013') if len(end_date) > 1 and end_date[1]: end_date = datetime.datetime.strptime( _clean(end_date[1]), "%d.%m.%Y").date() else: end_date = None except: logger.error( u"Failed to parse mandate end date: {}".format(end_date)) end_date = None mandates.append({ 'function': function, 'party': _clean(party), 'start_date': start_date, 'end_date': end_date, 'llp': llp, 'llp_roman': llp_roman, }) return mandates
def xt(cls, response): raw_memberships = response.xpath(cls.XPATH) memberships = [] for raw_membership in raw_memberships: raw_llp = raw_membership.xpath('a[1]/text()').extract()[1] nrbr = u'Nationalrat' comittee_llp = None if nrbr in raw_llp: comittee_llp = raw_llp.split()[-2][:-1] else: nrbr = u'Bundesrat' tablerows = raw_membership.xpath('following-sibling::div[1]/table[1]/tbody/tr').extract() last_function = u'' for row in tablerows: row_sel = Selector(text=row) raw_function = row_sel.xpath('//td[@class="biogr_am_funktext"]/text()').extract() if len(raw_function) > 0: function = _clean(raw_function[0]) # TODO: standardization of functions should be done on model level last_function = function else: function = last_function raw_comittee_link = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/a/@href').extract() if raw_comittee_link: comittee_link = raw_comittee_link[0] comittee_link = "{}/{}".format(BASE_HOST, comittee_link) else: comittee_link = u'' _,comittee_parl_id = COMITTEE.url_to_parlid(comittee_link) raw_comitee_name = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/a/text()').extract() if len(raw_comitee_name) > 0: comittee_name = _clean(raw_comitee_name[0]) else: raw_comitee_name = row_sel.xpath('//td[@class="biogr_am_ausschuss"]/text()').extract() if len(raw_comitee_name) > 0: comittee_name = _clean(raw_comitee_name[0]) else: comittee_name = u'' raw_dates = row_sel.xpath('//td[@class="biogr_am_vonbis"]/text()').extract()[0] if raw_dates: raw_dates = _clean(raw_dates) # \u2013 == - (dash) raw_dates = raw_dates.split(u'\u2013') if len(raw_dates) > 0: raw_from = raw_dates[0] if raw_from is not u'': raw_from = time.strptime(raw_from, '%d.%m.%Y') date_from = datetime.datetime.fromtimestamp(time.mktime(raw_from)) else: date_from = None else: date_from = None if len(raw_dates) > 1: raw_to = raw_dates[1] if raw_to is not u'': raw_to = time.strptime(raw_to, '%d.%m.%Y') date_to = datetime.datetime.fromtimestamp(time.mktime(raw_to)) else: date_to = None else: date_to = None # we cant add the membership if the parl_id of the comitee is empty if comittee_parl_id is not u'': memberships.append({ 'comittee': { 'name': comittee_name, 'parl_id': comittee_parl_id, 'nrbr': nrbr, 'legislative_period': comittee_llp, 'source_link': comittee_link }, 'function': function, 'date_from': date_from, 'date_to': date_to }) return memberships
def xt(cls, response): raw_memberships = response.xpath(cls.XPATH) memberships = [] for raw_membership in raw_memberships: raw_llp = raw_membership.xpath('a[1]/text()').extract()[1] nrbr = u'Nationalrat' comittee_llp = None if nrbr in raw_llp: comittee_llp = raw_llp.split()[-2][:-1] else: nrbr = u'Bundesrat' tablerows = raw_membership.xpath( 'following-sibling::div[1]/table[1]/tbody/tr').extract() last_function = u'' for row in tablerows: row_sel = Selector(text=row) raw_function = row_sel.xpath( '//td[@class="biogr_am_funktext"]/text()').extract() if len(raw_function) > 0: function = _clean(raw_function[0]) # TODO: standardization of functions should be done on model level last_function = function else: function = last_function raw_comittee_link = row_sel.xpath( '//td[@class="biogr_am_ausschuss"]/a/@href').extract() if raw_comittee_link: comittee_link = raw_comittee_link[0] comittee_link = "{}{}".format(BASE_HOST, comittee_link) else: comittee_link = u'' _, comittee_parl_id = COMITTEE.url_to_parlid(comittee_link) raw_comitee_name = row_sel.xpath( '//td[@class="biogr_am_ausschuss"]/a/text()').extract( ) if len(raw_comitee_name) > 0: comittee_name = _clean(raw_comitee_name[0]) else: raw_comitee_name = row_sel.xpath( '//td[@class="biogr_am_ausschuss"]/text()' ).extract() if len(raw_comitee_name) > 0: comittee_name = _clean(raw_comitee_name[0]) else: comittee_name = u'' raw_dates = row_sel.xpath( '//td[@class="biogr_am_vonbis"]/text()').extract()[0] if raw_dates: raw_dates = _clean(raw_dates) # \u2013 == - (dash) raw_dates = raw_dates.split(u'\u2013') if len(raw_dates) > 0: raw_from = raw_dates[0] if raw_from is not u'': raw_from = time.strptime(raw_from, '%d.%m.%Y') date_from = datetime.datetime.fromtimestamp( time.mktime(raw_from)) else: date_from = None else: date_from = None if len(raw_dates) > 1: raw_to = raw_dates[1] if raw_to is not u'': raw_to = time.strptime(raw_to, '%d.%m.%Y') date_to = datetime.datetime.fromtimestamp( time.mktime(raw_to)) else: date_to = None else: date_to = None # we cant add the membership if the parl_id of the comitee is empty if comittee_parl_id is not u'': memberships.append({ 'comittee': { 'name': comittee_name, 'parl_id': comittee_parl_id, 'nrbr': nrbr, 'legislative_period': comittee_llp, 'source_link': comittee_link }, 'function': function, 'date_from': date_from, 'date_to': date_to }) return memberships
def xt(cls, response): raw_meetings = response.xpath(cls.XPATH) meetings = [] for raw_meeting in raw_meetings: raw_header_row = raw_meeting.xpath('tr[@class="historyHeader"]') raw_date = raw_header_row.xpath('td[1]/text()').extract() if len(raw_date) > 0: raw_date = _clean(raw_date[0]) if raw_date is not u'': raw_date = time.strptime(raw_date, '%d.%m.%Y') meeting_date = datetime.datetime.fromtimestamp(time.mktime(raw_date)) else: meeting_date = None else: meeting_date = None raw_number = raw_header_row.xpath('td[2]/em/a/text()').extract() if len(raw_number) > 0 and u'Sitzung' in raw_number[0]: meeting_number = raw_number[0].split()[0][:-1] else: continue # not a meeting raw_document_urls = raw_header_row.xpath('td[2]/a/@href').extract() html_link, pdf_link = u"", u"" for url in raw_document_urls: if url.endswith('.pdf'): pdf_link = url if not pdf_link.startswith(BASE_HOST): pdf_link = "{}/{}".format(BASE_HOST, pdf_link) elif url.endswith('.html'): html_link = url if not html_link.startswith(BASE_HOST): html_link = "{}/{}".format(BASE_HOST, html_link) title = u'Tagesordnung der {}. Sitzung des {} am {}'\ .format(meeting_number, COMITTEE.NAME.xt(response), str(meeting_date.date())) if html_link != u'' or pdf_link != u'': meeting_document = { 'title': title, 'html_link': html_link, 'pdf_link': pdf_link } else: meeting_document = None raw_rows = raw_header_row.xpath('following-sibling::tr') meeting_topics = [] for raw_row in raw_rows: raw_topic_number = raw_row.xpath('td[1]/text()').extract() if len(raw_topic_number) > 0: topic_number_list = _clean(raw_topic_number[0]).split() if len(topic_number_list) == 2 and topic_number_list[0] == u'TOP': topic_number = int(topic_number_list[1]) else: topic_number = 0 else: topic_number = 0 raw_topic_text = raw_row.xpath('td[2]/text()').extract() if len(raw_topic_text) > 0: topic_text = _clean(raw_topic_text[0]) if topic_text.endswith('('): topic_text = topic_text[:-1].rstrip() else: topic_text = u'' if len(raw_topic_text) > 1: topic_comment = _clean(raw_topic_text[1]) if topic_comment.startswith(')'): topic_comment = topic_comment[:-1].lstrip() else: topic_comment = u'' raw_topic_law_text = raw_row.xpath('td[2]/a/text()').extract() if len(raw_topic_law_text) > 0: topic_law_text = u'({})'.format(raw_topic_law_text[0]) else: topic_law_text = u'' topic_text = u'{} {}'.format(topic_text,topic_law_text) raw_topic_law_link = raw_row.xpath('td[2]/a/@href').extract() if len(raw_topic_law_link) > 0: topic_law_llp, topic_law_id = COMITTEE.url_to_parlid(raw_topic_law_link[0]) else: topic_law_llp, topic_law_id = u'', u'' if topic_law_id != u'': topic_law = { 'parl_id': topic_law_id, 'llp': topic_law_llp } else: topic_law = None topic = { 'number': topic_number, 'text': topic_text, 'comment': topic_comment, 'law': topic_law } meeting_topics.append(topic) meeting = { 'number': meeting_number, 'date': meeting_date, 'agenda': meeting_document, 'topics': meeting_topics, } meetings.append(meeting) return meetings
def xt(cls, response): raw_meetings = response.xpath(cls.XPATH) meetings = [] for raw_meeting in raw_meetings: raw_header_row = raw_meeting.xpath( 'tr[@class="historyHeader"]') raw_date = raw_header_row.xpath('td[1]/text()').extract() if len(raw_date) > 0: raw_date = _clean(raw_date[0]) if raw_date is not u'': raw_date = time.strptime(raw_date, '%d.%m.%Y') meeting_date = datetime.datetime.fromtimestamp( time.mktime(raw_date)) else: meeting_date = None else: meeting_date = None raw_number = raw_header_row.xpath( 'td[2]/em/a/text()').extract() if len(raw_number) > 0 and u'Sitzung' in raw_number[0]: meeting_number = raw_number[0].split()[0][:-1] else: continue # not a meeting raw_document_urls = raw_header_row.xpath( 'td[2]/a/@href').extract() html_link, pdf_link = u"", u"" for url in raw_document_urls: if url.endswith('.pdf'): pdf_link = url if not pdf_link.startswith(BASE_HOST): pdf_link = "{}/{}".format(BASE_HOST, pdf_link) elif url.endswith('.html'): html_link = url if not html_link.startswith(BASE_HOST): html_link = "{}/{}".format(BASE_HOST, html_link) title = u'Tagesordnung der {}. Sitzung des {} am {}'\ .format(meeting_number, COMITTEE.NAME.xt(response), str(meeting_date.date())) if html_link != u'' or pdf_link != u'': meeting_document = { 'title': title, 'html_link': html_link, 'pdf_link': pdf_link } else: meeting_document = None raw_rows = raw_header_row.xpath('following-sibling::tr') meeting_topics = [] for raw_row in raw_rows: raw_topic_number = raw_row.xpath('td[1]/text()').extract() if len(raw_topic_number) > 0: topic_number_list = _clean(raw_topic_number[0]).split() if len(topic_number_list ) == 2 and topic_number_list[0] == u'TOP': topic_number = int(topic_number_list[1]) else: topic_number = 0 else: topic_number = 0 raw_topic_text = raw_row.xpath('td[2]/text()').extract() if len(raw_topic_text) > 0: topic_text = _clean(raw_topic_text[0]) if topic_text.endswith('('): topic_text = topic_text[:-1].rstrip() else: topic_text = u'' if len(raw_topic_text) > 1: topic_comment = _clean(raw_topic_text[1]) if topic_comment.startswith(')'): topic_comment = topic_comment[:-1].lstrip() else: topic_comment = u'' raw_topic_law_text = raw_row.xpath( 'td[2]/a/text()').extract() if len(raw_topic_law_text) > 0: topic_law_text = u'({})'.format(raw_topic_law_text[0]) else: topic_law_text = u'' topic_text = u'{} {}'.format(topic_text, topic_law_text) raw_topic_law_link = raw_row.xpath( 'td[2]/a/@href').extract() if len(raw_topic_law_link) > 0: topic_law_llp, topic_law_id = COMITTEE.url_to_parlid( raw_topic_law_link[0]) else: topic_law_llp, topic_law_id = u'', u'' if topic_law_id != u'': topic_law = { 'parl_id': topic_law_id, 'llp': topic_law_llp } else: topic_law = None topic = { 'number': topic_number, 'text': topic_text, 'comment': topic_comment, 'law': topic_law } meeting_topics.append(topic) meeting = { 'number': meeting_number, 'date': meeting_date, 'agenda': meeting_document, 'topics': meeting_topics, } meetings.append(meeting) return meetings