def scrape_membership(self): # Returns chambers membership list with the basic information data # for each member of every chamber for Armenia's parliament. print "\n\tScraping membership's data from Armenia's parliament...\n" mps = self.members_list() memberships = [] roles = self.membership_correction() chambers = {} all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber['identifiers'][0]["identifier"]] = chamber['id'] members = {} all_members = vpapi.getall("people") for member in all_members: members[member['name']] = member['id'] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), ' ', FileTransferSpeed(), ' '] pbar = ProgressBar(widgets=widgets) for member in pbar(mps): p_id = members[member['name']] o_id = chambers[member['term']] role = "" membership_label = member['membership'] if member['membership'].encode('utf-8') in roles: role = roles["անդամ"] url = "http://www.parliament.am/deputies.php?lang=arm&sel=full&ord=alpha&show_session=" + member['term'] membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url) memberships.append(membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(memberships)) + " members" return memberships
def scrape_membership(self): # Iterates in chamber member json document and # returns the list with the json document structure that Visegrad+ API accepts print "\n\tScraping chambers membership's data from Belarus Upperhouse parliament...\n" members = {} all_members = vpapi.getall("people") for member in all_members: members[member['name']] = member['id'] chambers = {} all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber['identifiers'][0]['identifier']] = chamber['id'] terms = parser.terms mps_list = parser.members_list() chambers_membership = [] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for member in pbar(mps_list): p_id = members[member['name']] o_id = chambers[member['term']] url = terms[member['term']]['url'] membership_label = member['membership'] role = member['role'] chamber_membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url) chambers_membership.append(chamber_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_membership)) + " members" return chambers_membership
def scrape_parliamentary_group_membership(self): # Returns parliamentary groups membership list with the basic information data # for each member of every parliamentary group for Armenia's parliament. print "\n\tScraping parliamentary groups membership from Armenia's parliament...\n" chambers = {} groups = {} members = {} memberships = self.membership_correction() all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber['identifiers'][0]["identifier"]] = chamber['id'] all_groups = vpapi.getall('organizations', where={"classification": "parliamentary group"}) for group in all_groups: groups[group['sources'][0]['url']] = group['id'] all_members = vpapi.getall("people") for member in all_members: members[member['name']] = member['id'] parties_membership = [] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for term in pbar(list(reversed(sorted(self.terms.keys())))): url = "http://www.parliament.am/deputies.php?lang=arm&sel=factions&SubscribeEmail=&show_session=" + str(term) soup = scrape.download_html_file(url) for each_div in soup.findAll('div', {"class": "content"}): party_name = each_div.find("center").find("b").get_text() party_name_ordered = party_name.replace(" ", " ") exist = vpapi.getfirst("organizations", where={'name': party_name_ordered, "parent_id": chambers[str(term)]}) if exist: o_id = exist['id'] for each_tr in each_div.find('table', {"style": "margin-top:10px; margin-bottom:10px;"}).findAll('tr'): if each_tr.has_attr('bgcolor'): continue else: td_array = each_tr.findAll('td') names = td_array[0].find('a').get_text().split(' ') first_name = names[1] last_name = names[0] middle_name = names[2] name_ordered = "%s %s %s" % (first_name, middle_name, last_name) membership = each_tr.find('span', {'class': "news_date"}).get_text() if membership == "": membership = "անդամ".decode('utf-8') else: membership = membership[1:len(membership)-1] role = memberships[membership.encode('utf-8')] if name_ordered in members: p_id = members[name_ordered] party_membership_json = self.build_memberships_doc(p_id, o_id, membership, role, url) parties_membership.append(party_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(parties_membership)) + " members of parliamentary groups" return parties_membership
def scrape_committee_members(self): # Returns committee groups membership list with the basic information data # for each member of every committee group for Armenia's parliament. print "\n\tScraping committee groups membership from Armenia's parliament...\n" committees = self.committee_list() committee_membership = [] chambers = {} groups = {} members = {} memberships = self.membership_correction() all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber['identifiers'][0]["identifier"]] = chamber['id'] all_groups = vpapi.getall('organizations', where={"classification": "committe"}) for group in all_groups: groups[group['sources'][0]['url']] = group['id'] all_members = vpapi.getall("people") for member in all_members: members[member['name']] = member['id'] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for committee in pbar(committees): url = committee['url'].replace('show', "members") soup = scrape.download_html_file(url) for each_tr in soup.find('table', {"style": "margin-top:10px; margin-bottom:10px;"}).findAll('tr'): if each_tr.has_attr('bgcolor'): continue else: td_array = each_tr.findAll('td') if td_array: names = td_array[0].find('a').get_text().split(' ') first_name = names[1] last_name = names[0] middle_name = names[2] name_ordered = "%s %s %s" % (first_name, middle_name, last_name) membership = each_tr.find('span', {'class': "news_date"}).get_text() if url in groups: o_id = groups[url] if membership == "": membership = "անդամ".decode('utf-8') else: membership = membership[1:len(membership)-1] role = memberships[membership.encode('utf-8')] if name_ordered in members: p_id = members[name_ordered] party_membership_json = self.build_memberships_doc(p_id, o_id, membership, role, url) committee_membership.append(party_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership)) + " members of committee groups" return committee_membership
def get_group_id(self): # Returns the json with all the organization IDs groups = {} parties_ids = [] all_groups = vpapi.getall("organizations", where={"classification": "parliamentary group"}) for group in all_groups: parties_ids.append(group['id']) memberships = vpapi.getall("memberships") for member in memberships: if member['organization_id'] in parties_ids: groups[member['person_id']] = member['organization_id'] else: groups[member['person_id']] = None return groups
def committe_list(self): # Returns the list of committee groups with basic information for each committee_list = [] chambers_list = {} chambers_api = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in chambers_api: chambers_list[chamber['identifiers'][0]['identifier']] = chamber['id'] chambers = self.chambers_list() for term in chambers: soup = scrape.download_html_file(chambers[term]['url']) for each_h2 in soup.find("div", {"id": "committee_bm_info"}).findAll("h2"): name = each_h2.find("a").get_text() url = each_h2.find("a").get("href") start_date = chambers[term]['start_date'] if term != "5": end_date = chambers[term]['end_date'] else: end_date = None identifiers = re.findall(r'\d+', url) if len(identifiers) > 2: identifier = identifiers[1] else: identifier = identifiers[0] chamber_id = chambers_list[term] committee_json = { "identifier": identifier, "parent_id": chamber_id, "name": name, "url": url, "start_date": start_date, "end_date": end_date } committee_list.append(committee_json) return committee_list
def save(self, update_only=False): """If a compatible membership already exists, update it. Otherwise, create a new one. If `update_only` is True, only existing memberships are updated, no new one is created. Memberships are compatible if their fields `start_date`, `role` and `post` are compatible. Field 'end_date' is not checked to allow for later corrections of guessed end dates used when a member disappears from a group profile. """ memberships = vpapi.getall('memberships', where={'person_id': self.person_id, 'organization_id': self.organization_id}, sort='-start_date') to_save = self.__dict__.copy() id = None for existing in memberships: if self._merge_values('start_date', to_save, existing) \ and to_save.get('end_date', '9999-12-31') >= existing.get('start_date', '0001-01-01') \ and self._merge_values('role', to_save, existing) \ and self._merge_values('post', to_save, existing): id = existing['id'] self._merge_values('end_date', to_save, existing) break else: to_save = self.__dict__.copy() if id: resp = vpapi.put('memberships', id, to_save) else: if update_only: return resp = vpapi.post('memberships', self.__dict__) if resp['_status'] != 'OK': raise Exception(self.name, resp)
def update_motion_url(self): print "\n\tUpdating url of motions" motions = vpapi.getall("motions") counter = 0 widgets = [ " Progress: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " - Processed: ", Counter(), " events ", ] pbar = ProgressBar(widgets=widgets) for motion in motions: counter += 1 sources = motion["sources"] url = sources[0]["url"] print (str(counter)) if "http://w1.c1.rada.gov.ua" not in url: motion_id = motion["id"] motion["sources"][0]["url"] = "http://w1.c1.rada.gov.ua" + url items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del motion[item_delete] vpapi.put("motions", motion_id, motion, effective_date=self.effective_date()) else: continue print "\n\tFinished updating motions url"
def scrape_membership(self): # Returns chambers membership list with the basic information data # for each member of every chamber for Moldova's parliament. chamber_membership = [] print "\n\tScraping chambers membership from Moldova's parliament..." mps_list = self.mps_list() members = {} membership_correction = self.membership_correction() all_members = vpapi.getall("people") for member in all_members: members[member['identifiers'][0]['identifier']] = member['id'] chamber_id = vpapi.getfirst("organizations", where={"identifiers": { "$elemMatch": { "identifier": "20", "scheme": "parlament.md" } }}) deputy_list_url = "http://www.parlament.md/StructuraParlamentului/" \ "Deputies/tabid/87/language/ro-RO/Default.aspx" for member in mps_list: p_id = members[member['identifier']] role = membership_correction[member['membership'].encode('utf-8')] chamber_membership_json = self.build_memberships_doc(p_id, chamber_id['id'], member['membership'], role, deputy_list_url) chamber_membership.append(chamber_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership)) + " members of chambers \n" return chamber_membership
def get_all_member_ids_for_votes(self): members = {} api_members = vpapi.getall("people") for member in api_members: members[member['identifiers'][0]['identifier']] = member['id'] return members
def test1(self): motions = vpapi.getall("motions") counter = 0 for motion in motions: counter += 1 print counter print motion["id"] print "------------------------------------------------>"
def scrape_parliamentary_groups(self): # Scrapes parliamentary groups and Returns the list of # parliamentary groups with all the information needed for each parties_list = [] terms_ids = {} all_terms = vpapi.getall("organizations", where={"classification": "chamber"}) for term in all_terms: terms_ids[term['identifiers'][0]['identifier']] = term['id'] parties_doc = self.parliamentary_groups() print "\n\tScraping parliamentary groups from Armenia's parliament...\n" widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for term in pbar(parties_doc): url = "http://www.parliament.am/deputies.php?lang=arm&sel=factions&SubscribeEmail=&show_session=" + term soup = scrape.download_html_file(url) all_divs = soup.findAll('div', {"class": "content"}) for each_div in all_divs: name = each_div.find("center").find("b").get_text() name_ordered = name.replace(" ", " ") if name_ordered in parties_doc[term]: identifier = parties_doc[term][name_ordered]['identifier'] url_faction = parties_doc[term][name_ordered]['url'] founding_date = self.terms[term]["start_date"] parent_id = terms_ids[str(term)] if each_div.find("center").find("a"): email = each_div.find("center").find("a").get_text() if term != "5": dissolution_date = self.terms[term]["end_date"] else: dissolution_date = None party_json = self.build_organization_doc("parliamentary group", name_ordered, identifier, founding_date, dissolution_date, url_faction, email, parent_id) if not dissolution_date: del party_json['dissolution_date'] if not email or email == None: del party_json['contact_details'] if not identifier: del party_json['identifiers'] parties_list.append(party_json) else: print "term: %s \nname: %s" % (term, name_ordered) print "\n\tScraping completed! \n\tScraped " + str(len(parties_list)) + " parliametary groups" return parties_list
def scrape_committee_members(self): # Returns committee groups membership list with the basic information data # for each member of every committee group for Moldova's parliament. print "\n\tScraping committees membership from Moldova's parliament..." committees_list = self.committee_list() membership_correction = self.membership_correction() committees = {} all_committees = vpapi.getall("organizations", where={"classification": "committe"}) for committe in all_committees: committees[committe['identifiers'][0]['identifier']] = committe['id'] members = {} all_members = vpapi.getall("people") for member in all_members: members[member['identifiers'][0]['identifier']] = member['id'] committees_membership = [] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed members from: ", Counter(), ' committees '] pbar = ProgressBar(widgets=widgets) for committee in pbar(committees_list): committee_identifier = committee['identifier'] soup_party = scrape.download_html_file(committee['url']) for each_tr in soup_party.find("fieldset", {"id": "dnn_ctr486_ViewCommissionPermanent_ctrlViewCommissionType_fsMembers"}).findAll('tr'): td_array = each_tr.findAll('td') link = td_array[1].find('a').get('href') index_start = link.index('/Id/') + 4 index_end = link.index('/la') member_identifier = link[index_start:index_end] membership = td_array[2].get_text().strip() member_id = members[member_identifier] o_id = committees[committee_identifier] if membership == "": membership = "Membru" role = membership_correction[membership.encode('utf-8')] committees_membership_json = self.build_memberships_doc(member_id, o_id, membership, role, committee['url']) committees_membership.append(committees_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(committees_membership)) + " members of committee groups\n" return committees_membership
def scrape_parliamentary_group_membership(self): # Returns parliamentary groups membership list with the basic information data # for each member of every parliamentary group for Moldova's parliament. print "\n\tScraping parliamentary groups membership from Moldova's parliament..." parties_list = self.parliamentary_group_list() membership_correction = self.membership_correction() parties = {} all_parties = vpapi.getall("organizations", where={'classification': "parliamentary group"}) for party in all_parties: parties[party['identifiers'][0]['identifier']] = party['id'] members = {} all_members = vpapi.getall("people") for member in all_members: members[member['identifiers'][0]['identifier']] = member['id'] parties_membership = [] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed members from: ", Counter(), ' parliamentary groups '] pbar = ProgressBar(widgets=widgets) for party in pbar(parties_list): party_identifier = party['identifier'] soup_party = scrape.download_html_file(party['url']) for each_tr in soup_party.find("fieldset", {"id": "dnn_ctr482_ViewFraction_fsMembers"}).findAll('tr'): td_array = each_tr.findAll('td') link = td_array[1].find('a').get('href') index_start = link.index('/Id/') + 4 index_end = link.index('/la') member_identifier = link[index_start:index_end] membership = td_array[2].get_text().strip() member_id = members[member_identifier] o_id = parties[party_identifier] if membership == "": membership = "Membru" role = membership_correction[membership.encode('utf-8')] party_membership_json = self.build_memberships_doc(member_id, o_id, membership, role, party['url']) parties_membership.append(party_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(parties_membership)) + " members of parties \n" return parties_membership
def scrape_membership(self): print "\n\tScraping chambers membership's data from Ukraine's parliament..." print "\tPlease wait. This may take a few moments...\n" members = {} all_members = vpapi.getall("people") for member in all_members: members[member["name"]] = member["id"] chambers = {} all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber["identifiers"][0]["identifier"]] = chamber["id"] terms = parser.chambers() mps_list = parser.mps_list() chambers_membership = [] widgets = [ " Progress: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " - Processed: ", Counter(), " items ", ] pbar = ProgressBar(widgets=widgets) for member in pbar(mps_list): if member["name"] in members: p_id = members[member["name"]] o_id = chambers[member["term"]] url = terms[member["term"]]["url"] membership_label = member["membership"] role = member["role"] chamber_membership_json = self.build_memberships_doc(p_id, o_id, membership_label, role, url) chambers_membership.append(chamber_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_membership)) + " members" return chambers_membership
def committee_membership(self): # Returns committee groups membership list with all needed information data # for each member of every committee group for Belarus Lower house parliament. committee_list = self.committee_list() element_positions = self.committee_membership_list() committee_members = {} members = {} all_members = vpapi.getall("people") for member in all_members: members[member['sources'][0]['url']] = member['id'] widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' committees '] pbar = ProgressBar(widgets=widgets) for committee in pbar(committee_list): identifier = int(committee['identifier']) + 2 url = committee['url'].replace(committee['identifier'], str(identifier)) soup = scrape.download_html_file(url) all_tr_elements = soup.find("table", {"cellpadding": "2"}).findAll('tr') all_tr = all_tr_elements[:len(all_tr_elements) - 2] committee_members[committee['identifier']] = {} committee_members[committee['identifier']]["Старшыня"] = [] committee_members[committee['identifier']]["Намеснікі старшыні"] = [] committee_members[committee['identifier']]["Члены камісіі"] = [] if committee['identifier'] in element_positions: index_start_first = element_positions[committee['identifier']][0] index_start_middle = element_positions[committee['identifier']][1] index_penultimate = element_positions[committee['identifier']][2] index_start_last = element_positions[committee['identifier']][3] for each_tr in all_tr[index_start_first:index_start_middle]: if each_tr.find("a"): url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041") member_id = members[url] committee_members[committee['identifier']]["Старшыня"].append(member_id) for each_tr in all_tr[index_start_middle:index_penultimate]: if each_tr.find("a"): url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041") member_id = members[url] committee_members[committee['identifier']]["Намеснікі старшыні"].append(member_id) for each_tr in all_tr[index_penultimate:index_start_last]: if each_tr.find("a"): url = "http://house.gov.by/" + each_tr.find('a').get('href').replace("15489", "17041") member_id = members[url] committee_members[committee['identifier']]["Члены камісіі"].append(member_id) return committee_members
def scrape_membership(self): # Iterates in chamber member json document and # returns the list with the json document structure that Visegrad+ API accepts print "\n\tScraping membership's data from Belarus Lowerhouese parliament..." mp_list = parser.mps_list() chamber_membership_list = [] members = {} url = "http://house.gov.by/index.php/,17041,,,,2,,,0.html" all_members = vpapi.getall("people") for person in all_members: members[person['identifiers'][0]['identifier']] = person['id'] chamber = vpapi.getfirst("organizations", where={"identifiers": {"$elemMatch": {"identifier": "2", "scheme": "house.by"}}}) for member in mp_list: p_id = members[member['identifier']] o_id = chamber['id'] chamber_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'], member['role'], url) chamber_membership_list.append(chamber_membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership_list)) + " members" return chamber_membership_list
def scrape_committee_members(self): # Iterates in every committee member json doc and returns the # list with the json document structure that Visegrad+ API accepts print "\n\tScraping committee groups membership from Belarus Lowerhouse parliament..." committee_membership_list = [] committee_list = parser.committee_membership() groups = {} all_groups = vpapi.getall("organizations", where={"classification": "committe"}) for group in all_groups: groups[group['identifiers'][0]['identifier']] = group['id'] roles = parser.membership_correction() for committee in committee_list: identifier = int(committee) + 2 url = "http://house.gov.by/index.php/,17230,,,,2,,,0.html".replace("17230", str(identifier)) for membership in committee_list[committee]: for members in committee_list[committee][membership]: role = roles[membership] membership_json = self.build_memberships_doc(members, groups[committee], membership, role, url) committee_membership_list.append(membership_json) print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership_list)) + " members of committee groups" return committee_membership_list
def scrape_committee_members(self): # Iterates in every committee member json doc and returns the # list with the json document structure that Visegrad+ API accepts print "\n\tScraping committee groups from Belarus Upperhouse parliament...\n" members = {} committee_membership = [] all_members = vpapi.getall("people") for member in all_members: if member['identifiers'][0]['identifier'] not in members: members[member['identifiers'][0]['identifier']] = member['id'] else: continue committee_membership_list = parser.committee_membership() widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) print "\n\tProcessing members of committee groups from Belarus Upperhouse parliament...\n" for member in pbar(committee_membership_list): if member['identifier'] in members: p_id = members[member['identifier']] else: p_id = None existing = vpapi.getfirst("organizations", where={"name": member['committee_name'], "parent_id": member['committee_parent_id']}) if existing: o_id = existing['id'] else: o_id = None if p_id and o_id: committee_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'], member['role'], member['url']) committee_membership.append(committee_membership_json) else: continue print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership)) + " members" return committee_membership
def committee_list(self): # Returns the list of committee groups with basic information for each committee_list = [] chambers = {} all_chambers = vpapi.getall("organizations", where={"classification": "chamber"}) for chamber in all_chambers: chambers[chamber['identifiers'][0]['identifier']] = chamber['id'] for i in range(3, 6): url = "http://www.parliament.am/committees.php?lang=arm&show_session=" + str(i) soup = scrape.download_html_file(url) for each_tr in soup.find("table", {"class": "com-table"}).findAll('tr', {"valign": "top"}): for each_td in each_tr.findAll('td'): name = each_td.find('a', {"class": "blue_mid_b"}).get_text() url = "http://www.parliament.am" + each_td.find('a', {"class": "blue_mid_b"}).get("href") identifier = re.findall(r'\d+', url) committee_json = { "name": name, "url": url, "identifier": identifier[0], "parent_id": chambers[str(i)] } committee_list.append(committee_json) return committee_list
}, { "code_api" : "cz/senat", "code": "senat", "code_csv": "upper", "name": "Senát" } ] for p in parliaments: vpapi.parliament(p['code_api']) for ve in ves: if ve[p['code_csv'] + '_vote_event_id'] != '': votes = vpapi.getall("votes",where={"vote_event_id":ve[p['code_csv'] + '_vote_event_id']}) print(ve[p['code_csv'] + '_vote_event_id']) for v in votes: try: data[p['code'] + '_' + v['voter_id']] except: data[p['code'] + '_' + v['voter_id']] = {} data[p['code'] + '_' + v['voter_id']]['votes'] = {} data[p['code'] + '_' + v['voter_id']]['chamber'] = p['code'] data[p['code'] + '_' + v['voter_id']]['chamber_name'] = p['name'] data[p['code'] + '_' + v['voter_id']]['id'] = v['voter_id'] data[p['code'] + '_' + v['voter_id']]['votes'][ve['id']] = o2o[v['option']] * int(ve[p['code_csv'] + '_polarity']) data[p['code'] + '_' + v['voter_id']]['group_id'] = v['group_id'] os = {} for k in data:
try: out = m.group(1) except: out = "" return out organizations = {} people = {} vote_events = {} motions = {} votes = [] orgs = vpapi.getall("organizations") for org in orgs: organizations[org['id']] = org print("organizations downloaded: ",len(organizations)) peop = vpapi.getall("people") for person in peop: people[person['id']] = person print("people downloaded: ",len(people)) for term in terms: sli = term["since"].split('-') uli = term["until"].split('-') directory = "sk-nrsr-" + term["id"] + "-" + sli[0] + "-" + uli[0] + "-roll-call-votes"
def scrape_new_debates(term): """Scrape and save speeches from debates of the given term, one of those newer terms where transcripts of debates are published in parts assigned to individual speakers. Returns number of scraped speeches. """ debate_part_kinds = { 'Uvádzajúci uvádza bod': 'speech', 'Vstup predsedajúceho': 'speech', 'Vystúpenie spoločného spravodajcu': 'speech', 'Vystúpenie': 'speech', 'Vystúpenie v rozprave': 'speech', 'Vystúpenie s faktickou poznámkou': 'speech', 'Vystúpenie s procedurálnym návrhom': 'speech', 'Prednesenie otázky': 'question', 'Zodpovedanie otázky': 'answer', 'Doplňujúca otázka / reakcia zadávajúceho': 'question', 'Prednesenie interpelácie': 'question', 'Odpoveď na interpeláciu': 'answer', 'scene': 'scene' } def insert_speech(kind): """Insert a speech entity for the given debate part kind and data from parent scope variables and update end date of the corresponding session and sitting. Delete `text` variable.""" nonlocal text, last_speech_enddatetime if not text: return speech = { 'text': text.strip().replace('[', '(').replace(']', ')'), 'date': start_datetime, 'type': debate_part_kinds.get(kind, 'speech'), 'position': len(speeches) + 1, 'event_id': sitting_id, 'sources' : [{ 'url': dpart_url, 'note': 'Prepis časti debaty na webe NRSR' }] } if dpart_video: speech['video'] = dpart_video if kind != 'scene': speech['creator_id'] = speaker_id speech['attribution_text'] = attribution.strip() speeches.append(speech) text = '' if end_datetime > session_end_date: vpapi.patch('events', session_id, {'end_date': end_datetime}) if end_datetime > sitting_end_date: vpapi.patch('events', sitting_id, {'end_date': end_datetime}) last_speech_enddatetime = datetime.strptime(end_datetime, '%Y-%m-%dT%H:%M:%S') logging.info('Scraping debates of term `%s`' % term) chamber_id = get_chamber_id(term) # prepare mapping from MP's name to id people = vpapi.getall('people', projection={'name': 1}) mps = {mp['name']: mp['id'] for mp in people} # load name corrections with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f: name_corrections = json.load(f) # scraping will start since the most recent sitting start date last_sitting = vpapi.getfirst('events', where={'type': 'sitting', 'organization_id': chamber_id}, sort='-start_date') since_date = last_sitting['start_date'][:10] if last_sitting else None # scrape list of debate parts debate_parts = parse.new_debates_list(term, since_date) speech_count = 0 session_name = '' speeches = [] for dp in debate_parts['_items']: # stop at very recent debate parts (may be incomplete) start_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['od'])) sd = datetime.strptime(start_datetime, '%Y-%m-%dT%H:%M:%S') if datetime.utcnow() - sd < timedelta(days=5): break # skip already scraped debate parts existing = vpapi.getfirst('speeches', where={'sources.url': dp['prepis']['url']}) if existing: continue logging.info('Scraping debate part %s %s-%s (id=%s)' % (dp['dátum'], dp['trvanie']['od'], dp['trvanie']['do'], dp['prepis']['id'])) dpart = parse.debate_of_terms56(dp['prepis']['id']) if not dpart['riadky']: continue end_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['do'])) dpart_kind = dp['druh'] dpart_url = dp['prepis']['url'] dpart_video = dp['video']['url'] if 'video' in dp else None if not session_name.startswith('%s. ' % dp['schôdza']): # create new session event session_name = '%s. schôdza' % dp['schôdza'] session = { 'name': session_name, 'identifier': dp['schôdza'], 'organization_id': chamber_id, 'type': 'session', 'start_date': start_datetime, 'end_date': end_datetime, } key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) session_end_date = end_datetime # find the last moment of the last sitting of this session session_last_sitting = vpapi.getfirst('events', where={'type': 'sitting', 'parent_id': session_id}, sort='-start_date') if session_last_sitting: last_speech_enddatetime = datetime.strptime(session_last_sitting['end_date'], '%Y-%m-%dT%H:%M:%S') sitting_identifier = session_last_sitting['identifier'] sitting_id = session_last_sitting['id'] sitting_end_date = session_last_sitting['end_date'] else: last_speech_enddatetime = datetime.min sitting_identifier = '0' if sd - last_speech_enddatetime > timedelta(hours=5): # create new sitting event sitting_identifier = str(int(sitting_identifier) + 1) sitting_name = '%s. deň rokovania, %s' % (sitting_identifier, dp['dátum']) sitting = { 'name': sitting_name, 'identifier': sitting_identifier, 'organization_id': chamber_id, 'type': 'sitting', 'start_date': start_datetime, 'end_date': end_datetime, 'parent_id': session_id, } key = ('parent_id', 'type', 'identifier') sitting_id, _ = get_or_create('events', sitting, key) sitting_end_date = end_datetime # save speeches of the previous sitting if len(speeches) > 0: vpapi.post('speeches', speeches) speech_count += len(speeches) if dp != debate_parts['_items'][0]: logging.info('Scraped %s speeches from previous sitting' % len(speeches)) speeches = [] # add the first speaker name that is sometimes missing first_speaker = '<strong>%s, %s</strong>' % (dp['osoba']['meno'], dp['osoba']['funkcia']) dpart['riadky'].insert(0, first_speaker) # extract speeches from the debate part text = '' within_scene = False for par in dpart['riadky']: if not par: continue par = par.replace('\n', ' ').strip() # skip eventual speech number if re.match('^(\d+)\.$', par): continue # convert brackets to parentheses par = re.sub(r'\[(.*?)\]', r'(\1)', par) # convert all inner nested parentheses to brackets n = 1 while n >= 1: (par, n) = re.subn(r'\((.*?)\((\.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL) # process eventual multiparagraph scene if par.startswith('(') and par.count('(') > par.count(')'): # save eventual previous speech insert_speech(dpart_kind) text = '<p>%s</p>' % lxml.html.fromstring(par[1:]).text_content() within_scene = True continue if within_scene: if par.endswith(')') and par.count(')') > par.count('('): text += '\n\n<p>%s</p>' % lxml.html.fromstring(par[:-1]).text_content() insert_speech('scene') within_scene = False else: text += '\n\n<p>%s</p>' % lxml.html.fromstring(par).text_content() continue # process eventual new speaker # format `Doe, John, foreign minister` speech_start_pattern = r'<strong>(\w+), (\w+\.?)( (\w+\.?))?, (.*)</strong>' sp = re.match(speech_start_pattern, par, re.DOTALL) if sp: # save eventual previous speech insert_speech(dpart_kind) # identify speaker name = '%s %s' % (sp.group(2), sp.group(1)) if (sp.group(4)): name = name.replace(' ', ' %s ' % sp.group(4)) attribution = sp.group(5) if name in name_corrections: name = name_corrections[name] if len(name) == 0: continue speaker_id = mps.get(name) # create unknown speakers if not speaker_id: logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution)) name_parts = re.match(r'(\w+\.?)( (\w+\.?))? (\w+)', name) person = { 'name': name, 'family_name': name_parts.group(4), 'given_name': name_parts.group(1) } person['sort_name'] = '%s, %s' % (person['family_name'], person['given_name']) if name_parts.group(3): person['additional_name'] = name_parts.group(3) person['sort_name'] += ' %s' % person['additional_name'] resp = vpapi.post('people', person) speaker_id = resp['id'] mps[name] = speaker_id continue # remove HTML tags par = lxml.html.fromstring(par).text_content() # process eventual scene in this paragraph scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS while True: scene = re.match(scene_pattern, par, re.DOTALL) if not scene: break if scene.group(1): text += '\n\n<p>%s</p>' % scene.group(1).strip() insert_speech(dpart_kind) text = '<p>%s</p>' % scene.group(2).strip() insert_speech('scene') par = scene.group(3) if par: text += '\n\n<p>%s</p>' % par insert_speech(dpart_kind) if len(speeches) > 0: vpapi.post('speeches', speeches) logging.info('Scraped %s speeches' % len(speeches)) speech_count += len(speeches) logging.info('Scraped %s speeches in total' % speech_count)
def scrape_old_debates(term): """Scrape and save speeches from debates of the given term, one of those older terms where transcripts of debates are stored in RTF files. Returns number of scraped speeches. """ def insert_speech(type): """Insert a speech entity with the given type and data from parent scope variables and update end date of the corresponding session and sitting. Delete `text` variable.""" nonlocal text, position if not text: return position = position + 1 speech = { 'text': text.strip().replace('[', '(').replace(']', ')'), 'type': type, 'position': position, 'event_id': sitting_id, 'sources' : [{ 'url': debate['url'], 'note': 'Prepis debaty v Digitálnej knižnici na webe NRSR' }] } if type != 'scene': speech['creator_id'] = speaker_id speech['attribution_text'] = attribution.strip() speeches.append(speech) text = '' if date > session_end_date: vpapi.patch('events', session_id, {'end_date': date}) if date > sitting_end_date: vpapi.patch('events', sitting_id, {'end_date': date}) logging.info('Scraping debates of term `%s`' % term) chamber_id = get_chamber_id(term) # prepare mapping from MP's name to id people = vpapi.getall('people', projection={'given_name': 1, 'additional_name': 1, 'family_name': 1}) mps = {} for mp in people: if 'additional_name' in mp: name = '%s. %s. %s' % (mp['given_name'][0], mp['additional_name'][0], mp['family_name']) else: name = '%s. %s' % (mp['given_name'][0], mp['family_name']) mps[name] = mp['id'] # load name corrections with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f: name_corrections = json.load(f) # scrape list of debates debates = parse.old_debates_list(term) # add the debate missing in the list if term == '4': debates['_items'].append({ 'názov': 'Autorizovaná rozprava, 48. schôdza NR SR, 3. 2. 2010', 'id': '2010_02_03', 'url': 'http://www.nrsr.sk/dl/Browser/DsDocument?documentId=391413' }) speech_count = 0 session_identifier = None for debate in debates['_items']: # skip obsolete debates in the list if term == '1': if (debate['názov'] == 'Stenozáznam' and debate['id'] != '198550' or debate['id'] in ('65890', '65945', '65949')): continue elif term == '2': if debate['názov'].startswith('Stenografická') and debate['id'] != '92098': continue elif term == '3': if debate['id'] == '181047': continue logging.info('Scraping debate `%s` (id=%s)' % (debate['názov'], debate['id'])) if term == '1': paragraphs = parse.debate_of_term1(debate['id']) else: paragraphs = parse.debate_of_terms234(debate['id']) # normalize header of the debate transcript if term == '2': # join first 4 paragraphs and add trailing underscores to mark the header paragraphs = ['%s %s %s %s\n___' % (paragraphs[0], paragraphs[1], paragraphs[2], paragraphs[3])] + paragraphs[4:] elif term in ('3', '4'): # join first paragraphs until " hodine" ending is found # and add trailing underscores to mark the header p = '' while True: p += ' ' + paragraphs.pop(0) if p.endswith('hodine'): break if paragraphs[0].startswith('___'): paragraphs.pop(0) paragraphs.insert(0, p + '\n___') # extract speeches from the debate speeches = [] text = '' within_scene = False for par in paragraphs: par = par.replace('\n', ' ').strip() if not par: continue # fix last scene if re.search(r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.', par): if not par[0] in ('(', '[', '/'): par = '(%s)' % par # convert brackets to parentheses par = re.sub(r'\[(.*?)\]', r'(\1)', par) # slash pairs are converted to parentheses too in term 1 if term == '1': par = re.sub(r'(^|\s)/(.*?)/(\s|$)', r'\1(\2)\3', par) # convert all inner nested parentheses to brackets n = 1 while n >= 1: (par, n) = re.subn(r'\((.*?)\((.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL) # process eventual multiparagraph scene if par.startswith('(') and par.count('(') > par.count(')'): # save eventual previous speech insert_speech('speech') text = '<p>%s</p>' % par[1:] within_scene = True continue if within_scene: if par.endswith(')') and par.count(')') > par.count('('): text += '\n\n<p>%s</p>' % par[:-1] insert_speech('scene') within_scene = False else: text += '\n\n<p>%s</p>' % par continue # process eventual header header_pattern = r'((\(?(\d+)\.\)?\s+schôdz)|slávnostn).*?(\d+)\..*\b(\w{3,})\s+(\d{4})(.*?)_{3,}$' hd = re.search(header_pattern, par, re.DOTALL) if hd: # save eventual previous speech insert_speech('speech') sk_date = '%s. %s %s' % (hd.group(4), hd.group(5), hd.group(6)) initial_time = re.search(r'\s+o\s+(.*?)\s+hodine', hd.group(7), re.DOTALL) if initial_time and initial_time.group(1) != '??': h, m = initial_time.group(1).strip('.').split('.') date = sk_to_utc(sk_date + ' %s:%s:00' % (h.strip().zfill(2), m.strip().zfill(2))) else: date = sk_to_utc(sk_date) + 'T00:00:00' if hd.group(1).startswith('sláv'): new_session_name = 'Mimoriadna schôdza' if term == '1': new_session_identifier = debate['časť'] elif term == '2': new_session_identifier = '1000' else: sl = parse.session_list(term) d = '%s. %s. %s' % (int(date[8:10]), int(date[5:7]), int(date[0:4])) new_session_identifier = next((s['číslo'] for s in sl['_items'] if s['trvanie'] == d)) else: new_session_name = '%s. schôdza' % hd.group(3) new_session_identifier = hd.group(3) if new_session_identifier != session_identifier: # create new session event session = { 'name': new_session_name, 'identifier': new_session_identifier, 'organization_id': chamber_id, 'type': 'session', 'start_date': date, } key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) session_identifier = new_session_identifier session_end_date = date sitting_count = 0 # create new sitting event sitting_count += 1 sitting = { 'name': '%s. deň rokovania, %s' % (sitting_count, sk_date), 'identifier': str(sitting_count), 'organization_id': chamber_id, 'type': 'sitting', 'start_date': date, 'parent_id': session_id, } key = ('parent_id', 'type', 'identifier') sitting_id, created = get_or_create('events', sitting, key) sitting_end_date = date position = 0 # delete existing speeches of the sitting if not created: obsolete = vpapi.getall('speeches', where={'event_id': sitting_id}) for speech in obsolete: vpapi.delete('speeches', speech['id']) continue # process eventual start of a speech if date < '2001-09-04': # format `Foreign minister J. Doe:` speech_start_pattern = r'(.*?)\b([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+):$' else: # format `J. Doe, foreign minister: speech` speech_start_pattern = r'([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+),\s+(.+?):(.+)$' sp = re.match(speech_start_pattern, par, re.DOTALL) if sp: # save eventual previous speech insert_speech('speech') # identify speaker if date < '2001-09-04': name = '%s. %s' % (sp.group(2), sp.group(5)) if (sp.group(4)): name = name.replace(' ', ' %s. ' % sp.group(4)) attribution = sp.group(1) par = '' else: name = '%s. %s' % (sp.group(1), sp.group(4)) if (sp.group(3)): name = name.replace(' ', ' %s. ' % sp.group(3)) attribution = sp.group(5) par = sp.group(6) if name in name_corrections: name = name_corrections[name] attribution = attribution[0].lower() + attribution[1:].strip() speaker_id = mps.get(name) # create unknown speakers if not speaker_id: logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution)) name_parts = re.match(r'(\w)\. ((\w)\. )?(\w+)', name) person = { 'name': name, 'family_name': name_parts.group(4), 'given_name': name_parts.group(1) } person['sort_name'] = '%s, %s.' % (person['family_name'], person['given_name']) if name_parts.group(3): person['additional_name'] = name_parts.group(3) person['sort_name'] += ' %s.' % person['additional_name'] resp = vpapi.post('people', person) speaker_id = resp['id'] mps[name] = speaker_id # recognize date(-time) stamps in transcripts ds = re.match(r'^\s*(\d+\.\s\w+\s\d{4})(.*hodine)?\s*$', par) if ds: dt = ds.group(1).strip() tm = re.search(r'o\s+(.*?)\s+', ds.group(2) or '') try: if tm: h, m = tm.group(1).strip('.').split('.') date = sk_to_utc('%s %s:%s:00' % (dt, h.strip().zfill(2), m.strip().zfill(2))) else: date = sk_to_utc(dt) + 'T00:00:00' continue except ValueError: pass # process eventual scene in this paragraph scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS while True: scene = re.match(scene_pattern, par, re.DOTALL) if not scene: break if scene.group(1): text += '\n\n<p>%s</p>' % scene.group(1).strip() insert_speech('speech') text = '<p>%s</p>' % scene.group(2).strip() insert_speech('scene') par = scene.group(3) if par: text += '\n\n<p>%s</p>' % par.strip() insert_speech('speech') # extract end time of the session final_time = re.search( r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.', speeches[-1]['text']) if final_time: tm = final_time.group(2) tm = tm.replace('O', '0').replace(',', '.') h, m = tm.strip('.').split('.') final_date = '%s.%s.%s %s:%s:00' % (date[8:10], date[5:7], date[0:4], h.strip().zfill(2), m.strip().zfill(2)) final_date = sk_to_utc(final_date) vpapi.patch('events', session_id, {'end_date': final_date}) vpapi.patch('events', sitting_id, {'end_date': final_date}) vpapi.post('speeches', speeches) logging.info('Scraped %s speeches' % len(speeches)) speech_count += len(speeches) logging.info('Scraped %s speeches in total' % speech_count)
def scrape_motions(term): """Scrape and save motions from the given term that are not scraped yet starting from the oldest ones. One Motion item, one VoteEvent item and many Vote items are created for each scraped motion detail page. Returns number of scraped motions. """ logging.info('Scraping motions of term `%s`' % term) # prepare mappings from source identifier to id for MPs and parliamentary groups chamber_id = get_chamber_id(term) people = vpapi.getall('people', projection={'identifiers': 1}) mps = {mp['identifiers'][0]['identifier']: mp['id'] for mp in people if 'identifiers' in mp} orgs = vpapi.getall('organizations', where={'classification': 'parliamentary group', 'parent_id': chamber_id}) parl_groups = {c['name']: c['id'] for c in orgs} # add differently spelled parliamentary groups group_corrections = { '2': { 'Klub HZDS': 'Klub ĽS-HZDS', 'Klub SMK': 'Klub SMK-MKP', 'Klub Nezávislí': 'Klub Nezávislý', }, '3': { 'Klub HZDS': 'Klub ĽS-HZDS', 'Klub SDKÚ': 'Klub SDKÚ-DS', 'Klub Smer': 'Klub SMER-SD', 'Klub Smer-SD': 'Klub SMER-SD', 'Klub KNP': 'Klub nezávislých poslancov NR SR', 'Klub Nezávislý': 'Klub nezávislých poslancov NR SR', }, } for k, v in group_corrections.get(term, {}).items(): parl_groups[k] = parl_groups[v] # prepare list of sessions that are not completely scraped yet sessions_to_scrape = [] session_list = parse.session_list(term) for session in session_list['_items']: motions = parse.session(session['číslo'], term) if len(motions['_items']) == 0: continue last_motion_id = motions['_items'][-1]['id'] m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % last_motion_id existing = vpapi.getfirst('motions', where={'sources.url': m_url}) if existing: break sessions_to_scrape.append((session, motions)) # scrape motions from those sessions scraped_motions_count = 0 for s, motions in reversed(sessions_to_scrape): logging.info('Scraping session `%s`' % s['názov']) # insert the session event unless it already exists session = { 'name': s['názov'], 'identifier': s['číslo'], 'organization_id': chamber_id, 'type': 'session', } try: session['start_date'] = sk_to_utc(s['trvanie']) + 'T00:00:00' session['end_date'] = session['start_date'] except ValueError: # multiday session contains votes; dates are set by debates scraping pass key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) for i, m in enumerate(motions['_items']): # check if the motion is already present m_id = re.search(r'ID=(\d+)', m['url']['výsledok']).group(1) # we not use directly m['url']['kluby'] because it is not always present m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % m_id existing = vpapi.getfirst('motions', where={'sources.url': m_url}) if existing: continue try: motion_id = None vote_event_id = None # insert motion logging.info('Scraping motion %s of %s (voted at %s)' % (i+1, len(motions['_items']), m['dátum'])) parsed_motion = parse.motion(m['id']) motion = { 'organization_id': chamber_id, 'legislative_session_id': session_id, 'identifier': parsed_motion['číslo'], 'text': parsed_motion['názov'], 'date': sk_to_utc(m['dátum']), 'sources': [{ 'url': parsed_motion['url'], 'note': 'Hlasovanie na webe NRSR' }], } if 'výsledok' in parsed_motion: motion['result'] = 'pass' if parsed_motion['výsledok'] == 'Návrh prešiel' else 'fail' resp = vpapi.post('motions', motion) motion_id = resp['id'] # insert vote event vote_event = { 'motion_id': motion_id, 'organization_id': chamber_id, 'legislative_session_id': session_id, 'identifier': parsed_motion['číslo'], 'start_date': motion['date'], 'sources': [{ 'url': parsed_motion['url'], 'note': 'Hlasovanie na webe NRSR' }], } if 'výsledok' in parsed_motion: vote_event['result'] = motion['result'] if 'súčty' in parsed_motion: options = { 'yes': '[z] za', 'no': '[p] proti', 'abstain': '[?] zdržalo sa', 'absent': '[0] neprítomní', 'not voting': '[n] nehlasovalo' } vote_event['counts'] = [ {'option': o, 'value': int(parsed_motion['súčty'][s])} for o, s in options.items() if parsed_motion['súčty'][s] != '' ] if len(vote_event['counts']) == 0: del vote_event['counts'] resp = vpapi.post('vote-events', vote_event) vote_event_id = resp['id'] # insert votes if 'hlasy' in parsed_motion and len(parsed_motion['hlasy']) > 0: vote_options = { 'z': 'yes', 'p': 'no', '?': 'abstain', 'n': 'not voting', '0': 'absent' } votes = [] for v in parsed_motion['hlasy']: # skip MPs not applying their mandate if v['hlas'] == '-': continue pg = normalize_parlgroup_name(v['klub']) votes.append({ 'vote_event_id': vote_event_id, 'option': vote_options[v['hlas']], 'voter_id': mps.get(v['id']), 'group_id': parl_groups.get(pg), }) if len(votes) > 0: resp = vpapi.post('votes', votes) # delete incomplete data if insertion of the motion, vote event or votes failed except: if motion_id: vpapi.delete('motions', motion_id) if vote_event_id: vpapi.delete('vote-events', vote_event_id) raise scraped_motions_count += 1 logging.info('Scraped %s motions of term `%s`' % (scraped_motions_count, term)) return scraped_motions_count
def test_ids(self): committees_ids = {} all_committees = vpapi.getall("organizations", where={"classification": "committe"}) for committe in all_committees: committees_ids[committe["identifiers"][0]["identifier"]] = committe["id"] print len(committees_ids)
def scrape_from_group_and_save(group_type, id, term): """Scrape memberships in a given group and save (or update) them. If group or MP referred by the membership does not exist, scrape and save it/him/her. """ group = parse.group(group_type, id) # if group is not scraped yet, scrape and save it g = vpapi.getfirst('organizations', where={ 'classification': group_type, 'identifiers': {'$elemMatch': {'identifier': id, 'scheme': 'nrsr.sk'}}}, projection={'id': 1}) if g: oid = g['id'] else: o = Organization.scrape(group_type, id) oid = o.save() roles = { 'člen': 'member', 'členka': 'member', 'predseda': 'chairman', 'predsedníčka': 'chairwoman', 'podpredseda': 'vice-chairman', 'podpredsedníčka': 'vice-chairwoman', 'vedúci': 'chairman', 'vedúca': 'chairwoman', 'náhradník': 'substitute', 'náhradníčka': 'substitute', 'overovateľ': 'verifier', 'overovateľka': 'verifier', 'poverený vedením klubu': 'chairman', 'podpredseda poverený vedením výboru': 'vice-chairman', 'náhradný člen': 'substitute', 'náhradná členka': 'substitute', } for member in group['členovia']: logging.info('Scraping membership of `%s`' % member['meno']) # if member MP is not scraped yet, scrape and save him existing = vpapi.getfirst('people', where={'identifiers': {'$elemMatch': {'identifier': member['id'], 'scheme': 'nrsr.sk'}}}, projection={'id': 1}) if existing: pid = existing['id'] else: p = Person.scrape(member['id'], term) pid = p.save() m = Membership() m.person_id = pid m.organization_id = oid m.sources = [{ 'url': group['url'], 'note': 'Profil na webe NRSR' }] # create or update all periods of the membership for period in member['obdobia']: if period.get('rola'): m.label = period['rola'].capitalize() + ' v skupine ' + group['názov'] m.role = roles[period['rola'].lower()] else: m.label = 'V skupine ' + group['názov'] if period.get('od'): m.start_date = sk_to_utc(period.get('od')) if period.get('do'): m.end_date = sk_to_utc(period.get('do')) m.save() for attr in ('role', 'start_date', 'end_date'): if hasattr(m, attr): delattr(m, attr) logging.info('Scraped %s memberships' % len(group['členovia'])) # close all open memberships in this group that were not updated logging.info('Closing not updated open memberships') present = datetime.utcnow() - timedelta(minutes=10) query = { 'organization_id': oid, '$or': [{'end_date': {'$exists': False}}, {'end_date': {'$in': [None, '']}}], 'updated_at': {'$lt': present.isoformat()} } to_close = vpapi.getall('memberships', where=query) for m in to_close: vpapi.patch('memberships', m['id'], {'end_date': datestring_add(effective_date, -1)})
return -1 if vote == 'abstain': return -1 else: return 0 answers = {} groups = {} mps = {} vpapi.parliament('sk/nrsr') for ve in ves: print(ve) vedb = vpapi.get("vote-events", where={"sources.url":{"$regex":"ID="+ve+"$"}}) idd = vedb['_items'][0]['id'] r = vpapi.getall("votes",where={"vote_event_id":idd}) for row in r: try: answers[row['voter_id']] except: answers[row['voter_id']] = {"vote":{}} answers[row['voter_id']]['vote'][ve] = vote2vote(row['option']) if row['group_id'] is not None: try: groups[row['group_id']] except: group = vpapi.get("organizations/"+row['group_id']) groups[row['group_id']] = {"name": group['name'].replace('Klub ','')} groups[row['group_id']]['slug'] = slugify.slugify(groups[row['group_id']]['name']) print(groups[row['group_id']]['slug']) answers[row['voter_id']]['friendly_name'] = groups[row['group_id']]['slug']
def export_speeches(self): speeches = self.load_json('speeches') people = {} prefix_regex = re.compile( ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\ (generalni sekretar )', re.U) for p in vpapi.getall('people'): name = self.normalize_name(p['name']) people[name] = p['id'] for speech in speeches: session_id = speech.get('event_id') speech['event_id'] = self.events_ids[session_id] url = speech['sources'][0]['url'] if url.endswith('.pdf'): parsed_speeches = self.download_pdf(url) for n, s in enumerate(parsed_speeches): text_speech = speech.copy() text_speech['text'] = s['text'] text_speech['position'] = n + 1 text_speech['type'] = 'speech' creator = self.normalize_name(s['creator']) creator = prefix_regex.sub('', creator) if creator in people: text_speech['creator_id'] = people[creator] else: creator_id = None for name in people: if name in creator: creator_id = people[name] break if creator_id is None: resp = vpapi.getfirst('people', where={ 'name': { '$regex': s['creator'], 'options': 'i' } }) if resp is None: self.log( 'Person "%(creator)s" not found. \ Creating one' % s, WARNING) item = { 'name': s['creator'], 'sources': text_speech['sources'] } resp = vpapi.post('people', item) creator_id = resp['id'] people[creator] = creator_id text_speech['creator_id'] = creator_id self.get_or_create('speeches', text_speech, where_keys=['event_id', 'position']) else: self.get_or_create('speeches', speech)
return 0 answers = {} groups = {} mps = {} vpapi.parliament('sk/nrsr') for ve in ves: print(ve) vedb = vpapi.get("vote-events", where={"sources.url": { "$regex": "ID=" + ve + "$" }}) idd = vedb['_items'][0]['id'] r = vpapi.getall("votes", where={"vote_event_id": idd}) for row in r: try: answers[row['voter_id']] except: answers[row['voter_id']] = {"vote": {}} answers[row['voter_id']]['vote'][ve] = vote2vote(row['option']) if row['group_id'] is not None: try: groups[row['group_id']] except: group = vpapi.get("organizations/" + row['group_id']) groups[row['group_id']] = { "name": group['name'].replace('Klub ', '') } groups[row['group_id']]['slug'] = slugify.slugify(
votesli = [] existingvotes = {} # terms = {} for rowp in hl_poslanec: # print(rowp) try: voteevents[rowp[1].strip()] except: voteevents[rowp[1].strip()] = vpapi.get('vote-events', where={'identifier': rowp[1].strip()}) r_voteevent = voteevents[rowp[1].strip()] try: existingvotes[r_voteevent["_items"][0]["id"]] except: rex = vpapi.getall('votes',where={"vote_event_id":r_voteevent["_items"][0]["id"]}) ids = [] for rowx in rex: ids.append(rowx['id']) if len(ids) > 0: existingvotes[r_voteevent["_items"][0]["id"]] = True if len(ids) < 200: print(r_voteevent["_items"][0]["id"] + ": " + len(ids)) else: existingvotes[r_voteevent["_items"][0]["id"]] = False print(r_voteevent["_items"][0]["id"] + ": " + str(len(ids))) # print(existingvotes) if not existingvotes[r_voteevent["_items"][0]["id"]]: try:
import scrapeutils import vpapi import authentication vpapi.parliament('cz/psp') vpapi.authorize(authentication.username,authentication.password) vpapi.timezone('Europe/Prague') votes = {} for vote_event in vpapi.getall("vote-events"): votes[vote_event['id']] = [] print(len(votes)) i = 0 for vote in vpapi.getall("votes"): if(i/100 == round(i/100)): print(i) votes[vote_event['id']].append(vote) i += 1 print(len(votes)) for i in votes: if not((len(votes[i]) == 200) or (len(votes[i]) == 400)): print (i + ":" + len(votes[i]))
def export_speeches(self): speeches = self.load_json('speeches') people = {} prefix_regex = re.compile( ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\ (generalni sekretar )', re.U) for p in vpapi.getall('people'): name = self.normalize_name(p['name']) people[name] = p['id'] for speech in speeches: session_id = speech.get('event_id') speech['event_id'] = self.events_ids[session_id] url = speech['sources'][0]['url'] if url.endswith('.pdf'): parsed_speeches = self.download_pdf(url) for n, s in enumerate(parsed_speeches): text_speech = speech.copy() text_speech['text'] = s['text'] text_speech['position'] = n + 1 text_speech['type'] = 'speech' creator = self.normalize_name(s['creator']) creator = prefix_regex.sub('', creator) if creator in people: text_speech['creator_id'] = people[creator] else: creator_id = None for name in people: if name in creator: creator_id = people[name] break if creator_id is None: resp = vpapi.getfirst( 'people', where={ 'name': { '$regex': s['creator'], 'options': 'i' } } ) if resp is None: self.log('Person "%(creator)s" not found. \ Creating one' % s, WARNING) item = { 'name': s['creator'], 'sources': text_speech['sources'] } resp = vpapi.post('people', item) creator_id = resp['id'] people[creator] = creator_id text_speech['creator_id'] = creator_id self.get_or_create( 'speeches', text_speech, where_keys=['event_id', 'position'] ) else: self.get_or_create('speeches', speech)
handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2') db_log = vpapi.post('logs', { 'status': 'running', 'file': logname, 'params': [] }) vpapi.parliament('cz/senat') vpapi.authorize(authentication.username, authentication.password) vpapi.timezone('Europe/Prague') o2id = {} organizations = vpapi.getall("organizations") for org in organizations: o2id[org['name']] = org['id'] p2id = {} persons = vpapi.getall('people') for p in persons: p2id[p['name']] = p['id'] def pp2id(name, date, p2id): if name == 'Jiří Dienstbier': if date < '2011-01-08': return '218' else: return '253'