def update_motion_url(self): print "\n\tUpdating url of motions" motions = vpapi.getall("motions") counter = 0 widgets = [ " Progress: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " - Processed: ", Counter(), " events ", ] pbar = ProgressBar(widgets=widgets) for motion in motions: counter += 1 sources = motion["sources"] url = sources[0]["url"] print (str(counter)) if "http://w1.c1.rada.gov.ua" not in url: motion_id = motion["id"] motion["sources"][0]["url"] = "http://w1.c1.rada.gov.ua" + url items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del motion[item_delete] vpapi.put("motions", motion_id, motion, effective_date=self.effective_date()) else: continue print "\n\tFinished updating motions url"
def scrape_chamber(self): # Iterates in every parliamentary group json document and # returns the list with the json document structure that Visegrad+ API accepts print "\n\tScraping chambers from Belarus Lowerhouse parliament..." chambers = parser.chambers() chambers_list = [] url = "http://house.gov.by/index.php/,10087,,,,2,,,0.html" for chamber in chambers: chamber_json = self.build_organization_doc("chamber", chambers[chamber]['name'], chamber, chambers[chamber]['start_date'], chambers[chamber]['end_date'], url, "", "") if chamber == "2": del chamber_json['dissolution_date'] del chamber_json['contact_details'] del chamber_json['parent_id'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers_list.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers" return chambers_list
def save(self, update_only=False): """If a compatible membership already exists, update it. Otherwise, create a new one. If `update_only` is True, only existing memberships are updated, no new one is created. Memberships are compatible if their fields `start_date`, `role` and `post` are compatible. Field 'end_date' is not checked to allow for later corrections of guessed end dates used when a member disappears from a group profile. """ memberships = vpapi.getall('memberships', where={'person_id': self.person_id, 'organization_id': self.organization_id}, sort='-start_date') to_save = self.__dict__.copy() id = None for existing in memberships: if self._merge_values('start_date', to_save, existing) \ and to_save.get('end_date', '9999-12-31') >= existing.get('start_date', '0001-01-01') \ and self._merge_values('role', to_save, existing) \ and self._merge_values('post', to_save, existing): id = existing['id'] self._merge_values('end_date', to_save, existing) break else: to_save = self.__dict__.copy() if id: resp = vpapi.put('memberships', id, to_save) else: if update_only: return resp = vpapi.post('memberships', self.__dict__) if resp['_status'] != 'OK': raise Exception(self.name, resp)
def scrape_chamber(self): # Scrapes chambers and Returns the list of chambers with all the information needed for each url = "http://www.parliament.am/deputies.php?sel=ful&ord=photo&show_session=5&lang=arm&enc=utf8" soup = scrape.download_html_file(url) chambers_list = [] print "\n\tScraping chambers from Armenia's parliament...\n" widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) all_options = soup.find("select", {"name": "show_session"}).findAll("option") for each_option in pbar(all_options): identifier = each_option.get('value') name = each_option.get_text() url = "http://www.parliament.am/deputies.php?lang=arm&sel=&ord=&show_session=" + identifier if "100" not in identifier: founding_date = self.terms[identifier]["start_date"] dissolution_date = self.terms[identifier]["end_date"] chamber_json = self.build_organization_doc("chamber", name, identifier, founding_date, dissolution_date, url, "", "") del chamber_json['contact_details'] del chamber_json['parent_id'] if identifier == "5": del chamber_json['dissolution_date'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers_list.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers" return chambers_list
def save(self): scraped = self.__dict__ existing = vpapi.getfirst('people', where={'identifiers': {'$elemMatch': self.identifiers[0]}}) if not existing: resp = vpapi.post('people', scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put('people', existing['id'], scraped, effective_date=effective_date) if resp['_status'] != 'OK': raise Exception(self.name, resp) return resp['id']
def save(scraped): import json r = vpapi.get("organizations", where={"identifiers": {"$elemMatch": scraped["identifiers"][0]}}) if not r["_items"]: r = vpapi.post("organizations", scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now existing = r["_items"][0] r = vpapi.put("organizations/%s" % existing["id"], scraped) if r["_status"] != "OK": raise Exception(self.name, resp) return r["id"]
def get_or_create(self, endpoint, item, refresh=False, where_keys=None): sort = [] embed = [] where = {} if where_keys: for key in where_keys: where[key] = item[key] elif endpoint == 'memberships': where = { 'person_id': item['person_id'], 'organization_id': item['organization_id'] } where['start_date'] = item.get('start_date', {"$exists": False}) sort = [('start_date', -1)] elif endpoint in ('motions', 'speeches'): where = {'sources.url': item['sources'][0]['url']} elif endpoint == 'vote-events': embed = ['votes'] if 'motion_id' in item: where = {'motion_id': item['motion_id']} else: where = {'start_date': item['start_date']} elif endpoint == 'votes': where = { 'vote_event_id': item['vote_event_id'], 'voter_id': item['voter_id'], } elif endpoint == 'events': where = {'identifier': item['identifier']} else: where = { 'identifiers': {'$elemMatch': item['identifiers'][0]}} created = False resp = vpapi.getfirst(endpoint, where=where, sort=sort) if not resp: resp = vpapi.post(endpoint, item) created = True self.log('Created %s' % resp['_links']['self']['href'], DEBUG) else: pk = resp['id'] resp = vpapi.put("%s/%s" % (endpoint, pk), item) self.log('Updated %s' % resp['_links']['self']['href'], DEBUG) if resp['_status'] != 'OK': raise Exception(resp) if refresh: resp = vpapi.get( resp['_links']['self']['href'], sort=sort, embed=embed) resp['_created'] = created return resp
def scrape_chamber(self): # Scrapes chambers and Returns the list of chambers with all the information needed for each url = "http://www.parlament.md/Parlamentarismul%C3%AEnRepublicaMoldova/" \ "Istorie%C8%99ievolu%C8%9Bie/tabid/96/language/ro-RO/Default.aspx" chambers_to_fix = {"XII": "12", "XIII": "13", "XIV": "14", "XV": "15", "XVI": "16", "XVII": "17", "XVIII": "18", "XIX": "19", "XX": "20"} chambers = [] soup = scrape.download_html_file(url) print "\n\tScraping chambers from Moldova's parliament..." widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for each_a in pbar(soup.find('div', {"class": "LocalizedContent"}).findAll('a')): name = each_a.get_text().strip() if name != "": url = "http://www.parlament.md" + each_a.get('href') if "(" in name: chamber_roman = name[name.index('X'):name.index('(')].replace('-a', "").strip() chamber_identifier = chambers_to_fix[chamber_roman] founding_date = self.terms[chamber_identifier]['start_date'] dissolution_date = self.terms[chamber_identifier]['end_date'] else: chamber_roman = name[-6:len(name)-3].strip() chamber_identifier = chambers_to_fix[chamber_roman] founding_date = self.terms[chamber_identifier]['start_date'] dissolution_date = self.terms[chamber_identifier]['end_date'] chamber_json = self.build_organization_doc("chamber", name, chamber_identifier, founding_date, dissolution_date, url, "", "") del chamber_json['contact_details'] del chamber_json['parent_id'] if chamber_identifier == "20": del chamber_json['dissolution_date'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers)) + " chambers" return chambers
def scrape(countries, people, votes): global effective_date effective_date = date.today().isoformat() # execute MP's bio data. georgia = georgia_scraper.GeorgiaScraper() armenia = armenia_scraper.ArmeniaScraper() ukraine = ukraine_scraper.UkraineScraper() belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper() belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper() moldova = moldova_scraper.MoldovaScraper() references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine, "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova, "belarus-upperhouse": belarus_upperhouse} countries_array = [] if countries == "all": for key in references: countries_array.append(key) else: countries_array = countries.split(',') indexes = [] for country in countries_array: if country.lower() not in references: indexes.append(countries_array.index(country)) if len(indexes) > 0: countries_array.pop(indexes) with open(os.path.join(BASE_DIR, 'access.json')) as f: creds = json.load(f) if len(countries_array) > 0: for item in sorted(countries_array): if internet_on(): # scrape and post data from parliaments if there's internet connection print "\n\tPosting and updating data from %s parliament" % item print "\tThis may take a few minutes..." vpapi.parliament(creds[item.lower()]['parliament']) vpapi.timezone(creds[item.lower()]['timezone']) vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password']) if people == "yes": members = references[item.lower()].scrape_mp_bio_data() chamber = references[item.lower()].scrape_chamber() parliamentary_groups = references[item.lower()].scrape_parliamentary_groups() committee = references[item.lower()].scrape_committee() data_collections = { "a-people": members, "b-chamber": chamber, "c-parliamentary_groups": parliamentary_groups, "d-committe": committee } # inserts data for each data collection in Visegrad+ Api for collection in sorted(set(data_collections)): widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \ collection[2:] if len(data_collections[collection]) > 0: for json_doc in pbar(data_collections[collection]): if collection == "a-people": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "people" elif collection == "c-parliamentary_groups" or collection == "d-committe": if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\ or item.lower() == "ukraine": where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']} else: where_condition = {'name': json_doc['name']} collection_of_data = "organizations" elif collection == "b-chamber": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "organizations" existing = vpapi.getfirst(collection_of_data, where=where_condition) if not existing: resp = vpapi.post(collection_of_data, json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date) # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:] if item.lower() != "georgia": memberships = { "chambers": references[item.lower()].scrape_membership(), "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(), "committees": references[item.lower()].scrape_committee_members() } elif item.lower() == "georgia": memberships = { "chambers": references[item.lower()].scrape_membership() } for data_collection in memberships: widgets_stat = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] prog_bar = ProgressBar(widgets=widgets_stat) if len(memberships[data_collection]) > 0: print "\n\tPosting and updating data from %s membership data collection\n" % data_collection for json_doc in prog_bar(memberships[data_collection]): existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'], "person_id": json_doc['person_id']}) if not existing: resp = vpapi.post("memberships", json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection else: print "\n\tThere is no data from %s membership data collection\n" % data_collection continue if votes == "yes": if item.lower() == "ukraine": events = references[item.lower()].scrape_events() try: if len(events) > 0: widgets_events = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar_events = ProgressBar(widgets=widgets_events) for json_doc in pbar_events(events): existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']}) if not existing_event: resp = vpapi.post("events", json_doc) else: resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from events data collection" else: print "\n\tThere are no new events" except BaseException as ex: print ex.message else: print "\tThere's not any event to post from %s parliament" % item motions_vote_events = references[item.lower()].vote_events() voting_results = references[item.lower()].scrape_votes() try: if len(voting_results) > 0: resp = vpapi.post("votes", voting_results) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message elif item.lower() == "georgia": voting_data_collections = { "amotions": references[item.lower()].motions(), "bvote-events": references[item.lower()].vote_events(), } votes = references[item.lower()].scrape_votes() for collection in sorted(voting_data_collections): try: if len(voting_data_collections[collection]) > 0: resp = vpapi.post(collection[1:], voting_data_collections[collection]) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection" % collection[1:] except BaseException as ex: print ex.message print "\n\tPosting voting records from Georgia Parliament\n" try: if len(votes) > 0: vpapi.post("votes", votes) print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message else: print "\n\tThere are no voting records for %s" % item vpapi.deauthorize() else: print "\n\tInternet connection problems for %s official parliament web page" % item continue else: print "\n\tInvalid country/ies added"