def __init__(self, *args, **kwargs): super(VisegradSpider, self).__init__(*args, **kwargs) vpapi.parliament(self.get_parliament()) vpapi.authorize(self.get_user(), self.get_password()) dispatcher.connect(self.spider_opened, signals.spider_opened)
def __init__(self, log = None): vpapi.parliament(self.get_parliament()) vpapi.authorize(self.get_user(), self.get_password()) self._chamber = None self._ids = {} if log is None: self.log = scrapy.log.msg else: self.log = log
def main(): # read command-line arguments ap = argparse.ArgumentParser('Scrapes data from Slovak parliament website http://nrsr.sk') ap.add_argument('--people', choices=['initial', 'recent', 'none'], default='recent', help='scrape of people, organizations and memberships') ap.add_argument('--votes', choices=['initial', 'recent', 'none'], default='recent', help='scrape of motions and votes') ap.add_argument('--debates', choices=['initial', 'recent', 'none'], default='recent', help='scrape of speeches from debates') ap.add_argument('--term', help='term to scrape recent data from; current term is used when omitted') args = ap.parse_args() # set-up logging to a local file if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info('Started') try: # set-up the API access vpapi.parliament('sk/nrsr') vpapi.timezone('Europe/Bratislava') with open(os.path.join(CONF_DIR, 'private.json'), encoding='utf8') as f: creds = json.load(f) vpapi.authorize(creds['api_user'], creds['password']) # indicate that the scraper has started db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': args.__dict__}) # clear cached source files if scrapeutils.USE_WEBCACHE: logging.info('Clearing cached files') scrapeutils.clear_cache() # test parser functions logging.info('Testing parser functions') out = io.StringIO() suite = unittest.TestLoader().loadTestsFromModule(sys.modules['test']) result = unittest.TextTestRunner(stream=out).run(suite) logging.info(out.getvalue()) if result.errors or result.failures: raise RuntimeError('Unit tests of parser functions failed, update canceled.') if args.people == 'initial': # initial scrape of all history of people and organizations logging.info('Initial scrape - deleting people, organizations and memberships') vpapi.delete('memberships') vpapi.delete('organizations') vpapi.delete('people') for term in sorted(parse.terms.keys()): scrape_people(term) elif args.people == 'recent': # incremental scrape of people and organizations since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun for the recently finished term once more.' % term) scrape_people(term) terms_with_old_debates = ('1', '2', '3', '4') if args.debates == 'initial': # initial scrape of debates from all terms logging.info('Initial scrape - deleting speeches and events') vpapi.delete('speeches') vpapi.delete('events') # newer terms are scraped first to get full names of unknown speakers for term in sorted(parse.terms.keys()): if term in terms_with_old_debates: continue scrape_new_debates(term) for term in terms_with_old_debates: scrape_old_debates(term) elif args.debates == 'recent': # incremental scrape of debates since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term) if term in terms_with_old_debates: scrape_old_debates(term) else: scrape_new_debates(term) if args.votes == 'initial': # initial scrape of votes from all terms logging.info('Initial scrape - deleting votes, vote-events and motions') vpapi.delete('votes') vpapi.delete('vote-events') vpapi.delete('motions') for term in sorted(parse.terms.keys()): scrape_motions(term) elif args.votes == 'recent': # incremental scrape of votes since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term) scrape_motions(term) status = 'finished' except BaseException as e: logging.critical(e, exc_info=True) if hasattr(e, 'response') and hasattr(e.response, '_content'): logging.critical(e.response._content.decode('utf-8')) status = 'interrupted' if isinstance(e, KeyboardInterrupt) else 'failed' # output to console to provoke an e-mail from Cron print('Scraping of parliament sk/nrsr failed, see\n\n' + logname + '\n\nfor details.') finally: logging.info(status.capitalize()) if 'db_log' in locals(): vpapi.patch('logs', db_log['id'], {'status': status})
#adding some more motions, vote-events (> 60536) import scrapeutils import vpapi import authentication import io import os.path import logging from datetime import date, datetime, timedelta import argparse LOGS_DIR = '/var/log/scrapers/cz/psp' vpapi.parliament('cz/psp') vpapi.authorize(authentication.username,authentication.password) vpapi.timezone('Europe/Prague') #motions, vote-events, votes: def guess_majority(quorum,present): if int(quorum) == 120: return 'two-thirds representatives majority' if int(quorum) == 101 and int(present)<200: return 'all representatives majority' else: return 'simple majority' def result2result(res): if res == "A": return "pass" else:
def scrape(countries, people, votes): global effective_date effective_date = date.today().isoformat() # execute MP's bio data. georgia = georgia_scraper.GeorgiaScraper() armenia = armenia_scraper.ArmeniaScraper() ukraine = ukraine_scraper.UkraineScraper() belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper() belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper() moldova = moldova_scraper.MoldovaScraper() references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine, "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova, "belarus-upperhouse": belarus_upperhouse} countries_array = [] if countries == "all": for key in references: countries_array.append(key) else: countries_array = countries.split(',') indexes = [] for country in countries_array: if country.lower() not in references: indexes.append(countries_array.index(country)) if len(indexes) > 0: countries_array.pop(indexes) with open(os.path.join(BASE_DIR, 'access.json')) as f: creds = json.load(f) if len(countries_array) > 0: for item in sorted(countries_array): if internet_on(): # scrape and post data from parliaments if there's internet connection print "\n\tPosting and updating data from %s parliament" % item print "\tThis may take a few minutes..." vpapi.parliament(creds[item.lower()]['parliament']) vpapi.timezone(creds[item.lower()]['timezone']) vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password']) if people == "yes": members = references[item.lower()].scrape_mp_bio_data() chamber = references[item.lower()].scrape_chamber() parliamentary_groups = references[item.lower()].scrape_parliamentary_groups() committee = references[item.lower()].scrape_committee() data_collections = { "a-people": members, "b-chamber": chamber, "c-parliamentary_groups": parliamentary_groups, "d-committe": committee } # inserts data for each data collection in Visegrad+ Api for collection in sorted(set(data_collections)): widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \ collection[2:] if len(data_collections[collection]) > 0: for json_doc in pbar(data_collections[collection]): if collection == "a-people": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "people" elif collection == "c-parliamentary_groups" or collection == "d-committe": if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\ or item.lower() == "ukraine": where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']} else: where_condition = {'name': json_doc['name']} collection_of_data = "organizations" elif collection == "b-chamber": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "organizations" existing = vpapi.getfirst(collection_of_data, where=where_condition) if not existing: resp = vpapi.post(collection_of_data, json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date) # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:] if item.lower() != "georgia": memberships = { "chambers": references[item.lower()].scrape_membership(), "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(), "committees": references[item.lower()].scrape_committee_members() } elif item.lower() == "georgia": memberships = { "chambers": references[item.lower()].scrape_membership() } for data_collection in memberships: widgets_stat = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] prog_bar = ProgressBar(widgets=widgets_stat) if len(memberships[data_collection]) > 0: print "\n\tPosting and updating data from %s membership data collection\n" % data_collection for json_doc in prog_bar(memberships[data_collection]): existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'], "person_id": json_doc['person_id']}) if not existing: resp = vpapi.post("memberships", json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection else: print "\n\tThere is no data from %s membership data collection\n" % data_collection continue if votes == "yes": if item.lower() == "ukraine": events = references[item.lower()].scrape_events() try: if len(events) > 0: widgets_events = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar_events = ProgressBar(widgets=widgets_events) for json_doc in pbar_events(events): existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']}) if not existing_event: resp = vpapi.post("events", json_doc) else: resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from events data collection" else: print "\n\tThere are no new events" except BaseException as ex: print ex.message else: print "\tThere's not any event to post from %s parliament" % item motions_vote_events = references[item.lower()].vote_events() voting_results = references[item.lower()].scrape_votes() try: if len(voting_results) > 0: resp = vpapi.post("votes", voting_results) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message elif item.lower() == "georgia": voting_data_collections = { "amotions": references[item.lower()].motions(), "bvote-events": references[item.lower()].vote_events(), } votes = references[item.lower()].scrape_votes() for collection in sorted(voting_data_collections): try: if len(voting_data_collections[collection]) > 0: resp = vpapi.post(collection[1:], voting_data_collections[collection]) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection" % collection[1:] except BaseException as ex: print ex.message print "\n\tPosting voting records from Georgia Parliament\n" try: if len(votes) > 0: vpapi.post("votes", votes) print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message else: print "\n\tThere are no voting records for %s" % item vpapi.deauthorize() else: print "\n\tInternet connection problems for %s official parliament web page" % item continue else: print "\n\tInvalid country/ies added"
logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2') db_log = vpapi.post('logs', { 'status': 'running', 'file': logname, 'params': [] }) vpapi.parliament('cz/senat') vpapi.authorize(authentication.username, authentication.password) vpapi.timezone('Europe/Prague') o2id = {} organizations = vpapi.getall("organizations") for org in organizations: o2id[org['name']] = org['id'] p2id = {} persons = vpapi.getall('people') for p in persons: p2id[p['name']] = p['id'] def pp2id(name, date, p2id): if name == 'Jiří Dienstbier':
import scrapeutils import vpapi import io import scrapeutils vpapi.parliament("cz/psp") vpapi.authorize("admin", "secret") def save(scraped): import json r = vpapi.get("organizations", where={"identifiers": {"$elemMatch": scraped["identifiers"][0]}}) if not r["_items"]: r = vpapi.post("organizations", scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now existing = r["_items"][0] r = vpapi.put("organizations/%s" % existing["id"], scraped) if r["_status"] != "OK": raise Exception(self.name, resp) return r["id"] # zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/poslanci.zip',zipped=True) # organy = scrapeutils.zipfile2rows(zfile,'organy.unl') ## chamber: # for row in organy: # if row[2] == '11': # term = row[3][3:] # org = {