Example #1
0
    def __init__(self, *args, **kwargs):
        super(VisegradSpider, self).__init__(*args, **kwargs)

        vpapi.parliament(self.get_parliament())
        vpapi.authorize(self.get_user(), self.get_password())

        dispatcher.connect(self.spider_opened, signals.spider_opened)
    def __init__(self, log = None):
        vpapi.parliament(self.get_parliament())
        vpapi.authorize(self.get_user(), self.get_password())

        self._chamber = None
        self._ids = {}
        if log is None:
            self.log = scrapy.log.msg
        else:
            self.log = log
'''creates people from API
see people-example.json
'''

import vpapi
import json

vpapi.parliament("cz/psp")

people = {}
next = True
page = 1
while next:
    peo = vpapi.get("people", page=page)
    for p in peo["_items"]:
        people[p["id"]] = {
            "id": p["id"],
            "name": p["name"],
            "birth_date": p['birth_date'],
            "gender": p["gender"],
            "sort_name": p["sort_name"],
            "given_name": p["given_name"],
            "identifiers": p["identifiers"],
            "family_name": p["family_name"]
        }
        try:
            p["honorific_prefix"]
        except:
            nothing = None
        else:
            people[p["id"]]["honorific_prefix"] = p["honorific_prefix"]
Example #4
0
def main():
	# read command-line arguments
	ap = argparse.ArgumentParser('Scrapes data from Slovak parliament website http://nrsr.sk')
	ap.add_argument('--people', choices=['initial', 'recent', 'none'], default='recent', help='scrape of people, organizations and memberships')
	ap.add_argument('--votes', choices=['initial', 'recent', 'none'], default='recent', help='scrape of motions and votes')
	ap.add_argument('--debates', choices=['initial', 'recent', 'none'], default='recent', help='scrape of speeches from debates')
	ap.add_argument('--term', help='term to scrape recent data from; current term is used when omitted')
	args = ap.parse_args()

	# set-up logging to a local file
	if not os.path.exists(LOGS_DIR):
		os.makedirs(LOGS_DIR)
	logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log'
	logname = os.path.join(LOGS_DIR, logname)
	logname = os.path.abspath(logname)
	logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
	logging.getLogger('requests').setLevel(logging.ERROR)

	logging.info('Started')
	try:
		# set-up the API access
		vpapi.parliament('sk/nrsr')
		vpapi.timezone('Europe/Bratislava')
		with open(os.path.join(CONF_DIR, 'private.json'), encoding='utf8') as f:
			creds = json.load(f)
		vpapi.authorize(creds['api_user'], creds['password'])

		# indicate that the scraper has started
		db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': args.__dict__})

		# clear cached source files
		if scrapeutils.USE_WEBCACHE:
			logging.info('Clearing cached files')
			scrapeutils.clear_cache()

		# test parser functions
		logging.info('Testing parser functions')
		out = io.StringIO()
		suite = unittest.TestLoader().loadTestsFromModule(sys.modules['test'])
		result = unittest.TextTestRunner(stream=out).run(suite)
		logging.info(out.getvalue())
		if result.errors or result.failures:
			raise RuntimeError('Unit tests of parser functions failed, update canceled.')

		if args.people == 'initial':
			# initial scrape of all history of people and organizations
			logging.info('Initial scrape - deleting people, organizations and memberships')
			vpapi.delete('memberships')
			vpapi.delete('organizations')
			vpapi.delete('people')
			for term in sorted(parse.terms.keys()):
				scrape_people(term)

		elif args.people == 'recent':
			# incremental scrape of people and organizations since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun for the recently finished term once more.' % term)
			scrape_people(term)

		terms_with_old_debates = ('1', '2', '3', '4')
		if args.debates == 'initial':
			# initial scrape of debates from all terms
			logging.info('Initial scrape - deleting speeches and events')
			vpapi.delete('speeches')
			vpapi.delete('events')
			# newer terms are scraped first to get full names of unknown speakers
			for term in sorted(parse.terms.keys()):
				if term in terms_with_old_debates: continue
				scrape_new_debates(term)
			for term in terms_with_old_debates:
				scrape_old_debates(term)

		elif args.debates == 'recent':
			# incremental scrape of debates since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term)
			if term in terms_with_old_debates:
				scrape_old_debates(term)
			else:
				scrape_new_debates(term)

		if args.votes == 'initial':
			# initial scrape of votes from all terms
			logging.info('Initial scrape - deleting votes, vote-events and motions')
			vpapi.delete('votes')
			vpapi.delete('vote-events')
			vpapi.delete('motions')
			for term in sorted(parse.terms.keys()):
				scrape_motions(term)

		elif args.votes == 'recent':
			# incremental scrape of votes since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term)
			scrape_motions(term)

		status = 'finished'

	except BaseException as e:
		logging.critical(e, exc_info=True)
		if hasattr(e, 'response') and hasattr(e.response, '_content'):
			logging.critical(e.response._content.decode('utf-8'))
		status = 'interrupted' if isinstance(e, KeyboardInterrupt) else 'failed'

		# output to console to provoke an e-mail from Cron
		print('Scraping of parliament sk/nrsr failed, see\n\n' + logname + '\n\nfor details.')

	finally:
		logging.info(status.capitalize())
		if 'db_log' in locals():
			vpapi.patch('logs', db_log['id'], {'status': status})
#adding some more motions, vote-events (> 60536)

import scrapeutils
import vpapi
import authentication
import io
import os.path
import logging
from datetime import date, datetime, timedelta
import argparse

LOGS_DIR = '/var/log/scrapers/cz/psp'

vpapi.parliament('cz/psp')
vpapi.authorize(authentication.username,authentication.password)
vpapi.timezone('Europe/Prague')


#motions, vote-events, votes:
def guess_majority(quorum,present):
    if int(quorum) == 120:
        return 'two-thirds representatives majority'
    if int(quorum) == 101 and int(present)<200:
        return 'all representatives majority'
    else:
        return 'simple majority'

def result2result(res):
    if res == "A":
        return "pass"
    else:
Example #6
0
'''creates vote-events from API and updates issue.json
see vote-events-example.json
'''

import vpapi
import json

vpapi.parliament("cz/psp")

vote_events = {}
next = True
page = 1

with open('../www/json/issue.json') as data_file:
    issue = json.load(data_file)

for key in issue["vote_events"]:
    rve = vpapi.get("vote-events",where={"identifier":key},embed=["motion"])
    try:
        ve = rve["_items"][0]
        issue["vote_events"][key]['available_vote_event'] = True
        vote_event = {
            "id": ve["id"],
            "motion": {
                "text": ve["motion"]["text"],
                "requirement": ve["motion"]["requirement"],
                "id": ve["motion"]["id"]
            },
            "start_date": ve["start_date"],
            "identifier": ve["identifier"],
            "result": ve["result"],
Example #7
0
def scrape(countries, people, votes):
    global effective_date
    effective_date = date.today().isoformat()

    # execute MP's bio data.
    georgia = georgia_scraper.GeorgiaScraper()
    armenia = armenia_scraper.ArmeniaScraper()
    ukraine = ukraine_scraper.UkraineScraper()
    belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper()
    belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper()
    moldova = moldova_scraper.MoldovaScraper()
    references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine,
                  "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova,
                  "belarus-upperhouse": belarus_upperhouse}
    countries_array = []
    if countries == "all":
        for key in references:
            countries_array.append(key)
    else:
        countries_array = countries.split(',')
        indexes = []
        for country in countries_array:
            if country.lower() not in references:
                indexes.append(countries_array.index(country))
        if len(indexes) > 0:
            countries_array.pop(indexes)
    with open(os.path.join(BASE_DIR, 'access.json')) as f:
        creds = json.load(f)
    if len(countries_array) > 0:
        for item in sorted(countries_array):
            if internet_on(): # scrape and post data from parliaments if there's internet connection
                print "\n\tPosting and updating data from %s parliament" % item
                print "\tThis may take a few minutes..."
                vpapi.parliament(creds[item.lower()]['parliament'])
                vpapi.timezone(creds[item.lower()]['timezone'])
                vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password'])
                if people == "yes":
                    members = references[item.lower()].scrape_mp_bio_data()
                    chamber = references[item.lower()].scrape_chamber()
                    parliamentary_groups = references[item.lower()].scrape_parliamentary_groups()
                    committee = references[item.lower()].scrape_committee()
                    data_collections = {
                        "a-people": members,
                        "b-chamber": chamber,
                        "c-parliamentary_groups": parliamentary_groups,
                        "d-committe": committee
                    }
                    # inserts data for each data collection in Visegrad+ Api
                    for collection in sorted(set(data_collections)):
                        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        pbar = ProgressBar(widgets=widgets)
                        print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \
                              collection[2:]
                        if len(data_collections[collection]) > 0:
                            for json_doc in pbar(data_collections[collection]):
                                if collection == "a-people":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "people"
                                elif collection == "c-parliamentary_groups" or collection == "d-committe":
                                    if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\
                                            or item.lower() == "ukraine":
                                        where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']}
                                    else:
                                        where_condition = {'name': json_doc['name']}
                                    collection_of_data = "organizations"
                                elif collection == "b-chamber":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "organizations"

                                existing = vpapi.getfirst(collection_of_data, where=where_condition)
                                if not existing:
                                    resp = vpapi.post(collection_of_data, json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date)

                                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")

                            print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:]
                    if item.lower() != "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership(),
                            "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(),
                            "committees": references[item.lower()].scrape_committee_members()
                        }
                    elif item.lower() == "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership()
                        }

                    for data_collection in memberships:
                        widgets_stat = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                        ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        prog_bar = ProgressBar(widgets=widgets_stat)
                        if len(memberships[data_collection]) > 0:
                            print "\n\tPosting and updating data from %s membership data collection\n" % data_collection
                            for json_doc in prog_bar(memberships[data_collection]):
                                existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'],
                                                                                "person_id": json_doc['person_id']})
                                if not existing:
                                    resp = vpapi.post("memberships", json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                            print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection
                        else:
                            print "\n\tThere is no data from %s membership data collection\n" % data_collection
                            continue
                if votes == "yes":
                    if item.lower() == "ukraine":
                        events = references[item.lower()].scrape_events()
                        try:
                            if len(events) > 0:
                                widgets_events = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                           ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                                pbar_events = ProgressBar(widgets=widgets_events)
                                for json_doc in pbar_events(events):
                                    existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']})
                                    if not existing_event:
                                        resp = vpapi.post("events", json_doc)
                                    else:
                                        resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date)
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from events data collection"
                            else:
                                print "\n\tThere are no new events"
                        except BaseException as ex:
                            print ex.message
                        else:
                            print "\tThere's not any event to post from %s parliament" % item
                        motions_vote_events = references[item.lower()].vote_events()
                        voting_results = references[item.lower()].scrape_votes()
                        try:
                            if len(voting_results) > 0:
                                resp = vpapi.post("votes", voting_results)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    elif item.lower() == "georgia":
                        voting_data_collections = {
                            "amotions": references[item.lower()].motions(),
                            "bvote-events": references[item.lower()].vote_events(),
                        }
                        votes = references[item.lower()].scrape_votes()
                        for collection in sorted(voting_data_collections):
                            try:
                                if len(voting_data_collections[collection]) > 0:
                                    resp = vpapi.post(collection[1:], voting_data_collections[collection])
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                    print "\n\tFinished Posting and updating data from %s data collection" % collection[1:]
                            except BaseException as ex:
                                print ex.message

                        print "\n\tPosting voting records from Georgia Parliament\n"
                        try:
                            if len(votes) > 0:
                                vpapi.post("votes", votes)
                            print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    else:
                        print "\n\tThere are no voting records for %s" % item
                vpapi.deauthorize()
            else:
                print "\n\tInternet connection problems for %s official parliament web page" % item
                continue
    else:
        print "\n\tInvalid country/ies added"
Example #8
0
logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log'
logname = os.path.join(LOGS_DIR, logname)
logname = os.path.abspath(logname)
logging.basicConfig(level=logging.DEBUG,
                    format='%(message)s',
                    handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
logging.getLogger('requests').setLevel(logging.ERROR)

logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2')
db_log = vpapi.post('logs', {
    'status': 'running',
    'file': logname,
    'params': []
})

vpapi.parliament('cz/senat')
vpapi.authorize(authentication.username, authentication.password)
vpapi.timezone('Europe/Prague')

o2id = {}
organizations = vpapi.getall("organizations")
for org in organizations:
    o2id[org['name']] = org['id']

p2id = {}
persons = vpapi.getall('people')
for p in persons:
    p2id[p['name']] = p['id']


def pp2id(name, date, p2id):
Example #9
0
def vote2vote(vote):
    if vote == 'yes':
        return 1
    if vote == 'no':
        return -1
    if vote == 'abstain':
        return -1
    else:
        return 0


answers = {}
groups = {}
mps = {}

vpapi.parliament('sk/nrsr')
for ve in ves:
    print(ve)
    vedb = vpapi.get("vote-events",
                     where={"sources.url": {
                         "$regex": "ID=" + ve + "$"
                     }})
    idd = vedb['_items'][0]['id']
    r = vpapi.getall("votes", where={"vote_event_id": idd})
    for row in r:
        try:
            answers[row['voter_id']]
        except:
            answers[row['voter_id']] = {"vote": {}}
        answers[row['voter_id']]['vote'][ve] = vote2vote(row['option'])
        if row['group_id'] is not None:
Example #10
0
def vote2vote (vote):
  if vote == 'yes':
    return 1
  if vote == 'no':
    return -1
  if vote == 'abstain':
    return -1
  else:
    return 0

answers = {}
groups = {}
mps = {}

vpapi.parliament('sk/nrsr')
for ve in ves:
    print(ve)
    vedb = vpapi.get("vote-events", where={"sources.url":{"$regex":"ID="+ve+"$"}})
    idd = vedb['_items'][0]['id']
    r = vpapi.getall("votes",where={"vote_event_id":idd})
    for row in r:
        try:
            answers[row['voter_id']]
        except:
            answers[row['voter_id']] = {"vote":{}}
        answers[row['voter_id']]['vote'][ve] = vote2vote(row['option'])
        if row['group_id'] is not None:
            try:
                groups[row['group_id']]
            except:
Example #11
0
        "code": "psp",
        "code_csv": "lower",
        "name": "Sněmovna"
    },
    {
        "code_api" : "cz/senat",
        "code": "senat",
        "code_csv": "upper",
        "name": "Senát"
    }
    
    
]

for p in parliaments:
    vpapi.parliament(p['code_api'])
    for ve in ves:
        if ve[p['code_csv'] + '_vote_event_id'] != '':
            votes = vpapi.getall("votes",where={"vote_event_id":ve[p['code_csv'] + '_vote_event_id']})
            print(ve[p['code_csv'] + '_vote_event_id'])
            for v in votes:
                try:
                    data[p['code'] + '_' + v['voter_id']]
                except:
                    data[p['code'] + '_' + v['voter_id']] = {}
                    data[p['code'] + '_' + v['voter_id']]['votes'] = {}
                    data[p['code'] + '_' + v['voter_id']]['chamber'] = p['code']
                    data[p['code'] + '_' + v['voter_id']]['chamber_name'] = p['name']
                    data[p['code'] + '_' + v['voter_id']]['id'] = v['voter_id']
                data[p['code'] + '_' + v['voter_id']]['votes'][ve['id']] = o2o[v['option']] * int(ve[p['code_csv'] + '_polarity'])
                data[p['code'] + '_' + v['voter_id']]['group_id'] = v['group_id']