def my_put(resource, item, vpapi): ex = vpapi.get(resource, where={"id": item['id']}) if len(ex['_items']) >= 1: #somehow vpapi.put does not work for me, so delete and post #vpapi.put(resource,item['id'],item) vpapi.delete(resource, item['id']) vpapi.post(resource, item)
def scrape_chamber(self): # Scrapes chambers and Returns the list of chambers with all the information needed for each url = "http://www.parliament.am/deputies.php?sel=ful&ord=photo&show_session=5&lang=arm&enc=utf8" soup = scrape.download_html_file(url) chambers_list = [] print "\n\tScraping chambers from Armenia's parliament...\n" widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) all_options = soup.find("select", {"name": "show_session"}).findAll("option") for each_option in pbar(all_options): identifier = each_option.get('value') name = each_option.get_text() url = "http://www.parliament.am/deputies.php?lang=arm&sel=&ord=&show_session=" + identifier if "100" not in identifier: founding_date = self.terms[identifier]["start_date"] dissolution_date = self.terms[identifier]["end_date"] chamber_json = self.build_organization_doc("chamber", name, identifier, founding_date, dissolution_date, url, "", "") del chamber_json['contact_details'] del chamber_json['parent_id'] if identifier == "5": del chamber_json['dissolution_date'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers_list.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers" return chambers_list
def log_start(self): log_item = { 'status': 'running' } if settings.get('LOG_FILE'): log_item['file'] = settings['LOG_FILE'] self._log = vpapi.post('logs', log_item)
def save(self, update_only=False): """If a compatible membership already exists, update it. Otherwise, create a new one. If `update_only` is True, only existing memberships are updated, no new one is created. Memberships are compatible if their fields `start_date`, `role` and `post` are compatible. Field 'end_date' is not checked to allow for later corrections of guessed end dates used when a member disappears from a group profile. """ memberships = vpapi.getall('memberships', where={'person_id': self.person_id, 'organization_id': self.organization_id}, sort='-start_date') to_save = self.__dict__.copy() id = None for existing in memberships: if self._merge_values('start_date', to_save, existing) \ and to_save.get('end_date', '9999-12-31') >= existing.get('start_date', '0001-01-01') \ and self._merge_values('role', to_save, existing) \ and self._merge_values('post', to_save, existing): id = existing['id'] self._merge_values('end_date', to_save, existing) break else: to_save = self.__dict__.copy() if id: resp = vpapi.put('memberships', id, to_save) else: if update_only: return resp = vpapi.post('memberships', self.__dict__) if resp['_status'] != 'OK': raise Exception(self.name, resp)
def scrape_chamber(self): # Iterates in every parliamentary group json document and # returns the list with the json document structure that Visegrad+ API accepts print "\n\tScraping chambers from Belarus Lowerhouse parliament..." chambers = parser.chambers() chambers_list = [] url = "http://house.gov.by/index.php/,10087,,,,2,,,0.html" for chamber in chambers: chamber_json = self.build_organization_doc("chamber", chambers[chamber]['name'], chamber, chambers[chamber]['start_date'], chambers[chamber]['end_date'], url, "", "") if chamber == "2": del chamber_json['dissolution_date'] del chamber_json['contact_details'] del chamber_json['parent_id'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers_list.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers" return chambers_list
def savevoteevent(self): # r = vpapi.get('vote-events', where={'identifier':self["identifier"]}) # if not r['_items']: r = vpapi.post("vote-events", self) # else: # r = vpapi.patch('vote-events/%s' % r['_items'][0]['id'],self) if r["_status"] != "OK": raise Exception(self.name, r) else: return r
def savemotion(self): # r = vpapi.get('motions', where={'identifiers': {'$elemMatch': self["identifiers"][0]}}) # if not r['_items']: r = vpapi.post("motions", self) # else: # r = vpapi.put('motions/%s' % r['_items'][0]['id'],self) if r["_status"] != "OK": raise Exception(self.name, r) else: return r
def save_organization(scraped): r = vpapi.get('organizations', where={'identifiers': {'$elemMatch': scraped["identifiers"][0]}}) if not r['_items']: r = vpapi.post('organizations', scraped) print ("POST " + scraped['id']) # outid = r['id'] else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now # outid = r['_items'][0]['id'] existing = r['_items'][0] # r = vpapi.put('organizations', existing['id'], scraped) #somehow vpapi.put does not work for me, so delete and post #vpapi.put(resource,item['id'],item) vpapi.delete("organizations",existing['id']) r = vpapi.post('organizations', scraped) print ("PUT " + scraped['id']) if r['_status'] != 'OK': raise Exception(scraped.name, r) return r['id']
def saveperson(scraped): import json for ident in scraped["identifiers"]: if ident["scheme"] == "psp.cz/osoby": identifier = ident break r = vpapi.get('people', where={'identifiers': {'$elemMatch': identifier}}) if not r['_items']: r = vpapi.post('people', scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now existing = r['_items'][0] # somehow vpapi.put does not work for me, so delete and post # r = vpapi.put('people', existing['id'], scraped) vpapi.delete("people", existing['id']) r = vpapi.post('people', scraped) if r['_status'] != 'OK': raise Exception(self.name, resp) return r['id']
def save(self): scraped = self.__dict__ existing = vpapi.getfirst('people', where={'identifiers': {'$elemMatch': self.identifiers[0]}}) if not existing: resp = vpapi.post('people', scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put('people', existing['id'], scraped, effective_date=effective_date) if resp['_status'] != 'OK': raise Exception(self.name, resp) return resp['id']
def save(scraped): import json r = vpapi.get("organizations", where={"identifiers": {"$elemMatch": scraped["identifiers"][0]}}) if not r["_items"]: r = vpapi.post("organizations", scraped) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now existing = r["_items"][0] r = vpapi.put("organizations/%s" % existing["id"], scraped) if r["_status"] != "OK": raise Exception(self.name, resp) return r["id"]
def get_or_create(self, endpoint, item, refresh=False, where_keys=None): sort = [] embed = [] where = {} if where_keys: for key in where_keys: where[key] = item[key] elif endpoint == 'memberships': where = { 'person_id': item['person_id'], 'organization_id': item['organization_id'] } where['start_date'] = item.get('start_date', {"$exists": False}) sort = [('start_date', -1)] elif endpoint in ('motions', 'speeches'): where = {'sources.url': item['sources'][0]['url']} elif endpoint == 'vote-events': embed = ['votes'] if 'motion_id' in item: where = {'motion_id': item['motion_id']} else: where = {'start_date': item['start_date']} elif endpoint == 'votes': where = { 'vote_event_id': item['vote_event_id'], 'voter_id': item['voter_id'], } elif endpoint == 'events': where = {'identifier': item['identifier']} else: where = { 'identifiers': {'$elemMatch': item['identifiers'][0]}} created = False resp = vpapi.getfirst(endpoint, where=where, sort=sort) if not resp: resp = vpapi.post(endpoint, item) created = True self.log('Created %s' % resp['_links']['self']['href'], DEBUG) else: pk = resp['id'] resp = vpapi.put("%s/%s" % (endpoint, pk), item) self.log('Updated %s' % resp['_links']['self']['href'], DEBUG) if resp['_status'] != 'OK': raise Exception(resp) if refresh: resp = vpapi.get( resp['_links']['self']['href'], sort=sort, embed=embed) resp['_created'] = created return resp
def savemembership(self): r = vpapi.get('memberships',where={'person_id': self["person_id"], 'organization_id': self["organization_id"], "role": "member", "start_date": self["start_date"]}) if not r['_items']: r = vpapi.post("memberships",self) else: #somehow vpapi.put does not work for me, so delete and post update = True try: if r['_items'][0]["end_date"] == self["end_date"]: update = False print("not updating: " + r['_items'][0]['id']) except: nothing = 0 if update: vpapi.delete("memberships",r['_items'][0]['id']) self['id'] = r['_items'][0]['id'] r = vpapi.post('memberships', self) print("updating: " + self['id']) # r = vpapi.put('memberships/%s' % r['_items'][0]['id'],self) if r['_status'] != 'OK': raise Exception(self.name, r)
def get_or_create(resource, item, key=None): """Unless the item already exists in the resource (identified by `key` fields) create it. Return id of the item and a bool whether the item was newly created or not. If key is not given, all fields of the item are used as a key. """ if key is None: key = item.keys() query = {field: item[field] for field in key} existing = vpapi.getfirst(resource, where=query) if existing: return existing['id'], False resp = vpapi.post(resource, item) return resp['id'], True
def scrape_chamber(self): # Scrapes chambers and Returns the list of chambers with all the information needed for each url = "http://www.parlament.md/Parlamentarismul%C3%AEnRepublicaMoldova/" \ "Istorie%C8%99ievolu%C8%9Bie/tabid/96/language/ro-RO/Default.aspx" chambers_to_fix = {"XII": "12", "XIII": "13", "XIV": "14", "XV": "15", "XVI": "16", "XVII": "17", "XVIII": "18", "XIX": "19", "XX": "20"} chambers = [] soup = scrape.download_html_file(url) print "\n\tScraping chambers from Moldova's parliament..." widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) for each_a in pbar(soup.find('div', {"class": "LocalizedContent"}).findAll('a')): name = each_a.get_text().strip() if name != "": url = "http://www.parlament.md" + each_a.get('href') if "(" in name: chamber_roman = name[name.index('X'):name.index('(')].replace('-a', "").strip() chamber_identifier = chambers_to_fix[chamber_roman] founding_date = self.terms[chamber_identifier]['start_date'] dissolution_date = self.terms[chamber_identifier]['end_date'] else: chamber_roman = name[-6:len(name)-3].strip() chamber_identifier = chambers_to_fix[chamber_roman] founding_date = self.terms[chamber_identifier]['start_date'] dissolution_date = self.terms[chamber_identifier]['end_date'] chamber_json = self.build_organization_doc("chamber", name, chamber_identifier, founding_date, dissolution_date, url, "", "") del chamber_json['contact_details'] del chamber_json['parent_id'] if chamber_identifier == "20": del chamber_json['dissolution_date'] existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}}) if not existing: resp = vpapi.post("organizations", chamber_json) else: # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date()) if resp["_status"] != "OK": raise Exception("Invalid status code") chambers.append(chamber_json) print "\n\tScraping completed! \n\tScraped " + str(len(chambers)) + " chambers" return chambers
def savemotion(self): r = vpapi.get('motions', where={'id': self['id']}) if not r['_items']: #print(self) r2 = vpapi.post("motions",self)
def main(): # read command-line arguments ap = argparse.ArgumentParser('Scrapes data from Slovak parliament website http://nrsr.sk') ap.add_argument('--people', choices=['initial', 'recent', 'none'], default='recent', help='scrape of people, organizations and memberships') ap.add_argument('--votes', choices=['initial', 'recent', 'none'], default='recent', help='scrape of motions and votes') ap.add_argument('--debates', choices=['initial', 'recent', 'none'], default='recent', help='scrape of speeches from debates') ap.add_argument('--term', help='term to scrape recent data from; current term is used when omitted') args = ap.parse_args() # set-up logging to a local file if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info('Started') try: # set-up the API access vpapi.parliament('sk/nrsr') vpapi.timezone('Europe/Bratislava') with open(os.path.join(CONF_DIR, 'private.json'), encoding='utf8') as f: creds = json.load(f) vpapi.authorize(creds['api_user'], creds['password']) # indicate that the scraper has started db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': args.__dict__}) # clear cached source files if scrapeutils.USE_WEBCACHE: logging.info('Clearing cached files') scrapeutils.clear_cache() # test parser functions logging.info('Testing parser functions') out = io.StringIO() suite = unittest.TestLoader().loadTestsFromModule(sys.modules['test']) result = unittest.TextTestRunner(stream=out).run(suite) logging.info(out.getvalue()) if result.errors or result.failures: raise RuntimeError('Unit tests of parser functions failed, update canceled.') if args.people == 'initial': # initial scrape of all history of people and organizations logging.info('Initial scrape - deleting people, organizations and memberships') vpapi.delete('memberships') vpapi.delete('organizations') vpapi.delete('people') for term in sorted(parse.terms.keys()): scrape_people(term) elif args.people == 'recent': # incremental scrape of people and organizations since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun for the recently finished term once more.' % term) scrape_people(term) terms_with_old_debates = ('1', '2', '3', '4') if args.debates == 'initial': # initial scrape of debates from all terms logging.info('Initial scrape - deleting speeches and events') vpapi.delete('speeches') vpapi.delete('events') # newer terms are scraped first to get full names of unknown speakers for term in sorted(parse.terms.keys()): if term in terms_with_old_debates: continue scrape_new_debates(term) for term in terms_with_old_debates: scrape_old_debates(term) elif args.debates == 'recent': # incremental scrape of debates since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term) if term in terms_with_old_debates: scrape_old_debates(term) else: scrape_new_debates(term) if args.votes == 'initial': # initial scrape of votes from all terms logging.info('Initial scrape - deleting votes, vote-events and motions') vpapi.delete('votes') vpapi.delete('vote-events') vpapi.delete('motions') for term in sorted(parse.terms.keys()): scrape_motions(term) elif args.votes == 'recent': # incremental scrape of votes since the last scrape term = args.term or parse.current_term() if term not in parse.terms: raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term) scrape_motions(term) status = 'finished' except BaseException as e: logging.critical(e, exc_info=True) if hasattr(e, 'response') and hasattr(e.response, '_content'): logging.critical(e.response._content.decode('utf-8')) status = 'interrupted' if isinstance(e, KeyboardInterrupt) else 'failed' # output to console to provoke an e-mail from Cron print('Scraping of parliament sk/nrsr failed, see\n\n' + logname + '\n\nfor details.') finally: logging.info(status.capitalize()) if 'db_log' in locals(): vpapi.patch('logs', db_log['id'], {'status': status})
def export_speeches(self): speeches = self.load_json('speeches') people = {} prefix_regex = re.compile( ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\ (generalni sekretar )', re.U) for p in vpapi.getall('people'): name = self.normalize_name(p['name']) people[name] = p['id'] for speech in speeches: session_id = speech.get('event_id') speech['event_id'] = self.events_ids[session_id] url = speech['sources'][0]['url'] if url.endswith('.pdf'): parsed_speeches = self.download_pdf(url) for n, s in enumerate(parsed_speeches): text_speech = speech.copy() text_speech['text'] = s['text'] text_speech['position'] = n + 1 text_speech['type'] = 'speech' creator = self.normalize_name(s['creator']) creator = prefix_regex.sub('', creator) if creator in people: text_speech['creator_id'] = people[creator] else: creator_id = None for name in people: if name in creator: creator_id = people[name] break if creator_id is None: resp = vpapi.getfirst('people', where={ 'name': { '$regex': s['creator'], 'options': 'i' } }) if resp is None: self.log( 'Person "%(creator)s" not found. \ Creating one' % s, WARNING) item = { 'name': s['creator'], 'sources': text_speech['sources'] } resp = vpapi.post('people', item) creator_id = resp['id'] people[creator] = creator_id text_speech['creator_id'] = creator_id self.get_or_create('speeches', text_speech, where_keys=['event_id', 'position']) else: self.get_or_create('speeches', speech)
a.xpath('@href')[0]).group(1).strip() o = re.search('O=(\d{1,})', a.xpath('@href')[0]).group(1).strip() groups[a.text]["identifiers"][gid] = { "scheme": "senat.cz/" + o, "identifier": gid } # save it j = 0 for person in people: print(j) j += 1 ex = vpapi.get("people", where={"id": person['id']}) if len(ex['_items']) < 1: vpapi.post("people", person) #vpapi.post("people",people) group = {"name": "Senát Parlamentu ČR", "classification": "chamber", "id": "1"} # some are not available by the algorithm above: vpapi.post("organizations", group) group = { "name": "Nezařazení", "classification": "political group", "parent_id": "1", "id": str(iid) } iid += 1 vpapi.post("organizations", group) group = { "name": "Senátorský klub Zelení - nezávislí", "classification": "political group",
test[row[0].strip()] = {"id":row[0].strip(),"ve":True} logging.info('Motion and vote-event saved: ' + str(r_motion['id'])) # set-up logging to a local file if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info('Started') db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': []}) terms = [1993, 1996, 1998, 2002, 2006, 2010, 2013] terms = [2013] test = {} #terms = [2010] for term in terms: zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/hl-'+str(term)+'ps.zip',zipped=True) hl_hlasovani = scrapeutils.zipfile2rows(zfile,'hl'+str(term)+'s.unl') saveallmotionsandvoteevents(hl_hlasovani) #j = 0 #last_ve_id = 0
def vote_events(self): print "\n\n\tScraping Motions and Vote Events data from Ukraine's parliament..." vote_events = parser.vote_events_list() index_vote_events = self.get_index("vote-events", "-start_date", vote_events) index_motions = self.get_index("motions", "-date", vote_events) index = min(index_vote_events, index_motions) voting_events = [] motions = [] if len(vote_events) > 0: print "\n\n\tPosting Motions and Vote events data to the Visegrad+ API from Ukraine's parliament..." if len(vote_events[index:]) > 0: widgets = [ " Progress: ", Percentage(), " ", Bar(marker="#", left="[", right="]"), " ", ETA(), " - Processed: ", Counter(), " events ", ] pbar = ProgressBar(widgets=widgets) for motion in pbar(vote_events[index:]): json_motion = self.build_json_motion( motion["date"][:19], motion["sources"][0]["url"], motion["id"], motion["legislative_session_id"], motion["organization_id"], motion["text"], motion["result"], ) motions.append(json_motion) existing = vpapi.getfirst("motions", where={"identifier": json_motion["identifier"]}) if not existing: vpapi.post("motions", json_motion) else: continue json_vote_event = self.build_vote_event_json( motion["date"][:19], motion["legislative_session_id"], motion["id"], motion["organization_id"], motion["result"], motion["counts"], ) voting_events.append(json_vote_event) existing1 = vpapi.getfirst("vote-events", where={"id": json_vote_event["id"]}) if not existing1: vpapi.post("vote-events", json_vote_event) else: continue print "\n\tFinished posting motions and vote events data." print "\tScraped %s motions and vote events" % str(len(vote_events[index:]) * 2) else: print "\n\tThere are no new motion and vote events data." else: print "\n\tThere are no new motions or vote events." return motions, voting_events
def savevoteevent(self): r = vpapi.get('vote-events', where={'identifier':self["identifier"]}) if not r['_items']: #print(self) r = vpapi.post("vote-events",self)
def scrape_new_debates(term): """Scrape and save speeches from debates of the given term, one of those newer terms where transcripts of debates are published in parts assigned to individual speakers. Returns number of scraped speeches. """ debate_part_kinds = { 'Uvádzajúci uvádza bod': 'speech', 'Vstup predsedajúceho': 'speech', 'Vystúpenie spoločného spravodajcu': 'speech', 'Vystúpenie': 'speech', 'Vystúpenie v rozprave': 'speech', 'Vystúpenie s faktickou poznámkou': 'speech', 'Vystúpenie s procedurálnym návrhom': 'speech', 'Prednesenie otázky': 'question', 'Zodpovedanie otázky': 'answer', 'Doplňujúca otázka / reakcia zadávajúceho': 'question', 'Prednesenie interpelácie': 'question', 'Odpoveď na interpeláciu': 'answer', 'scene': 'scene' } def insert_speech(kind): """Insert a speech entity for the given debate part kind and data from parent scope variables and update end date of the corresponding session and sitting. Delete `text` variable.""" nonlocal text, last_speech_enddatetime if not text: return speech = { 'text': text.strip().replace('[', '(').replace(']', ')'), 'date': start_datetime, 'type': debate_part_kinds.get(kind, 'speech'), 'position': len(speeches) + 1, 'event_id': sitting_id, 'sources' : [{ 'url': dpart_url, 'note': 'Prepis časti debaty na webe NRSR' }] } if dpart_video: speech['video'] = dpart_video if kind != 'scene': speech['creator_id'] = speaker_id speech['attribution_text'] = attribution.strip() speeches.append(speech) text = '' if end_datetime > session_end_date: vpapi.patch('events', session_id, {'end_date': end_datetime}) if end_datetime > sitting_end_date: vpapi.patch('events', sitting_id, {'end_date': end_datetime}) last_speech_enddatetime = datetime.strptime(end_datetime, '%Y-%m-%dT%H:%M:%S') logging.info('Scraping debates of term `%s`' % term) chamber_id = get_chamber_id(term) # prepare mapping from MP's name to id people = vpapi.getall('people', projection={'name': 1}) mps = {mp['name']: mp['id'] for mp in people} # load name corrections with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f: name_corrections = json.load(f) # scraping will start since the most recent sitting start date last_sitting = vpapi.getfirst('events', where={'type': 'sitting', 'organization_id': chamber_id}, sort='-start_date') since_date = last_sitting['start_date'][:10] if last_sitting else None # scrape list of debate parts debate_parts = parse.new_debates_list(term, since_date) speech_count = 0 session_name = '' speeches = [] for dp in debate_parts['_items']: # stop at very recent debate parts (may be incomplete) start_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['od'])) sd = datetime.strptime(start_datetime, '%Y-%m-%dT%H:%M:%S') if datetime.utcnow() - sd < timedelta(days=5): break # skip already scraped debate parts existing = vpapi.getfirst('speeches', where={'sources.url': dp['prepis']['url']}) if existing: continue logging.info('Scraping debate part %s %s-%s (id=%s)' % (dp['dátum'], dp['trvanie']['od'], dp['trvanie']['do'], dp['prepis']['id'])) dpart = parse.debate_of_terms56(dp['prepis']['id']) if not dpart['riadky']: continue end_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['do'])) dpart_kind = dp['druh'] dpart_url = dp['prepis']['url'] dpart_video = dp['video']['url'] if 'video' in dp else None if not session_name.startswith('%s. ' % dp['schôdza']): # create new session event session_name = '%s. schôdza' % dp['schôdza'] session = { 'name': session_name, 'identifier': dp['schôdza'], 'organization_id': chamber_id, 'type': 'session', 'start_date': start_datetime, 'end_date': end_datetime, } key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) session_end_date = end_datetime # find the last moment of the last sitting of this session session_last_sitting = vpapi.getfirst('events', where={'type': 'sitting', 'parent_id': session_id}, sort='-start_date') if session_last_sitting: last_speech_enddatetime = datetime.strptime(session_last_sitting['end_date'], '%Y-%m-%dT%H:%M:%S') sitting_identifier = session_last_sitting['identifier'] sitting_id = session_last_sitting['id'] sitting_end_date = session_last_sitting['end_date'] else: last_speech_enddatetime = datetime.min sitting_identifier = '0' if sd - last_speech_enddatetime > timedelta(hours=5): # create new sitting event sitting_identifier = str(int(sitting_identifier) + 1) sitting_name = '%s. deň rokovania, %s' % (sitting_identifier, dp['dátum']) sitting = { 'name': sitting_name, 'identifier': sitting_identifier, 'organization_id': chamber_id, 'type': 'sitting', 'start_date': start_datetime, 'end_date': end_datetime, 'parent_id': session_id, } key = ('parent_id', 'type', 'identifier') sitting_id, _ = get_or_create('events', sitting, key) sitting_end_date = end_datetime # save speeches of the previous sitting if len(speeches) > 0: vpapi.post('speeches', speeches) speech_count += len(speeches) if dp != debate_parts['_items'][0]: logging.info('Scraped %s speeches from previous sitting' % len(speeches)) speeches = [] # add the first speaker name that is sometimes missing first_speaker = '<strong>%s, %s</strong>' % (dp['osoba']['meno'], dp['osoba']['funkcia']) dpart['riadky'].insert(0, first_speaker) # extract speeches from the debate part text = '' within_scene = False for par in dpart['riadky']: if not par: continue par = par.replace('\n', ' ').strip() # skip eventual speech number if re.match('^(\d+)\.$', par): continue # convert brackets to parentheses par = re.sub(r'\[(.*?)\]', r'(\1)', par) # convert all inner nested parentheses to brackets n = 1 while n >= 1: (par, n) = re.subn(r'\((.*?)\((\.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL) # process eventual multiparagraph scene if par.startswith('(') and par.count('(') > par.count(')'): # save eventual previous speech insert_speech(dpart_kind) text = '<p>%s</p>' % lxml.html.fromstring(par[1:]).text_content() within_scene = True continue if within_scene: if par.endswith(')') and par.count(')') > par.count('('): text += '\n\n<p>%s</p>' % lxml.html.fromstring(par[:-1]).text_content() insert_speech('scene') within_scene = False else: text += '\n\n<p>%s</p>' % lxml.html.fromstring(par).text_content() continue # process eventual new speaker # format `Doe, John, foreign minister` speech_start_pattern = r'<strong>(\w+), (\w+\.?)( (\w+\.?))?, (.*)</strong>' sp = re.match(speech_start_pattern, par, re.DOTALL) if sp: # save eventual previous speech insert_speech(dpart_kind) # identify speaker name = '%s %s' % (sp.group(2), sp.group(1)) if (sp.group(4)): name = name.replace(' ', ' %s ' % sp.group(4)) attribution = sp.group(5) if name in name_corrections: name = name_corrections[name] if len(name) == 0: continue speaker_id = mps.get(name) # create unknown speakers if not speaker_id: logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution)) name_parts = re.match(r'(\w+\.?)( (\w+\.?))? (\w+)', name) person = { 'name': name, 'family_name': name_parts.group(4), 'given_name': name_parts.group(1) } person['sort_name'] = '%s, %s' % (person['family_name'], person['given_name']) if name_parts.group(3): person['additional_name'] = name_parts.group(3) person['sort_name'] += ' %s' % person['additional_name'] resp = vpapi.post('people', person) speaker_id = resp['id'] mps[name] = speaker_id continue # remove HTML tags par = lxml.html.fromstring(par).text_content() # process eventual scene in this paragraph scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS while True: scene = re.match(scene_pattern, par, re.DOTALL) if not scene: break if scene.group(1): text += '\n\n<p>%s</p>' % scene.group(1).strip() insert_speech(dpart_kind) text = '<p>%s</p>' % scene.group(2).strip() insert_speech('scene') par = scene.group(3) if par: text += '\n\n<p>%s</p>' % par insert_speech(dpart_kind) if len(speeches) > 0: vpapi.post('speeches', speeches) logging.info('Scraped %s speeches' % len(speeches)) speech_count += len(speeches) logging.info('Scraped %s speeches in total' % speech_count)
def export_speeches(self): speeches = self.load_json('speeches') people = {} prefix_regex = re.compile( ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\ (generalni sekretar )', re.U) for p in vpapi.getall('people'): name = self.normalize_name(p['name']) people[name] = p['id'] for speech in speeches: session_id = speech.get('event_id') speech['event_id'] = self.events_ids[session_id] url = speech['sources'][0]['url'] if url.endswith('.pdf'): parsed_speeches = self.download_pdf(url) for n, s in enumerate(parsed_speeches): text_speech = speech.copy() text_speech['text'] = s['text'] text_speech['position'] = n + 1 text_speech['type'] = 'speech' creator = self.normalize_name(s['creator']) creator = prefix_regex.sub('', creator) if creator in people: text_speech['creator_id'] = people[creator] else: creator_id = None for name in people: if name in creator: creator_id = people[name] break if creator_id is None: resp = vpapi.getfirst( 'people', where={ 'name': { '$regex': s['creator'], 'options': 'i' } } ) if resp is None: self.log('Person "%(creator)s" not found. \ Creating one' % s, WARNING) item = { 'name': s['creator'], 'sources': text_speech['sources'] } resp = vpapi.post('people', item) creator_id = resp['id'] people[creator] = creator_id text_speech['creator_id'] = creator_id self.get_or_create( 'speeches', text_speech, where_keys=['event_id', 'position'] ) else: self.get_or_create('speeches', speech)
# set-up logging to a local file if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted') db_log = vpapi.post('logs', { 'status': 'running', 'file': logname, 'params': [] }) try: # get all senators by districts and all political groups baseurl = 'http://senat.cz/' people = [] groups = {} iid = 100001 ## people for i in range(1, 82): print(i) url = "http://senat.cz/senat/volby/hledani/o_obvodu.php?ke_dni=" + time.strftime( "%d") + "." + time.strftime("%m") + "." + time.strftime(
def savevotes(hl_poslanec): votes = {} voteevents = {} people = {} organizations = {} terms = {} for rowp in hl_poslanec: # if rowp[0] == 0: chybne hlasovani v db, viz http://www.psp.cz/sqw/hlasy.sqw?g=58297 # try: # terms[hl_hlasovani[i][1].strip()] # except: # r_t = vpapi.get("organizations", where={'identifiers': {'$elemMatch': {"identifier": hl_hlasovani[0][1].strip(), "scheme": "psp.cz/organy"}}}) # for ident in r_t["_items"][0]["identifiers"]: # if ident["scheme"] == "psp.cz/term": # terms[hl_hlasovani[0][1].strip()] = ident["identifier"] try: voteevents[rowp[1].strip()] except: voteevents[rowp[1].strip()] = vpapi.get("vote-events", where={"identifier": rowp[1].strip()}) r_voteevent = voteevents[rowp[1].strip()] try: people[rowp[0].strip()] except: people[rowp[0].strip()] = vpapi.get( "people", where={ "identifiers": { "$elemMatch": { "identifier": rowp[0].strip(), "scheme": {"$regex": "psp.cz/poslanec/*", "$options": "i"}, } } }, ) r_pers = people[rowp[0].strip()] try: organizations[r_pers["_items"][0]["id"]] except: organizations[r_pers["_items"][0]["id"]] = vpapi.get( "memberships", where={"person_id": r_pers["_items"][0]["id"]}, embed=["organization"] ) r_org = organizations[r_pers["_items"][0]["id"]] for rowo in r_org["_items"]: if ( rowo["organization"]["classification"] == "political group" and rowo["start_date"] <= r_voteevent["_items"][0]["start_date"] ): try: rowo["end_date"] except: fine = True else: if rowo["end_date"] >= r_voteevent["_items"][0]["start_date"]: fine = True else: fine = False # 9 lines to overcome no python's function "isset" ... )-: if fine: organization = rowo["organization"] break vote = { "voter_id": r_pers["_items"][0]["id"], "option": option2option(rowp[2].strip()), "group_id": organization["id"], "vote_event_id": r_voteevent["_items"][0]["id"], } try: votes[r_voteevent["_items"][0]["id"]] except: votes[r_voteevent["_items"][0]["id"]] = [] votes[r_voteevent["_items"][0]["id"]].append(vote.copy()) # for k in votes: # vpapi.post("votes",votes[k]) vpapi.post("votes", votes)
def scrape_motions(term): """Scrape and save motions from the given term that are not scraped yet starting from the oldest ones. One Motion item, one VoteEvent item and many Vote items are created for each scraped motion detail page. Returns number of scraped motions. """ logging.info('Scraping motions of term `%s`' % term) # prepare mappings from source identifier to id for MPs and parliamentary groups chamber_id = get_chamber_id(term) people = vpapi.getall('people', projection={'identifiers': 1}) mps = {mp['identifiers'][0]['identifier']: mp['id'] for mp in people if 'identifiers' in mp} orgs = vpapi.getall('organizations', where={'classification': 'parliamentary group', 'parent_id': chamber_id}) parl_groups = {c['name']: c['id'] for c in orgs} # add differently spelled parliamentary groups group_corrections = { '2': { 'Klub HZDS': 'Klub ĽS-HZDS', 'Klub SMK': 'Klub SMK-MKP', 'Klub Nezávislí': 'Klub Nezávislý', }, '3': { 'Klub HZDS': 'Klub ĽS-HZDS', 'Klub SDKÚ': 'Klub SDKÚ-DS', 'Klub Smer': 'Klub SMER-SD', 'Klub Smer-SD': 'Klub SMER-SD', 'Klub KNP': 'Klub nezávislých poslancov NR SR', 'Klub Nezávislý': 'Klub nezávislých poslancov NR SR', }, } for k, v in group_corrections.get(term, {}).items(): parl_groups[k] = parl_groups[v] # prepare list of sessions that are not completely scraped yet sessions_to_scrape = [] session_list = parse.session_list(term) for session in session_list['_items']: motions = parse.session(session['číslo'], term) if len(motions['_items']) == 0: continue last_motion_id = motions['_items'][-1]['id'] m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % last_motion_id existing = vpapi.getfirst('motions', where={'sources.url': m_url}) if existing: break sessions_to_scrape.append((session, motions)) # scrape motions from those sessions scraped_motions_count = 0 for s, motions in reversed(sessions_to_scrape): logging.info('Scraping session `%s`' % s['názov']) # insert the session event unless it already exists session = { 'name': s['názov'], 'identifier': s['číslo'], 'organization_id': chamber_id, 'type': 'session', } try: session['start_date'] = sk_to_utc(s['trvanie']) + 'T00:00:00' session['end_date'] = session['start_date'] except ValueError: # multiday session contains votes; dates are set by debates scraping pass key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) for i, m in enumerate(motions['_items']): # check if the motion is already present m_id = re.search(r'ID=(\d+)', m['url']['výsledok']).group(1) # we not use directly m['url']['kluby'] because it is not always present m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % m_id existing = vpapi.getfirst('motions', where={'sources.url': m_url}) if existing: continue try: motion_id = None vote_event_id = None # insert motion logging.info('Scraping motion %s of %s (voted at %s)' % (i+1, len(motions['_items']), m['dátum'])) parsed_motion = parse.motion(m['id']) motion = { 'organization_id': chamber_id, 'legislative_session_id': session_id, 'identifier': parsed_motion['číslo'], 'text': parsed_motion['názov'], 'date': sk_to_utc(m['dátum']), 'sources': [{ 'url': parsed_motion['url'], 'note': 'Hlasovanie na webe NRSR' }], } if 'výsledok' in parsed_motion: motion['result'] = 'pass' if parsed_motion['výsledok'] == 'Návrh prešiel' else 'fail' resp = vpapi.post('motions', motion) motion_id = resp['id'] # insert vote event vote_event = { 'motion_id': motion_id, 'organization_id': chamber_id, 'legislative_session_id': session_id, 'identifier': parsed_motion['číslo'], 'start_date': motion['date'], 'sources': [{ 'url': parsed_motion['url'], 'note': 'Hlasovanie na webe NRSR' }], } if 'výsledok' in parsed_motion: vote_event['result'] = motion['result'] if 'súčty' in parsed_motion: options = { 'yes': '[z] za', 'no': '[p] proti', 'abstain': '[?] zdržalo sa', 'absent': '[0] neprítomní', 'not voting': '[n] nehlasovalo' } vote_event['counts'] = [ {'option': o, 'value': int(parsed_motion['súčty'][s])} for o, s in options.items() if parsed_motion['súčty'][s] != '' ] if len(vote_event['counts']) == 0: del vote_event['counts'] resp = vpapi.post('vote-events', vote_event) vote_event_id = resp['id'] # insert votes if 'hlasy' in parsed_motion and len(parsed_motion['hlasy']) > 0: vote_options = { 'z': 'yes', 'p': 'no', '?': 'abstain', 'n': 'not voting', '0': 'absent' } votes = [] for v in parsed_motion['hlasy']: # skip MPs not applying their mandate if v['hlas'] == '-': continue pg = normalize_parlgroup_name(v['klub']) votes.append({ 'vote_event_id': vote_event_id, 'option': vote_options[v['hlas']], 'voter_id': mps.get(v['id']), 'group_id': parl_groups.get(pg), }) if len(votes) > 0: resp = vpapi.post('votes', votes) # delete incomplete data if insertion of the motion, vote event or votes failed except: if motion_id: vpapi.delete('motions', motion_id) if vote_event_id: vpapi.delete('vote-events', vote_event_id) raise scraped_motions_count += 1 logging.info('Scraped %s motions of term `%s`' % (scraped_motions_count, term)) return scraped_motions_count
rowo["end_date"] except: fine = True else: if rowo["end_date"] >= r_voteevent["_items"][0]["start_date"]: fine = True else: fine = False # 9 lines to overcome no python's function "isset" ... )-: if fine: organization = rowo["organization"] break vote = { "voter_id": r_pers["_items"][0]["id"], "option": option2option(rowp[2].strip()), "group_id": organization["id"], "vote_event_id": r_voteevent["_items"][0]["id"], } try: votes[r_voteevent["_items"][0]["id"]] except: votes[r_voteevent["_items"][0]["id"]] = [] votes[r_voteevent["_items"][0]["id"]].append(vote.copy()) for k in votes: vpapi.post("votes", votes[k]) print(j) j = j + 1 except: nothing = 1
def scrape_old_debates(term): """Scrape and save speeches from debates of the given term, one of those older terms where transcripts of debates are stored in RTF files. Returns number of scraped speeches. """ def insert_speech(type): """Insert a speech entity with the given type and data from parent scope variables and update end date of the corresponding session and sitting. Delete `text` variable.""" nonlocal text, position if not text: return position = position + 1 speech = { 'text': text.strip().replace('[', '(').replace(']', ')'), 'type': type, 'position': position, 'event_id': sitting_id, 'sources' : [{ 'url': debate['url'], 'note': 'Prepis debaty v Digitálnej knižnici na webe NRSR' }] } if type != 'scene': speech['creator_id'] = speaker_id speech['attribution_text'] = attribution.strip() speeches.append(speech) text = '' if date > session_end_date: vpapi.patch('events', session_id, {'end_date': date}) if date > sitting_end_date: vpapi.patch('events', sitting_id, {'end_date': date}) logging.info('Scraping debates of term `%s`' % term) chamber_id = get_chamber_id(term) # prepare mapping from MP's name to id people = vpapi.getall('people', projection={'given_name': 1, 'additional_name': 1, 'family_name': 1}) mps = {} for mp in people: if 'additional_name' in mp: name = '%s. %s. %s' % (mp['given_name'][0], mp['additional_name'][0], mp['family_name']) else: name = '%s. %s' % (mp['given_name'][0], mp['family_name']) mps[name] = mp['id'] # load name corrections with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f: name_corrections = json.load(f) # scrape list of debates debates = parse.old_debates_list(term) # add the debate missing in the list if term == '4': debates['_items'].append({ 'názov': 'Autorizovaná rozprava, 48. schôdza NR SR, 3. 2. 2010', 'id': '2010_02_03', 'url': 'http://www.nrsr.sk/dl/Browser/DsDocument?documentId=391413' }) speech_count = 0 session_identifier = None for debate in debates['_items']: # skip obsolete debates in the list if term == '1': if (debate['názov'] == 'Stenozáznam' and debate['id'] != '198550' or debate['id'] in ('65890', '65945', '65949')): continue elif term == '2': if debate['názov'].startswith('Stenografická') and debate['id'] != '92098': continue elif term == '3': if debate['id'] == '181047': continue logging.info('Scraping debate `%s` (id=%s)' % (debate['názov'], debate['id'])) if term == '1': paragraphs = parse.debate_of_term1(debate['id']) else: paragraphs = parse.debate_of_terms234(debate['id']) # normalize header of the debate transcript if term == '2': # join first 4 paragraphs and add trailing underscores to mark the header paragraphs = ['%s %s %s %s\n___' % (paragraphs[0], paragraphs[1], paragraphs[2], paragraphs[3])] + paragraphs[4:] elif term in ('3', '4'): # join first paragraphs until " hodine" ending is found # and add trailing underscores to mark the header p = '' while True: p += ' ' + paragraphs.pop(0) if p.endswith('hodine'): break if paragraphs[0].startswith('___'): paragraphs.pop(0) paragraphs.insert(0, p + '\n___') # extract speeches from the debate speeches = [] text = '' within_scene = False for par in paragraphs: par = par.replace('\n', ' ').strip() if not par: continue # fix last scene if re.search(r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.', par): if not par[0] in ('(', '[', '/'): par = '(%s)' % par # convert brackets to parentheses par = re.sub(r'\[(.*?)\]', r'(\1)', par) # slash pairs are converted to parentheses too in term 1 if term == '1': par = re.sub(r'(^|\s)/(.*?)/(\s|$)', r'\1(\2)\3', par) # convert all inner nested parentheses to brackets n = 1 while n >= 1: (par, n) = re.subn(r'\((.*?)\((.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL) # process eventual multiparagraph scene if par.startswith('(') and par.count('(') > par.count(')'): # save eventual previous speech insert_speech('speech') text = '<p>%s</p>' % par[1:] within_scene = True continue if within_scene: if par.endswith(')') and par.count(')') > par.count('('): text += '\n\n<p>%s</p>' % par[:-1] insert_speech('scene') within_scene = False else: text += '\n\n<p>%s</p>' % par continue # process eventual header header_pattern = r'((\(?(\d+)\.\)?\s+schôdz)|slávnostn).*?(\d+)\..*\b(\w{3,})\s+(\d{4})(.*?)_{3,}$' hd = re.search(header_pattern, par, re.DOTALL) if hd: # save eventual previous speech insert_speech('speech') sk_date = '%s. %s %s' % (hd.group(4), hd.group(5), hd.group(6)) initial_time = re.search(r'\s+o\s+(.*?)\s+hodine', hd.group(7), re.DOTALL) if initial_time and initial_time.group(1) != '??': h, m = initial_time.group(1).strip('.').split('.') date = sk_to_utc(sk_date + ' %s:%s:00' % (h.strip().zfill(2), m.strip().zfill(2))) else: date = sk_to_utc(sk_date) + 'T00:00:00' if hd.group(1).startswith('sláv'): new_session_name = 'Mimoriadna schôdza' if term == '1': new_session_identifier = debate['časť'] elif term == '2': new_session_identifier = '1000' else: sl = parse.session_list(term) d = '%s. %s. %s' % (int(date[8:10]), int(date[5:7]), int(date[0:4])) new_session_identifier = next((s['číslo'] for s in sl['_items'] if s['trvanie'] == d)) else: new_session_name = '%s. schôdza' % hd.group(3) new_session_identifier = hd.group(3) if new_session_identifier != session_identifier: # create new session event session = { 'name': new_session_name, 'identifier': new_session_identifier, 'organization_id': chamber_id, 'type': 'session', 'start_date': date, } key = ('organization_id', 'type', 'identifier') session_id, _ = get_or_create('events', session, key) session_identifier = new_session_identifier session_end_date = date sitting_count = 0 # create new sitting event sitting_count += 1 sitting = { 'name': '%s. deň rokovania, %s' % (sitting_count, sk_date), 'identifier': str(sitting_count), 'organization_id': chamber_id, 'type': 'sitting', 'start_date': date, 'parent_id': session_id, } key = ('parent_id', 'type', 'identifier') sitting_id, created = get_or_create('events', sitting, key) sitting_end_date = date position = 0 # delete existing speeches of the sitting if not created: obsolete = vpapi.getall('speeches', where={'event_id': sitting_id}) for speech in obsolete: vpapi.delete('speeches', speech['id']) continue # process eventual start of a speech if date < '2001-09-04': # format `Foreign minister J. Doe:` speech_start_pattern = r'(.*?)\b([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+):$' else: # format `J. Doe, foreign minister: speech` speech_start_pattern = r'([^\W\d])\.[\s_]+((\w)\.[\s_]+)?([\w-]+),\s+(.+?):(.+)$' sp = re.match(speech_start_pattern, par, re.DOTALL) if sp: # save eventual previous speech insert_speech('speech') # identify speaker if date < '2001-09-04': name = '%s. %s' % (sp.group(2), sp.group(5)) if (sp.group(4)): name = name.replace(' ', ' %s. ' % sp.group(4)) attribution = sp.group(1) par = '' else: name = '%s. %s' % (sp.group(1), sp.group(4)) if (sp.group(3)): name = name.replace(' ', ' %s. ' % sp.group(3)) attribution = sp.group(5) par = sp.group(6) if name in name_corrections: name = name_corrections[name] attribution = attribution[0].lower() + attribution[1:].strip() speaker_id = mps.get(name) # create unknown speakers if not speaker_id: logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution)) name_parts = re.match(r'(\w)\. ((\w)\. )?(\w+)', name) person = { 'name': name, 'family_name': name_parts.group(4), 'given_name': name_parts.group(1) } person['sort_name'] = '%s, %s.' % (person['family_name'], person['given_name']) if name_parts.group(3): person['additional_name'] = name_parts.group(3) person['sort_name'] += ' %s.' % person['additional_name'] resp = vpapi.post('people', person) speaker_id = resp['id'] mps[name] = speaker_id # recognize date(-time) stamps in transcripts ds = re.match(r'^\s*(\d+\.\s\w+\s\d{4})(.*hodine)?\s*$', par) if ds: dt = ds.group(1).strip() tm = re.search(r'o\s+(.*?)\s+', ds.group(2) or '') try: if tm: h, m = tm.group(1).strip('.').split('.') date = sk_to_utc('%s %s:%s:00' % (dt, h.strip().zfill(2), m.strip().zfill(2))) else: date = sk_to_utc(dt) + 'T00:00:00' continue except ValueError: pass # process eventual scene in this paragraph scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS while True: scene = re.match(scene_pattern, par, re.DOTALL) if not scene: break if scene.group(1): text += '\n\n<p>%s</p>' % scene.group(1).strip() insert_speech('speech') text = '<p>%s</p>' % scene.group(2).strip() insert_speech('scene') par = scene.group(3) if par: text += '\n\n<p>%s</p>' % par.strip() insert_speech('speech') # extract end time of the session final_time = re.search( r'\b(skončil.|skončené|prerušené|Prerušenie rokovani[ae])\s+o\s+(.*?)\s+hodine.', speeches[-1]['text']) if final_time: tm = final_time.group(2) tm = tm.replace('O', '0').replace(',', '.') h, m = tm.strip('.').split('.') final_date = '%s.%s.%s %s:%s:00' % (date[8:10], date[5:7], date[0:4], h.strip().zfill(2), m.strip().zfill(2)) final_date = sk_to_utc(final_date) vpapi.patch('events', session_id, {'end_date': final_date}) vpapi.patch('events', sitting_id, {'end_date': final_date}) vpapi.post('speeches', speeches) logging.info('Scraped %s speeches' % len(speeches)) speech_count += len(speeches) logging.info('Scraped %s speeches in total' % speech_count)
try: votes[r_voteevent["_items"][0]["id"]] except: votes[r_voteevent["_items"][0]["id"]] = [] votes[r_voteevent["_items"][0]["id"]].append(vote.copy()) j = j + 1 print(str(j) + ':' + str(j/200)) j = 0 votesli = [] n = 0 # raise(Exception) for k in votes: if (j == 1): vpapi.post("votes",votesli) votesli = [] print(str(n) + "/" + str(len(votes))) print(k) j = 0 j = j + 1 n += 1 votesli = votesli + votes[k] # vpapi.post("votes",votesli) # vpapi.post("votes",votes[k]) # for k in votes: # votesli = votesli + votes[k] # vpapi.post("votes",votesli) except: nothing = 1 # logging.warning('Something went wrong with year ' + str(term) + 'and file ' + str(i) + ' (it may not exist), last vote_event_id: ' + str(last_ve_id))
def batch_create(self, endpoint, items): resp = vpapi.post(endpoint, items) if resp['_status'] != 'OK': raise Exception(resp) self.log('Created %d items' % len(resp['_items']), DEBUG) return
# set-up logging to a local file if not os.path.exists(LOGS_DIR): os.makedirs(LOGS_DIR) logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log' logname = os.path.join(LOGS_DIR, logname) logname = os.path.abspath(logname) logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')]) logging.getLogger('requests').setLevel(logging.ERROR) logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2') db_log = vpapi.post('logs', { 'status': 'running', 'file': logname, 'params': [] }) vpapi.parliament('cz/senat') vpapi.authorize(authentication.username, authentication.password) vpapi.timezone('Europe/Prague') o2id = {} organizations = vpapi.getall("organizations") for org in organizations: o2id[org['name']] = org['id'] p2id = {} persons = vpapi.getall('people') for p in persons:
def scrape(countries, people, votes): global effective_date effective_date = date.today().isoformat() # execute MP's bio data. georgia = georgia_scraper.GeorgiaScraper() armenia = armenia_scraper.ArmeniaScraper() ukraine = ukraine_scraper.UkraineScraper() belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper() belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper() moldova = moldova_scraper.MoldovaScraper() references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine, "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova, "belarus-upperhouse": belarus_upperhouse} countries_array = [] if countries == "all": for key in references: countries_array.append(key) else: countries_array = countries.split(',') indexes = [] for country in countries_array: if country.lower() not in references: indexes.append(countries_array.index(country)) if len(indexes) > 0: countries_array.pop(indexes) with open(os.path.join(BASE_DIR, 'access.json')) as f: creds = json.load(f) if len(countries_array) > 0: for item in sorted(countries_array): if internet_on(): # scrape and post data from parliaments if there's internet connection print "\n\tPosting and updating data from %s parliament" % item print "\tThis may take a few minutes..." vpapi.parliament(creds[item.lower()]['parliament']) vpapi.timezone(creds[item.lower()]['timezone']) vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password']) if people == "yes": members = references[item.lower()].scrape_mp_bio_data() chamber = references[item.lower()].scrape_chamber() parliamentary_groups = references[item.lower()].scrape_parliamentary_groups() committee = references[item.lower()].scrape_committee() data_collections = { "a-people": members, "b-chamber": chamber, "c-parliamentary_groups": parliamentary_groups, "d-committe": committee } # inserts data for each data collection in Visegrad+ Api for collection in sorted(set(data_collections)): widgets = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar = ProgressBar(widgets=widgets) print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \ collection[2:] if len(data_collections[collection]) > 0: for json_doc in pbar(data_collections[collection]): if collection == "a-people": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "people" elif collection == "c-parliamentary_groups" or collection == "d-committe": if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\ or item.lower() == "ukraine": where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']} else: where_condition = {'name': json_doc['name']} collection_of_data = "organizations" elif collection == "b-chamber": where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}} collection_of_data = "organizations" existing = vpapi.getfirst(collection_of_data, where=where_condition) if not existing: resp = vpapi.post(collection_of_data, json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date) # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:] if item.lower() != "georgia": memberships = { "chambers": references[item.lower()].scrape_membership(), "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(), "committees": references[item.lower()].scrape_committee_members() } elif item.lower() == "georgia": memberships = { "chambers": references[item.lower()].scrape_membership() } for data_collection in memberships: widgets_stat = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] prog_bar = ProgressBar(widgets=widgets_stat) if len(memberships[data_collection]) > 0: print "\n\tPosting and updating data from %s membership data collection\n" % data_collection for json_doc in prog_bar(memberships[data_collection]): existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'], "person_id": json_doc['person_id']}) if not existing: resp = vpapi.post("memberships", json_doc) else: json_obj_id = existing['id'] items_to_delete = ["created_at", "updated_at", "_links", "id"] for item_delete in items_to_delete: del existing[item_delete] if json.loads(json.dumps(json_doc)) == existing: continue else: resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection else: print "\n\tThere is no data from %s membership data collection\n" % data_collection continue if votes == "yes": if item.lower() == "ukraine": events = references[item.lower()].scrape_events() try: if len(events) > 0: widgets_events = [' Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA(), " - Processed: ", Counter(), ' items '] pbar_events = ProgressBar(widgets=widgets_events) for json_doc in pbar_events(events): existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']}) if not existing_event: resp = vpapi.post("events", json_doc) else: resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from events data collection" else: print "\n\tThere are no new events" except BaseException as ex: print ex.message else: print "\tThere's not any event to post from %s parliament" % item motions_vote_events = references[item.lower()].vote_events() voting_results = references[item.lower()].scrape_votes() try: if len(voting_results) > 0: resp = vpapi.post("votes", voting_results) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message elif item.lower() == "georgia": voting_data_collections = { "amotions": references[item.lower()].motions(), "bvote-events": references[item.lower()].vote_events(), } votes = references[item.lower()].scrape_votes() for collection in sorted(voting_data_collections): try: if len(voting_data_collections[collection]) > 0: resp = vpapi.post(collection[1:], voting_data_collections[collection]) if resp["_status"] != "OK": raise Exception("Invalid status code") print "\n\tFinished Posting and updating data from %s data collection" % collection[1:] except BaseException as ex: print ex.message print "\n\tPosting voting records from Georgia Parliament\n" try: if len(votes) > 0: vpapi.post("votes", votes) print "\n\tFinished Posting and updating data from votes data collection" except BaseException as ex: print ex.message else: print "\n\tThere are no voting records for %s" % item vpapi.deauthorize() else: print "\n\tInternet connection problems for %s official parliament web page" % item continue else: print "\n\tInvalid country/ies added"
motion['requirement'] = guess_majority(quorum, present) h2s = domtree1.xpath('//h2') h2s.pop(0) j = 0 tables.pop(0) for table in tables: tds = table.xpath('tr/td') for td in tds: li = td.text.strip().split('\xa0') vote = { "vote_event_id": iid, "voter_id": pp2id(" ".join([li[2], li[3]]), vote_event['start_date'], p2id), "option": option2option(li[0]), "group_id": o2id[h2s[j].text.strip()] } votes.append(vote) j += 1 vpapi.post("motions", motion) vpapi.post("vote-events", vote_event) vpapi.post("votes", votes) except: print("XXX:" + iid) nothing = 0 # "Zmatečné hlasování"