def save(data, stats): res=Mep.get_by_id(data['UserID']) if res is not None: if 'Gender' not in data and 'Gender' in res.data: data['Gender']=res['Gender'] d=diff(dict([(k,v) for k,v in res.data.items() if not k in ['meta', 'changes', 'activities',]]), dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]])) data['changes']=res.data.get('changes',{}) else: d=diff({}, dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes', 'activities',]])) data['changes']={} if d: now=datetime.utcnow().replace(microsecond=0) if not res: logger.info('adding %s' % (data['Name']['full'])) data['meta']['created']=now if stats: stats[0]+=1 data['changes']={} else: logger.info('updating %s' % (data['Name']['full'])) logger.warn(jdump(d)) data['meta']['updated']=now if stats: stats[1]+=1 data['id']=res.id data['changes']=res.data.get('changes',{}) data['changes'][now.isoformat()]=d Mep.upsert(data) del res if stats: del data return stats else: return data
def run(args): if len(args)<1: print("possible options: full|test|mepid <mepid>|"+'|'.join(meplists.keys())) return if args[0]=="test": yield scrape('28215') yield scrape('113959') #print jdump(scrape('108570')).encode('utf8') #print jdump(scrape('1934')).encode('utf8') #print jdump(scrape('96919')).encode('utf8') #import code; code.interact(local=locals()); return yield scrape("http://www.europarl.europa.eu/meps/en/1934/get.html") yield scrape("http://www.europarl.europa.eu/meps/en/28576/get.html") yield scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html") yield scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html") yield scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html") yield scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html") elif args[0]=='mepid' and args[1]: yield jdump(scrape(int(args[1]))) elif args[0] in meplists.keys(): #s=Multiplexer(scrape,save,threads=4) #def _crawler(): # return crawler(args[0]) #s.run(_crawler) yield (scrape, crawler(args[0])) return
def save(data, stats): if not data: return stats res=Dossier.get_by_src(data['meta']['source']) if res is not None: d=diff(dict([(k,v) for k,v in res.items() if not k in ['meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes',]])) data['changes']=res.data.get('changes',{}) else: d=diff({}, dict([(k,v) for k,v in data.items() if not k in ['meta', 'changes',]])) data['changes']={} #logger.warn(pprint.pformat(d)) if d: now=data['meta']['timestamp'].replace(microsecond=0).isoformat() del data['meta']['timestamp'] if not res: logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['created']=now stats[0]+=1 else: logger.info(('updating %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['updated']=now stats[1]+=1 logger.info(jdump(d)) #if not NOMAIL: # m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails']) # for g in m: # if len(g['active_emails'])==0: # continue # msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']), # sender = "*****@*****.**", # bcc = g['active_emails']) # #msg.html = htmldiff(data,d) # msg.body = makemsg(data,d) # mail.send(msg) #logger.info(htmldiff(data,d)) #logger.info(makemsg(data,d)) data['changes'][now]=d Dossier.upsert(data) return stats
def process(obj, id, getter, table, name, nopreserve=None, nodiff=False, nostore=False, onchanged=None): if nopreserve is None: nopreserve = [] # clear out empty values obj = {k: v for k, v in obj.items() if v or v == False} if nodiff: now = datetime.utcnow().replace(microsecond=0) if not 'meta' in obj: obj['meta'] = {} log(3, 'adding %s (%s)' % (name, id)) obj['meta']['created'] = now obj['changes'] = {} if not nostore and not db.put(table, obj): log(1, "failed to store updated obj {}".format(id)) raise ValueError if onchanged is not None: onchanged(obj, d) return # generate diff prev = getter(id) if prev is not None: d = diff( { k: v for k, v in prev.items() if not k in ['meta', 'changes', '_id'] }, { k: v for k, v in obj.items() if not k in ['meta', 'changes', '_id'] }) # preserve some top level items d1 = [] for c in d: if c['type'] != 'deleted' or len( c['path']) != 1 or c['path'][0] in nopreserve: d1.append(c) continue if c['type'] == 'deleted' and len( c['path']) == 1 and c['data'] in ({}, []): d1.append(c) continue log( 3, "preserving deleted path {} for obj id: {}".format( c['path'], id)) obj[c['path'][0]] = prev[c['path'][0]] d = d1 else: d = diff({}, { k: v for k, v in obj.items() if not k in ['meta', 'changes', '_id'] }) if d: # attempt to recreate current version by applying d to prev o2 = patch(prev or {}, json.loads(jdump(d))) if not o2: log( 1, "failed to recreate {} record by patching previous version with diff" .format(id)) raise ValueError else: # make a diff between current record, an recreated one zero = diff( { k: v for k, v in o2.items() if not k in ['meta', 'changes', '_id'] }, { k: v for k, v in obj.items() if not k in ['meta', 'changes', '_id'] }) if zero != []: log( 1, "id:{} diff between current record and patched previous one is not empty\n{!r}" .format(id, zero)) raise ValueError( "diff between new and patched old is not empty") now = datetime.utcnow().replace(microsecond=0) if not 'meta' in obj: obj['meta'] = {} if not prev or nodiff: log(3, 'adding %s (%s)' % (name, id)) obj['meta']['created'] = now obj['changes'] = {} else: log(3, 'updating %s (%s)' % (name, id)) log(4, "changes for %s\n%s" % (id, jdump(d))) obj['meta']['updated'] = now obj['changes'] = prev.get('changes', {}) obj['changes'][now.isoformat()] = d if not nostore and not db.put(table, obj): log(1, "failed to store updated obj {}".format(id)) raise ValueError if onchanged is not None: onchanged(obj, d) del prev if __name__ == '__main__': print(jdump(obj)) return obj
mepid = int(mepid) else: mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes from utils.process import publish_logs def onfinished(daisy=True): publish_logs(get_all_jobs) if __name__ == '__main__': import sys url = sys.argv[1] print(jdump(scrape(url)))
def onfinished(daisy=True): from utils.process import publish_logs publish_logs(get_all_jobs) if __name__ == '__main__': #print(jdump(scrape(1275))) #scrape(28390) #scrape(96779) #scrape(96674) #scrape(28469) #scrape(96843) #scrape(1393) # 1-3rd term #scrape(96992) #scrape(1275) # test written decl: #print(jdump(scrape(28266, [8], "some MEP"))) # test written expl: #print(jdump(scrape(197682, [9], "some MEP"))) # test plen spch #print(jdump(scrape(28266, [9], "some MEP"))) # test report-shadow with double committee #print(jdump(scrape(28266, [7,8,9], "some MEP"))) # major interpellations: print(jdump(scrape(131749, [7, 8, 9], "some MEP"))) #import sys #print(jdump(scrape(int(sys.argv[1]), [9], "some MEP")))
logger.error('[wtf] did not reach final state %s' % state) return {} else: if (len(data['occupation'])>1 and data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate", u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής", u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)", u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję", u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode", u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato", u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode", u"Aucune activité professionnelle au cours des trois années ayant précédé le présent mandat", u"Sin ocupación durante los tres años anteriores al actual mandato", u"Intet erhvervsarbejde i de tre år forud for det nuværende mandate", u"Nicio activitate profesională în ultimii trei ani dinaintea preluării mandatului actual", u"Har inte utövat någon yrkesmässig verksamhet under de tre år som föregick det nuvarande mandatet", u"Sem atividade profissional durante os três anos que precederam o atual mandato", u"Nepostojanje profesionalne djelatnosti tijekom tri godine prije aktualnog mandata", u"Ei ammatillista toimintaa kolmena nykyistä edustajantointa edeltävänä vuotena", u"A jelenlegi megbízatást megelőző három évben nem végzett foglalkozást.", u"Без професионална дейност по време на трите години, предшестващи текущия мандат", u"Během tří let před současným mandátem jsem nevykonával(a) žádnou profesní činnost.", ]): del data['occupation'][-1] return data if __name__ == "__main__": DEBUG=True print(jdump(scrape(sys.argv[1])).encode('utf8')) #scrape(sys.argv[1])
from utils.process import publish_logs publish_logs(get_all_jobs) if __name__ == '__main__': #print(jdump(scrape(1275))) #scrape(28390) #scrape(96779) #scrape(96674) #scrape(28469) #scrape(96843) #scrape(1393) # 1-3rd term #scrape(96992) #scrape(1275) # test written decl: #print(jdump(scrape(28266, [8], "some MEP"))) # test written expl: #print(jdump(scrape(197682, [9], "some MEP"))) # test plen spch #print(jdump(scrape(28266, [9], "some MEP"))) # test report-shadow with double committee #print(jdump(scrape(28266, [7,8,9], "some MEP"))) # major interpellations: #print(jdump(scrape(131749, [7,8,9], "some MEP"))) #print(jdump(scrape(205452, [9], 'Chris MACMANUS'))) #print(jdump(scrape(204400, [9], 'Adrián VÁZQUEZ LÁZARA', save=False))) print(jdump(scrape(197767, [9], 'Eugen JURZYCA', save=False))) #import sys #print(jdump(scrape(int(sys.argv[1]), [9], "some MEP")))
pass continue if amstart.match(line): # parse block am=parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) block=[line] continue block.append(line) if block and filter(None,block): am = parse_block(block, url, reference, date, committee, meps, PE) if am is not None: process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True) res.append(am) log(3,"total amendments %d in %s" % (len(res),url)) return res from utils.process import publish_logs def onfinished(daisy=True): publish_logs(get_all_jobs) if __name__ == "__main__": from utils.utils import jdump #print(jdump(scrape('https://www.europarl.europa.eu/doceo/document/INTA-AM-658734_EN.pdf', ['Enikő GYŐRI']))) #print(jdump(scrape("http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-//EP//NONSGML+COMPARL+PE-609.623+01+DOC+PDF+V0//EN&language=EN", "Krišjānis Kariņš"))) #print(jdump(scrape(sys.argv[1],"ANDERSSON Max"))) print(jdump(scrape(sys.argv[1],sys.argv[2])))
log(1, "sidebar has not 1 element: %s" % url) raise ValueError for li in sidebar[0].xpath('./li'): title = li.xpath('./a/span[@class="t-x"]/text()') if len(title) != 1: log(1, "title has not 1 element: %s" % url) raise ValueError title = unws(title[0]) if title not in known_sidebar: log(2, '"%s" not in known_sidebar items, in %s' % (title, url)) subtitles = li.xpath('.//div/ul/li/a/span[@class="t-x"]/text()') for s in subtitles: s = unws(s) if s not in known_sidebar[title]: log( 2, '"%s" -> "%s" not in known_sidebar items, in %s' % (title, s, url)) if __name__ == '__main__': #scrape(28390) #scrape(96779) #scrape(96674) #scrape(28469) #scrape(96843) #scrape(1393) # 1-3rd term #scrape(96992) #scrape(1275) print(jdump(scrape(int(sys.argv[1])))) #print(jdump({k: v for k,v in scrape(1428).items() if k not in ['changes']}))
"\n %s" % (textdiff(diff) if diff else ''), "%sdossier/%s" % (ROOT_URL, doc['epdoc']), )))) from utils.process import publish_logs def onfinished(daisy=True): publish_logs(get_all_jobs) if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] == 'url' and len(sys.argv) == 4: print(jdump(scrape(sys.argv[2], sys.argv[3]))) sys.exit(0) elif sys.argv[1] == "url": print('-' * 30) print(jdump(scrape(sys.argv[2], 'XXXX'))) print('-' * 30) sys.exit(0) if sys.argv[1] == "test": #print(jdump([(u,d) for u,d in getComAgendas()])) print( jdump( scrape( 'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE'))) #import code; code.interact(local=locals()); sys.exit(0)
name = junws(mep) mepid = db.getMep(name, v['ts'], abbr=g) if mepid: m['mepid']= mepid #if int(mep.get('MepId')) in ambiguous_meps: # oid = int(mep.get('MepId')) # ambiguous_meps.remove(oid) # log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid)) else: mepid = lost_meps.get(mep.get('MepId')) if mepid: m['mepid']= mepid else: m['name']= name m['obscure_id']=int(mep.get('MepId')) # it's a totally useless and confusing id that is nowhere else used v['votes'][stype]['groups'][g].append(m) # save process(v, v['voteid'], db.vote, 'ep_votes', v['title']) votes.append(v) return votes from utils.process import publish_logs def onfinished(daisy=True): publish_logs(get_all_jobs) if __name__ == '__main__': import sys term = int(sys.argv[1]) date = sys.argv[2] print(jdump(scrape(term, date)))
del (item['title']) if item.get('body') == 'EC' and len(d.get('commission', [])) == 1: item.update(d['commission'][0]) if isinstance(item['date'], list): if not len(item['date']): continue if len(set(item['date'])) == 1: item['date'] = item['date'][0] else: print("more than one date in: ", item) if not item.get("body") and item.get( 'type') != 'Final act published in Official Journal': log( 2, "merge_events: no body for {!r}".format( {k: v for k, v in item.items() if k != 'summary'})) #continue #print(item) activities.append(item) res = sorted(activities, key=lambda x: x['date'][0] if isinstance(x['date'], list) else x['date'], reverse=True) return res if __name__ == '__main__': from db import db d = db.dossier('2016/0279(COD)') from utils.utils import jdump print(jdump(merge_events(d)))