def save(data, stats): res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]), dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]), ) if d: now = datetime.utcnow().replace(microsecond=0) if not res: logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8")) data["meta"]["created"] = now if stats: stats[0] += 1 else: logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8")) logger.warn(jdump(d)) data["meta"]["updated"] = now if stats: stats[1] += 1 data["_id"] = res["_id"] data["changes"] = res.get("changes", {}) data["changes"][now.isoformat()] = d db.ep_meps2.save(data) if stats: return stats else: return data
def save(data, stats): res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {} if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender'] d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]])) if d: now=datetime.utcnow().replace(microsecond=0) if not res: logger.info(('adding %s' % (data['Name']['full'])).encode('utf8')) data['meta']['created']=now if stats: stats[0]+=1 else: logger.info(('updating %s' % (data['Name']['full'])).encode('utf8')) logger.warn(jdump(d)) data['meta']['updated']=now if stats: stats[1]+=1 data['_id']=res['_id'] data['changes']=res.get('changes',{}) data['changes'][now.isoformat()]=d db.ep_meps2.save(data) del res if stats: del data return stats else: return data
def save(data, stats): if not data: return stats src = data['meta']['source'] res = db.dossiers2.find_one({'meta.source': src}) or {} d = diff( dict([(k, v) for k, v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k, v) for k, v in data.items() if not k in [ '_id', 'meta', 'changes', ]])) #logger.warn(pprint.pformat(d)) if d: now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() if not res: logger.info(('adding %s - %s' % (data['procedure']['reference'], data['procedure']['title'])).encode('utf8')) data['meta']['created'] = data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[0] += 1 else: logger.info(('updating %s - %s' % (data['procedure']['reference'], data['procedure']['title'])).encode('utf8')) data['meta']['updated'] = data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[1] += 1 data['_id'] = res['_id'] logger.info(jdump(d)) if not NOMAIL: m = db.notifications.find( {'dossiers': data['procedure']['reference']}, ['active_emails']) for g in m: if len(g['active_emails']) == 0: continue msg = Message("[PT] %s %s" % (data['procedure']['reference'], data['procedure']['title']), sender="*****@*****.**", bcc=g['active_emails']) #msg.html = htmldiff(data,d) msg.body = makemsg(data, d) mail.send(msg) #logger.info(htmldiff(data,d)) #logger.info(makemsg(data,d)) data['changes'] = res.get('changes', {}) data['changes'][now] = d db.dossiers2.save(data) return stats
def save(data, stats): if not data: return stats src=data['meta']['source'] res=db.dossiers2.find_one({ 'meta.source' : src }) or {} d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]), dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]])) #logger.warn(pprint.pformat(d)) if d: now=datetime.datetime.utcnow().replace(microsecond=0).isoformat() if not res: logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['created']=data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[0]+=1 else: logger.info(('updating %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8')) data['meta']['updated']=data['meta']['timestamp'] del data['meta']['timestamp'] sys.stdout.flush() stats[1]+=1 data['_id']=res['_id'] logger.info(jdump(d)) if not NOMAIL: m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails']) for g in m: if len(g['active_emails'])==0: continue msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']), sender = "*****@*****.**", bcc = g['active_emails']) #msg.html = htmldiff(data,d) msg.body = makemsg(data,d) mail.send(msg) #logger.info(htmldiff(data,d)) #logger.info(makemsg(data,d)) data['changes']=res.get('changes',{}) data['changes'][now]=d db.dossiers2.save(data) return stats
} for mep in group.xpath('PoliticalGroup.Member.Name')] } for group in type.xpath('Result.PoliticalGroup.List')] } # save q = {'title': res['voteid'], 'ts': res['ts']} db.ep_votes.update(q, {"$set": res}, upsert=True) votes.append(res) return votes if __name__ == '__main__': #res = scrape("http://www.europarl.europa.eu/RegData/seance_pleniere/proces_verbal/2014/03-13/votes_nominaux/xml/P7_PV(2014)03-13(RCV)_XC.xml") #print jdump(res).encode('utf8') #exit(0) try: year = int(sys.argv[1]) except: sys.stderr.write('[!] usage: %s [year(2004-2014)]\n' % sys.argv[0]) sys.exit(1) if year >= 2004 and year < 2009: map(scrape, crawl(year, 6)) elif year == 2009: map(scrape, crawl(year, 6)) map(scrape, crawl(year, 7)) elif year < 2014: print jdump(map(scrape, crawl(year, 7))).encode('utf8') #map(scrape, crawl(year, 7)) else: map(scrape, crawl(year, 7)) map(scrape, crawl(year, 8))
if sys.argv[1]=="full": NOMAIL=True crawl(get_all_dossiers(), threads=4) elif sys.argv[1]=="fullseq": NOMAIL=True crawlseq(get_all_dossiers(), null=null) elif sys.argv[1]=="newseq": crawlseq(get_new_dossiers(), null=null) elif sys.argv[1]=="new": crawl(get_new_dossiers()) elif sys.argv[1]=="update": crawl(get_active_dossiers()) elif sys.argv[1]=="updateseq": crawlseq(get_active_dossiers(), null=null) elif sys.argv[1]=="url": print jdump(scrape(sys.argv[2])).encode('utf8') #res=scrape(sys.argv[2]) #print >>sys.stderr, pprint.pformat(res) #save(res,[0,0]) elif sys.argv[1]=="test": save(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397"),[0,0]) # telecoms package #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=575084")) #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=589377")) #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556208")) # with shadow rapporteurs #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?reference=2011/0135(COD)")) # with shadow rapporteurs #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593187")) # with shadow rapporteur #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397")) # telecoms package sys.exit(0) pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=16542")) pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=584049")) # two rapporteurs in one committee pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593435")) # with forecast
for mepid in meps if (null and db.ep_meps2.find_one({'UserID': mepid},['_id'])==None) or not null] if __name__ == "__main__": if len(sys.argv)<2: print "%s full|test|mepid <mepid> [<seq>] [<dry>]" % (sys.argv[0]) args=set(sys.argv[1:]) saver=save null=False if 'dry' in args: saver=jdump if 'null' in args: null=True if sys.argv[1]=="test": print jdump(scrape('28215')).encode('utf8') print jdump(scrape('113959')).encode('utf8') #print jdump(scrape('108570')).encode('utf8') #print jdump(scrape('1934')).encode('utf8') #print jdump(scrape('96919')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None) print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None) elif sys.argv[1]=='mepid' and sys.argv[2]: #print saver(scrape(int(sys.argv[2]))).encode('utf8')
def seqcrawler(saver=jdump): stats=[0,0] for u, com in getComAgendas(): try: saver(scrape(u,com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1])) if __name__ == "__main__": if len(sys.argv)>1: if sys.argv[1]=="url": print jdump(scrape(sys.argv[2], 'XXXX')).encode('utf8') sys.exit(0) if sys.argv[1]=="test": print jdump([(u,d) for u,d in getComAgendas()]) #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) elif sys.argv[1]=='url' and len(sys.argv)==4: print jdump(scrape(sys.argv[2], sys.argv[3])) sys.exit(0) # handle opts args=set(sys.argv[1:]) saver=jdump if 'save' in args: saver=save
if len(sys.argv) != 2: print "%s full|fullseq|new|update|updateseq|test" % (sys.argv[0]) if sys.argv[1] == "full": crawl(get_all_dossiers(), threads=4) elif sys.argv[1] == "fullseq": crawlseq(get_all_dossiers(), null=null) elif sys.argv[1] == "newseq": crawlseq(get_new_dossiers(), null=null) elif sys.argv[1] == "new": crawl(get_new_dossiers()) elif sys.argv[1] == "update": crawl(get_active_dossiers()) elif sys.argv[1] == "updateseq": crawlseq(get_active_dossiers(), null=null) elif sys.argv[1] == "url": print jdump(scrape(sys.argv[2])).encode('utf8') #res=scrape(sys.argv[2]) #print >>sys.stderr, pprint.pformat(res) #save(res,[0,0]) elif sys.argv[1] == "test": save( scrape( "http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397" ), [0, 0]) # telecoms package #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=575084")) #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=589377")) #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556208")) # with shadow rapporteurs #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?reference=2011/0135(COD)")) # with shadow rapporteurs #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593187")) # with shadow rapporteur #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397")) # telecoms package sys.exit(0)
for celexid, data in sources("%s/index.htm" % crawlroot, []): if (null and db.eurlex.find_one({'id.celexid': celexid}, ['_id']) == None) or not null: try: tmp = saver(scrape(celexid, data), [0, 0]) except: logger.warn("[!] failed to scrape %s" % celexid) continue yield tmp if __name__ == "__main__": if len(sys.argv) < 2: print "%s [<chapter>] [<dry>] [<null>])" % (sys.argv[0]) elif sys.argv[1] == 'url' and sys.argv[2]: print jdump(scrape(sys.argv[2], [])) sys.exit(0) args = set(sys.argv[1:]) saver = save null = False if 'dry' in args: saver = jdump if 'null' in args: null = True iter = crawl(saver=saver, null=null) if 'dry' in args: print "[\n%s" % iter.next() for res in iter: if 'dry' in args: print ",\n%s" % res.encode('utf8') print ".oOo." * 35
stats = [0, 0] for u, com in getComAgendas(): try: saver(scrape(u, com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0], stats[1])) if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] == "test": print jdump( scrape( 'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN', 'ECON')).encode('utf8') print jdump( scrape( 'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) elif sys.argv[1] == 'url' and len(sys.argv) == 4: print jdump(scrape(sys.argv[2], sys.argv[3])) sys.exit(0) # handle opts args = set(sys.argv[1:]) saver = jdump if 'save' in args:
print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8') raise IndexError state+=1 if DEBUG: print >> sys.stderr, state #print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x) #else: #print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr+=1 if state!=9: print >> sys.stderr, '>>>>>>>>', "wtfwtf", state logger.error('[wtf] did not reach final state %s' % state) return {} else: if (len(data['occupation'])>1 and data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate", u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής", u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)", u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję", u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode", u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato", u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode", ]): del data['occupation'][-1] return data if __name__ == "__main__": DEBUG=True print jdump(scrape(sys.argv[1]))
def crawl(saver=jdump, null=False): for celexid, data in sources("%s/index.htm" % crawlroot, []): if (null and db.eurlex.find_one({'id.celexid': celexid},['_id'])==None) or not null: try: tmp = saver(scrape(celexid, data),[0,0]) except: logger.warn("[!] failed to scrape %s" % celexid) continue yield tmp if __name__ == "__main__": if len(sys.argv)<2: print "%s [<chapter>] [<dry>] [<null>])" % (sys.argv[0]) elif sys.argv[1]=='url' and sys.argv[2]: print jdump(scrape(sys.argv[2],[])) sys.exit(0) args=set(sys.argv[1:]) saver=save null=False if 'dry' in args: saver=jdump if 'null' in args: null=True iter=crawl(saver=saver, null=null) if 'dry' in args: print "[\n%s" % iter.next() for res in iter: if 'dry' in args: print ",\n%s" % res.encode('utf8') print ".oOo." * 35
tmpdate = tmp[2] del tmp[2] if tmp in [["Date", ":", "Signature", ":"]]: data["date"] = tmpdate break ptr += 1 if ptr >= len(text): logger.error("[meh] fail find end in I") if DEBUG: print "meh\n>>>%s" % "\n>>>".join(text[istart : istart + 14]).encode("utf8") raise IndexError state += 1 if DEBUG: print >> sys.stderr, state # print >> sys.stderr, "\t", text[istart:ptr], state data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[istart:ptr]) if x) # else: # print >> sys.stderr, '>>>>>>>>', line.encode('utf8') ptr += 1 if state != 9: print >> sys.stderr, ">>>>>>>>", "wtfwtf", state logger.error("[wtf] did not reach final state %s" % state) return {} else: return data if __name__ == "__main__": DEBUG = True print jdump(scrape(sys.argv[1]))
def seqcrawler(saver=jdump): stats=[0,0] for u, com in getComAgendas(): try: saver(scrape(u,com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1])) if __name__ == "__main__": if len(sys.argv)>1: if sys.argv[1]=="test": print jdump([(u,d) for u,d in getComAgendas()]) #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN', 'ECON')).encode('utf8') #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) elif sys.argv[1]=='url' and len(sys.argv)==4: print jdump(scrape(sys.argv[2], sys.argv[3])) sys.exit(0) # handle opts args=set(sys.argv[1:]) saver=jdump if 'save' in args: saver=save if 'seq' in args: res=seqcrawler(saver=saver)
def seqcrawler(saver=jdump): stats=[0,0] for u, com in getComAgendas(): try: saver(scrape(u,com), stats) except: # ignore failed scrapes logger.warn("[!] failed to scrape: %s" % u) logger.warn(traceback.format_exc()) logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1])) if __name__ == "__main__": if len(sys.argv)>1: if sys.argv[1]=="test": print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN', 'ECON')).encode('utf8') print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8') #import code; code.interact(local=locals()); sys.exit(0) elif sys.argv[1]=='url' and len(sys.argv)==4: print jdump(scrape(sys.argv[2], sys.argv[3])) sys.exit(0) # handle opts args=set(sys.argv[1:]) saver=jdump if 'save' in args: saver=save if 'seq' in args: res=seqcrawler(saver=saver) else: