def scrape(url): print "scraping", url root=etree.parse(fetch_raw(url)) # root is: #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443" votes=[] for vote in root.xpath('//RollCallVote.Result'): res={u"ts": datetime.strptime(vote.get('Date'), "%Y-%m-%d %H:%M:%S"), u"url": url, u"voteid": vote.get('Identifier'), u"title": vote.xpath("RollCallVote.Description.Text/text()")[0]} res.update(votemeta(res['title'], res['ts'])) for type, stype in [('Result.For','For'), ('Result.Against','Against'), ('Result.Abstention','Abstain')]: type = vote.xpath(type) if not type: continue if len(type)>1: print "[pff] more than one", stype, "entry in vote" type = type[0] res[stype]={u'total': type.get('Number'), u'groups': [{u'group': group.get('Identifier'), u'votes': [{u'userid': int(mep.get('MepId')), u'ep_id': getMep(mep.xpath('text()')[0].strip(), res['ts']), u'name': mep.xpath('text()')[0]} for mep in group.xpath('PoliticalGroup.Member.Name')]} for group in type.xpath('Result.PoliticalGroup.List')]} # save q={'title': res['voteid'], 'ts': res['ts']} db.ep_votes.update(q, {"$set": res}, upsert=True) votes.append(res) return votes
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() text = pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, "w") fd.write(fetch_raw(pdf).read()) fd.close() text = pdftotext("-nopgbrk", "-layout", fname, "-") os.unlink(fname) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, "w") fd.write(fetch_raw(pdf).read()) fd.close() x, y, h, w = 70, 63, 631, 473 text = pdftotext("-nopgbrk", "-layout", "-x", x, "-y", y, "-H", h, "-W", w, fname, "-") os.unlink(fname) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() x, y, h, w = 70, 63, 631, 473 text = pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w, fname, '-') os.unlink(fname) return text
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() text=pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() x,y,h,w = 70,63,631,473 text=pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w, fname, '-') os.unlink(fname) return text
def parse(celexid): (fd, fname)=mkstemp('.doc') fd=os.fdopen(fd, 'w') fd.write(fetch_raw("http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=%s:DOC" % celexid).read()) fd.close() resdir=mkdtemp() null=open('/dev/null','w+') p = Popen(['/usr/bin/libreoffice', '--headless', '--convert-to', 'odt', '--outdir', resdir, fname], stdout=null, stderr=null) p.wait() null.close() os.unlink(fname) #logger.info(resdir+'/'+os.path.basename(fname)[:-4]+'.odt') odt=ODT(resdir+'/'+os.path.basename(fname)[:-4]+'.odt') rmtree(resdir) return odt
def getactivities(mepid, terms=[8]): urltpl = 'http://www.europarl.europa.eu/meps/en/%s/see_more.html?type=%s&leg=%s&index=%s' #ctjson={'content-type': 'application/json'} actions={} for type in activitymap.keys(): actions[type]={} for term in terms: term=str(term) actions[type][term]=[] idx=0 while True: res=fetch_raw(urltpl % (mepid,type,term,idx)) #, headers=ctjson) ret=json.load(res) actions[type][term].extend(ret['documentList']) idx=ret['nextIndex'] if idx in [-1,0]: break if not actions[type][term]: del actions[type][term] if not actions[type]: del actions[type] return actions
def parse(celexid): (fd, fname) = mkstemp('.doc') fd = os.fdopen(fd, 'w') fd.write( fetch_raw( "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=%s:DOC" % celexid).read()) fd.close() resdir = mkdtemp() null = open('/dev/null', 'w+') p = Popen([ '/usr/bin/libreoffice', '--headless', '--convert-to', 'odt', '--outdir', resdir, fname ], stdout=null, stderr=null) p.wait() null.close() os.unlink(fname) #logger.info(resdir+'/'+os.path.basename(fname)[:-4]+'.odt') odt = ODT(resdir + '/' + os.path.basename(fname)[:-4] + '.odt') rmtree(resdir) return odt