Example #1
0
def scrape(url):
    print "scraping", url
    root=etree.parse(fetch_raw(url))
    # root is:
    #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443"
    votes=[]
    for vote in root.xpath('//RollCallVote.Result'):
        res={u"ts": datetime.strptime(vote.get('Date'), "%Y-%m-%d %H:%M:%S"),
             u"url": url,
             u"voteid": vote.get('Identifier'),
             u"title": vote.xpath("RollCallVote.Description.Text/text()")[0]}
        res.update(votemeta(res['title'], res['ts']))
        for type, stype in [('Result.For','For'), ('Result.Against','Against'), ('Result.Abstention','Abstain')]:
            type = vote.xpath(type)
            if not type: continue
            if len(type)>1: print "[pff] more than one", stype, "entry in vote"
            type = type[0]
            res[stype]={u'total': type.get('Number'),
                        u'groups': [{u'group': group.get('Identifier'),
                                     u'votes': [{u'userid': int(mep.get('MepId')),
                                                 u'ep_id': getMep(mep.xpath('text()')[0].strip(), res['ts']),
                                                 u'name': mep.xpath('text()')[0]}
                                              for mep in group.xpath('PoliticalGroup.Member.Name')]}
                                   for group in type.xpath('Result.PoliticalGroup.List')]}
        # save
        q={'title': res['voteid'],
           'ts':    res['ts']}
        db.ep_votes.update(q, {"$set": res}, upsert=True)
        votes.append(res)
    return votes
Example #2
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text = pdftotext('-nopgbrk', '-layout', fname, '-')
    os.unlink(fname)
    return text
Example #3
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, "w")
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text = pdftotext("-nopgbrk", "-layout", fname, "-")
    os.unlink(fname)
    return text
Example #4
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, "w")
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x, y, h, w = 70, 63, 631, 473
    text = pdftotext("-nopgbrk", "-layout", "-x", x, "-y", y, "-H", h, "-W", w, fname, "-")
    os.unlink(fname)
    return text
Example #5
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x, y, h, w = 70, 63, 631, 473
    text = pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w,
                     fname, '-')
    os.unlink(fname)
    return text
Example #6
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text=pdftotext('-nopgbrk',
                   '-layout',
                   fname,
                   '-')
    os.unlink(fname)
    return text
Example #7
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x,y,h,w = 70,63,631,473
    text=pdftotext('-nopgbrk',
                   '-layout',
                   '-x', x,
                   '-y', y,
                   '-H', h,
                   '-W', w,
                   fname,
                   '-')
    os.unlink(fname)
    return text
Example #8
0
def parse(celexid):
    (fd, fname)=mkstemp('.doc')
    fd=os.fdopen(fd, 'w')
    fd.write(fetch_raw("http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=%s:DOC" % celexid).read())
    fd.close()
    resdir=mkdtemp()
    null=open('/dev/null','w+')
    p = Popen(['/usr/bin/libreoffice', '--headless', '--convert-to', 'odt', '--outdir', resdir, fname],
              stdout=null,
              stderr=null)
    p.wait()
    null.close()
    os.unlink(fname)
    #logger.info(resdir+'/'+os.path.basename(fname)[:-4]+'.odt')
    odt=ODT(resdir+'/'+os.path.basename(fname)[:-4]+'.odt')
    rmtree(resdir)
    return odt
Example #9
0
def getactivities(mepid, terms=[8]):
    urltpl = 'http://www.europarl.europa.eu/meps/en/%s/see_more.html?type=%s&leg=%s&index=%s'
    #ctjson={'content-type': 'application/json'}
    actions={}
    for type in activitymap.keys():
        actions[type]={}
        for term in terms:
            term=str(term)
            actions[type][term]=[]
            idx=0
            while True:
                res=fetch_raw(urltpl % (mepid,type,term,idx)) #, headers=ctjson)
                ret=json.load(res)
                actions[type][term].extend(ret['documentList'])
                idx=ret['nextIndex']
                if idx in [-1,0]: break
            if not actions[type][term]:
                del actions[type][term]
        if not actions[type]:
            del actions[type]

    return actions
Example #10
0
def parse(celexid):
    (fd, fname) = mkstemp('.doc')
    fd = os.fdopen(fd, 'w')
    fd.write(
        fetch_raw(
            "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=%s:DOC" %
            celexid).read())
    fd.close()
    resdir = mkdtemp()
    null = open('/dev/null', 'w+')
    p = Popen([
        '/usr/bin/libreoffice', '--headless', '--convert-to', 'odt',
        '--outdir', resdir, fname
    ],
              stdout=null,
              stderr=null)
    p.wait()
    null.close()
    os.unlink(fname)
    #logger.info(resdir+'/'+os.path.basename(fname)[:-4]+'.odt')
    odt = ODT(resdir + '/' + os.path.basename(fname)[:-4] + '.odt')
    rmtree(resdir)
    return odt