コード例 #1
0
def extract_table(table, url, date=None):
    trs = table.xpath('.//tr')
    header = trs[0]
    tds = header.xpath('.//td')
    if len(tds) < 2:
        log(
            1, "vote table has less than two columns in the header: %s %s" %
            (url, tostring(trs[0])))
        raise ValueError
    type = junws(tds[1])
    if type not in {"+", "-", "0"}:
        log(
            1, "vote header type is unexpected value %s in %s" %
            (repr(type), url))
        raise ValueError
    res = {'total': int(junws(tds[0])), 'type': type, 'meps': []}
    for tr in trs[1:]:
        tds = tr.xpath('.//td')
        if len(tds) < 2:
            log(
                1, "vote table has less than two columns in the body: %s %s" %
                (url, tostring(tr)))
            raise ValueError
        #grp = junws(tds[0]).split()
        for meps in tds[1].xpath(".//p"):
            meps = junws(meps)
            if not meps: continue
            for m in meps.split(','):
                m = unws(m)
                if not m: continue
                mepid = db.getMep(m, date=date)
                if not mepid:
                    log(2, "could not resolve MEP name: %s" % m)
                res['meps'].append(mepid or m)
    return res
コード例 #2
0
def extract_proc(table, url):
    res = {}
    if len(table) < 1:
        log(1, "could not find procedure table in %s", url)
        raise ValueError
    for tr in table[0].xpath('.//tr'):
        tds = tr.xpath('.//td')
        title = junws(tds[0])
        val = junws(tds[1])
        if not title or not val: continue
        res[title] = val
    return res
コード例 #3
0
def crawl(term, update=False, test=[], **kwargs):
    seen = set()
    url="https://www.europarl.europa.eu/committees/en/documents/search?committeeMnemoCode=%s&textualSearchMode=TITLE&textualSearch=&documentTypeCode=AMCO&reporterPersId=&procedureYear=&procedureNum=&procedureCodeType=&peNumber=&aNumber=&aNumberYear=&documentDateFrom=&documentDateTo=&meetingDateFrom=&meetingDateTo=&performSearch=true&term=%s&page=%s&pageSize={}".format(itemsPerPage)
    jobs = []
    for com in (k for k in test or COMMITTEE_MAP.keys() if len(k)==4):
        i=0
        log(3,'crawling %s, term: %s' % (com, term))
        try:
            root=fetch(url % (com, term, i))
        except requests.exceptions.HTTPError as e:
            #if e.response.status_code == 500:
            log(3, "failed to get list of amendments for %s in term %d, http error code: %s" % (com, term, e.response.status_code))
            continue
        prev=[]
        while True:
            log(3, "crawling amendments search page %s for %s term %s" % (i, com, term))
            tmp=[]
            for a in root.xpath('//a[@class="erpl_document-subtitle-pdf"]'):
                u=a.get('href','')
                if (len(u)<=13):
                    log(2,'url is too short, skipping: "%s"' % u)
                    continue
                if u in seen or u in skipurls or (not u.endswith('EN') and not u.endswith('_EN.pdf')):
                    log(3,"skipping url: %s" % repr(u))
                    continue
                seen.add(u)
                tmp.append(u)
                rs = a.xpath('../../following-sibling::div/span[@class="erpl_document-subtitle-author"]')
                r = [y for y in [junws(x) for x in rs] if y]
                try:
                    payload = dict(kwargs)
                    payload['url'] = u
                    payload['meps'] = r
                    if test:
                        print(payload)
                    else:
                        add_job('amendment', payload=payload)
                except:
                    print(u, r)

            if not tmp or prev==tmp or len(tmp) < itemsPerPage:
                break
            prev=tmp

            if update: break

            i+=1
            try:
                root=fetch(url % (com, term, i))
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 500:
                    log(3, "failed to page %s of draft agendas for %s in term %d" % (i, com, term))
                break
コード例 #4
0
ファイル: pvote.py プロジェクト: parltrack/parltrack
def scrape(url, **kwargs):
    log(3,"scraping %s" % (url))
    root = getXML(url)
    if root is None:
        log(1,"could not get votes for", url)
        return # angrily o/
    log(3, "processing plenary votes xml from %s" % url)
    # root is:
    #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443"
    votes=[]
    for vote in root.xpath('//RollCallVote.Result'):
        # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/
        time = vote.get('Date')
        if len(time.split()) == 2:
            ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        else:
            ts = datetime.strptime(time, "%Y-%m-%d")
        tmp=vote.get('Identifier')
        if tmp:
            voteid = int(tmp)
        else:
            tmp = vote.get('Number')
            if not tmp:
                log(1, "blimey, could not deduce an id for the vote in %s" % url)
                raise ValueError("no id for vote in %s" % url)
            voteid = "%s-%s" % (ts,tmp)
        title = vote.xpath("RollCallVote.Description.Text")
        if len(title) != 1:
            log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url))
            title="!unknown!"
        else:
            title=junws(title[0])
        v={u"ts": ts,
           u"url": url,
           u"voteid": voteid,
           u"title": title,
           'votes':{}}
        v.update(votemeta(v['title'], v['ts']))
        if 'epref' not in v:
            ref = vote.xpath("RollCallVote.Description.Text/a/text()")
            if ref:
                v['epref']=unws(ref[0])
        for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]:
            type = vote.xpath(type)
            if not type: continue
            if len(type)>1:
                log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url))
            type = type[0]
            v['votes'][stype]={'total': int(type.get('Number')),
                               'groups': {}}
            for group in type.xpath('Result.PoliticalGroup.List'):
                g = str(group.get('Identifier'))
                if not g in v['votes'][stype]['groups']:
                    v['votes'][stype]['groups'][g]=[]
                for tag in ['Member.Name', 'PoliticalGroup.Member.Name']:
                    for mep in group.xpath(tag):
                        m = {}
                        name = junws(mep)
                        mepid = mep.get("PersId")
                        if mepid:
                            mepid = int(mepid)
                        else:
                            mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes