コード例 #1
0
ファイル: amendments.py プロジェクト: bjornkri/parltrack
def getComAms(leg=TERM, update=False):
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
コード例 #2
0
def getComAgendas():
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm"
    #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        root=fetch(url, params=postdata)
        prev=[]
        while True:
            logger.info("%s %s" % (datetime.now().isoformat(), url))
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp or prev==tmp: break
            prev=tmp
            for u,title in tmp:
                if title.startswith('DRAFT AGENDA'):
                    yield (u,com)
            i+=1
            url=nexttpl % (com,i)
            root=fetch(url)
コード例 #3
0
ファイル: amendments.py プロジェクト: danohuiginn/parltrack
def getComAms(leg=7, update=False):
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
コード例 #4
0
ファイル: ep_comagendas.py プロジェクト: ehj/parltrack
def getComAgendas():
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root=fetch(url)
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp: break
            for u,_ in tmp:
                yield (u,com)
            i+=10
            url=nexttpl % (com,i)
コード例 #5
0
def getComAgendas():
    urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url = urltpl % (com)
        i = 0
        agendas = []
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root = fetch(url)
            tmp = [(a.get('href'), unws(a.xpath('text()')[0]))
                   for a in root.xpath('//p[@class="title"]/a')
                   if len(a.get('href', '')) > 13]
            if not tmp: break
            for u, _ in tmp:
                yield (u, com)
            i += 10
            url = nexttpl % (com, i)
コード例 #6
0
ファイル: oeil.py プロジェクト: parltrack/parltrack
def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0]
          if len(t.xpath('text()'))>0
          else
              groupurlmap[t.xpath("a")[0].get('href')]
              if len(t.xpath("a"))>0
              else groupurlmap[t.xpath("img")[0].get('src')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents