Example #1
0
def getComAms(leg=TERM, update=False):
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
Example #2
0
def getComAgendas():
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm"
    #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        root=fetch(url, params=postdata)
        prev=[]
        while True:
            logger.info("%s %s" % (datetime.now().isoformat(), url))
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp or prev==tmp: break
            prev=tmp
            for u,title in tmp:
                if title.startswith('DRAFT AGENDA'):
                    yield (u,com)
            i+=1
            url=nexttpl % (com,i)
            root=fetch(url)
Example #3
0
def getComAms(leg=7, update=False):
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
Example #4
0
def getComAgendas():
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root=fetch(url)
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp: break
            for u,_ in tmp:
                yield (u,com)
            i+=10
            url=nexttpl % (com,i)
Example #5
0
def getComAgendas():
    urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url = urltpl % (com)
        i = 0
        agendas = []
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root = fetch(url)
            tmp = [(a.get('href'), unws(a.xpath('text()')[0]))
                   for a in root.xpath('//p[@class="title"]/a')
                   if len(a.get('href', '')) > 13]
            if not tmp: break
            for u, _ in tmp:
                yield (u, com)
            i += 10
            url = nexttpl % (com, i)
Example #6
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Example #7
0
def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0]
          if len(t.xpath('text()'))>0
          else
              groupurlmap[t.xpath("a")[0].get('href')]
              if len(t.xpath("a"))>0
              else groupurlmap[t.xpath("img")[0].get('src')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents