Beispiel #1
0
def getComAms(leg=TERM, update=False):
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
Beispiel #2
0
def getComAgendas():
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm"
    #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        root=fetch(url, params=postdata)
        prev=[]
        while True:
            logger.info("%s %s" % (datetime.now().isoformat(), url))
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp or prev==tmp: break
            prev=tmp
            for u,title in tmp:
                if title.startswith('DRAFT AGENDA'):
                    yield (u,com)
            i+=1
            url=nexttpl % (com,i)
            root=fetch(url)
Beispiel #3
0
def getIncomming(term=7):
    # returns dict of new incoming meps. this is being checked when
    # crawling, to set more accurate groups and constituency info
    i=0
    page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=in')
    last=None
    res={}
    while True:
        meps=[((u'name', unws(x.xpath('text()')[0])),
               (u'meta', {u'url': urljoin(urljoin(BASE_URL,x.get('href')),'get.html')}),
               (u'Constituencies', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"),
                                    u'country': unws((x.xpath('..//span[@class="ep_country"]/text()') or [''])[0])}),
               (u'Groups', {u'start': datetime.strptime(unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0]), "%B %d, %Y"),
                            u'group': unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0]),
                            u'groupid': group_map[unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])],
                            u'role': unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])}),
               )
              for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        last=meps
        for mep in meps:
            res[int(mep[1][1]['url'].split('/')[-2])]=dict(mep[1:])
        i+=1
        page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=in&filter=' % (i, term))
    return res
Beispiel #4
0
def getComAms(leg=7, update=False):
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
Beispiel #5
0
def getInOut(term=current_term, dir="in", res={}):
    # returns dict of new incoming meps. this is being checked when
    # crawling, to set more accurate groups and constituency info
    i = 0
    page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=%s" % dir, ignore=[500])
    last = None
    while True:
        meps = []
        for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'):
            mepid = int(urljoin(BASE_URL, x.get("href")).split("/")[-2])
            const = {u"country": unws((x.xpath('..//span[@class="ep_country"]/text()') or [""])[0])}
            if dir == "out":
                const["start"], const["end"] = [
                    datetime.strptime(d, "%B %d, %Y")
                    for d in unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]).split(" - ")
                ]
            else:
                const["start"] = datetime.strptime(
                    unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [""])[0]), "%B %d, %Y"
                )
            if not mepid in res:
                res[mepid] = [const]
            else:
                res[mepid].append(const)
            meps.append((mepid, const))
        if meps == last:
            break
        last = meps
        i += 1
        page = fetch(
            "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=%s&filter="
            % (i, term, dir),
            ignore=[500],
        )
    return res
Beispiel #6
0
def getOutgoing(term=current_term):
    # returns an iter over ex meps from the current term, these are
    # missing from the get_meps result
    global newbies
    i = 0
    page = fetch(
        'http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out',
        ignore=[500])
    last = None
    while True:
        meps = [(
            (u'url', urljoin(BASE_URL, x.get('href'))),
            (u'name', unws(x.xpath('text()')[0])),
            ('dates',
             unws((x.xpath('../span[@class="meps_date_inout"]/text()')
                   or [''])[0])),
            ('country',
             unws((x.xpath('../span[@class="ep_country"]/text()')
                   or [''])[0])),
            ('group',
             unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])),
            ('role',
             unws((x.xpath(
                 '..//span[@class="ep_group"]/span[@class="ep_title"]/text()')
                   or [''])[0])),
        ) for x in page.xpath(
            '//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps == last:
            break
        last = meps
        for mep in meps:
            mep = dict(mep)
            tmp = mep['dates'].split(' - ')
            if tmp:
                mep[u'Constituencies'] = [{
                    u'start':
                    datetime.strptime(tmp[0], "%B %d, %Y"),
                    u'end':
                    datetime.strptime(tmp[1], "%B %d, %Y"),
                    u'country':
                    mep['country']
                }]
                mep[u'Groups'] = [{
                    u'Organization': mep['group'],
                    u'role': mep['role']
                }]
                del mep['dates']
                del mep['country']
                del mep['group']
                del mep['role']
                newbies[int(mep['url'].split('/')[-2])] = mep
                yield (urljoin(urljoin(BASE_URL, mep['url']), 'get.html'), mep)
        i += 1
        page = fetch(
            'http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter='
            % (i, term),
            ignore=[500])
Beispiel #7
0
def get_all_dossiers():
    for year in xrange(datetime.date.today().year, 1972, -1):
        tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
                   % (year))
        count=int(tree.xpath('//span[@class="ep_title resultNum pdfHide"]/text()')[0].strip()[len('Results found: '):])
        tree=fetch('http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
                   % (count,year))
        links=tree.xpath('//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]')
        for link in links:
            yield (urljoin(BASE_URL,link.get('href')),
                   (link.xpath('text()') or [''])[0])
Beispiel #8
0
def get_meps(term='7'):
    i=0
    page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=&search=Show+result" % (term))
    last=None
    while True:
        meps=[(x.get('href'), unws(x.xpath('text()')[0])) for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        for url,name in meps:
            yield (urljoin(urljoin(BASE_URL,url),'get.html'), name)
        last=meps
        i+=1
        page=fetch("http://www.europarl.europa.eu/meps/en/performsearch.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=ALL&bodyValue=&type=&filter=" % (i, term))
Beispiel #9
0
def getDates(params):
    root = fetch(URL, params=params)
    #print tostring(root)
    prevdates = None
    dates = root.xpath('//span[@class="date"]/text()')
    i = 10
    while dates and dates != prevdates:
        for date in dates:
            yield datetime.strptime(date, "%d-%m-%Y").strftime("%Y%m%d")

        root = fetch(URL, params="%s&startValue=%s" % (params, i))
        prevdates = dates
        i += 10
        dates = root.xpath('//span[@class="date"]/text()')
def getDates(params):
    root=fetch(URL, params=params)
    #print tostring(root)
    prevdates=None
    dates=root.xpath('//span[@class="date"]/text()')
    i=10
    while dates and dates!=prevdates:
        for date in dates:
            yield datetime.strptime(date.strip(), "%d-%m-%Y").strftime("%Y%m%d")

        root=fetch(URL, params="%s&startValue=%s" % (params,i))
        prevdates=dates
        i+=10
        dates=root.xpath('//span[@class="date"]/text()')
Beispiel #11
0
def getmeps(query='current'):
    if query=='unlisted':
        for mep in unlisted:
            yield mep
    elif query=='all':
        for letter in xrange(26):
            tmp=meplists[query]
            a=ord('A')
            root=fetch(tmp%chr(a+letter), ignore=[500])
            for meplm in root.xpath('//id/text()'):
                yield int(meplm)
    else:
        root=fetch(meplists[query], ignore=[500])
        for meplm in root.xpath('//id/text()'):
            yield int(meplm)
Beispiel #12
0
def get_all_dossiers():
    for year in xrange(datetime.date.today().year, 1972, -1):
        tree = fetch(
            'http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
            % (year))
        count = int(
            tree.xpath('//span[@class="resultNumber"]/text()')[0].strip())
        tree = fetch(
            'http://www.europarl.europa.eu/oeil/widgets/resultwidget.do?lang=en&limit=%s&noHeader=false&q=objectReferenceN:N-%s/????\(*\)'
            % (count, year))
        links = tree.xpath(
            '//a[@class="reference rssEntry_id rssEntry_title rssEntry_updated"]'
        )
        for link in links:
            yield (urljoin(BASE_URL, link.get('href')), (link.xpath('text()')
                                                         or [''])[0])
Beispiel #13
0
def checkUrl(url):
    if not url: return False
    try:
        res = fetch(url)
    except Exception, e:
        #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e)
        return False
Beispiel #14
0
def scrape_docs(tree):
    res = []
    docs = tree.xpath('//table[@id="doc_gateway"]')
    tabs = [x.xpath("preceding-sibling::h2")[0].xpath("text()")[0] for x in docs]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != "Other institutions":
                    doc[u"body"] = instmap[inst]
                else:
                    try:
                        doc[u"body"] = otherinst[doc["type"].split(":")[0]]
                    except KeyError:
                        doc[u"body"] = ""
                if (
                    doc["body"] in ["EP", "CSL"]
                    and doc["type"] == "Joint text approved by Conciliation Committee co-chairs"
                ):
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get("text"):
                    try:
                        summary = fetch(doc["text"]["url"])
                    except:
                        continue
                    doc[u"text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')]
                res.append(doc)
        elif inst != "All documents":
            print "[!] unrecognized tab in documents", inst
    return res
Beispiel #15
0
def scrape_docs(tree):
    res=[]
    docs=tree.xpath('//table[@id="doc_gateway"]')
    tabs=[x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != 'Other institutions':
                    doc[u'body']=instmap[inst]
                else:
                    try:
                        doc[u'body']=otherinst[doc['type'].split(':')[0]]
                    except KeyError:
                        doc[u'body']=''
                if doc['body'] in ['EP','CSL'] and doc['type']=='Joint text approved by Conciliation Committee co-chairs':
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get('text'):
                    try: summary=fetch(doc['text']['url'])
                    except: continue
                    doc[u'text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
                res.append(doc)
        elif inst != 'All':
            logger.warn(u"[!] unrecognized tab in documents %s" % inst)
    return res
Beispiel #16
0
def checkUrl(url):
    if not url: return False
    try:
        res=fetch(url)
    except Exception, e:
        #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e)
        return False
Beispiel #17
0
def scrape_docs(tree):
    res = []
    docs = tree.xpath('//table[@id="doc_gateway"]')
    tabs = [
        x.xpath('preceding-sibling::h2')[0].xpath('text()')[0] for x in docs
    ]
    for inst, table in izip(tabs, docs):
        if inst in instmap.keys():
            for doc in lst2obj(table, docFields):
                if inst != 'Other institutions':
                    doc[u'body'] = instmap[inst]
                else:
                    try:
                        doc[u'body'] = otherinst[doc['type'].split(':')[0]]
                    except KeyError:
                        doc[u'body'] = ''
                if doc['body'] in ['EP', 'CSL'] and doc[
                        'type'] == 'Joint text approved by Conciliation Committee co-chairs':
                    # skip it twice and hope it's listed in the all documents, so it becomes EP/CSL :)
                    continue
                if doc.get('text'):
                    try:
                        summary = fetch(doc['text']['url'])
                    except:
                        continue
                    doc[u'text'] = [
                        tostring(x)
                        for x in summary.xpath('//div[@id="summary"]')
                    ]
                res.append(doc)
        elif inst != 'All documents':
            print "[!] unrecognized tab in documents", inst
    return res
Beispiel #18
0
def getMEPGender(id):
    try:
        mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" %
                       (id),
                       ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
Beispiel #19
0
def checkUrl(url):
    if not url: return False
    if url in seenurls:
        return seenurls[url]
    try:
        res=fetch(url)
    except Exception, e:
        #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e)
        seenurls[url]=False
Beispiel #20
0
def checkUrl(url):
    if not url: return False
    if url in seenurls:
        return seenurls[url]
    try:
        res = fetch(url)
    except Exception, e:
        #print >>sys.stderr, "[!] checkurl failed in %s\n%s" % (url, e)
        seenurls[url] = False
Beispiel #21
0
def getMEPDeclarations(id):
    try:
        dom = fetch(
            "http://www.europarl.europa.eu/meps/en/%s/_declarations.html" %
            (id),
            ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
        return []
Beispiel #22
0
def scrape_events(tree):
    res=[]
    for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0],eventFields):
        if item.get('text'):
            try: summary=fetch(item['text']['url'])
            except: continue
            item['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
        res.append(item)
    return res
Beispiel #23
0
def crawl(year, term):
    listurl = 'http://www.europarl.europa.eu/plenary/en/minutes.html'
    PARAMS = 'clean=false&leg=%s&refSittingDateStart=01/01/%s&refSittingDateEnd=31/12/%s&miType=title&miText=Roll-call+votes&tabActif=tabResult'
    voteurl = 'http://www.europarl.europa.eu/RegData/seance_pleniere/proces_verbal/%s/votes_nominaux/xml/P%s_PV%s(RCV)_XC.xml'
    params = PARAMS % (term, year, year)
    root=fetch(listurl, params=params)
    prevdates=None
    dates=root.xpath('//span[@class="date"]/text()')
    i=10
    while dates and dates!=prevdates:
        for date in dates:
            if not date.strip(): continue
            date = datetime.strptime(date.strip(), "%d-%m-%Y")
            yield voteurl % (date.strftime("%Y/%m-%d"), term, date.strftime("(%Y)%m-%d"))

        root=fetch(listurl, params="%s&startValue=%s" % (params,i))
        prevdates=dates
        i+=10
        dates=root.xpath('//span[@class="date"]/text()')
Beispiel #24
0
def getOutgoing(term=current_term):
    # returns an iter over ex meps from the current term, these are
    # missing from the get_meps result
    i = 0
    page = fetch("http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out", ignore=[500])
    last = None
    while True:
        meps = []
        for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]'):
            url = urljoin(urljoin(BASE_URL, x.get("href")), "get.html")
            meps.append(url)
            yield (url, {})
        if meps == last:
            break
        last = meps
        i += 1
        page = fetch(
            "http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter="
            % (i, term),
            ignore=[500],
        )
Beispiel #25
0
def scrape_events(tree):
    res = []
    for item in lst2obj((tree.xpath('//table[@id="key_events"]') or [None])[0],
                        eventFields):
        if item.get('text'):
            try:
                summary = fetch(item['text']['url'])
            except:
                continue
            item['text'] = [
                tostring(x) for x in summary.xpath('//div[@id="summary"]')
            ]
        res.append(item)
    return res
Beispiel #26
0
def getOutgoing(term=7):
    # returns an iter over ex meps from the current term, these are
    # missing from the get_meps result
    i=0
    page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?type=out')
    last=None
    while True:
        meps=[((u'url', urljoin(BASE_URL,x.get('href'))),
               (u'name', unws(x.xpath('text()')[0])),
               ('dates', unws((x.xpath('../span[@class="meps_date_inout"]/text()') or [''])[0])),
               ('country', unws((x.xpath('../span[@class="ep_country"]/text()') or [''])[0])),
               ('group', unws((x.xpath('..//span[@class="ep_group"]/text()') or [''])[0])),
               ('role', unws((x.xpath('..//span[@class="ep_group"]/span[@class="ep_title"]/text()') or [''])[0])),
               )
              for x in page.xpath('//div[@class="ep_elementpeople1"]//a[@class="ep_title"]')]
        if meps==last:
            break
        last=meps
        for mep in meps:
            mep=dict(mep)
            tmp=mep['dates'].split(' - ')
            if tmp:
                mep[u'Constituencies']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"),
                                       u'end': datetime.strptime(tmp[1], "%B %d, %Y"),
                                       u'country': mep['country']}
                mep[u'Groups']={u'start': datetime.strptime(tmp[0], "%B %d, %Y"),
                               u'end': datetime.strptime(tmp[1], "%B %d, %Y"),
                               u'group': mep['group'],
                               u'role': mep['role']}
                del mep['dates']
                del mep['country']
                del mep['group']
                del mep['role']
                yield (urljoin(urljoin(BASE_URL,mep['url']),'get.html'), mep)
        i+=1
        page=fetch('http://www.europarl.europa.eu/meps/en/incoming-outgoing.html?action=%s&webCountry=&webTermId=%s&name=&politicalGroup=&bodyType=&bodyValue=&type=out&filter=' % (i, term))
Beispiel #27
0
def getComAgendas():
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root=fetch(url)
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp: break
            for u,_ in tmp:
                yield (u,com)
            i+=10
            url=nexttpl % (com,i)
Beispiel #28
0
def getComAgendas():
    urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url = urltpl % (com)
        i = 0
        agendas = []
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root = fetch(url)
            tmp = [(a.get('href'), unws(a.xpath('text()')[0]))
                   for a in root.xpath('//p[@class="title"]/a')
                   if len(a.get('href', '')) > 13]
            if not tmp: break
            for u, _ in tmp:
                yield (u, com)
            i += 10
            url = nexttpl % (com, i)
Beispiel #29
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url)
    data = {u'active': True, 'meta': {u'url': url}} # return {'active': False}
    mepdiv=root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1)
    try:
        data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"),
                           u'place': unws(p) }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])}
    data[u'Constituencies']=[const]
    try:
        const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]),
    except IndexError:
        data[u'active']=False
    else:
        group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]),
                             u'group': group,
                             u'groupid': group_map[group]}]
    cdiv=root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')])
        addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')])
        addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))])
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title=unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data,u'Addresses',getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower()=='curriculum vitae':
            data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item={u'role': key,
                      u'abbr': unws(''.join(span.xpath('text()'))),
                      u'Organization': unws(span.tail)}
                for start, field in orgmaps:
                    if item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Beispiel #30
0
def parseMember(userid):
    url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u"active": False, "meta": {u"url": url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
    borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')
    if len(borntxt) > 0:
        (d, p) = borntxt[0].split(",", 1)
        try:
            data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)}
        except ValueError:
            logger.warn("[!] failed to scrape birth data %s" % url)
            logger.warn(traceback.format_exc())
    else:
        logger.warn("[!] no birth data %s" % url)
    const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)}
    data[u"Constituencies"] = [const]
    try:
        data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1])
    except IndexError:
        pass
    else:
        group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        try:
            role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1])
        except IndexError:
            role = u"Member"
        data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}]
    cdiv = root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(
            data,
            u"RSS",
            [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')],
        )
        addif(
            data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]
        )
        addif(
            data,
            u"Mail",
            [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))],
        )
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title = unws("".join(span.xpath(".//text()")))
        if title in ["Accredited assistants", "Local assistants"]:
            if not "assistants" in data:
                data["assistants"] = {}
            addif(
                data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")]
            )
    addif(data, u"Addresses", getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower() == "curriculum vitae":
            data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)}
                for start, field in orgmaps:
                    if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start):
                        if not field in data:
                            data[field] = []
                        if field == "Committees" and item["Organization"] in COMMITTEE_MAP:
                            item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]]
                        data[field].append(item)
                        break
        else:
            logger.error("[!] unknown field %s" % key)
    return data
Beispiel #31
0
def scrape(url):
    try:
        logger.info('scrape '+url)
        tree=fetch(url)
        agents,committees=scrape_actors(tree)
        forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields)
        events=scrape_events(tree)
        procedure=scrape_basic(tree)
        if not procedure: return
        ipext=[]
        for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]):
            skip=False
            for event in forecasts+events:
                if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']:
                    skip=True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents=agents+scrape_docs(tree)+events+forecasts+ipext
        other=[x for x in allevents if not x.get('date')]
        allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date'))
        allevents=merge_events(allevents,committees, agents)
        res={u'meta': {'source': url,
                       'timestamp': datetime.datetime.utcnow() },
             u'procedure': procedure,
             u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
             u'committees': committees,
             u'activities': sorted(allevents, key=itemgetter('date')),
             u'other': other,
             }
        tmp=url.split('id=')
        if len(tmp)>1:
            res['meta']['id']=int(tmp[1])
        # check for "final act"
        finalas=tree.xpath('//div[@id="final_act"]//a')
        final={}
        for link in finalas:
            if link.get('class')=='sumbutton':
                try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href'))
                except: continue
                final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not 'docs' in final: final['docs']=[]
                final['docs'].append({'title': link.xpath('text()')[0].strip(),
                                               'url': link.get('href')})
        if final and final.get('docs'):
            res[u'procedure'][u'final']=final.get('docs',[{}])[0]
            for item in res['activities']:
                if item.get('type')==u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text']=final['text']
                    if  len(final.get('docs'))>1:
                       if not 'docs' in item:
                           item[u'docs']=final['docs']
                       else:
                           item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url,traceback.format_exc()))
        return
Beispiel #32
0
def getMEPDeclarations(id):
    try:
        dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
        return []
Beispiel #33
0
def getMEPGender(id):
    try:
        mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
Beispiel #34
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Beispiel #35
0
def scrape(url, comid):
    root=fetch(url)
    lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA':
        logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()'))))
    agenda={u'committee': comid,
            u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
            u'src': url,
        }
    i=1
    if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING":
        logger.warn("skipping interparl com meet")
        return
    if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'):
            agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))),
                           u'type': unws(' '.join(lines[3].xpath('.//text()'))),
                           u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))),
                           u'city': unws(' '.join(lines[5].xpath('.//text()'))),
                           u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:],
                           })
            i=7
    itemcnt=0
    item={}
    schedule=None
    res=[]
    while i < len(lines):
        line=lines[i]
        i+=1
        txt=unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp=toTime(txt)
        if tmp:
            schedule=tmp
            if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera']=True
                i+=1
            continue

        if line.tag=='div':
            item[u'actors']=getactors(line)
            continue
        firsttoken=txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt+=1
            item=copy.deepcopy(agenda)
            item.update({u'title': ' '.join(txt.split()[1:]),
                         u'seq_no': itemcnt,})
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken==u"·":
            if not 'list' in item: item[u'list']=[]
            tmp=' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M")
                    except:
                        logger.warn('[$] unknown tabling deadline format %s' % unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt)==12:
            item[u'comdossier']=txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp=getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    return res
Beispiel #36
0
def scrape(url):
    try:
        logger.info('scrape ' + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]')
                             or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []):
            skip = False
            for event in forecasts + events:
                if event['type'] == ipexevents.get(ipexd['type'], {}).get(
                        'oeil', 'asdf') and event['date'] == ipexd['date']:
                    skip = True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get('date')]
        allevents = sorted([x for x in allevents if x.get('date')],
                           key=itemgetter('date'))
        allevents = merge_events(allevents, committees)
        res = {
            u'meta': {
                'source': url,
                'id': int(url.split('id=')[1]),
                'timestamp': datetime.datetime.utcnow()
            },
            u'procedure':
            procedure,
            u'links':
            form2obj((tree.xpath('//table[@id="external_links"]')
                      or [None])[0]),
            u'committees':
            committees,
            u'activities':
            sorted(allevents, key=itemgetter('date')),
            u'other':
            other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get('class') == 'sumbutton':
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" %
                                    link.get('href'))
                except:
                    continue
                final['text'] = [
                    tostring(x) for x in summary.xpath('//div[@id="summary"]')
                ]
            else:
                if not 'docs' in final: final['docs'] = []
                final['docs'].append({
                    'title': link.xpath('text()')[0].strip(),
                    'url': link.get('href')
                })
        if final and final.get('docs'):
            res[u'procedure'][u'final'] = final.get('docs', [{}])[0]
            for item in res['activities']:
                if item.get(
                        'type') == u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text'] = final['text']
                    if len(final.get('docs')) > 1:
                        if not 'docs' in item:
                            item[u'docs'] = final['docs']
                        else:
                            item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return
Beispiel #37
0
def scrape(url):
    try:
        logger.info("scrape " + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []):
            skip = False
            for event in forecasts + events:
                if (
                    event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf")
                    and event["date"] == ipexd["date"]
                ):
                    skip = True
                    break
            if skip:
                continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get("date")]
        allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date"))
        allevents = merge_events(allevents, committees)
        res = {
            u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()},
            u"procedure": procedure,
            u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
            u"committees": committees,
            u"activities": sorted(allevents, key=itemgetter("date")),
            u"other": other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get("class") == "sumbutton":
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" % link.get("href"))
                except:
                    continue
                final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not "docs" in final:
                    final["docs"] = []
                final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")})
        if final and final.get("docs"):
            res[u"procedure"][u"final"] = final.get("docs", [{}])[0]
            for item in res["activities"]:
                if item.get("type") == u"Final act published in Official Journal":
                    if final.get("text"):
                        item[u"text"] = final["text"]
                    if len(final.get("docs")) > 1:
                        if not "docs" in item:
                            item[u"docs"] = final["docs"]
                        else:
                            item[u"docs"].extend(final["docs"])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return