Ejemplo n.º 1
0
def save(data, stats):
    res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]),
        dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]),
    )
    if d:
        now = datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8"))
            data["meta"]["created"] = now
            if stats:
                stats[0] += 1
        else:
            logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8"))
            logger.warn(jdump(d))
            data["meta"]["updated"] = now
            if stats:
                stats[1] += 1
            data["_id"] = res["_id"]
        data["changes"] = res.get("changes", {})
        data["changes"][now.isoformat()] = d
        db.ep_meps2.save(data)
    if stats:
        return stats
    else:
        return data
Ejemplo n.º 2
0
def getComAms(leg=7, update=False):
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata="clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (k for k in COMMITTEE_MAP.keys()
                    if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
            url=urltpl % (com)
            i=0
            logger.info('%s %s crawling %s' % (datetime.now().isoformat(), doctype, com))
            root=fetch(url, params=postdata)
            prev=[]
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp={a.get('href'): ' '.join(a.xpath('../../../p[@class="rapporteurs"]//text()')) if doctype != 'AMCO' else None
                     for a in root.xpath('//a[@title="open this PDF in a new window"]')
                     if (len(a.get('href',''))>13)}
                if not tmp or prev==tmp:
                    break
                prev=tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i+=1
                url=nexttpl % (com,i)
                root=fetch(url)
Ejemplo n.º 3
0
def getComAgendas():
    #urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    urltpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html"
    postdata="docType=AGEN&leg=8&miType=text&tabActif=tabResult#sidesForm"
    #nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/search-in-documents.html?action=%s&tabActif=tabResult#sidesForm"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP', 'SURE', 'CRIM', 'CRIS']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        root=fetch(url, params=postdata)
        prev=[]
        while True:
            logger.info("%s %s" % (datetime.now().isoformat(), url))
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp or prev==tmp: break
            prev=tmp
            for u,title in tmp:
                if title.startswith('DRAFT AGENDA'):
                    yield (u,com)
            i+=1
            url=nexttpl % (com,i)
            root=fetch(url)
Ejemplo n.º 4
0
def save(data, stats):
    res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {}
    if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender']
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]]))
    if d:
        now=datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(('adding %s' % (data['Name']['full'])).encode('utf8'))
            data['meta']['created']=now
            if stats: stats[0]+=1
        else:
            logger.info(('updating %s' % (data['Name']['full'])).encode('utf8'))
            logger.warn(jdump(d))
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now.isoformat()]=d
        db.ep_meps2.save(data)
    del res
    if stats: 
        del data
        return stats
    else: return data
Ejemplo n.º 5
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL,celexid))
    path.reverse()
    (code,lang)=celexid.split(":")[1:3]
    st=6
    if len(code)>6:
        if code[6].isalpha(): st=7
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:st],
                       u'refno': code[st:],
                       u'lang': lang,
                       u'chapter': path,
                       }}
    else:
        eurlex={'id': {u'celexid': celexid,
                       u'sector': code[0],
                       u'year': code[1:5],
                       u'doctype': code[5:6],
                       u'lang': lang,
                       u'chapter': path,
                       }}

    try:
        eurlex['id'][u'typeDesc']= CELEXCODES[code[0]]['Document Types'][code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc']= u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta']={u'src': "%s%s:NOT" % (EURLEXURL,celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL,celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]'))>0:
        logger.warn('[!] nothing to scrape here: %s', "%s%s:NOT" % (EURLEXURL,celexid))
        return
    eurlex[u'title'] = root.xpath('//h2[text()="Title and reference"]/following-sibling::p/text()')[0]
    # dates
    dates=root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest=unws(y).split(": ",1)
        item={u'type': title}
        date=rest[:10]
        tail=rest[10:]
        if tail.startswith('; '):
            tail=tail[2:]
        if date=='99/99/9999': item[u'date']= datetime(9999,12,31)
        elif date=='00/00/0000': item[u'date']= datetime(0001,01,01)
        elif date=='//': continue
        else:
            try: item[u'date']= datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try: item[u'date']= datetime.strptime(date, u"%m/%d/%Y")
                except: pass
        if len(tail):
            item['note']=tail
        try:
            eurlex['dates'].append(item)
        except:
            eurlex['dates']=[item]
Ejemplo n.º 6
0
def crawl_allseq(saver=jdump):
    seen=[]
    stats=[0,0]
    for term in xrange(1,8):
        for url, name in get_meps(term=term):
            if not url in seen:
                saver(scrape(url),stats)
    logger.info('end of crawl')
Ejemplo n.º 7
0
def get_all(term=""):
    for term in xrange(1, current_term + 1):
        for url, name in get_meps(term=term):
            mep = db.ep_meps2.find_one({"Name.full": name})
            if not mep:
                yield (urljoin(urljoin(BASE_URL, url), "get.html"), {})
            else:
                mep["terms"] = list(set(mep.get("terms", []) + [term]))
                db.ep_meps2.save(mep)
                logger.info("updated %s" % name)
Ejemplo n.º 8
0
def seqcrawler(saver=jdump):
    stats=[0,0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u,com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))
Ejemplo n.º 9
0
def get_all(term=''):
    for term in xrange(1,current_term+1):
        for url, name in get_meps(term=term):
            mep=db.ep_meps2.find_one({'Name.full': name})
            if not mep:
                yield (urljoin(urljoin(BASE_URL,url),'get.html'), {})
            else:
                mep['terms']=list(set(mep.get('terms',[]) + [term]))
                db.ep_meps2.save(mep)
                logger.info('updated %s' % name)
Ejemplo n.º 10
0
def crawl_all(saver=jdump,threads=4):
    m=Multiplexer(scrape,saver,threads=threads)
    m.start()
    seen=[]
    for term in xrange(1,8):
        for url, name in get_meps(term=term):
            if not url in seen:
                m.addjob(url)
                seen.append(url)
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 11
0
def fetch(url, **kwargs):
    timer=8
    while True:
        root=_fetch(url, **kwargs)
        fail=root.xpath('//h1[text()="The system could not serve your request in time because of a temporary problem; please try again shortly."]')
        if not len(fail):
            timer=8
            break
        logger.info('[i] getting "pls wait" msg, sleeping for %ss' % timer)
        time.sleep(timer)
        timer=timer*2
    return root
Ejemplo n.º 12
0
def sources(url, path):
    root=fetch(url)
    regexpNS = "http://exslt.org/regular-expressions"
    if len(path): logger.info("[i] crawler: %s" % ' '.join(path[-1]))
    for doc in root.xpath("//a[re:test(@href, 'LexUriServ[.]do[?]uri=[0-9A-Z:]*:NOT', 'i')]",
                          namespaces={'re':regexpNS}):
        yield (doc.get('href').split('uri=')[1][:-4], path)
    for c in root.xpath("//div[@id='content']//a[re:test(@href, 'chap[0-9]*.htm', 'i')]",
                        namespaces={'re':regexpNS}):
        for res in sources("%s/%s" % (crawlroot,c.get('href')),
                           path+[tuple(c.text.split(' ',1))]):
            yield res
Ejemplo n.º 13
0
def fetch(url, **kwargs):
    timer = 8
    while True:
        root = _fetch(url, **kwargs)
        fail = root.xpath(
            '//h1[text()="The system could not serve your request in time because of a temporary problem; please try again shortly."]'
        )
        if not len(fail):
            timer = 8
            break
        logger.info('[i] getting "pls wait" msg, sleeping for %ss' % timer)
        time.sleep(timer)
        timer = timer * 2
    return root
Ejemplo n.º 14
0
def sources(url, path):
    root = fetch(url)
    regexpNS = "http://exslt.org/regular-expressions"
    if len(path): logger.info("[i] crawler: %s" % ' '.join(path[-1]))
    for doc in root.xpath(
            "//a[re:test(@href, 'LexUriServ[.]do[?]uri=[0-9A-Z:]*:NOT', 'i')]",
            namespaces={'re': regexpNS}):
        yield (doc.get('href').split('uri=')[1][:-4], path)
    for c in root.xpath(
            "//div[@id='content']//a[re:test(@href, 'chap[0-9]*.htm', 'i')]",
            namespaces={'re': regexpNS}):
        for res in sources("%s/%s" % (crawlroot, c.get('href')),
                           path + [tuple(c.text.split(' ', 1))]):
            yield res
Ejemplo n.º 15
0
def save(data, stats):
    src = data['meta']['source']
    res = db.dossiers2.find_one({'meta.source': src}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items()
              if not k in ['_id', 'meta', 'changes']]),
        dict([(k, v) for k, v in data.items() if not k in [
            '_id',
            'meta',
            'changes',
        ]]))
    #logger.warn(d)
    if d:
        now = datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        if not res:
            logger.info(('adding %s - %s' %
                         (data['procedure']['reference'],
                          data['procedure']['title'])).encode('utf8'))
            data['meta']['created'] = data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[0] += 1
        else:
            logger.info(('updating  %s - %s' %
                         (data['procedure']['reference'],
                          data['procedure']['title'])).encode('utf8'))
            data['meta']['updated'] = data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[1] += 1
            data['_id'] = res['_id']
            #print >> sys.stderr, (d)
        m = db.notifications.find({'dossiers': data['procedure']['reference']},
                                  ['active_emails'])
        for g in m:
            if len(g['active_emails']) == 0:
                continue
            msg = Message(
                "[PT] %s %s" %
                (data['procedure']['reference'], data['procedure']['title']),
                sender="*****@*****.**",
                bcc=g['active_emails'])
            msg.html = htmldiff(data, d)
            msg.body = makemsg(data, d)
            mail.send(msg)
        data['changes'] = res.get('changes', {})
        data['changes'][now] = d
        db.dossiers2.save(data)
    return stats
Ejemplo n.º 16
0
def save(data, stats):
    src = data["meta"]["source"]
    res = db.dossiers2.find_one({"meta.source": src}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]),
        dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]),
    )
    # logger.warn(d)
    if d:
        now = datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        if not res:
            logger.info(
                ("adding %s - %s" % (data["procedure"]["reference"], data["procedure"]["title"])).encode("utf8")
            )
            data["meta"]["created"] = data["meta"]["timestamp"]
            del data["meta"]["timestamp"]
            sys.stdout.flush()
            stats[0] += 1
        else:
            logger.info(
                ("updating  %s - %s" % (data["procedure"]["reference"], data["procedure"]["title"])).encode("utf8")
            )
            data["meta"]["updated"] = data["meta"]["timestamp"]
            del data["meta"]["timestamp"]
            sys.stdout.flush()
            stats[1] += 1
            data["_id"] = res["_id"]
            # print >> sys.stderr, (d)
        m = db.notifications.find({"dossiers": data["procedure"]["reference"]}, ["active_emails"])
        for g in m:
            if len(g["active_emails"]) == 0:
                continue
            msg = Message(
                "[PT] %s %s" % (data["procedure"]["reference"], data["procedure"]["title"]),
                sender="*****@*****.**",
                bcc=g["active_emails"],
            )
            msg.html = htmldiff(data, d)
            msg.body = makemsg(data, d)
            mail.send(msg)
        data["changes"] = res.get("changes", {})
        data["changes"][now] = d
        db.dossiers2.save(data)
    return stats
Ejemplo n.º 17
0
def getComAgendas():
    urltpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl="http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys() if len(k)==4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url=urltpl % (com)
        i=0
        agendas=[]
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root=fetch(url)
            tmp=[(a.get('href'), unws(a.xpath('text()')[0]))
                 for a in root.xpath('//p[@class="title"]/a')
                 if len(a.get('href',''))>13]
            if not tmp: break
            for u,_ in tmp:
                yield (u,com)
            i+=10
            url=nexttpl % (com,i)
Ejemplo n.º 18
0
def getComAgendas():
    urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?&docType=AGEN&leg=7&miType=text"
    nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?tabActif=tabLast&startValue=%s"
    for com in (k for k in COMMITTEE_MAP.keys()
                if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
        url = urltpl % (com)
        i = 0
        agendas = []
        logger.info('scraping %s' % com)
        while True:
            logger.info("crawling %s" % (url))
            root = fetch(url)
            tmp = [(a.get('href'), unws(a.xpath('text()')[0]))
                   for a in root.xpath('//p[@class="title"]/a')
                   if len(a.get('href', '')) > 13]
            if not tmp: break
            for u, _ in tmp:
                yield (u, com)
            i += 10
            url = nexttpl % (com, i)
Ejemplo n.º 19
0
def save(data, stats):
    res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    if d:
        now=unicode(datetime.utcnow().replace(microsecond=0).isoformat())
        if not res:
            logger.info(('adding %s' % (data['Name']['full'])).encode('utf8'))
            data['meta']['created']=now
            stats[0]+=1
        else:
            logger.info(('updating %s' % (data['Name']['full'])).encode('utf8'))
            logger.warn(d)
            data['meta']['updated']=now
            stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.ep_meps2.save(data)
    return stats
Ejemplo n.º 20
0
    def html(self):
        res=[css]
        res.append("<p class='statut'>%s</p>" % self.statut)
        res.append("<p class='type'>%s</p>" % self.type)

        res.append("<h2><span id='reference'>%s</span> <span id='title'>%s</span></h2>" %
                   (self.ref, self.title))

        res.append("<p class='institutions'>%s</p>" % self.institutions)

        res.extend(["<div class='explanation %s'>%s</div>" % (p.type,'\n'.join(p.html()))
                    for p in self.explanation])

        res.extend(["<p class='preamble'>%s</p>" % '\n'.join(p.html())
                    for p in self.preamble])

        tmp=[u"<li class='recital' id='recital_%s'>%s</li>" % (i+1,r)
             for i, r in enumerate(self.recitals)]
        res.append("<ol class='number '>%s</ol>" % '\n'.join(tmp))

        res.append("<p class='adoption'>%s</p>" % self.adoption)

        for i, c in enumerate(self.chaps):
            res.append("<h3 class='chapter' id='chapter%s'>%s</h3>" %
                       (i+1, c.title))
            res.append("<ol>")
            for x in c.nodes:
                res.extend(x.html())
            res.append("</ol>")

        logger.info(self.annexes)
        res.extend(["<h3>%s</h3><div class='annex'>%s</div>" %
                    (p['title'],
                     '\n'.join(['\n'.join(x.html())
                                for x in p['content']]))
                    for p in self.annexes])
        res.append("<hr /><h3>Footnotes</h3>")
        res.extend([u"<p class='footnote'><a name='footnote%s'>[%s] %s</a></p>" % (i+1, i+1 ,r[1])
                    for i, r in enumerate(self.footnotes)])

        return u'\n'.join(res).encode('utf8')
Ejemplo n.º 21
0
def save(data, stats):
    for item in data:
        if not 'committee' in item: continue
        query={'committee': item['committee'],
               'src': item['src'],
               'title': item['title']}
        if 'date' in data:
            query['date']= item['date']
            if 'end' in data:
                query['end']= item['end']
        else:
            query['seq_no']=item['seq_no']
        res=db.ep_comagendas.find_one(query) or {}
        d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
               dict([(k,v) for k,v in item.items() if not k in ['_id', 'meta', 'changes',]]))
        if d:
            now=datetime.utcnow().replace(microsecond=0)
            if not 'meta' in item: item[u'meta']={}
            if not res:
                logger.info((u'adding %s %s' % (item['committee'], item['title'])).encode('utf8'))
                item['meta']['created']=now
                if stats: stats[0]+=1
            else:
                logger.info((u'updating %s %s' % (item['committee'], item['title'])).encode('utf8'))
                logger.info(d)
                item['meta']['updated']=now
                if stats: stats[1]+=1
                item['_id']=res['_id']
            item['changes']=res.get('changes',{})
            item['changes'][now.isoformat()]=d
            db.ep_comagendas.save(item)
    if stats: return stats
    else: return data
Ejemplo n.º 22
0
def save(data, stats):
    if not data: return stats
    res=db.eurlex.find_one({ 'id.celexid' : data['id']['celexid'] }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    if d:
        now=unicode(datetime.utcnow().replace(microsecond=0).isoformat())
        if not res:
            logger.info(('adding %s' % (data['id']['celexid'])).encode('utf8'))
            data['meta']['created']=now
            if stats: stats[0]+=1
        else:
            logger.info(('updating %s' % (data['id']['celexid'])).encode('utf8'))
            logger.warn(d)
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.eurlex.save(data)
    if stats: return stats
    else: return data
Ejemplo n.º 23
0
def getComAms(leg=TERM, update=False):
    urltpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html"
    # todo add to searchRPCD, OPCD
    for doctype in ['AMCO', 'RPCD', 'OPCD']:
        postdata = "clean=false&leg=%s&docType=%s&miType=text" % (leg, doctype)
        nexttpl = "http://www.europarl.europa.eu/committees/en/%s/documents-search.html?action=%s&tabActif=tabResult#sidesForm"
        for com in (
                k for k in COMMITTEE_MAP.keys()
                if len(k) == 4 and k not in ['CODE', 'RETT', 'CLIM', 'TDIP']):
            url = urltpl % (com)
            i = 0
            logger.info('%s %s crawling %s' %
                        (datetime.now().isoformat(), doctype, com))
            root = fetch(url, params=postdata)
            prev = []
            while True:
                logger.info("%s %s" % (datetime.now().isoformat(), url))
                #logger.info(tostring(root))
                tmp = {
                    a.get('href'): ' '.join(
                        a.xpath('../../../p[@class="rapporteurs"]//text()'))
                    if doctype != 'AMCO' else None
                    for a in root.xpath(
                        '//a[@title="open this PDF in a new window"]')
                    if (len(a.get('href', '')) > 13)
                }
                if not tmp or prev == tmp:
                    break
                prev = tmp
                for u, v in sorted(tmp.items()):
                    if db.ep_ams.find_one({'src': u}): continue
                    yield u, v
                if update: break
                i += 1
                url = nexttpl % (com, i)
                root = fetch(url)
Ejemplo n.º 24
0
def save(data, stats):
    for item in data:
        if not 'committee' in item: continue
        query = {
            'committee': item['committee'],
            'src': item['src'],
            'title': item['title']
        }
        if 'date' in data:
            query['date'] = item['date']
            if 'end' in data:
                query['end'] = item['end']
        else:
            query['seq_no'] = item['seq_no']
        res = db.ep_comagendas.find_one(query) or {}
        d = diff(
            dict([(k, v) for k, v in res.items()
                  if not k in ['_id', 'meta', 'changes']]),
            dict([(k, v) for k, v in item.items() if not k in [
                '_id',
                'meta',
                'changes',
            ]]))
        if d:
            now = datetime.utcnow().replace(microsecond=0)
            if not 'meta' in item: item[u'meta'] = {}
            if not res:
                logger.info(
                    (u'adding %s%s %s' %
                     (u'%s ' % item['epdoc'] if 'epdoc' in item else '',
                      item['committee'], item['title'])).encode('utf8'))
                item['meta']['created'] = now
                if stats: stats[0] += 1
                notify(item, None)
            else:
                logger.info(
                    (u'updating %s%s %s' %
                     (u'%s ' % item['epdoc'] if 'epdoc' in item else '',
                      item['committee'], item['title'])).encode('utf8'))
                logger.info(d)
                item['meta']['updated'] = now
                if stats: stats[1] += 1
                item['_id'] = res['_id']
                notify(item, d)
            item['changes'] = res.get('changes', {})
            item['changes'][now.isoformat()] = d
            db.ep_comagendas.save(item)
    if stats: return stats
    else: return data
Ejemplo n.º 25
0
def crawler(saver=jdump, update=False):
    stats = [0, 0]
    for pdf, rapporteur in getComAms(update=update):
        logger.info(datetime.now().isoformat() + " " + pdf)
        ctr = [0, 0]
        try:
            saver(scrape(pdf, rapporteur), ctr)
        except:
            # ignore failed scrapes
            logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf))
            # logger.warn(traceback.format_exc())
            raise
        logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0], ctr[1]))
        stats[0] += ctr[0]
        stats[1] += ctr[1]
    logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(), stats[0], stats[1]))
Ejemplo n.º 26
0
def crawler(saver=jdump, update=False):
    stats=[0,0]
    for pdf, rapporteur in getComAms(update=update):
        logger.info(datetime.now().isoformat()+" "+pdf)
        ctr=[0,0]
        try:
            saver(scrape(pdf, rapporteur), ctr)
        except:
            # ignore failed scrapes
            logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf))
            #logger.warn(traceback.format_exc())
            raise
        logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1]))
        stats[0]+=ctr[0]
        stats[1]+=ctr[1]
    logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1]))
Ejemplo n.º 27
0
def save(data, stats):
    if not data: return stats
    src=data['meta']['source']
    res=db.dossiers2.find_one({ 'meta.source' : src }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    #logger.warn(pprint.pformat(d))
    if d:
        now=datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        if not res:
            logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['created']=data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[0]+=1
        else:
            logger.info(('updating  %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['updated']=data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[1]+=1
            data['_id']=res['_id']
            logger.info(jdump(d))
        if not NOMAIL:
            m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails'])
            for g in m:
                if len(g['active_emails'])==0:
                    continue
                msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']),
                              sender = "*****@*****.**",
                              bcc = g['active_emails'])
                #msg.html = htmldiff(data,d)
                msg.body = makemsg(data,d)
                mail.send(msg)
        #logger.info(htmldiff(data,d))
        #logger.info(makemsg(data,d))
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.dossiers2.save(data)
    return stats
Ejemplo n.º 28
0
def crawlseq(urls):
    [save(scrape(url), [0, 0]) for url, title in urls]
    logger.info('end of crawl')
Ejemplo n.º 29
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url)
    data = {u'active': True, 'meta': {u'url': url}} # return {'active': False}
    mepdiv=root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1)
    try:
        data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"),
                           u'place': unws(p) }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])}
    data[u'Constituencies']=[const]
    try:
        const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]),
    except IndexError:
        data[u'active']=False
    else:
        group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]),
                             u'group': group,
                             u'groupid': group_map[group]}]
    cdiv=root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')])
        addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')])
        addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))])
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title=unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data,u'Addresses',getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower()=='curriculum vitae':
            data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item={u'role': key,
                      u'abbr': unws(''.join(span.xpath('text()'))),
                      u'Organization': unws(span.tail)}
                for start, field in orgmaps:
                    if item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Ejemplo n.º 30
0
def scrape(url):
    try:
        logger.info('scrape ' + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]')
                             or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []):
            skip = False
            for event in forecasts + events:
                if event['type'] == ipexevents.get(ipexd['type'], {}).get(
                        'oeil', 'asdf') and event['date'] == ipexd['date']:
                    skip = True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get('date')]
        allevents = sorted([x for x in allevents if x.get('date')],
                           key=itemgetter('date'))
        allevents = merge_events(allevents, committees)
        res = {
            u'meta': {
                'source': url,
                'id': int(url.split('id=')[1]),
                'timestamp': datetime.datetime.utcnow()
            },
            u'procedure':
            procedure,
            u'links':
            form2obj((tree.xpath('//table[@id="external_links"]')
                      or [None])[0]),
            u'committees':
            committees,
            u'activities':
            sorted(allevents, key=itemgetter('date')),
            u'other':
            other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get('class') == 'sumbutton':
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" %
                                    link.get('href'))
                except:
                    continue
                final['text'] = [
                    tostring(x) for x in summary.xpath('//div[@id="summary"]')
                ]
            else:
                if not 'docs' in final: final['docs'] = []
                final['docs'].append({
                    'title': link.xpath('text()')[0].strip(),
                    'url': link.get('href')
                })
        if final and final.get('docs'):
            res[u'procedure'][u'final'] = final.get('docs', [{}])[0]
            for item in res['activities']:
                if item.get(
                        'type') == u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text'] = final['text']
                    if len(final.get('docs')) > 1:
                        if not 'docs' in item:
                            item[u'docs'] = final['docs']
                        else:
                            item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return
Ejemplo n.º 31
0
def crawl(urls, threads=4):
    m = Multiplexer(scrape, save, threads=threads)
    m.start()
    [m.addjob(url) for url, title in urls]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 32
0
def scrape(url, rapporteur=None):
    if (url in ['http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-483.680%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.387%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-456.679%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-494.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.705%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.767%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-454.385%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-465.012%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-496.504%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.724%2b01%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.721%2b02%2bDOC%2bPDF%2bV0%2f%2fEN',
                'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-469.723%2b03%2bDOC%2bPDF%2bV0%2f%2fEN']
        or not url.endswith('EN')):
        logger.info("skipping unparsable url")
        return []
    prolog=True
    res=[]
    block=None
    reference=None
    date=None
    committee=[]
    text=getraw(url).split('\n')
    for line in text:
        if prolog:
            if amstart.match(line):
                if reference==None:
                    logger.warn("%s [!] couldn't find ref: %s" %
                                (datetime.now().isoformat(),
                                 unws([x for x in text[:20] if unws(x)][2])))
                    # marking as scraped though
                    db.ep_ams.save({'src': url, 'error': "couldn't find reference in source pdf"})
                    return []
                if date==None or committee==[]:
                    return []
                    #raise ValueError
                block=[line]
                prolog=False
                continue

            line=unws(line)

            if not line: continue

            if line in COMMITTEE_MAP:
                committee.append(COMMITTEE_MAP[line])
                continue

            if (committee and
                  not reference and
                  re.match(refre, line)):
                reference=line
                if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN':
                    logger.info("adjusting reference to eudatap")
                    reference="2012/0011(COD)"
                continue

            if (reference and
                not date):
                try:
                    date = parse(unws(line), dayfirst=True)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            res.append(parse_block(block, url, reference, date, committee, rapporteur))
            block=[line]
            continue
        block.append(line)
    if block and filter(None,block):
        res.append(parse_block(block, url, reference, date, committee, rapporteur))
    return res
Ejemplo n.º 33
0
def scrape(decl):
    mep_id = decl.split('/')[-1].split('_')[0]
    data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''}
    logger.info("findecl scraping %s" % mep_id)

    text=getraw(decl).split('\n')
    state=0
    ptr=0
    while ptr<len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or
            issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or
            issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or
            issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or
            issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))):
            # skip to table
            while (text[ptr].split()[-4:]!=['1','2','3','4']):
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s table not found' % state)
                    raise IndexError
            start=ptr
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s fail skip empty lines' % state)
                    raise IndexError
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail past end of block %s' % state)
                    raise IndexError
                if (text[ptr].strip()=='' and
                    (text[ptr+1] in ['1',''] or
                    text[ptr+1].strip()[:3] == '1/6')):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            #print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')):
            while len([x for x in text[ptr].split(' ' * 10) if x]) != 2:
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] table B not found')
                    raise IndexError
            start=ptr
            # skip empty lines
            while ptr<len(text) and not text[ptr].split():
                ptr+=1
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail skip empty lines in B')
                    raise IndexError
                if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state==6:
            while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')):
                ptr+=1
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in G fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in G')
                    raise IndexError
            gstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')):
                ptr+=1
            gend=ptr-1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in H fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in H')
                    raise IndexError
            hstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')):
                ptr+=1
            hend=ptr-1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in I fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in I')
                    raise IndexError
            istart=ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp)==3:
                    data['date']=tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp)==5:
                    # date=tmp[2] could be preserved in data
                    tmpdate=tmp[2]
                    del tmp[2]
                    if tmp in [['Date', ':','Signature', ':']]:
                        data['date']=tmpdate
                        break
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail find end in I')
                    if DEBUG:
                        print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8')
                    raise IndexError
            state+=1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
            #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr+=1
    if state!=9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        if (len(data['occupation'])>1 and
            data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate",
                                          u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής",
                                          u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)",
                                          u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję",
                                          u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode",
                                          u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato",
                                          u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode",
            ]):
            del data['occupation'][-1]
        return data
Ejemplo n.º 34
0
def crawler(saver=jdump,threads=4):
    m=Multiplexer(scrape,saver,threads=threads)
    m.start()
    [m.addjob(url, data) for url, data in getComAgendas()]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 35
0
def parse_block(block, url, reference, date, committee, rapporteur):
    am={u'src': url,
        u'reference': reference,
        u'date': date,
        u'committee': committee}

    #logger.info(block)
    # get title
    try:
        am[u'seq']=int(unws(block[0]).split()[1])
    except ValueError:
        am[u'seq']=unws(block[0]).split()[1]
    except IndexError:
        logger.warn("%s wrong seq %s" % (datetime.now().isoformat(), block[0]))
        am[u'seq']=unws(block[0])
    del block[0]

    strip(block)

    # find and strip justification
    i=len(block)-1
    while i>2 and not (unws(block[i])=="Justification" and block[i].startswith(' ' * 6)):
        i-=1
    if i>2:
        if i<len(block)-1 and (not unws(block[i+1]) or not block[i+1].startswith(' ') ):
            am['justification']='\n'.join(block[i+2:])
            del block[i:]
            strip(block)
        else:
            logger.warn("%s wrong justification\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:])))

    # get original language
    if 4<len(unws(block[-1]))<=6 and unws(block[-1]).startswith('Or.'):
        am['orig_lang']=unws(block[-1])[4:]
        del block[-1]
        strip(block)

    # find split column new/old heading
    i=len(block)-1
    while (i>2 and
           not ((block[i].endswith("     Amendment") or
                 block[i].endswith("     PARTICULARS") or
                 block[i].endswith("     Remedy") or
                 block[i].endswith("     Amended text") or
                 block[i].endswith("     Amendement") or
                 block[i].endswith("           Amendments by Parliament") or
                 block[i].endswith("           Proposal for rejection") or
                 block[i].endswith("           Proposal for a rejection") or
                 block[i].endswith("           Does not affect English version") or
                 block[i].endswith("           (Does not affect English version)") or
                 block[i].endswith("      Amendment by Parliament")) and
                len(block[i])>33) and
           not (unws(block[i])=='Text proposed by the Commission' or
                unws(block[i]) in types)):
        i-=1
    if i>2:
        #if block[i].endswith("               Proposal for rejection"):
        #    pass # location will be possibly '-'
        seq=False
        if unws(block[i]) in ["Amendment", "Amendment by Parliament"]:
            # sequential format, search for preceeding original text
            j=i
            while (j>2 and not (unws(block[j]) in types or unws(block[j])=='Text proposed by the Commission')):
                j-=1
            if j>2: i=j
            seq=True; key='old'
        elif unws(block[i])=='Text proposed by the Commission' or block[i].strip() in types:
            seq=True; key='old'
        # throw headers
        del block[i]
        while i<len(block) and not unws(block[i]): del block[i]        # skip blank lines
        mid=max([len(x) for x in block])/2
        while i<len(block):
            if seq:
                if unws(block[i]) in ["Amendment", "Amendment by Parliament", "Text Amended"]:
                    key='new'
                    del block[i]
                    continue
                try: am[key].append(block[i])
                except KeyError: am[key]=[block[i]]
                del block[i]
                continue
            # only new, old is empty
            if block[i].startswith('         '):
                try: am['new'].append(unws(block[i]))
                except KeyError: am['new']=[unws(block[i])]
                del block[i]
                continue
            newstart = block[i].rstrip().rfind('  ')
            # only old, new is empty
            if newstart < 6:
                try: am['old'].append(unws(block[i]))
                except KeyError: am['old']=[unws(block[i])]
                del block[i]
                continue
            #mid=len(block[i])/2
            #mid=40
            lsep=block[i].rfind('  ', 0, mid)
            # todo calculate both, and use the one closer to the center
            rsep=block[i].find('  ', mid)
            sep=None
            if abs(lsep-mid)<abs(rsep-mid):
                if abs(lsep-mid)<15:
                    sep=lsep
            else:
                if abs(rsep-mid)<15:
                    sep=rsep
            if sep:
                try: am['old'].append(unws(block[i][:sep]))
                except KeyError: am['old']=[unws(block[i][:sep])]
                try: am['new'].append(unws(block[i][sep:]))
                except KeyError: am['new']=[unws(block[i][sep:])]
            else:
                # no sane split found
                #logger.warn("no split: %s %s\n%s" % (datetime.now().isoformat(),
                #                                     (sep, mid, len(block[i]), newstart, block[i]),
                #                                     block[i][mid-1:mid+2]))
                # fallback to naive splitting
                try: am['old'].append(unws(block[i][:newstart]))
                except KeyError: am['old']=[unws(block[i][:newstart])]
                try: am['new'].append(unws(block[i][newstart:]))
                except KeyError: am['new']=[unws(block[i][newstart:])]
            del block[i]
        strip(block)
    else:
        logger.warn("%s no table\n%s" % (datetime.now().isoformat(), '\n'.join(block[i:])))
        am['content']=block[i:]
        return am

    i=0
    # find end of authors
    while (i<len(block) and
           unws(block[i]) and
           not unws(block[i]).lower().startswith('compromise') and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block):
        if i>0:
            names=' '.join(block[:i])
            am['authors']=names
            #logger.info("names \n%s" % names)

            # convert to pt mep _ids
            for text in filter(None,splitNames(names)):
                mep=getMep(text,None,False)
                if mep:
                    try: am['meps'].append(mep['UserID'])
                    except KeyError: am['meps']=[mep['UserID']]
                else:
                    logger.info("fix %s" % text)
            del block[:i]
            strip(block)
        elif rapporteur:
            am['authors']=rapporteur
            for text in filter(None,splitNames(rapporteur)):
                mep=getMep(text,None,False)
                if mep:
                    try: am['meps'].append(mep['UserID'])
                    except KeyError: am['meps']=[mep['UserID']]
                else:
                    logger.info("fix %s" % text)
        else:
            logger.info("%s no authors in Amendment %s" % (datetime.now().isoformat(), am['seq']))
    else:
        logger.warn("%s no boundaries in Amendment %s\n%s" % (datetime.now().isoformat(),
                                                              am['seq'],
                                                              '\n'.join(block)))
        am['rest']=block
        return am

    # handle compromise info
    i=0
    while (i<len(block) and
           unws(block[i]) and
           not istype(block[i]) and
           not unws(block[i]).split()[0] in locstarts): i+=1
    if i<len(block) and i>0:
        am['compromise']=block[:i]
        del block[:i]
        strip(block)

    i=0
    while (i<len(block) and unws(block[i])):
        if unws(block[i]).split()[0] in locstarts:
            try: am['location'].append((' '.join(block[:i]),unws(block[i])))
            except KeyError: am['location']=[(' '.join(block[:i]),unws(block[i]))]
            del block[:i+1]
            i=0
        else:
            i+=1
    if len(block)>0 and ((len(block)==1 or
                          not unws(block[1])) and
                         unws(block[0])!='1' and
                         'location' in am):
        am['location'][-1]=(am['location'][-1][0],"%s %s" % (am['location'][-1][1],block[0]))
        del block[0]
        strip(block)

    if block:
        if not ((len(block)==3 and
                unws(block[0])=='1' and
                not unws(block[1]) and
                block[2].startswith("  ")) or
                (len(block)==2 and
                unws(block[0])=='1' and
                block[1].startswith("  "))):
            # ignore obvious footnotes
            logger.info("rest in Amendment %s\n%s" % (am['seq'],'\n'.join(block)))
    return am
Ejemplo n.º 36
0
def scrape(url):
    try:
        logger.info('scrape '+url)
        tree=fetch(url)
        agents,committees=scrape_actors(tree)
        forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields)
        events=scrape_events(tree)
        procedure=scrape_basic(tree)
        if not procedure: return
        ipext=[]
        for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]):
            skip=False
            for event in forecasts+events:
                if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']:
                    skip=True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents=agents+scrape_docs(tree)+events+forecasts+ipext
        other=[x for x in allevents if not x.get('date')]
        allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date'))
        allevents=merge_events(allevents,committees, agents)
        res={u'meta': {'source': url,
                       'timestamp': datetime.datetime.utcnow() },
             u'procedure': procedure,
             u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
             u'committees': committees,
             u'activities': sorted(allevents, key=itemgetter('date')),
             u'other': other,
             }
        tmp=url.split('id=')
        if len(tmp)>1:
            res['meta']['id']=int(tmp[1])
        # check for "final act"
        finalas=tree.xpath('//div[@id="final_act"]//a')
        final={}
        for link in finalas:
            if link.get('class')=='sumbutton':
                try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href'))
                except: continue
                final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not 'docs' in final: final['docs']=[]
                final['docs'].append({'title': link.xpath('text()')[0].strip(),
                                               'url': link.get('href')})
        if final and final.get('docs'):
            res[u'procedure'][u'final']=final.get('docs',[{}])[0]
            for item in res['activities']:
                if item.get('type')==u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text']=final['text']
                    if  len(final.get('docs'))>1:
                       if not 'docs' in item:
                           item[u'docs']=final['docs']
                       else:
                           item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url,traceback.format_exc()))
        return
Ejemplo n.º 37
0
def crawlseq(urls, null=False):
    stats=[0,0]
    [save(scrape(url),stats)
     for url, title in urls
     if (null and db.dossiers2.find_one({'meta.source': url},['_id'])==None) or not null]
    logger.info('end of crawl %s' % stats)
Ejemplo n.º 38
0
def jdump(d, tmp=None):
    # simple json dumper default for saver (multiplexer related)
    logger.info(json.dumps(d, indent=1, default=dateJSONhandler, ensure_ascii=False).encode('utf-8'))
    return json.dumps(d, indent=1, default=dateJSONhandler, ensure_ascii=False)
Ejemplo n.º 39
0
    if sys.argv[1]=="test":
        print jdump(scrape('28215')).encode('utf8')
        print jdump(scrape('113959')).encode('utf8')

        #print jdump(scrape('108570')).encode('utf8')
        #print jdump(scrape('1934')).encode('utf8')
        #print jdump(scrape('96919')).encode('utf8')
        #import code; code.interact(local=locals());
        sys.exit(0)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None)
    elif sys.argv[1]=='mepid' and sys.argv[2]:
        #print saver(scrape(int(sys.argv[2]))).encode('utf8')
        print jdump(scrape(int(sys.argv[2]))).encode('utf8')
        sys.exit(0)

    elif sys.argv[1] in meplists.keys():
        logger.info('\n\tsaver: %s\n\tseq: %s' % (saver, 'seq' in args))
        meps=getmeps(sys.argv[1])
        if 'seq' in args:
            res=seqcrawl(meps,saver=saver, null=null)
            if 'dry' in args:
                print "[%s]" % ',\n'.join(res).encode('utf8')
        else:
            crawler(meps,saver=saver)
Ejemplo n.º 40
0
def crawler(targets,saver=jdump,threads=4, term='7'):
    m=Multiplexer(scrape,saver,threads=threads)
    m.start()
    [m.addjob(url, data) for url, data in targets(term=term)]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 41
0
def crawl(saver=jdump,threads=4):
    m=Multiplexer(scrape,saver,threads=threads)
    m.start()
    [m.addjob(url) for url, name in get_meps()]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 42
0
def scrape(decl):
    mep_id = decl.split('/')[-1].split('_')[0]
    data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''}
    logger.info("findecl scraping %s" % mep_id)

    text = getraw(decl).split('\n')
    state = 0
    ptr = 0
    while ptr < len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (issectionhead(decl, text, ptr, state, 0, ('A', u'А', 'A'))
                or issectionhead(decl, text, ptr, state, 2, ('C', u'В', u'Γ'))
                or issectionhead(decl, text, ptr, state, 3, ('D', u'Г', u'Δ'))
                or issectionhead(decl, text, ptr, state, 4, ('E', u'Д', u'E'))
                or issectionhead(decl, text, ptr, state, 5,
                                 ('F', u'Е', u'ΣΤ'))):
            # skip to table
            while (text[ptr].split()[-4:] != ['1', '2', '3', '4']):
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] %s table not found' % state)
                    raise IndexError
            start = ptr
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] %s fail skip empty lines' % state)
                    raise IndexError
            while True:
                if ptr > len(text):
                    logger.error('[meh] fail past end of block %s' % state)
                    raise IndexError
                if (text[ptr].strip() == ''
                        and (text[ptr + 1] in ['1', '']
                             or text[ptr + 1].strip()[:3] == '1/6')):
                    break
                if text[ptr].startswith(' ' * 20) and (
                        text[ptr].strip()[1] == '/'
                        and text[ptr].strip()[0] in ['2', '3', '4']):
                    break
                ptr += 1
            end = ptr
            state += 1
            #print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ('\n\t'.join(
                    (repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text, ptr, state, 1, ('B', u'Б', u'B')):
            while len([x for x in text[ptr].split(' ' * 10) if x]) != 2:
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] table B not found')
                    raise IndexError
            start = ptr
            # skip empty lines
            while ptr < len(text) and not text[ptr].split():
                ptr += 1
            while True:
                if ptr > len(text):
                    logger.error('[meh] fail skip empty lines in B')
                    raise IndexError
                if [text[ptr].strip(), text[ptr + 1]] in (['', '1'], ['', '']):
                    break
                if text[ptr].startswith(' ' * 20) and (
                        text[ptr].strip()[1] == '/'
                        and text[ptr].strip()[0] in ['2', '3', '4']):
                    break
                ptr += 1
            end = ptr
            state += 1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ('\n\t'.join(
                    (repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state == 6:
            while not issectionhead(decl, text, ptr, state, 6,
                                    ('G', u'Ж', u'Ζ')):
                ptr += 1
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in G fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in G')
                    raise IndexError
            gstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 7,
                                    ('H', u'З', u'H')):
                ptr += 1
            gend = ptr - 1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in H fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in H')
                    raise IndexError
            hstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 8,
                                    ('I', u'И', u'Θ')):
                ptr += 1
            hend = ptr - 1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in I fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in I')
                    raise IndexError
            istart = ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp) == 3:
                    data['date'] = tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp) == 5:
                    # date=tmp[2] could be preserved in data
                    del tmp[2]
                    if tmp in [['Date', ':', 'Signature', ':']]:
                        break
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail find end in I')
                    if DEBUG:
                        print 'meh\n>>>%s' % '\n>>>'.join(
                            text[istart:istart + 14]).encode('utf8')
                    raise IndexError
            state += 1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
        #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr += 1
    if state != 9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        return data
Ejemplo n.º 43
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Ejemplo n.º 44
0
def scrape(url):
    try:
        logger.info("scrape " + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []):
            skip = False
            for event in forecasts + events:
                if (
                    event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf")
                    and event["date"] == ipexd["date"]
                ):
                    skip = True
                    break
            if skip:
                continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get("date")]
        allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date"))
        allevents = merge_events(allevents, committees)
        res = {
            u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()},
            u"procedure": procedure,
            u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
            u"committees": committees,
            u"activities": sorted(allevents, key=itemgetter("date")),
            u"other": other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get("class") == "sumbutton":
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" % link.get("href"))
                except:
                    continue
                final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not "docs" in final:
                    final["docs"] = []
                final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")})
        if final and final.get("docs"):
            res[u"procedure"][u"final"] = final.get("docs", [{}])[0]
            for item in res["activities"]:
                if item.get("type") == u"Final act published in Official Journal":
                    if final.get("text"):
                        item[u"text"] = final["text"]
                    if len(final.get("docs")) > 1:
                        if not "docs" in item:
                            item[u"docs"] = final["docs"]
                        else:
                            item[u"docs"].extend(final["docs"])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return
Ejemplo n.º 45
0
def crawler(meps,saver=jdump,threads=4):
    m=Multiplexer(scrape,saver,threads=threads)
    m.start()
    [m.addjob(mepid) for mepid in meps]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 46
0
def crawlseq(urls):
    [save(scrape(url), [0, 0]) for url, title in urls]
    logger.info("end of crawl")
Ejemplo n.º 47
0
def crawl(urls, threads=4):
    m=Multiplexer(scrape,save, threads=threads)
    m.start()
    [m.addjob(url) for url, title in urls]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 48
0
def crawler(meps, saver=jdump, threads=4, term=current_term):
    m = Multiplexer(scrape, saver, threads=threads)
    m.start()
    [m.addjob(url, data) for url, data in meps(term=term)]
    m.finish()
    logger.info('end of crawl')
Ejemplo n.º 49
0
        ctr=[0,0]
        try:
            saver(scrape(pdf, rapporteur), ctr)
        except:
            # ignore failed scrapes
            logger.warn("[!] %s failed to scrape: %s" % (datetime.now().isoformat(), pdf))
            #logger.warn(traceback.format_exc())
            raise
        logger.info("%s [i] added/updated: %s/%s" % (datetime.now().isoformat(), ctr[0],ctr[1]))
        stats[0]+=ctr[0]
        stats[1]+=ctr[1]
    logger.info("%s [o] total added/updated: %s/%s" % (datetime.now().isoformat(),stats[0],stats[1]))

if __name__ == "__main__":
    import pprint, sys
    if len(sys.argv)>1:
        if sys.argv[1]=='update':
            crawler(saver=save,update=True)
            sys.exit(0)
        debug=True
        ctr=[0,0]
        while len(sys.argv)>1:
            logger.info(sys.argv[1])
            save(scrape(sys.argv[1], sys.argv[2]), ctr)
            #pprint.pprint(scrape(sys.argv[1], sys.argv[2]))
            del sys.argv[2]
            del sys.argv[1]
        sys.exit(0)
    else:
        crawler(saver=save)
Ejemplo n.º 50
0
def scrape(celexid, path):
    logger.info("scraping %s%s:NOT" % (EURLEXURL, celexid))
    path.reverse()
    (code, lang) = celexid.split(":")[1:3]
    st = 6
    if len(code) > 6:
        if code[6].isalpha(): st = 7
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:st],
                u'refno': code[st:],
                u'lang': lang,
            }
        }
    else:
        eurlex = {
            'id': {
                u'celexid': celexid,
                u'sector': code[0],
                u'year': code[1:5],
                u'doctype': code[5:6],
                u'lang': lang,
            }
        }

    try:
        eurlex['id'][u'typeDesc'] = CELEXCODES[code[0]]['Document Types'][
            code[5:st]] if code[5:st] != 'C' else CELEXCODES[code[0]]['Sector']
    except:
        eurlex['id'][u'typeDesc'] = u"Unknown"
        logger.warn("[!] unknown typedesc %s" % celexid)
    eurlex['meta'] = {u'src': "%s%s:NOT" % (EURLEXURL, celexid)}

    root = fetch("%s%s:NOT" % (EURLEXURL, celexid))
    if len(root.xpath('//h1[text()="No documents matching criteria."]')) > 0:
        logger.warn('[!] nothing to scrape here: %s',
                    "%s%s:NOT" % (EURLEXURL, celexid))
        return
    eurlex[u'title'] = unws(
        root.xpath(
            '//h2[text()="Title and reference"]/following-sibling::p/text()')
        [0])
    # dates
    dates = root.xpath('//h2[text()="Dates"]/following-sibling::ul/text()')
    for y in dates:
        if not unws(y): continue
        title, rest = unws(y).split(": ", 1)
        item = {}
        date = rest[:10]
        tail = rest[10:]
        if tail.startswith('; '):
            tail = tail[2:]
        if date == '99/99/9999': item[u'date'] = datetime(9999, 12, 31)
        elif date == '00/00/0000': item[u'date'] = datetime(0001, 01, 01)
        elif date == '//': continue
        else:
            try:
                item[u'date'] = datetime.strptime(date, u"%d/%m/%Y")
            except ValueError:
                try:
                    item[u'date'] = datetime.strptime(date, u"%m/%d/%Y")
                except:
                    pass
        if len(tail):
            item['note'] = tail
        try:
            eurlex['dates'][title] = item
        except:
            eurlex['dates'] = {title: item}