Ejemplo n.º 1
0
def save(data, stats):
    res = db.ep_meps2.find_one({"UserID": data["UserID"]}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items() if not k in ["_id", "meta", "changes"]]),
        dict([(k, v) for k, v in data.items() if not k in ["_id", "meta", "changes"]]),
    )
    if d:
        now = datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(("adding %s" % (data["Name"]["full"])).encode("utf8"))
            data["meta"]["created"] = now
            if stats:
                stats[0] += 1
        else:
            logger.info(("updating %s" % (data["Name"]["full"])).encode("utf8"))
            logger.warn(jdump(d))
            data["meta"]["updated"] = now
            if stats:
                stats[1] += 1
            data["_id"] = res["_id"]
        data["changes"] = res.get("changes", {})
        data["changes"][now.isoformat()] = d
        db.ep_meps2.save(data)
    if stats:
        return stats
    else:
        return data
Ejemplo n.º 2
0
def save(data, stats):
    res=db.ep_meps2.find_one({ 'UserID' : data['UserID'] }) or {}
    if 'Gender' not in data and 'Gender' in res: data['Gender']=res['Gender']
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes', 'activities',]]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes', 'activities',]]))
    if d:
        now=datetime.utcnow().replace(microsecond=0)
        if not res:
            logger.info(('adding %s' % (data['Name']['full'])).encode('utf8'))
            data['meta']['created']=now
            if stats: stats[0]+=1
        else:
            logger.info(('updating %s' % (data['Name']['full'])).encode('utf8'))
            logger.warn(jdump(d))
            data['meta']['updated']=now
            if stats: stats[1]+=1
            data['_id']=res['_id']
        data['changes']=res.get('changes',{})
        data['changes'][now.isoformat()]=d
        db.ep_meps2.save(data)
    del res
    if stats: 
        del data
        return stats
    else: return data
Ejemplo n.º 3
0
def save(data, stats):
    if not data: return stats
    src = data['meta']['source']
    res = db.dossiers2.find_one({'meta.source': src}) or {}
    d = diff(
        dict([(k, v) for k, v in res.items()
              if not k in ['_id', 'meta', 'changes']]),
        dict([(k, v) for k, v in data.items() if not k in [
            '_id',
            'meta',
            'changes',
        ]]))
    #logger.warn(pprint.pformat(d))
    if d:
        now = datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        if not res:
            logger.info(('adding %s - %s' %
                         (data['procedure']['reference'],
                          data['procedure']['title'])).encode('utf8'))
            data['meta']['created'] = data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[0] += 1
        else:
            logger.info(('updating  %s - %s' %
                         (data['procedure']['reference'],
                          data['procedure']['title'])).encode('utf8'))
            data['meta']['updated'] = data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[1] += 1
            data['_id'] = res['_id']
            logger.info(jdump(d))
        if not NOMAIL:
            m = db.notifications.find(
                {'dossiers': data['procedure']['reference']},
                ['active_emails'])
            for g in m:
                if len(g['active_emails']) == 0:
                    continue
                msg = Message("[PT] %s %s" % (data['procedure']['reference'],
                                              data['procedure']['title']),
                              sender="*****@*****.**",
                              bcc=g['active_emails'])
                #msg.html = htmldiff(data,d)
                msg.body = makemsg(data, d)
                mail.send(msg)
        #logger.info(htmldiff(data,d))
        #logger.info(makemsg(data,d))
        data['changes'] = res.get('changes', {})
        data['changes'][now] = d
        db.dossiers2.save(data)
    return stats
Ejemplo n.º 4
0
def save(data, stats):
    if not data: return stats
    src=data['meta']['source']
    res=db.dossiers2.find_one({ 'meta.source' : src }) or {}
    d=diff(dict([(k,v) for k,v in res.items() if not k in ['_id', 'meta', 'changes']]),
           dict([(k,v) for k,v in data.items() if not k in ['_id', 'meta', 'changes',]]))
    #logger.warn(pprint.pformat(d))
    if d:
        now=datetime.datetime.utcnow().replace(microsecond=0).isoformat()
        if not res:
            logger.info(('adding %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['created']=data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[0]+=1
        else:
            logger.info(('updating  %s - %s' % (data['procedure']['reference'],data['procedure']['title'])).encode('utf8'))
            data['meta']['updated']=data['meta']['timestamp']
            del data['meta']['timestamp']
            sys.stdout.flush()
            stats[1]+=1
            data['_id']=res['_id']
            logger.info(jdump(d))
        if not NOMAIL:
            m=db.notifications.find({'dossiers': data['procedure']['reference']},['active_emails'])
            for g in m:
                if len(g['active_emails'])==0:
                    continue
                msg = Message("[PT] %s %s" % (data['procedure']['reference'],data['procedure']['title']),
                              sender = "*****@*****.**",
                              bcc = g['active_emails'])
                #msg.html = htmldiff(data,d)
                msg.body = makemsg(data,d)
                mail.send(msg)
        #logger.info(htmldiff(data,d))
        #logger.info(makemsg(data,d))
        data['changes']=res.get('changes',{})
        data['changes'][now]=d
        db.dossiers2.save(data)
    return stats
Ejemplo n.º 5
0
                    } for mep in group.xpath('PoliticalGroup.Member.Name')]
                } for group in type.xpath('Result.PoliticalGroup.List')]
            }
        # save
        q = {'title': res['voteid'], 'ts': res['ts']}
        db.ep_votes.update(q, {"$set": res}, upsert=True)
        votes.append(res)
    return votes


if __name__ == '__main__':
    #res = scrape("http://www.europarl.europa.eu/RegData/seance_pleniere/proces_verbal/2014/03-13/votes_nominaux/xml/P7_PV(2014)03-13(RCV)_XC.xml")
    #print jdump(res).encode('utf8')
    #exit(0)
    try:
        year = int(sys.argv[1])
    except:
        sys.stderr.write('[!] usage: %s [year(2004-2014)]\n' % sys.argv[0])
        sys.exit(1)
    if year >= 2004 and year < 2009:
        map(scrape, crawl(year, 6))
    elif year == 2009:
        map(scrape, crawl(year, 6))
        map(scrape, crawl(year, 7))
    elif year < 2014:
        print jdump(map(scrape, crawl(year, 7))).encode('utf8')
        #map(scrape, crawl(year, 7))
    else:
        map(scrape, crawl(year, 7))
        map(scrape, crawl(year, 8))
Ejemplo n.º 6
0
 if sys.argv[1]=="full":
     NOMAIL=True
     crawl(get_all_dossiers(), threads=4)
 elif sys.argv[1]=="fullseq":
     NOMAIL=True
     crawlseq(get_all_dossiers(), null=null)
 elif sys.argv[1]=="newseq":
     crawlseq(get_new_dossiers(), null=null)
 elif sys.argv[1]=="new":
     crawl(get_new_dossiers())
 elif sys.argv[1]=="update":
     crawl(get_active_dossiers())
 elif sys.argv[1]=="updateseq":
     crawlseq(get_active_dossiers(), null=null)
 elif sys.argv[1]=="url":
     print jdump(scrape(sys.argv[2])).encode('utf8')
     #res=scrape(sys.argv[2])
     #print >>sys.stderr, pprint.pformat(res)
     #save(res,[0,0])
 elif sys.argv[1]=="test":
     save(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397"),[0,0]) # telecoms package
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=575084"))
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=589377"))
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556208")) # with shadow rapporteurs
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?reference=2011/0135(COD)")) # with shadow rapporteurs
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593187")) # with shadow rapporteur
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397")) # telecoms package
     sys.exit(0)
     pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=16542"))
     pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=584049")) # two rapporteurs in one committee
     pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593435")) # with forecast
Ejemplo n.º 7
0
            for mepid in meps
            if (null and db.ep_meps2.find_one({'UserID': mepid},['_id'])==None) or not null]

if __name__ == "__main__":
    if len(sys.argv)<2:
        print "%s full|test|mepid <mepid> [<seq>] [<dry>]" % (sys.argv[0])
    args=set(sys.argv[1:])
    saver=save
    null=False
    if 'dry' in args:
        saver=jdump
    if 'null' in args:
        null=True

    if sys.argv[1]=="test":
        print jdump(scrape('28215')).encode('utf8')
        print jdump(scrape('113959')).encode('utf8')

        #print jdump(scrape('108570')).encode('utf8')
        #print jdump(scrape('1934')).encode('utf8')
        #print jdump(scrape('96919')).encode('utf8')
        #import code; code.interact(local=locals());
        sys.exit(0)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None)
    elif sys.argv[1]=='mepid' and sys.argv[2]:
        #print saver(scrape(int(sys.argv[2]))).encode('utf8')
Ejemplo n.º 8
0
def seqcrawler(saver=jdump):
    stats=[0,0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u,com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))

if __name__ == "__main__":
    if len(sys.argv)>1:
        if sys.argv[1]=="url":
            print jdump(scrape(sys.argv[2], 'XXXX')).encode('utf8')
            sys.exit(0)
        if sys.argv[1]=="test":
            print jdump([(u,d) for u,d in getComAgendas()])
            #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8')
            #import code; code.interact(local=locals());
            sys.exit(0)
        elif sys.argv[1]=='url' and len(sys.argv)==4:
            print jdump(scrape(sys.argv[2], sys.argv[3]))
            sys.exit(0)

    # handle opts
    args=set(sys.argv[1:])
    saver=jdump
    if 'save' in args:
        saver=save
Ejemplo n.º 9
0
 if len(sys.argv) != 2:
     print "%s full|fullseq|new|update|updateseq|test" % (sys.argv[0])
 if sys.argv[1] == "full":
     crawl(get_all_dossiers(), threads=4)
 elif sys.argv[1] == "fullseq":
     crawlseq(get_all_dossiers(), null=null)
 elif sys.argv[1] == "newseq":
     crawlseq(get_new_dossiers(), null=null)
 elif sys.argv[1] == "new":
     crawl(get_new_dossiers())
 elif sys.argv[1] == "update":
     crawl(get_active_dossiers())
 elif sys.argv[1] == "updateseq":
     crawlseq(get_active_dossiers(), null=null)
 elif sys.argv[1] == "url":
     print jdump(scrape(sys.argv[2])).encode('utf8')
     #res=scrape(sys.argv[2])
     #print >>sys.stderr, pprint.pformat(res)
     #save(res,[0,0])
 elif sys.argv[1] == "test":
     save(
         scrape(
             "http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397"
         ), [0, 0])  # telecoms package
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=575084"))
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=589377"))
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556208")) # with shadow rapporteurs
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?reference=2011/0135(COD)")) # with shadow rapporteurs
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=593187")) # with shadow rapporteur
     #pprint.pprint(scrape("http://www.europarl.europa.eu/oeil/popups/ficheprocedure.do?id=556397")) # telecoms package
     sys.exit(0)
Ejemplo n.º 10
0
    for celexid, data in sources("%s/index.htm" % crawlroot, []):
        if (null and db.eurlex.find_one({'id.celexid': celexid}, ['_id'])
                == None) or not null:
            try:
                tmp = saver(scrape(celexid, data), [0, 0])
            except:
                logger.warn("[!] failed to scrape %s" % celexid)
                continue
            yield tmp


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "%s [<chapter>] [<dry>] [<null>])" % (sys.argv[0])
    elif sys.argv[1] == 'url' and sys.argv[2]:
        print jdump(scrape(sys.argv[2], []))
        sys.exit(0)
    args = set(sys.argv[1:])
    saver = save
    null = False
    if 'dry' in args:
        saver = jdump
    if 'null' in args:
        null = True
    iter = crawl(saver=saver, null=null)
    if 'dry' in args:
        print "[\n%s" % iter.next()
    for res in iter:
        if 'dry' in args:
            print ",\n%s" % res.encode('utf8')
            print ".oOo." * 35
Ejemplo n.º 11
0
    stats = [0, 0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u, com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0], stats[1]))


if __name__ == "__main__":
    if len(sys.argv) > 1:
        if sys.argv[1] == "test":
            print jdump(
                scrape(
                    'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN',
                    'ECON')).encode('utf8')
            print jdump(
                scrape(
                    'http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN',
                    'LIBE')).encode('utf8')
            #import code; code.interact(local=locals());
            sys.exit(0)
        elif sys.argv[1] == 'url' and len(sys.argv) == 4:
            print jdump(scrape(sys.argv[2], sys.argv[3]))
            sys.exit(0)

    # handle opts
    args = set(sys.argv[1:])
    saver = jdump
    if 'save' in args:
Ejemplo n.º 12
0
                        print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8')
                    raise IndexError
            state+=1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
            #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr+=1
    if state!=9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        if (len(data['occupation'])>1 and
            data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate",
                                          u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής",
                                          u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)",
                                          u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję",
                                          u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode",
                                          u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato",
                                          u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode",
            ]):
            del data['occupation'][-1]
        return data

if __name__ == "__main__":
    DEBUG=True
    print jdump(scrape(sys.argv[1]))
Ejemplo n.º 13
0
def crawl(saver=jdump, null=False):
    for celexid, data in sources("%s/index.htm" % crawlroot, []):
        if (null and db.eurlex.find_one({'id.celexid': celexid},['_id'])==None) or not null:
            try:
                tmp = saver(scrape(celexid, data),[0,0])
            except:
                logger.warn("[!] failed to scrape %s" % celexid)
                continue
            yield tmp

if __name__ == "__main__":
    if len(sys.argv)<2:
        print "%s [<chapter>] [<dry>] [<null>])" % (sys.argv[0])
    elif sys.argv[1]=='url' and sys.argv[2]:
        print jdump(scrape(sys.argv[2],[]))
        sys.exit(0)
    args=set(sys.argv[1:])
    saver=save
    null=False
    if 'dry' in args:
        saver=jdump
    if 'null' in args:
        null=True
    iter=crawl(saver=saver, null=null)
    if 'dry' in args:
        print "[\n%s" % iter.next()
    for res in iter:
        if 'dry' in args:
            print ",\n%s" % res.encode('utf8')
            print ".oOo." * 35
Ejemplo n.º 14
0
                    tmpdate = tmp[2]
                    del tmp[2]
                    if tmp in [["Date", ":", "Signature", ":"]]:
                        data["date"] = tmpdate
                        break
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] fail find end in I")
                    if DEBUG:
                        print "meh\n>>>%s" % "\n>>>".join(text[istart : istart + 14]).encode("utf8")
                    raise IndexError
            state += 1
            if DEBUG:
                print >> sys.stderr, state
                # print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        # else:
        # print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr += 1
    if state != 9:
        print >> sys.stderr, ">>>>>>>>", "wtfwtf", state
        logger.error("[wtf] did not reach final state %s" % state)
        return {}
    else:
        return data


if __name__ == "__main__":
    DEBUG = True
    print jdump(scrape(sys.argv[1]))
Ejemplo n.º 15
0
def seqcrawler(saver=jdump):
    stats=[0,0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u,com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))

if __name__ == "__main__":
    if len(sys.argv)>1:
        if sys.argv[1]=="test":
            print jdump([(u,d) for u,d in getComAgendas()])
            #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN', 'ECON')).encode('utf8')
            #print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8')
            #import code; code.interact(local=locals());
            sys.exit(0)
        elif sys.argv[1]=='url' and len(sys.argv)==4:
            print jdump(scrape(sys.argv[2], sys.argv[3]))
            sys.exit(0)

    # handle opts
    args=set(sys.argv[1:])
    saver=jdump
    if 'save' in args:
        saver=save
    if 'seq' in args:
        res=seqcrawler(saver=saver)
Ejemplo n.º 16
0
def seqcrawler(saver=jdump):
    stats=[0,0]
    for u, com in getComAgendas():
        try:
            saver(scrape(u,com), stats)
        except:
            # ignore failed scrapes
            logger.warn("[!] failed to scrape: %s" % u)
            logger.warn(traceback.format_exc())
    logger.info("[o] added/updated: %s/%s" % (stats[0],stats[1]))

if __name__ == "__main__":
    if len(sys.argv)>1:
        if sys.argv[1]=="test":
            print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=ECON-OJ-20120109-1&language=EN', 'ECON')).encode('utf8')
            print jdump(scrape('http://www.europarl.europa.eu/sides/getDoc.do?type=COMPARL&reference=LIBE-OJ-20120112-1&language=EN', 'LIBE')).encode('utf8')
            #import code; code.interact(local=locals());
            sys.exit(0)
        elif sys.argv[1]=='url' and len(sys.argv)==4:
            print jdump(scrape(sys.argv[2], sys.argv[3]))
            sys.exit(0)

    # handle opts
    args=set(sys.argv[1:])
    saver=jdump
    if 'save' in args:
        saver=save
    if 'seq' in args:
        res=seqcrawler(saver=saver)
    else: