# You should have received a copy of the GNU Affero General Public License # along with parltrack. If not, see <http://www.gnu.org/licenses/>. # (C) 2011 by Adam Tauber, <*****@*****.**> from lxml.html.soupparser import parse import urllib2, urllib, cookielib from string import strip from parltrack.environment import connect_db from parltrack.scrapers.oeil import scrape as oeil_scrape from os.path import realpath, exists, dirname from parltrack.scrapers.mappings import STAGES import sys db = connect_db() URL = 'http://www.europarl.europa.eu/oeil/' LAST_UPDATED_CACHE = "%s/.dossiers_last_updated" % dirname(realpath(__file__)) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) #opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), # urllib2.ProxyHandler({'http': 'http://localhost:8123/'})) opener.addheaders = [('User-agent', 'weurstchen/0.5')] def fetch(url, retries=5): # url to etree try: f=urllib2.urlopen(url) except (urllib2.HTTPError, urllib2.URLError), e: if hasattr(e, 'code') and e.code>=400 and e.code not in [504]:
def index_actor(procedure, db, solr): entry = {'_collection': 'actor', 'key': actor.get('key')} entry['_id'] = str(actor['_id']) entry['title'] = actor.get('name') if actor.get('department'): entry['department'] = actor.get('department') if actor.get('group'): entry['group'] = actor.get('group') if actor.get('function'): entry['function'] = actor.get('function') if actor.get('party'): entry['party'] = actor.get('party') if actor.get('state'): entry['state'] = actor.get('state') if actor.get('constituency'): entry['constituency'] = actor.get('constituency', {}).get('name') if 'bio' in actor: entry['description'] = actor['bio'] solr.add(**entry) solr.commit() def index_actors(db, solr): for actor in db.actor.find(): index_actor(actor, db, solr) if __name__ == '__main__': index_actors(connect_db(), connect_solr())
def index_actor(procedure, db, solr): entry = {"_collection": "actor", "key": actor.get("key")} entry["_id"] = str(actor["_id"]) entry["title"] = actor.get("name") if actor.get("department"): entry["department"] = actor.get("department") if actor.get("group"): entry["group"] = actor.get("group") if actor.get("function"): entry["function"] = actor.get("function") if actor.get("party"): entry["party"] = actor.get("party") if actor.get("state"): entry["state"] = actor.get("state") if actor.get("constituency"): entry["constituency"] = actor.get("constituency", {}).get("name") if "bio" in actor: entry["description"] = actor["bio"] solr.add(**entry) solr.commit() def index_actors(db, solr): for actor in db.actor.find(): index_actor(actor, db, solr) if __name__ == "__main__": index_actors(connect_db(), connect_solr())
if committee_id: db.news.update(q, {"$addToSet": {"committee": committee_id}}) return doc = etree.parse(url) news = elem_to_dict(doc.find(".")).get("newsdetails") news['sourceURL'] = url db.news.update(q, {"$set": news}, upsert=True) prev = news.get('dokumentInfo', {}).get('previous') if 'dokumentInfo' in news: del news['dokumentInfo'] if committee_id: db.news.update(q, {"$addToSet": {"committee": committee_id}}) pprint(news) if prev is not None: load_news_item(db, prev) def load_news_index(db): doc = etree.parse(AKTUELL_URL) for info_url in doc.findall("//detailsXML"): if not info_url.text or 'impressum' in info_url.text: continue load_news_item(db, info_url.text) if __name__ == '__main__': db = connect_db() #load_news_index(db) load_ausschuss_index(db) load_mdb_index(db)
from pprint import pprint from parltrack.environment import connect_db, connect_solr def index_procedure(procedure, db, solr): pprint(procedure) entry = {'_id': procedure['_id'], '_collection': 'procedure'} entry['finished'] = procedure.get('finished') entry['state'] = procedure.get('state') entry['reference'] = procedure.get('reference') entry['title'] = procedure.get('title') entry['parliament'] = procedure.get('parliament') entry['session'] = procedure.get('session') entry['session'] = procedure.get('session') entry['subject'] = procedure.get('subjects') entry['tag'] = procedure.get('tags') entry['initiative'] = procedure.get('initiative') entry['description'] = procedure.get('description') solr.add(**entry) solr.commit() def index_procedures(db, solr): for procedure in db.procedure.find(): index_procedure(procedure, db, solr) if __name__ == '__main__': index_procedures(connect_db(), connect_solr())