#    You should have received a copy of the GNU Affero General Public License
#    along with parltrack.  If not, see <http://www.gnu.org/licenses/>.

# (C) 2011 by Adam Tauber, <*****@*****.**>


from lxml.html.soupparser import parse
import urllib2, urllib, cookielib
from string import strip
from parltrack.environment import connect_db
from parltrack.scrapers.oeil import scrape as oeil_scrape
from os.path import realpath, exists, dirname
from parltrack.scrapers.mappings import STAGES
import sys

db = connect_db()

URL = 'http://www.europarl.europa.eu/oeil/'
LAST_UPDATED_CACHE = "%s/.dossiers_last_updated" % dirname(realpath(__file__))

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))
#opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()),
#                              urllib2.ProxyHandler({'http': 'http://localhost:8123/'}))
opener.addheaders = [('User-agent', 'weurstchen/0.5')]

def fetch(url, retries=5):
    # url to etree
    try:
        f=urllib2.urlopen(url)
    except (urllib2.HTTPError, urllib2.URLError), e:
        if hasattr(e, 'code') and e.code>=400 and e.code not in [504]:
Exemple #2
0
def index_actor(procedure, db, solr):
    entry = {'_collection': 'actor', 'key': actor.get('key')}
    entry['_id'] = str(actor['_id'])
    entry['title'] = actor.get('name')
    if actor.get('department'):
        entry['department'] = actor.get('department')
    if actor.get('group'):
        entry['group'] = actor.get('group')
    if actor.get('function'):
        entry['function'] = actor.get('function')
    if actor.get('party'):
        entry['party'] = actor.get('party')
    if actor.get('state'):
        entry['state'] = actor.get('state')
    if actor.get('constituency'):
        entry['constituency'] = actor.get('constituency', {}).get('name')
    if 'bio' in actor:
        entry['description'] = actor['bio']
    solr.add(**entry)
    solr.commit()


def index_actors(db, solr):
    for actor in db.actor.find():
        index_actor(actor, db, solr)


if __name__ == '__main__':
    index_actors(connect_db(), connect_solr())
Exemple #3
0
def index_actor(procedure, db, solr):
    entry = {"_collection": "actor", "key": actor.get("key")}
    entry["_id"] = str(actor["_id"])
    entry["title"] = actor.get("name")
    if actor.get("department"):
        entry["department"] = actor.get("department")
    if actor.get("group"):
        entry["group"] = actor.get("group")
    if actor.get("function"):
        entry["function"] = actor.get("function")
    if actor.get("party"):
        entry["party"] = actor.get("party")
    if actor.get("state"):
        entry["state"] = actor.get("state")
    if actor.get("constituency"):
        entry["constituency"] = actor.get("constituency", {}).get("name")
    if "bio" in actor:
        entry["description"] = actor["bio"]
    solr.add(**entry)
    solr.commit()


def index_actors(db, solr):
    for actor in db.actor.find():
        index_actor(actor, db, solr)


if __name__ == "__main__":
    index_actors(connect_db(), connect_solr())
Exemple #4
0
        if committee_id:
            db.news.update(q, {"$addToSet": {"committee": committee_id}})
        return
    doc = etree.parse(url)
    news = elem_to_dict(doc.find(".")).get("newsdetails")
    news['sourceURL'] = url
    db.news.update(q, {"$set": news}, upsert=True)
    prev = news.get('dokumentInfo', {}).get('previous')
    if 'dokumentInfo' in news:
        del news['dokumentInfo']
    if committee_id:
        db.news.update(q, {"$addToSet": {"committee": committee_id}})
    pprint(news)
    if prev is not None:
        load_news_item(db, prev)


def load_news_index(db):
    doc = etree.parse(AKTUELL_URL)
    for info_url in doc.findall("//detailsXML"):
        if not info_url.text or 'impressum' in info_url.text:
            continue
        load_news_item(db, info_url.text)


if __name__ == '__main__':
    db = connect_db()
    #load_news_index(db)
    load_ausschuss_index(db)
    load_mdb_index(db)
Exemple #5
0
from pprint import pprint
from parltrack.environment import connect_db, connect_solr


def index_procedure(procedure, db, solr):
    pprint(procedure)
    entry = {'_id': procedure['_id'], '_collection': 'procedure'}
    entry['finished'] = procedure.get('finished')
    entry['state'] = procedure.get('state')
    entry['reference'] = procedure.get('reference')
    entry['title'] = procedure.get('title')
    entry['parliament'] = procedure.get('parliament')
    entry['session'] = procedure.get('session')
    entry['session'] = procedure.get('session')
    entry['subject'] = procedure.get('subjects')
    entry['tag'] = procedure.get('tags')
    entry['initiative'] = procedure.get('initiative')
    entry['description'] = procedure.get('description')
    solr.add(**entry)
    solr.commit()


def index_procedures(db, solr):
    for procedure in db.procedure.find():
        index_procedure(procedure, db, solr)


if __name__ == '__main__':
    index_procedures(connect_db(), connect_solr())
Exemple #6
0
from pprint import pprint
from parltrack.environment import connect_db, connect_solr

def index_procedure(procedure, db, solr):
    pprint(procedure)
    entry = {'_id': procedure['_id'], '_collection': 'procedure'}
    entry['finished'] = procedure.get('finished')
    entry['state'] = procedure.get('state')
    entry['reference'] = procedure.get('reference')
    entry['title'] = procedure.get('title')
    entry['parliament'] = procedure.get('parliament')
    entry['session'] = procedure.get('session')
    entry['session'] = procedure.get('session')
    entry['subject'] = procedure.get('subjects')
    entry['tag'] = procedure.get('tags')
    entry['initiative'] = procedure.get('initiative')
    entry['description'] = procedure.get('description')
    solr.add(**entry)
    solr.commit()

def index_procedures(db, solr):
    for procedure in db.procedure.find():
        index_procedure(procedure, db, solr)

if __name__ == '__main__':
    index_procedures(connect_db(), connect_solr())