コード例 #1
0
def scrape_epagents(table):
    heading=''.join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible=None
    if heading in [ "Committee responsible", "Former committee responsible"]:
        responsible=True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible=False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems=table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a')
    tips=[t.xpath('text()')[0] if len(t.xpath('text()'))>0 else groupurlmap[t.xpath("a")[0].get('href')]
          for t in table.xpath('//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]')]
    shadows={}
    for shadow, group in izip_longest(shadowelems, tips):
        committee=shadow.xpath('./ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee]=[]
        if group=='NI': group=u'NI'
        mep={u'name': unicode(shadow.xpath('text()')[0]),
             u'group': unicode(group)}
        tmp=getMEPRef(shadow.xpath('text()')[0])
        if tmp:
           mep[u'mepref']=tmp
        #else:
        #    raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent=todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents=[]
    for agent in lst2obj(table,epagents,1):
        agent[u'responsible']=responsible
        agent[u'body']=u'EP'
        if agent.get('rapporteur'):
            meps=[]
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith("The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion']=None
                    continue
                tmp=getMEPRef(mep['name'])
                if tmp:
                    meps.append({u'mepref': tmp,
                                 u'group': mep['group'],
                                 u'name': mep['name']})
                else:
                    meps.append({u'group': mep['group'],
                                 u'name': mep['name']})
            agent[u'rapporteur']=meps

        abbr=agent['committee'][:4]
        if abbr=='BUDE': abbr='BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full']=agent['committee']
            if agent['committee'][4]==' ' and abbr.isalpha():
                agent[u'committee']=abbr
        else:
            agent[u'committee_full']=agent['committee'][5:]
            agent[u'committee']=abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows']=shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents
コード例 #2
0
ファイル: oeil.py プロジェクト: JacobOscarson/parltrack
def scrape_epagents(table):
    heading = "".join(table.xpath('.//td[@class="players_committee"]')[0].xpath(".//text()")).strip()
    responsible = None
    if heading in ["Committee responsible", "Former committee responsible"]:
        responsible = True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible = False
    else:
        print "[!] unknown committee heading", heading

    # remove tooltips
    [tip.xpath("..")[0].remove(tip) for tip in table.xpath('.//span[@class="tiptip"]')]

    # handle shadows
    shadowelems = table.xpath(
        '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a'
    )
    shadows = {}
    for shadow in shadowelems:
        committee = shadow.xpath("./ancestor::td/preceding-sibling::td//acronym/text()")[0]
        if not committee in shadows:
            shadows[committee] = []
        mep = {u"name": shadow.xpath("text()")[0]}
        tmp = getMEPRef(shadow.xpath("text()")[0])
        if tmp:
            mep[u"mepref"] = tmp
        else:
            raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent = todel.xpath("..")[0]
        parent.remove(todel.xpath("following-sibling::div")[0])
        parent.remove(todel)

    # handle each row of agents
    agents = []
    for agent in lst2obj(table, epagents, 1):
        agent[u"responsible"] = responsible
        agent[u"body"] = "EP"

        if agent.get("rapporteur", [""])[0].strip().startswith("The committee decided not to give an opinion"):
            del agent["rapporteur"]
            agent[u"opinion"] = None
        elif agent.get("rapporteur"):
            meps = []
            for mep in agent["rapporteur"]:
                tmp = getMEPRef(mep)
                if tmp:
                    meps.append({u"mepref": tmp, u"name": mep})
                else:
                    raise IndexError
            agent[u"rapporteur"] = meps

        abbr = agent["committee"][:4]
        if not abbr in COMMITTEE_MAP.keys():
            print "[!] uknown committee abbrev", abbr
            agent[u"committee_full"] = agent["committee"]
            del agent["committee"]
        else:
            agent[u"committee_full"] = agent["committee"][4:]
            agent[u"committee"] = abbr

        if agent.get(u"committee") in shadows.keys():
            agent[u"shadows"] = shadows[agent["committee"]]

        if not agent in agents:
            agents.append(agent)
    return agents
コード例 #3
0
ファイル: oeil.py プロジェクト: jacob414/parltrack
def scrape_epagents(table):
    heading = ''.join(
        table.xpath('.//td[@class="players_committee"]')[0].xpath(
            ".//text()")).strip()
    responsible = None
    if heading in ["Committee responsible", "Former committee responsible"]:
        responsible = True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible = False
    else:
        print "[!] unknown committee heading", heading

    # remove tooltips
    [
        tip.xpath('..')[0].remove(tip)
        for tip in table.xpath('.//span[@class="tiptip"]')
    ]

    # handle shadows
    shadowelems = table.xpath(
        '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a'
    )
    shadows = {}
    for shadow in shadowelems:
        committee = shadow.xpath(
            './ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee] = []
        mep = {u'name': shadow.xpath('text()')[0]}
        tmp = getMEPRef(shadow.xpath('text()')[0])
        if tmp:
            mep[u'mepref'] = tmp
        else:
            raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent = todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents = []
    for agent in lst2obj(table, epagents, 1):
        agent[u'responsible'] = responsible
        agent[u'body'] = 'EP'

        if agent.get('rapporteur', [''])[0].strip().startswith(
                "The committee decided not to give an opinion"):
            del agent['rapporteur']
            agent[u'opinion'] = None
        elif agent.get('rapporteur'):
            meps = []
            for mep in agent['rapporteur']:
                tmp = getMEPRef(mep)
                if tmp:
                    meps.append({u'mepref': tmp, u'name': mep})
                else:
                    raise IndexError
            agent[u'rapporteur'] = meps

        abbr = agent['committee'][:4]
        if not abbr in COMMITTEE_MAP.keys():
            print "[!] uknown committee abbrev", abbr
            agent[u'committee_full'] = agent['committee']
            del agent['committee']
        else:
            agent[u'committee_full'] = agent['committee'][4:]
            agent[u'committee'] = abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows'] = shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents
コード例 #4
0
ファイル: oeil.py プロジェクト: ehj/parltrack
def scrape_epagents(table):
    heading = ''.join(
        table.xpath('.//td[@class="players_committee"]')[0].xpath(
            ".//text()")).strip()
    responsible = None
    if heading in ["Committee responsible", "Former committee responsible"]:
        responsible = True
    elif heading in ["Committee for opinion", "Former committee for opinion"]:
        responsible = False
    else:
        logger.warn(u"[!] unknown committee heading %s" % heading)

    # handle shadows
    shadowelems = table.xpath(
        '//a[@id="shadowRapporteurHeader"]/../following-sibling::div/p//span[@class="players_rapporter_text"]/a'
    )
    tips = [
        t.xpath('text()')[0] if len(t.xpath('text()')) > 0 else
        groupurlmap[t.xpath("a")[0].get('href')] for t in table.xpath(
            '//a[@id="shadowRapporteurHeader"]/../following-sibling::div//span[@class="tiptip"]'
        )
    ]
    shadows = {}
    for shadow, group in izip_longest(shadowelems, tips):
        committee = shadow.xpath(
            './ancestor::td/preceding-sibling::td//acronym/text()')[0]
        if not committee in shadows: shadows[committee] = []
        if group == 'NI': group = u'NI'
        mep = {u'name': unicode(shadow.xpath('text()')[0]), u'group': group}
        tmp = getMEPRef(shadow.xpath('text()')[0])
        if tmp:
            mep[u'mepref'] = tmp
        else:
            raise IndexError
        shadows[committee].append(mep)
    # delete the uneccessary shadow elements - so the following regular lst2obj get's what it expects
    for todel in table.xpath('//a[@id="shadowRapporteurHeader"]/..'):
        parent = todel.xpath('..')[0]
        parent.remove(todel.xpath('following-sibling::div')[0])
        parent.remove(todel)

    # handle each row of agents
    agents = []
    for agent in lst2obj(table, epagents, 1):
        agent[u'responsible'] = responsible
        agent[u'body'] = u'EP'
        if agent.get('rapporteur'):
            meps = []
            for mep in agent['rapporteur']:
                if unws(mep['name']).startswith(
                        "The committee decided not to give an opinion"):
                    del agent['rapporteur'][agent['rapporteur'].index(mep)]
                    agent[u'opinion'] = None
                    continue
                tmp = getMEPRef(mep['name'])
                if tmp:
                    meps.append({
                        u'mepref': tmp,
                        u'group': mep['group'],
                        u'name': mep['name']
                    })
                else:
                    raise IndexError
            agent[u'rapporteur'] = meps

        abbr = agent['committee'][:4]
        if abbr == 'BUDE': abbr = 'BUDG'
        if not abbr in COMMITTEE_MAP.keys():
            logger.warn(u"[!] uknown committee abbrev %s" % abbr)
            agent[u'committee_full'] = agent['committee']
            if agent['committee'][4] == ' ' and abbr.isalpha():
                agent[u'committee'] = abbr
        else:
            agent[u'committee_full'] = agent['committee'][5:]
            agent[u'committee'] = abbr

        if agent.get(u'committee') in shadows.keys():
            agent[u'shadows'] = shadows[agent['committee']]

        if not agent in agents: agents.append(agent)
    return agents