Example #1
0
def getAddress(root):
    res={}
    for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'):
        # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul")))
        key=unws(''.join(div.xpath('.//text()')))
        if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']:
            continue
        res[key]={}
        if key in ['Bruxelles', 'Strasbourg', 'Luxembourg']:
            tmp=div.xpath('../..//li[@class="ep_phone"]/div/text()')
            if tmp:
                res[key]['Phone'] = unws(tmp[0]).replace('(0)','')
            tmp=div.xpath('../..//li[@class="ep_fax"]/div/text()')
            if tmp:
                res[key]['Fax'] = unws(tmp[0]).replace('(0)','')
        tmp=[unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))]
        if key=='Strasbourg':
            res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip1', 'Zip2'],tmp)))
            res[key]['City']=res[key]['Zip2'].split()[1]
            res[key]['Zip2']=res[key]['Zip2'].split()[0]
            res[key]['building_code']=buildings[res[key]['Building']]
        elif key=='Bruxelles':
            res[key].update(dict(zip(['Organization','Building', 'Office', 'Street','Zip'],tmp)))
            res[key]['City']=res[key]['Zip'].split()[1]
            res[key]['Zip']=res[key]['Zip'].split()[0]
            res[key]['building_code']=buildings[res[key]['Building']]
        elif key=='Luxembourg':
            res[key]['Address']=tmp
        elif key=='Postal address':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res
Example #2
0
def getAddress(root):
    res={}
    for div in root.xpath('../following-sibling::div[@class="boxcontent " or @class="boxcontent nobordertop"]/ul[@class="contact"]'):
        key=unws(''.join(div.xpath('./preceding-sibling::h4/text()')))
        if key not in ['Bruxelles', 'Strasbourg', 'Postal address', 'Luxembourg']:
            continue
        if key=='Bruxelles': key=u'Brussels'
        elif key=='Postal address': key=u'Postal'
        res[key]={}
        if key in ['Brussels', 'Strasbourg', 'Luxembourg']:
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="phone"]/text()')
            if tmp:
                res[key][u'Phone'] = unws(tmp[0]).replace('(0)','')
            tmp=div.xpath('./following-sibling::ul[@class="link_collection_noborder"]//span[@class="fax"]/text()')
            if tmp:
                res[key][u'Fax'] = unws(tmp[0]).replace('(0)','')
        tmp=[unws(x) for x in div.xpath('./li[@class="address"]//text()') if len(unws(x))]
        if key=='Strasbourg':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip1', u'Zip2'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip2'].split()[1]
            res[key][u'Address']['Zip2']=res[key]['Address']['Zip2'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Brussels':
            res[key][u'Address']=dict(zip([u'Organization',u'Building', u'Office', u'Street',u'Zip'],tmp))
            res[key][u'Address']['City']=res[key]['Address']['Zip'].split()[1]
            res[key][u'Address']['Zip']=res[key]['Address']['Zip'].split()[0]
            res[key][u'Address']['building_code']=buildings.get(res[key]['Address']['Building'])
        elif key=='Luxembourg':
            res[key][u'Address']=tmp
        elif key=='Postal':
            res[key]=tmp
        else:
            logger.error("wtf %s" % key)
    return res
Example #3
0
def getMEPGender(id):
    try:
        mepraw = fetch("http://www.europarl.europa.eu/meps/fr/%s/get.html" %
                       (id),
                       ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
Example #4
0
def getMEPDeclarations(id):
    try:
        dom = fetch(
            "http://www.europarl.europa.eu/meps/en/%s/_declarations.html" %
            (id),
            ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
        return []
Example #5
0
def getAddress(root):
    res = {}
    for div in root.xpath('//div[@id="contextzone"]//div[@class="ep_title"]'):
        # getAddress(map(strip, div.xpath("../..//div[@class='ep_elementcontact']/ul")))
        key = unws("".join(div.xpath(".//text()")))
        if key not in ["Bruxelles", "Strasbourg", "Postal address", "Luxembourg"]:
            continue
        if key == "Bruxelles":
            key = u"Brussels"
        elif key == "Postal address":
            key = u"Postal"
        res[key] = {}
        if key in ["Brussels", "Strasbourg", "Luxembourg"]:
            tmp = div.xpath('../..//li[@class="ep_phone"]/div/text()')
            if tmp:
                res[key][u"Phone"] = unws(tmp[0]).replace("(0)", "")
            tmp = div.xpath('../..//li[@class="ep_fax"]/div/text()')
            if tmp:
                res[key][u"Fax"] = unws(tmp[0]).replace("(0)", "")
        tmp = [unws(x) for x in div.xpath('../..//li[@class="ep_address"]/div/text()') if len(unws(x))]
        if key == "Strasbourg":
            res[key][u"Address"] = dict(
                zip([u"Organization", u"Building", u"Office", u"Street", u"Zip1", u"Zip2"], tmp)
            )
            res[key][u"Address"]["City"] = res[key]["Address"]["Zip2"].split()[1]
            res[key][u"Address"]["Zip2"] = res[key]["Address"]["Zip2"].split()[0]
            res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]]
        elif key == "Brussels":
            res[key][u"Address"] = dict(zip([u"Organization", u"Building", u"Office", u"Street", u"Zip"], tmp))
            res[key][u"Address"]["City"] = res[key]["Address"]["Zip"].split()[1]
            res[key][u"Address"]["Zip"] = res[key]["Address"]["Zip"].split()[0]
            res[key][u"Address"]["building_code"] = buildings[res[key]["Address"]["Building"]]
        elif key == "Luxembourg":
            res[key][u"Address"] = tmp
        elif key == "Postal":
            res[key] = tmp
        else:
            logger.error("wtf %s" % key)
    return res
Example #6
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/_history.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])

    data = {
        u'active': False,
        u'Photo': unicode(urljoin(BASE_URL,"/mepphoto/%s.jpg" % userid)),
        u'meta': {u'url': url}
        }

    mepdiv=root.xpath('//div[@class="zone_info_mep_transparent_mep_details"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(' '.join(mepdiv.xpath('.//li[@class="mep_name"]//text()'))))

    borntxt=mepdiv.xpath('.//span[@class="more_info"]/text()')
    if len(borntxt)>0:
        if unws(borntxt[-1]).startswith('Date of death:'):
            try:
                data[u'Death'] = datetime.strptime(unws(borntxt[-1]), u"Date of death: %d %B %Y")
            except ValueError:
                logger.warn('[!] failed to scrape birth data %s' % url)
                logger.warn(traceback.format_exc())
            tmp = borntxt[-2].split(',', 1)
        else:
            tmp = borntxt[-1].split(',', 1)
        if len(tmp)==2:
            (d, p) = tmp
        else:
            d,p = tmp[0], None
        try:
            data[u'Birth'] = { u'date': datetime.strptime(unws(d), u"Date of birth: %d %B %Y")}
        except ValueError:
            logger.warn(traceback.format_exc())
        finally:
            if p:
                if 'Birth' in data:
                    data[u'Birth'][u'place'] = unws(p)
                else:
                    data[u'Birth'] = unws(p)
    else:
        logger.warn('[!] no birth data %s' % url)

    # scrape stuff from right column
    addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8')
                       for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_rss"]')])
    addif(data,u'Homepage',[x.get('href')
                            for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_website"]')])
    addif(data,u'Twitter',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_twitt"]')])
    addif(data,u'Facebook',[x.get('href')
                           for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_fb"]')])
    addif(data,u'Mail',[x.get('href')[7:].replace('[dot]','.').replace('[at]','@')[::-1]
                        for x in root.xpath('//ul[@class="link_collection_noborder"]/li/a[@class="link_email"]')])
    # contact information
    for span in root.xpath('//div[@id="content_right"]//h3'):
        title=unws(''.join(span.xpath('.//text()')))
        if title == "Contacts":
            addif(data,u'Addresses',getAddress(span))

    # scrape main content
    for section in root.xpath('//div[@id="content_left"]/div[@class="boxcontent nobackground"]/h4'):
        key=unws(''.join(section.xpath('.//text()')))
        if key=="National parties":
            # constituencies
            key='Constituencies'
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, party = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if not key in data: data[key]=[]
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                cstart = party.rfind(' (')
                if party[cstart+2:-1] in SEIRTNUOC:
                    country = party[cstart+2:-1]
                    party = party[:cstart]
                else:
                    logger.warn('unknown country: %s' % party[cstart+2:-1])
                    country='unknown'
                #print etree.tostring(constlm, pretty_print=True)
                data[key].append({
                    u'party':     party,
                    u'country':   country,
                    u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                    u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                    })
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President', 'Observer', 'Quaestor']:
            # memberships in various committees, delegations and EP mgt
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                try:
                    interval, org = line.split(' : ',1)
                except ValueError:
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                item={u'role': key,
                      u'abbr': COMMITTEE_MAP.get(org),
                      u'Organization': org,
                      u'start':     datetime.strptime(unws(start), u"%d.%m.%Y"),
                      u'end':       datetime.strptime(unws(end), u"%d.%m.%Y"),
                      }
                for start, field in orgmaps:
                    if item['abbr'] in COMMITTEE_MAP or item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        elif key == u'Political groups':
            for constlm in section.xpath('./following-sibling::ul[@class="events_collection bullets"][1]/li'):
                line=unws(u' '.join([unicode(x) for x in constlm.xpath('.//text()')]))
                interval, org = line.split(' : ',1)
                tmp = org.split(u' - ')
                if len(tmp)>1:
                    org = ' - '.join(tmp[:-1])
                    role = tmp[-1]
                elif org.endswith(' -'):
                        org=org[:-2]
                        role=''
                else:
                    logger.error('[!] political group line %s' % line)
                    continue
                tmp = interval.split(' / ')
                if len(tmp)==2:
                    (start, end) = tmp
                else:
                    start = interval.split()[0]
                    end = "31.12.9999"
                if not u'Groups' in data: data[u'Groups']=[]
                data[u'Groups'].append(
                    {u'role':         role,
                     u'Organization': org,
                     u'country':      COUNTRIES.get(unws(constlm.get('class')).upper(), 'unknown country: %s' % unws(constlm.get('class'))),
                     u'groupid':      group_map[org],
                     u'start':        datetime.strptime(unws(start), u"%d.%m.%Y"),
                     u'end':          datetime.strptime(unws(end), u"%d.%m.%Y"),
                     })
        else:
            logger.error('[!] unknown field %s' % key)

    # sort all lists in descending order
    for fld in ['Constituencies', 'Groups', 'Committees', 'Delegations', 'Staff']:
        if not fld in data: continue
        data[fld]=sorted(data[fld],
                         key=lambda x: x.get('end',x['start']),
                         reverse=True)

    # get CV - page (is on separate http path :/)
    cvurl='http://www.europarl.europa.eu/meps/en/%s/_cv.html' % userid
    root = fetch(cvurl, ignore=[500])
    data[u'CV']=[unws(x) for x in root.xpath('//p[@class="details_cv"]/text()')]

    # get assistants also on a separate page :/
    assurl='http://www.europarl.europa.eu/meps/en/%s/_assistants.html' % userid
    root = fetch(assurl, ignore=[500])
    for h3 in root.xpath('//h3[@id="section"]'):
        title=unws(''.join(h3.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower().split()[0],
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])
        elif title in ['Accredited assistants (grouping)', 'Local assistants (grouping)',
                       'Service providers', ' Trainees', 'Paying agents (grouping)', 'Paying agents']:
            if not 'assistants' in data: data['assistants']={}
            addif(data['assistants'],
                  title.lower(),
                  [unws(x) for x in h3.xpath('../following-sibling::div[1]//li/text()')])

    return data
Example #7
0
def scrape(url):
    try:
        logger.info('scrape ' + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]')
                             or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure['reference']] or {}).get('Dates', []):
            skip = False
            for event in forecasts + events:
                if event['type'] == ipexevents.get(ipexd['type'], {}).get(
                        'oeil', 'asdf') and event['date'] == ipexd['date']:
                    skip = True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get('date')]
        allevents = sorted([x for x in allevents if x.get('date')],
                           key=itemgetter('date'))
        allevents = merge_events(allevents, committees)
        res = {
            u'meta': {
                'source': url,
                'id': int(url.split('id=')[1]),
                'timestamp': datetime.datetime.utcnow()
            },
            u'procedure':
            procedure,
            u'links':
            form2obj((tree.xpath('//table[@id="external_links"]')
                      or [None])[0]),
            u'committees':
            committees,
            u'activities':
            sorted(allevents, key=itemgetter('date')),
            u'other':
            other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get('class') == 'sumbutton':
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" %
                                    link.get('href'))
                except:
                    continue
                final['text'] = [
                    tostring(x) for x in summary.xpath('//div[@id="summary"]')
                ]
            else:
                if not 'docs' in final: final['docs'] = []
                final['docs'].append({
                    'title': link.xpath('text()')[0].strip(),
                    'url': link.get('href')
                })
        if final and final.get('docs'):
            res[u'procedure'][u'final'] = final.get('docs', [{}])[0]
            for item in res['activities']:
                if item.get(
                        'type') == u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text'] = final['text']
                    if len(final.get('docs')) > 1:
                        if not 'docs' in item:
                            item[u'docs'] = final['docs']
                        else:
                            item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return
Example #8
0
def scrape(url):
    try:
        logger.info("scrape " + url)
        tree = fetch(url)
        agents, committees = scrape_actors(tree)
        forecasts = lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0], forecastFields)
        events = scrape_events(tree)
        procedure = scrape_basic(tree)
        ipext = []
        for ipexd in (IPEXMAP[procedure["reference"]] or {}).get("Dates", []):
            skip = False
            for event in forecasts + events:
                if (
                    event["type"] == ipexevents.get(ipexd["type"], {}).get("oeil", "asdf")
                    and event["date"] == ipexd["date"]
                ):
                    skip = True
                    break
            if skip:
                continue
            ipext.append(ipexd)
        allevents = agents + scrape_docs(tree) + events + forecasts + ipext
        other = [x for x in allevents if not x.get("date")]
        allevents = sorted([x for x in allevents if x.get("date")], key=itemgetter("date"))
        allevents = merge_events(allevents, committees)
        res = {
            u"meta": {"source": url, "id": int(url.split("id=")[1]), "timestamp": datetime.datetime.utcnow()},
            u"procedure": procedure,
            u"links": form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
            u"committees": committees,
            u"activities": sorted(allevents, key=itemgetter("date")),
            u"other": other,
        }
        # check for "final act"
        finalas = tree.xpath('//div[@id="final_act"]//a')
        final = {}
        for link in finalas:
            if link.get("class") == "sumbutton":
                try:
                    summary = fetch("http://www.europarl.europa.eu%s" % link.get("href"))
                except:
                    continue
                final["text"] = [tostring(x) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not "docs" in final:
                    final["docs"] = []
                final["docs"].append({"title": link.xpath("text()")[0].strip(), "url": link.get("href")})
        if final and final.get("docs"):
            res[u"procedure"][u"final"] = final.get("docs", [{}])[0]
            for item in res["activities"]:
                if item.get("type") == u"Final act published in Official Journal":
                    if final.get("text"):
                        item[u"text"] = final["text"]
                    if len(final.get("docs")) > 1:
                        if not "docs" in item:
                            item[u"docs"] = final["docs"]
                        else:
                            item[u"docs"].extend(final["docs"])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url, traceback.format_exc()))
        return
Example #9
0
def parseMember(userid):
    url='http://www.europarl.europa.eu/meps/en/%s/get.html' % userid
    logger.info("scraping %s" % url)
    root = fetch(url)
    data = {u'active': True, 'meta': {u'url': url}} # return {'active': False}
    mepdiv=root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u'Name'] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u'Photo'] = unicode(urljoin(BASE_URL,mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get('src')),'utf8')
    (d, p) = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')[0].split(',', 1)
    try:
        data[u'Birth'] = { u'date': datetime.strptime(unws(d), "Born on %d %B %Y"),
                           u'place': unws(p) }
    except ValueError:
        logger.warn('[!] failed to scrape birth data %s' % url)
        logger.warn(traceback.format_exc())
    const={u'country': unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0])}
    data[u'Constituencies']=[const]
    try:
        const[u'party']=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1]),
    except IndexError:
        data[u'active']=False
    else:
        group=unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        data[u'Groups'] = [{ u'role': unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1]),
                             u'group': group,
                             u'groupid': group_map[group]}]
    cdiv=root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(data,u'RSS',[unicode(urljoin(BASE_URL,x.get('href')),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')])
        addif(data,u'Homepage',[unicode(x.get('href'),'utf8') for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')])
        addif(data,u'Mail',[decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))])
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title=unws(''.join(span.xpath('.//text()')))
        if title in ['Accredited assistants', 'Local assistants']:
            addif(data,title,[unws(x) for x in span.xpath('../../..//li/div/text()')])
    addif(data,u'Addresses',getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key=unws(u''.join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower()=='curriculum vitae':
            data[u'CV'] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ['Member', 'Substitute', 'Chair', 'Vice-Chair', 'Co-President', 'President', 'Vice-President']:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item={u'role': key,
                      u'abbr': unws(''.join(span.xpath('text()'))),
                      u'Organization': unws(span.tail)}
                for start, field in orgmaps:
                    if item['Organization'].startswith(start):
                        if not field in data: data[field]=[]
                        if field=='Committees' and item['Organization'] in COMMITTEE_MAP:
                            item[u'committee_id']=COMMITTEE_MAP[item['Organization']]
                        data[field].append(item)
                        break
        else:
            logger.error('[!] unknown field %s' % key)
    return data
Example #10
0
def getMEPDeclarations(id):
    try:
        dom = fetch("http://www.europarl.europa.eu/meps/en/%s/_declarations.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepdeclaration %s" % e)
        return []
Example #11
0
def parseMember(userid):
    url = "http://www.europarl.europa.eu/meps/en/%s/get.html" % userid
    logger.info("scraping %s" % url)
    root = fetch(url, ignore=[500])
    data = {u"active": False, "meta": {u"url": url}}  # return {'active': False}
    mepdiv = root.xpath('//div[@class="ep_elementpeople2"]')
    if len(mepdiv) == 1:
        mepdiv = mepdiv[0]
    else:
        logger.error("len(mepdiv) not 1: %s" % str(list(mepdiv)))
    data[u"Name"] = mangleName(unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[0]))
    data[u"Photo"] = unicode(urljoin(BASE_URL, mepdiv.xpath('.//span[@class="ep_img"]/img')[0].get("src")), "utf8")
    borntxt = mepdiv.xpath('.//div[@class="ep_elementtext"]/p/text()')
    if len(borntxt) > 0:
        (d, p) = borntxt[0].split(",", 1)
        try:
            data[u"Birth"] = {u"date": datetime.strptime(unws(d), u"Born on %d %B %Y"), u"place": unws(p)}
        except ValueError:
            logger.warn("[!] failed to scrape birth data %s" % url)
            logger.warn(traceback.format_exc())
    else:
        logger.warn("[!] no birth data %s" % url)
    const = {u"country": unws(mepdiv.xpath('.//span[@class="ep_country"]/text()')[0]), u"start": datetime(2009, 7, 14)}
    data[u"Constituencies"] = [const]
    try:
        data[u"party"] = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[1])
    except IndexError:
        pass
    else:
        group = unws(mepdiv.xpath('.//span[@class="ep_group"]/text()')[0])
        try:
            role = unws(mepdiv.xpath('.//span[@class="ep_title"]/text()')[1])
        except IndexError:
            role = u"Member"
        data[u"Groups"] = [{u"role": role, u"Organization": group, u"groupid": group_map[group]}]
    cdiv = root.xpath('//div[@class="ep_elementcontact"]')
    if len(cdiv):
        addif(
            data,
            u"RSS",
            [unicode(urljoin(BASE_URL, x.get("href")), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_rss"]//a')],
        )
        addif(
            data, u"Homepage", [unicode(x.get("href"), "utf8") for x in cdiv[0].xpath('.//li[@class="ep_website"]//a')]
        )
        addif(
            data,
            u"Mail",
            [decodemail(unws(x)) for x in cdiv[0].xpath('.//li[@class="ep_email"]//text()') if len(unws(x))],
        )
    for span in root.xpath('//div[@id="contextzone"]//span[@class="ep_title"]'):
        title = unws("".join(span.xpath(".//text()")))
        if title in ["Accredited assistants", "Local assistants"]:
            if not "assistants" in data:
                data["assistants"] = {}
            addif(
                data["assistants"], title.lower().split()[0], [unws(x) for x in span.xpath("../../..//li/div/text()")]
            )
    addif(data, u"Addresses", getAddress(root))
    for div in root.xpath('//div[@class="ep_content"]'):
        key = unws(u"".join(div.xpath('.//span[@class="ep_title"]/text()')))
        if not len(key):
            continue
        elif key.lower() == "curriculum vitae":
            data[u"CV"] = [unws(x) for x in div.xpath('.//div[@class="ep_elementtext"]//li/div/text()')]
        elif key in ["Member", "Substitute", "Chair", "Vice-Chair", "Co-President", "President", "Vice-President"]:
            for span in div.xpath('.//span[@class="commission_label"]'):
                item = {u"role": key, u"abbr": unws("".join(span.xpath(".//text()"))), u"Organization": unws(span.tail)}
                for start, field in orgmaps:
                    if item["abbr"] in COMMITTEE_MAP or item["Organization"].startswith(start):
                        if not field in data:
                            data[field] = []
                        if field == "Committees" and item["Organization"] in COMMITTEE_MAP:
                            item[u"committee_id"] = COMMITTEE_MAP[item["Organization"]]
                        data[field].append(item)
                        break
        else:
            logger.error("[!] unknown field %s" % key)
    return data
Example #12
0
def scrape(decl):
    mep_id = decl.split('/')[-1].split('_')[0]
    data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''}
    logger.info("findecl scraping %s" % mep_id)

    text = getraw(decl).split('\n')
    state = 0
    ptr = 0
    while ptr < len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (issectionhead(decl, text, ptr, state, 0, ('A', u'А', 'A'))
                or issectionhead(decl, text, ptr, state, 2, ('C', u'В', u'Γ'))
                or issectionhead(decl, text, ptr, state, 3, ('D', u'Г', u'Δ'))
                or issectionhead(decl, text, ptr, state, 4, ('E', u'Д', u'E'))
                or issectionhead(decl, text, ptr, state, 5,
                                 ('F', u'Е', u'ΣΤ'))):
            # skip to table
            while (text[ptr].split()[-4:] != ['1', '2', '3', '4']):
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] %s table not found' % state)
                    raise IndexError
            start = ptr
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] %s fail skip empty lines' % state)
                    raise IndexError
            while True:
                if ptr > len(text):
                    logger.error('[meh] fail past end of block %s' % state)
                    raise IndexError
                if (text[ptr].strip() == ''
                        and (text[ptr + 1] in ['1', '']
                             or text[ptr + 1].strip()[:3] == '1/6')):
                    break
                if text[ptr].startswith(' ' * 20) and (
                        text[ptr].strip()[1] == '/'
                        and text[ptr].strip()[0] in ['2', '3', '4']):
                    break
                ptr += 1
            end = ptr
            state += 1
            #print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ('\n\t'.join(
                    (repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text, ptr, state, 1, ('B', u'Б', u'B')):
            while len([x for x in text[ptr].split(' ' * 10) if x]) != 2:
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] table B not found')
                    raise IndexError
            start = ptr
            # skip empty lines
            while ptr < len(text) and not text[ptr].split():
                ptr += 1
            while True:
                if ptr > len(text):
                    logger.error('[meh] fail skip empty lines in B')
                    raise IndexError
                if [text[ptr].strip(), text[ptr + 1]] in (['', '1'], ['', '']):
                    break
                if text[ptr].startswith(' ' * 20) and (
                        text[ptr].strip()[1] == '/'
                        and text[ptr].strip()[0] in ['2', '3', '4']):
                    break
                ptr += 1
            end = ptr
            state += 1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ('\n\t'.join(
                    (repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state == 6:
            while not issectionhead(decl, text, ptr, state, 6,
                                    ('G', u'Ж', u'Ζ')):
                ptr += 1
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in G fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in G')
                    raise IndexError
            gstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 7,
                                    ('H', u'З', u'H')):
                ptr += 1
            gend = ptr - 1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in H fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in H')
                    raise IndexError
            hstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 8,
                                    ('I', u'И', u'Θ')):
                ptr += 1
            hend = ptr - 1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] continuation in I fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail skip empty lines in I')
                    raise IndexError
            istart = ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp) == 3:
                    data['date'] = tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp) == 5:
                    # date=tmp[2] could be preserved in data
                    del tmp[2]
                    if tmp in [['Date', ':', 'Signature', ':']]:
                        break
                ptr += 1
                if ptr >= len(text):
                    logger.error('[meh] fail find end in I')
                    if DEBUG:
                        print 'meh\n>>>%s' % '\n>>>'.join(
                            text[istart:istart + 14]).encode('utf8')
                    raise IndexError
            state += 1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(
                x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
        #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr += 1
    if state != 9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        return data
Example #13
0
def scrape(decl):
    mep_id = decl.split("/")[-1].split("_")[0]
    data = {"mep_id": mep_id, "url": unicode(decl), "date": ""}
    logger.info("findecl scraping %s" % mep_id)

    text = getraw(decl).split("\n")
    state = 0
    ptr = 0
    while ptr < len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (
            issectionhead(decl, text, ptr, state, 0, ("A", u"А", "A"))
            or issectionhead(decl, text, ptr, state, 2, ("C", u"В", u"Γ"))
            or issectionhead(decl, text, ptr, state, 3, ("D", u"Г", u"Δ"))
            or issectionhead(decl, text, ptr, state, 4, ("E", u"Д", u"E"))
            or issectionhead(decl, text, ptr, state, 5, ("F", u"Е", u"ΣΤ"))
        ):
            # skip to table
            while text[ptr].split()[-4:] != ["1", "2", "3", "4"]:
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] %s table not found" % state)
                    raise IndexError
            start = ptr
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] %s fail skip empty lines" % state)
                    raise IndexError
            while True:
                if ptr > len(text):
                    logger.error("[meh] fail past end of block %s" % state)
                    raise IndexError
                if text[ptr].strip() == "" and (text[ptr + 1] in ["1", ""] or text[ptr + 1].strip()[:3] == "1/6"):
                    break
                if text[ptr].startswith(" " * 20) and (
                    text[ptr].strip()[1] == "/" and text[ptr].strip()[0] in ["2", "3", "4"]
                ):
                    break
                ptr += 1
            end = ptr
            state += 1
            # print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ("\n\t".join((repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text, ptr, state, 1, ("B", u"Б", u"B")):
            while len([x for x in text[ptr].split(" " * 10) if x]) != 2:
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] table B not found")
                    raise IndexError
            start = ptr
            # skip empty lines
            while ptr < len(text) and not text[ptr].split():
                ptr += 1
            while True:
                if ptr > len(text):
                    logger.error("[meh] fail skip empty lines in B")
                    raise IndexError
                if [text[ptr].strip(), text[ptr + 1]] in (["", "1"], ["", ""]):
                    break
                if text[ptr].startswith(" " * 20) and (
                    text[ptr].strip()[1] == "/" and text[ptr].strip()[0] in ["2", "3", "4"]
                ):
                    break
                ptr += 1
            end = ptr
            state += 1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ("\n\t".join((repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state == 6:
            while not issectionhead(decl, text, ptr, state, 6, ("G", u"Ж", u"Ζ")):
                ptr += 1
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] continuation in G fail")
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] fail skip empty lines in G")
                    raise IndexError
            gstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 7, ("H", u"З", u"H")):
                ptr += 1
            gend = ptr - 1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] continuation in H fail")
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] fail skip empty lines in H")
                    raise IndexError
            hstart = ptr
            state += 1
            while not issectionhead(decl, text, ptr, state, 8, ("I", u"И", u"Θ")):
                ptr += 1
            hend = ptr - 1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] continuation in I fail")
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] fail skip empty lines in I")
                    raise IndexError
            istart = ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp) == 3:
                    data["date"] = tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp) == 5:
                    # date=tmp[2] could be preserved in data
                    tmpdate = tmp[2]
                    del tmp[2]
                    if tmp in [["Date", ":", "Signature", ":"]]:
                        data["date"] = tmpdate
                        break
                ptr += 1
                if ptr >= len(text):
                    logger.error("[meh] fail find end in I")
                    if DEBUG:
                        print "meh\n>>>%s" % "\n>>>".join(text[istart : istart + 14]).encode("utf8")
                    raise IndexError
            state += 1
            if DEBUG:
                print >> sys.stderr, state
                # print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = "\n".join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        # else:
        # print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr += 1
    if state != 9:
        print >> sys.stderr, ">>>>>>>>", "wtfwtf", state
        logger.error("[wtf] did not reach final state %s" % state)
        return {}
    else:
        return data
Example #14
0
                "http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"
            ), None)
        print jdump(
            scrape(
                "http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"
            ), None)
    elif sys.argv[1] == 'url' and sys.argv[2]:
        print jdump(scrape(sys.argv[2])).encode('utf8')
        sys.exit(0)

    # handle opts
    if 'current' in args:
        newbies = getIncomming()
        meps = get_meps
    elif 'outgoing' in args:
        meps = getOutgoing
    elif 'new' in args:
        newbies = getIncomming()
        meps = get_new
    else:
        logger.error('Need either <current|outgoing|new>')
        sys.exit(0)
    logger.info('\n\tsaver: %s\n\tmeps: %s\n\tseq: %s' %
                (saver, meps, 'seq' in args))
    if 'seq' in args:
        res = seqcrawl(meps, saver=saver, null=null)
        if 'dry' in args:
            print "[%s]" % ',\n'.join(res).encode('utf8')
    else:
        crawler(meps, saver=saver)
Example #15
0
        sys.exit(0)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1934/get.html"),None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28576/get.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1263/Elmar_BROK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/96739/Reinhard_B%C3%9CTIKOFER.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/28269/Jerzy_BUZEK.html"), None)
        print jdump(scrape("http://www.europarl.europa.eu/meps/en/1186/Astrid_LULLING.html"), None)
    elif sys.argv[1]=='url' and sys.argv[2]:
        print jdump(scrape(sys.argv[2])).encode('utf8')
        sys.exit(0)

    # handle opts
    if 'current' in args:
        newbies=getIncomming()
        meps=get_meps
    elif 'outgoing' in args:
        meps=getOutgoing
    elif 'new' in args:
        newbies=getIncomming()
        meps=get_new
    else:
        logger.error('Need either <current|outgoing|new>')
        sys.exit(0)
    logger.info('\n\tsaver: %s\n\tmeps: %s\n\tseq: %s' % (saver, meps, 'seq' in args))
    if 'seq' in args:
        res=seqcrawl(meps,saver=saver, null=null)
        if 'dry' in args:
            print "[%s]" % ',\n'.join(res).encode('utf8')
    else:
        crawler(meps,saver=saver)
Example #16
0
def scrape(decl):
    mep_id = decl.split('/')[-1].split('_')[0]
    data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''}
    logger.info("findecl scraping %s" % mep_id)

    text=getraw(decl).split('\n')
    state=0
    ptr=0
    while ptr<len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or
            issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or
            issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or
            issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or
            issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))):
            # skip to table
            while (text[ptr].split()[-4:]!=['1','2','3','4']):
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s table not found' % state)
                    raise IndexError
            start=ptr
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s fail skip empty lines' % state)
                    raise IndexError
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail past end of block %s' % state)
                    raise IndexError
                if (text[ptr].strip()=='' and
                    (text[ptr+1] in ['1',''] or
                    text[ptr+1].strip()[:3] == '1/6')):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            #print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')):
            while len([x for x in text[ptr].split(' ' * 10) if x]) != 2:
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] table B not found')
                    raise IndexError
            start=ptr
            # skip empty lines
            while ptr<len(text) and not text[ptr].split():
                ptr+=1
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail skip empty lines in B')
                    raise IndexError
                if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state==6:
            while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')):
                ptr+=1
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in G fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in G')
                    raise IndexError
            gstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')):
                ptr+=1
            gend=ptr-1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in H fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in H')
                    raise IndexError
            hstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')):
                ptr+=1
            hend=ptr-1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in I fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in I')
                    raise IndexError
            istart=ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp)==3:
                    data['date']=tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp)==5:
                    # date=tmp[2] could be preserved in data
                    tmpdate=tmp[2]
                    del tmp[2]
                    if tmp in [['Date', ':','Signature', ':']]:
                        data['date']=tmpdate
                        break
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail find end in I')
                    if DEBUG:
                        print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8')
                    raise IndexError
            state+=1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
            #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr+=1
    if state!=9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        if (len(data['occupation'])>1 and
            data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate",
                                          u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής",
                                          u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)",
                                          u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję",
                                          u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode",
                                          u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato",
                                          u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode",
            ]):
            del data['occupation'][-1]
        return data
Example #17
0
def getMEPGender(id):
    try:
        mepraw=fetch("http://www.europarl.europa.eu/meps/fr/%s/_home.html" % (id), ignore=[500])
    except Exception, e:
        logger.error("mepgender %s" % e)
        return 'n/a'
Example #18
0
def scrape(url, comid):
    root=fetch(url)
    lines=[x for x in root.xpath('//td[@class="contents"]/div/*') if unws(' '.join(x.xpath('.//text()')))]
    if not len(lines): return
    if not unws(' '.join(lines[2].xpath('.//text()')))=='DRAFT AGENDA':
        logger.error("NOT DRAFT AGENDA %s" % unws(' '.join(lines[2].xpath('.//text()'))))
    agenda={u'committee': comid,
            u'committee_full': unws(' '.join(lines[0].xpath('.//text()'))),
            u'src': url,
        }
    i=1
    if unws(' '.join(lines[3].xpath('.//text()')))=="INTERPARLIAMENTARY COMMITTEE MEETING":
        logger.warn("skipping interparl com meet")
        return
    if unws(' '.join(lines[6].xpath('.//text()'))).startswith('Room'):
            agenda.update({u'docid': unws(' '.join(lines[1].xpath('.//text()'))),
                           u'type': unws(' '.join(lines[3].xpath('.//text()'))),
                           u'time': toTime(unws(' '.join(lines[4].xpath('.//text()')))),
                           u'city': unws(' '.join(lines[5].xpath('.//text()'))),
                           u'room': unws(' '.join(lines[6].xpath('.//text()')))[6:],
                           })
            i=7
    itemcnt=0
    item={}
    schedule=None
    res=[]
    while i < len(lines):
        line=lines[i]
        i+=1
        txt=unws(' '.join(line.xpath('.//text()')))
        if txt in ['* * *', '***']:
            continue # skip end of schedule block

        # 20 December 2011, 16.00 – 16.30
        tmp=toTime(txt)
        if tmp:
            schedule=tmp
            if i<len(lines) and unws(' '.join(lines[i].xpath('.//text()'))) == 'In camera':
                schedule[u'incamera']=True
                i+=1
            continue

        if line.tag=='div':
            item[u'actors']=getactors(line)
            continue
        firsttoken=txt.split()[0]
        # 6. Alternative dispute resolution for consumer disputes and
        #    amending Regulation (EC) No 2006/2004 and Directive
        #    2009/22/EC (Directive on consumer ADR)
        if firsttoken[-1]=='.' and firsttoken[:-1].isdigit() and itemcnt+1==int(firsttoken[:-1]):
            if item: res.append(item)
            itemcnt+=1
            item=copy.deepcopy(agenda)
            item.update({u'title': ' '.join(txt.split()[1:]),
                         u'seq_no': itemcnt,})
            if schedule:
                item.update(schedule)
            continue
        # trailing list of "details"
        # · Presentation by the Commission of the proposal & Impact Assessment
        # · Exchange of views
        if firsttoken==u"·":
            if not 'list' in item: item[u'list']=[]
            tmp=' '.join(txt.split()[1:])
            if tmp.startswith('Deadline for tabling amendments:'):
                try:
                    item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d %B %Y, %H.%M")
                except ValueError:
                    try:
                        item[u'tabling_deadline']=datetime.strptime(tmp.split(':')[1].strip(),"%d.%m.%Y at %H.%M")
                    except:
                        logger.warn('[$] unknown tabling deadline format %s' % unws(tmp))
            item[u'list'].append(tmp)
            continue
        # committee dossier
        # IMCO/7/08130
        if txt.startswith("%s/7/" % comid) and len(txt)==12:
            item[u'comdossier']=txt
            continue
        # ***I    2011/0373(COD)       COM(2011)0793 – C7-0454/2011
        tmp=getdocs(txt)
        if tmp:
            item.update(tmp)
            continue
        # fall-through line
        logger.debug("(falltrough) %s %s" % (line.tag, txt.encode('utf8')))
    if item: res.append(item)
    return res
Example #19
0
def scrape(url):
    try:
        logger.info('scrape '+url)
        tree=fetch(url)
        agents,committees=scrape_actors(tree)
        forecasts=lst2obj((tree.xpath('//table[@id="forecast"]') or [None])[0],forecastFields)
        events=scrape_events(tree)
        procedure=scrape_basic(tree)
        if not procedure: return
        ipext=[]
        for ipexd in IPEXMAP.get(procedure['reference'], {}).get('Dates',[]):
            skip=False
            for event in forecasts+events:
                if event['type'] in ipexevents.get(ipexd['type'],{}).get('oeil',[]) and event['date']==ipexd['date']:
                    skip=True
                    break
            if skip: continue
            ipext.append(ipexd)
        allevents=agents+scrape_docs(tree)+events+forecasts+ipext
        other=[x for x in allevents if not x.get('date')]
        allevents=sorted([x for x in allevents if x.get('date')],key=itemgetter('date'))
        allevents=merge_events(allevents,committees, agents)
        res={u'meta': {'source': url,
                       'timestamp': datetime.datetime.utcnow() },
             u'procedure': procedure,
             u'links': form2obj((tree.xpath('//table[@id="external_links"]') or [None])[0]),
             u'committees': committees,
             u'activities': sorted(allevents, key=itemgetter('date')),
             u'other': other,
             }
        tmp=url.split('id=')
        if len(tmp)>1:
            res['meta']['id']=int(tmp[1])
        # check for "final act"
        finalas=tree.xpath('//div[@id="final_act"]//a')
        final={}
        for link in finalas:
            if link.get('class')=='sumbutton':
                try: summary=fetch("http://www.europarl.europa.eu%s" % link.get('href'))
                except: continue
                final['text']=[unicode(tostring(x)) for x in summary.xpath('//div[@id="summary"]')]
            else:
                if not 'docs' in final: final['docs']=[]
                final['docs'].append({'title': link.xpath('text()')[0].strip(),
                                               'url': link.get('href')})
        if final and final.get('docs'):
            res[u'procedure'][u'final']=final.get('docs',[{}])[0]
            for item in res['activities']:
                if item.get('type')==u'Final act published in Official Journal':
                    if final.get('text'):
                        item[u'text']=final['text']
                    if  len(final.get('docs'))>1:
                       if not 'docs' in item:
                           item[u'docs']=final['docs']
                       else:
                           item[u'docs'].extend(final['docs'])
                    break
        return res
    except:
        logger.error("%s\n%s" % (url,traceback.format_exc()))
        return
Example #20
0
def scrape(decl):
    mep_id = decl.split('/')[-1].split('_')[0]
    data = {'mep_id': mep_id, 'url': unicode(decl), 'date': ''}
    logger.info("findecl scraping %s" % mep_id)

    text=getraw(decl).split('\n')
    state=0
    ptr=0
    while ptr<len(text):
        # bg: "А Б В Г Д Е  Ж З И"
        # el: "A B Γ Δ E ΣΤ Ζ H Θ"
        if (issectionhead(decl, text,ptr,state,0,('A',u'А','A')) or
            issectionhead(decl, text,ptr,state,2,('C',u'В',u'Γ')) or
            issectionhead(decl, text,ptr,state,3,('D',u'Г',u'Δ')) or
            issectionhead(decl, text,ptr,state,4,('E',u'Д',u'E')) or
            issectionhead(decl, text,ptr,state,5,('F',u'Е',u'ΣΤ'))):
            # skip to table
            while (text[ptr].split()[-4:]!=['1','2','3','4']):
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s table not found' % state)
                    raise IndexError
            start=ptr
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] %s fail skip empty lines' % state)
                    raise IndexError
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail past end of block %s' % state)
                    raise IndexError
                if (text[ptr].strip()=='' and
                    (text[ptr+1] in ['1',''] or
                    text[ptr+1].strip()[:3] == '1/6')):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            #print >> sys.stderr, text[start:end]
            if state == 6:
                t = parse_table_f(text[start:end])
            else:
                t = parse_table(text[start:end])
            data[state_map[state]] = t
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
        elif issectionhead(decl, text,ptr,state,1,('B',u'Б', u'B')):
            while len([x for x in text[ptr].split(' ' * 10) if x]) != 2:
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] table B not found')
                    raise IndexError
            start=ptr
            # skip empty lines
            while ptr<len(text) and not text[ptr].split():
                ptr+=1
            while True:
                if ptr>len(text):
                    logger.error('[meh] fail skip empty lines in B')
                    raise IndexError
                if [text[ptr].strip(), text[ptr+1]] in (['','1'], ['','']):
                    break
                if text[ptr].startswith(' ' * 20) and (text[ptr].strip()[1]=='/' and
                                                       text[ptr].strip()[0] in ['2','3','4']):
                    break
                ptr+=1
            end=ptr
            state+=1
            t = parse_table_b(text[start:end])
            if DEBUG:
                print "\t%s" % ('\n\t'.join((repr(x) for x in t)) or "none"), state
            data[state_map[state]] = t
        elif state==6:
            while not issectionhead(decl, text,ptr,state,6,('G',u'Ж',u'Ζ')):
                ptr+=1
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in G fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in G')
                    raise IndexError
            gstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,7,('H',u'З',u'H')):
                ptr+=1
            gend=ptr-1
            if DEBUG:
                print "\t", text[gstart:gend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[gstart:gend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in H fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in H')
                    raise IndexError
            hstart=ptr
            state+=1
            while not issectionhead(decl, text,ptr,state,8,('I',u'И',u'Θ')):
                ptr+=1
            hend=ptr-1
            if DEBUG:
                print "\t", text[hstart:hend], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[hstart:hend]) if x)
            # skip continuation lines
            while text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] continuation in I fail')
                    raise IndexError
            # skip empty lines
            while not text[ptr].split():
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail skip empty lines in I')
                    raise IndexError
            istart=ptr
            while True:
                tmp = text[ptr].split()
                if len(tmp)==3:
                    data['date']=tmp[1]
                    del tmp[1]
                    if tmp in iendsigs:
                        break
                elif len(tmp)==5:
                    # date=tmp[2] could be preserved in data
                    tmpdate=tmp[2]
                    del tmp[2]
                    if tmp in [['Date', ':','Signature', ':']]:
                        data['date']=tmpdate
                        break
                ptr+=1
                if ptr>=len(text):
                    logger.error('[meh] fail find end in I')
                    if DEBUG:
                        print 'meh\n>>>%s' % '\n>>>'.join(text[istart:istart+14]).encode('utf8')
                    raise IndexError
            state+=1
            if DEBUG:
                print >> sys.stderr, state
                #print >> sys.stderr, "\t", text[istart:ptr], state
            data[state_map[state]] = '\n'.join(x for x in map(unicode.strip, text[istart:ptr]) if x)
        #else:
            #print >> sys.stderr, '>>>>>>>>', line.encode('utf8')
        ptr+=1
    if state!=9:
        print >> sys.stderr, '>>>>>>>>', "wtfwtf", state
        logger.error('[wtf] did not reach final state %s' % state)
        return {}
    else:
        if (len(data['occupation'])>1 and
            data['occupation'][-1][0] in [u"No occupation held during the three years preceding the current mandate",
                                          u"Καμία επαγγελματική δραστηριότητα κατά τη διάρκεια των τριών ετών που προηγήθηκαν της τρέχουσας εντολής",
                                          u"Atividade Liberal como autor/outras atividades artísticas (remuneração inferior a 500 € na totalidade dos 3 anos anteriores)",
                                          u"Brak działalności zawodowej w okresie trzech lat poprzedzających obecną kadencję",
                                          u"Geen beroep uitgeoefend gedurende de drie jaar voorafgaand aan de huidige zittingsperiode",
                                          u"Nessuna attività svolta durante i tre anni precedenti l'attuale mandato",
                                          u"Keine Berufstätigkeit während des Dreijahreszeitraums vor der laufenden Wahlperiode",
					  u"Aucune activité professionnelle au cours des trois années ayant précédé le présent mandat",
					  u"Sin ocupación durante los tres años anteriores al actual mandato",
					  u"Intet erhvervsarbejde i de tre år forud for det nuværende mandate",
					  u"Nicio activitate profesională în ultimii trei ani dinaintea preluării mandatului actual",
					  u"Har inte utövat någon yrkesmässig verksamhet under de tre år som föregick det nuvarande mandatet",
					  u"Sem atividade profissional durante os três anos que precederam o atual mandato",
					  u"Nepostojanje profesionalne djelatnosti tijekom tri godine prije aktualnog mandata",
					  u"Ei ammatillista toimintaa kolmena nykyistä edustajantointa edeltävänä vuotena",
					  u"A jelenlegi megbízatást megelőző három évben nem végzett foglalkozást.",
					  u"Без професионална дейност по време на трите години, предшестващи текущия мандат",
					  u"Během tří let před současným mandátem jsem nevykonával(a) žádnou profesní činnost.",
            ]):
            del data['occupation'][-1]
        return data