def parse(url) :
    reps = {
    'wp': {},
    'names': {},
    'links': {},
    }
    d = cache.cachewp (url)
    myparser = lxml.etree.HTMLParser(encoding="utf-8")
    html = lxml.etree.HTML(d, parser=myparser)
    for r in html.xpath("//ol/li") :
        for l in r.xpath("a"):
            f_name_link = l.get("href")
            f_name_element = l.text

            obj = {
                'links' :   {
                    'homepage' : {}
                },
                'link' :   f_name_link,
                'name' : f_name_element
            }
            link = re.search("/([^\/]+)$",f_name_link).group(1)          
            link = urllib.unquote(link)
            link = encode.decode(link)

            """ we are going to collect all the links and point to the object """ 
#            print link, f_name_element,  f_name_link 
            reps['wp'][link]= parse_ballotwiki_page(f_name_link,reps,obj)
            reps['names'][f_name_element]= obj

    return reps
Exemple #2
0
def parse_rep():
    reps = {
        'wp': {},
        'names': {},
        'links': {},
    }
    d = cache.cachewp(
        'http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes'
    )

    myparser = lxml.etree.HTMLParser(encoding="utf-8")
    html = lxml.etree.HTML(d, parser=myparser)

    tables = html.xpath("//table")
    table = tables[1]
    for r in table.xpath("//tr"):
        data = r.xpath("td")
        if (len(data) == 10):
            f_district = data[1]
            f_image = data[2]
            f_name = data[3]
            f_name_link = ""
            f_name_element = ""
            f_district_link = ""
            for l in f_name.xpath("span/span/a"):
                f_name_link = l.get("href")
                f_name_element = l.text

            for l in f_district.xpath("span/span/a"):
                f_district_link = l.get("href")
            obj = {
                'links': {
                    #                    'congbio' : '',
                    'homepage': {}
                },
                'link': f_name_link,
                'district': f_district_link,
                'name': f_name_element
            }
            link = re.search("/([^\/]+)$", f_name_link).group(1)
            link = urllib.unquote(link)
            link = encode.decode(link)
            """ we are going to collect all the links and point to the object """
            reps['wp'][link] = wiki.parse_wiki_page(f_name_link, reps, obj)
            reps['names'][f_name_element] = obj

    return reps
def parse_rep() :
    reps = {
    'wp': {},
    'names': {},
    'links': {},
    }
    d = cache.cachewp ('http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes')

    myparser = lxml.etree.HTMLParser(encoding="utf-8")
    html = lxml.etree.HTML(d, parser=myparser)

    tables = html.xpath("//table")
    table = tables[1]
    for r in table.xpath("//tr") :
        data= r.xpath("td")
        if( len(data) == 10):
            f_district = data[1]
            f_image     = data[2]
            f_name     = data[3]
            f_name_link = ""
            f_name_element = ""
            f_district_link=""
            for l in f_name.xpath("span/span/a"):
                f_name_link = l.get("href")
                f_name_element = l.text

            for l in f_district.xpath("span/span/a"):
                f_district_link = l.get("href")
            obj = {
                'links' :   {
#                    'congbio' : '',
                    'homepage' : {}
                },
                'link' :   f_name_link,
                'district' :  f_district_link,
                'name' : f_name_element
            }
            link = re.search("/([^\/]+)$",f_name_link).group(1)          
            link = urllib.unquote(link)
            link = encode.decode(link)

            """ we are going to collect all the links and point to the object """ 
            reps['wp'][link]= wiki.parse_wiki_page(f_name_link,reps,obj)
            reps['names'][f_name_element]= obj

    return reps
Exemple #4
0
def parse():
    reps = {
        'wp': {},
        'names': {},
        'links': {},
    }
    d = cache.cachewp(
        'http://en.wikipedia.org/wiki/List_of_current_United_States_Senators?printable=yes'
    )
    html = lxml.html.document_fromstring(d)
    tables = html.xpath("//table")
    table = tables[1]
    for r in table.xpath("//tr"):
        data = r.xpath("td")
        if (len(data) > 7):
            f_state = data[1]
            f_class = data[2]
            f_image = data[3]
            f_name = data[4]

            (f_name_element, skip, f_name_link,
             skip) = f_name.iterlinks().next()
            obj = {
                'type': 'senate',
                'links': {
                    #                    'congbio' : '',
                    'homepage': {}
                },
                'link': f_name_link,
                'state': f_state.text,
                'district': f_class.text,
                'name': f_name_element.text
            }

            link = re.search("/([^\/]+)$", f_name_link).group(1)
            link = urllib.unquote(link)
            link = encode.decode(link)
            """ we are going to collect all the links and point to the object """
            obj = wiki.parse_wiki_page(f_name_link, reps, obj)
            reps['wp'][link] = obj
            reps['names'][f_name_element.text] = obj

    return reps
def parse() :
    reps = {
    'wp': {},
    'names': {},
    'links': {},
    }
    d = cache.cachewp ('http://en.wikipedia.org/wiki/List_of_current_United_States_Senators?printable=yes')
    html = lxml.html.document_fromstring(  d  )
    tables = html.xpath("//table")
    table = tables[1]
    for r in table.xpath("//tr") :
        data= r.xpath("td")
        if( len(data) > 7):
            f_state = data[1]
            f_class = data[2]
            f_image = data[3]
            f_name  = data[4]

            (f_name_element, skip , f_name_link, skip) =f_name.iterlinks().next()
            obj = {
                'type': 'senate',
                'links' :   {
#                    'congbio' : '',
                    'homepage' : {}
                },
                'link' :   f_name_link,
                'state' :   f_state.text,
                'district' :  f_class.text,
                'name' : f_name_element.text
            }


            link = re.search("/([^\/]+)$",f_name_link).group(1)          
            link = urllib.unquote(link)
            link = encode.decode(link)

            """ we are going to collect all the links and point to the object """ 
            obj=wiki.parse_wiki_page(f_name_link,reps,obj)
            reps['wp'][link]= obj
            reps['names'][f_name_element.text]= obj

    return reps
def parse_ballotwiki_page(x,reps,obj) :
    d = cache.cachewp ('http://ballotpedia.org%s?printable=yes' % x)
    html = lxml.html.document_fromstring( d   )
    return wiki.parse_wiki_page_links(html,reps,obj)
def parse_wiki_page(x,reps,obj):
    d = cache.cachewp ('http://en.wikipedia.org%s?action=purge&printable=yes' % x)
    html = lxml.html.document_fromstring( d   )
    return parse_wiki_page_links(html,reps,obj)
def parse_wiki_source(x,reps):
    url='http://en.wikipedia.org/w/index.php?title=%s&action=raw' % x
    d = cache.cachewp (url)
    return parse_wiki_text(d,reps)