def parse(url) : reps = { 'wp': {}, 'names': {}, 'links': {}, } d = cache.cachewp (url) myparser = lxml.etree.HTMLParser(encoding="utf-8") html = lxml.etree.HTML(d, parser=myparser) for r in html.xpath("//ol/li") : for l in r.xpath("a"): f_name_link = l.get("href") f_name_element = l.text obj = { 'links' : { 'homepage' : {} }, 'link' : f_name_link, 'name' : f_name_element } link = re.search("/([^\/]+)$",f_name_link).group(1) link = urllib.unquote(link) link = encode.decode(link) """ we are going to collect all the links and point to the object """ # print link, f_name_element, f_name_link reps['wp'][link]= parse_ballotwiki_page(f_name_link,reps,obj) reps['names'][f_name_element]= obj return reps
def parse_rep(): reps = { 'wp': {}, 'names': {}, 'links': {}, } d = cache.cachewp( 'http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes' ) myparser = lxml.etree.HTMLParser(encoding="utf-8") html = lxml.etree.HTML(d, parser=myparser) tables = html.xpath("//table") table = tables[1] for r in table.xpath("//tr"): data = r.xpath("td") if (len(data) == 10): f_district = data[1] f_image = data[2] f_name = data[3] f_name_link = "" f_name_element = "" f_district_link = "" for l in f_name.xpath("span/span/a"): f_name_link = l.get("href") f_name_element = l.text for l in f_district.xpath("span/span/a"): f_district_link = l.get("href") obj = { 'links': { # 'congbio' : '', 'homepage': {} }, 'link': f_name_link, 'district': f_district_link, 'name': f_name_element } link = re.search("/([^\/]+)$", f_name_link).group(1) link = urllib.unquote(link) link = encode.decode(link) """ we are going to collect all the links and point to the object """ reps['wp'][link] = wiki.parse_wiki_page(f_name_link, reps, obj) reps['names'][f_name_element] = obj return reps
def parse_rep() : reps = { 'wp': {}, 'names': {}, 'links': {}, } d = cache.cachewp ('http://en.wikipedia.org/wiki/Current_members_of_the_United_States_House_of_Representatives?printable=yes') myparser = lxml.etree.HTMLParser(encoding="utf-8") html = lxml.etree.HTML(d, parser=myparser) tables = html.xpath("//table") table = tables[1] for r in table.xpath("//tr") : data= r.xpath("td") if( len(data) == 10): f_district = data[1] f_image = data[2] f_name = data[3] f_name_link = "" f_name_element = "" f_district_link="" for l in f_name.xpath("span/span/a"): f_name_link = l.get("href") f_name_element = l.text for l in f_district.xpath("span/span/a"): f_district_link = l.get("href") obj = { 'links' : { # 'congbio' : '', 'homepage' : {} }, 'link' : f_name_link, 'district' : f_district_link, 'name' : f_name_element } link = re.search("/([^\/]+)$",f_name_link).group(1) link = urllib.unquote(link) link = encode.decode(link) """ we are going to collect all the links and point to the object """ reps['wp'][link]= wiki.parse_wiki_page(f_name_link,reps,obj) reps['names'][f_name_element]= obj return reps
def parse(): reps = { 'wp': {}, 'names': {}, 'links': {}, } d = cache.cachewp( 'http://en.wikipedia.org/wiki/List_of_current_United_States_Senators?printable=yes' ) html = lxml.html.document_fromstring(d) tables = html.xpath("//table") table = tables[1] for r in table.xpath("//tr"): data = r.xpath("td") if (len(data) > 7): f_state = data[1] f_class = data[2] f_image = data[3] f_name = data[4] (f_name_element, skip, f_name_link, skip) = f_name.iterlinks().next() obj = { 'type': 'senate', 'links': { # 'congbio' : '', 'homepage': {} }, 'link': f_name_link, 'state': f_state.text, 'district': f_class.text, 'name': f_name_element.text } link = re.search("/([^\/]+)$", f_name_link).group(1) link = urllib.unquote(link) link = encode.decode(link) """ we are going to collect all the links and point to the object """ obj = wiki.parse_wiki_page(f_name_link, reps, obj) reps['wp'][link] = obj reps['names'][f_name_element.text] = obj return reps
def parse() : reps = { 'wp': {}, 'names': {}, 'links': {}, } d = cache.cachewp ('http://en.wikipedia.org/wiki/List_of_current_United_States_Senators?printable=yes') html = lxml.html.document_fromstring( d ) tables = html.xpath("//table") table = tables[1] for r in table.xpath("//tr") : data= r.xpath("td") if( len(data) > 7): f_state = data[1] f_class = data[2] f_image = data[3] f_name = data[4] (f_name_element, skip , f_name_link, skip) =f_name.iterlinks().next() obj = { 'type': 'senate', 'links' : { # 'congbio' : '', 'homepage' : {} }, 'link' : f_name_link, 'state' : f_state.text, 'district' : f_class.text, 'name' : f_name_element.text } link = re.search("/([^\/]+)$",f_name_link).group(1) link = urllib.unquote(link) link = encode.decode(link) """ we are going to collect all the links and point to the object """ obj=wiki.parse_wiki_page(f_name_link,reps,obj) reps['wp'][link]= obj reps['names'][f_name_element.text]= obj return reps
def parse_ballotwiki_page(x,reps,obj) : d = cache.cachewp ('http://ballotpedia.org%s?printable=yes' % x) html = lxml.html.document_fromstring( d ) return wiki.parse_wiki_page_links(html,reps,obj)
def parse_wiki_page(x,reps,obj): d = cache.cachewp ('http://en.wikipedia.org%s?action=purge&printable=yes' % x) html = lxml.html.document_fromstring( d ) return parse_wiki_page_links(html,reps,obj)
def parse_wiki_source(x,reps): url='http://en.wikipedia.org/w/index.php?title=%s&action=raw' % x d = cache.cachewp (url) return parse_wiki_text(d,reps)