def plot_colors(xml): dom = web.Element(xml) result = {} for graph in dom.by_tag('graph'): title = _strip(graph.attributes['title']) result[title] = graph.attributes['color'] return result
def rcp_poll_data(xml): dom = web.Element(xml) result = {} dates = dom.by_tag('series')[0] dates = { n.attributes['xid']: str(n.content) for n in dates.by_tag('value') } keys = dates.keys() result['date'] = pd.to_datetime([dates[k] for k in keys]) for graph in dom.by_tag('graph'): name = graph.attributes['title'] data = { n.attributes['xid']: float(n.content) if n.content else np.nan for n in graph.by_tag('value') } result[name] = [data[k] for k in keys] result = pd.DataFrame(result) result = result.sort(columns=['date']) return result
def rcp_poll_data(xml): dom = web.Element(xml) result = {} # extract dates series = dom.by_tag('series') date_value = series[0].by_tag('value') date = [] for d in date_value: date.append(pd.to_datetime(d.content)) result['date'] = date #extract result data and titles graphs_tag = dom.by_tag('graphs') graph_tags = graphs_tag[0].by_tag('graph') for graph in graph_tags: title = graph.attributes['title'] values = [] for value in graph.by_tag('value'): try: values.append(float(value.content)) except ValueError: values.append(np.nan) result[title] = values result = pd.DataFrame(result) return result
def parse_results(fname): with open('results_raw_2015/%s' % fname, 'r') as f: dom = web.Element(f.read()) fields = dom('td') stud_det = { 'roll_num': web.plaintext(fields[8].content), 'name': web.plaintext(fields[10].content), 'mother_name': web.plaintext(fields[12].content), 'father_name': web.plaintext(fields[14].content) } for i in range(21, len(fields) - 7, 6): if web.plaintext(fields[i].content) == 'Additional Subject': i += 1 stud_det.update({ web.plaintext(fields[i + 1].content) + '_theory': web.plaintext(fields[i + 2].content), web.plaintext(fields[i + 1].content) + '_practical': web.plaintext(fields[i + 3].content), web.plaintext(fields[i + 1].content) + '_total': web.plaintext(fields[i + 4].content), web.plaintext(fields[i + 1].content) + '_grade': web.plaintext(fields[i + 5].content) }) stud_det['fin_result'] = web.plaintext(fields[-5].content)[8:] return stud_det
def PQarchive_url_list(start_date, end_date, page, newspaper_tag='latimes', query='romney OR obama', debug=False): ''' Scrapes the PQ archive system to get a list of all URLs. Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed) Output: URL List ''' # split dates into M, D, Y start_date = start_date.split('-') end_date = end_date.split('-') options = {} # run the query url = 'http://pqasb.pqarchiver.com/' + newspaper_tag + '/results.html' options['st'] = 'advanced' options['sortby'] = 'CHRON' options['datetype'] = 6 options['frommonth'] = start_date[0] options['fromday'] = start_date[1] options['fromyear'] = start_date[2] options['tomonth'] = end_date[0] options['today'] = end_date[1] options['toyear'] = end_date[2] options['type'] = 'current' options['start'] = (page - 1) * 10 options['QryTxt'] = query # try to get url with specified parameters try: r = requests.get(url, params=options) html = r.text if debug: print r.url except: print 'Unable to parse URL list for ' + str(url) return None # declare dom object to begin parsing the data dom = web.Element(html) url_list = [] wp_pattern_good = re.compile(u'FMT=ABS') wp_pattern_bad = re.compile(u'washingtonpost_historical') # find each url for a in dom('table a'): # check if the a tag has a title, the title matches the Preview sring, and the href is not from the header faq section if ('title' in a.attrs) and (a.attrs['title'] == 'Preview (Abstract/Citation)' ) and (a.attrs['href'] != 'faq.html#abs'): # add url to url_list url_list += [str(a.attrs['href'])] return url_list
def race_result(url): html = requests.get(url).text dom = web.Element(html) result = {} #find tags unique to candidate names tr_tags = dom.by_tag('tr.omit') th_tags = tr_tags[0].by_tag('th') #extract candidate names candidate = [] #add names to candidate list without additional chars for tags in th_tags[3:-1]: if re.search("\(", tags.content): candidate.append(tags.content[:-4]) else: candidate.append(tags.content) #find tags unique to final polling results td_tags = tr_tags[0].next.by_tag('td') # extract percentages percentage = [] for tags in td_tags[3:-1]: percentage.append(float(tags.content)) result = dict(zip(candidate, percentage)) return result
def convert_answers(self): """ Convert Posts.xml to the required answers_df. """ posts_dom = web.Element(file('data/'+self.site_name+"/Posts.xml").read()) aids, uids, parentids, bodies, scores = [],[],[],[], [] for row in posts_dom.by_tag('row'): if row.attributes['posttypeid'] == '2': # get the answers aids.append(row.attributes['id']) if 'owneruserid' in row.attributes.keys(): uids.append(row.attributes['owneruserid']) else: uids.append(u'-999') scores.append(int(row.attributes['score'])) parentids.append(row.attributes['parentid']) bodies.append(row.attributes['body'].encode('unicode-escape')) self.answers_df = pd.DataFrame(data={'post_id':aids, 'user_id':uids, 'parent_id':parentids, 'score':scores, 'answer':bodies}, columns=['post_id','user_id','parent_id','score','answer']) self.answers_df = self.answers_df.set_index('post_id') # index by (unique) post_id return
def parsing_headline(date): #1~30위까지 rank 30위까지 crawl html = get_headline_page(date) element = web.Element(html) lst = [] for i in range(3): a = html.findAll('li', {'class': 'num' + str(i + 1)}) lst.append( str(a[0]).split('<dt>')[1].split('title=')[1].split('>')[0].decode( 'utf-8')) for i in range(7): b = html.findAll('li', {'class': 'gnum' + str(i + 4)}) #print 'Rank' + str(i+4) +': ' + str(b[0]).split('title=')[1].split('"')[1] lst.append( (str(b[0]).split('title=')[1].split('"')[1]).decode('utf-8')) for i in range(10): c = html.findAll('li', {'class': 'gnum' + str(i + 11)}) #print 'Rank' + str(i+11) +': ' + str(c[0]).split('title=')[1].split('"')[1] lst.append( (str(c[0]).split('title=')[1].split('"')[1]).decode('utf-8')) for i in range(10): d = html.findAll('li', {'class': 'gnum' + str(i + 21)}) #print 'Rank' + str(i+21) +': ' + str(d[0]).split('title=')[1].split('"')[1] lst.append( (str(d[0]).split('title=')[1].split('"')[1]).decode('utf-8')) return lst
def convert_questions(self): """ Convert Posts.xml to the required questions_df. """ posts_dom = web.Element(file('data/'+self.site_name+"/Posts.xml").read()) # Warning: super-hacky code follows! qids, uids, tags, titles, bodies, creation_dates = [],[],[],[],[],[] for row in posts_dom.by_tag('row'): if row.attributes['posttypeid'] == '1': # get the questions qids.append(row.attributes['id']) if 'owneruserid' in row.attributes.keys(): uids.append(row.attributes['owneruserid']) else: uids.append(u'-999') if 'tags' in row.attributes.keys(): tags.append(re.sub('[^-a-zA-Z]',' ',row.attributes['tags']).strip().split()) else: tags.append([u'']) titles.append(row.attributes['title'].encode('unicode-escape').replace(r'\n','')) bodies.append(row.attributes['body'].encode('unicode-escape').replace(r'\n','')) creation_dates.append(pd.to_datetime(row.attributes['creationdate'])) self.questions_df = pd.DataFrame(data={'post_id':qids, 'user_id':uids, 'title':titles, 'tags':tags, 'question':bodies, 'date':creation_dates}, columns=['post_id','user_id','title','tags','question','date']) self.questions_df = self.questions_df.set_index('post_id') # index by (unique) post_id return
def find_governor_races(html): dom = web.Element(html) links = [a.attributes.get('href', '') for a in dom.by_tag('a')] links = [l for l in links if is_gov_race(l)] #eliminate duplicates! links = list(set(links)) return links
def get_poll(id): url = "http://charts.realclearpolitics.com/charts/%i.xml" % int(id) xml = requests.get(url).text dom = web.Element(xml) result = {} dates = dom.by_tag('series')[0] temp = [] temp_dates = [] for n in dates.by_tag('value'): temp = temp + [str(n.content)] for t in temp: split = t.split('/') d = datetime.date(int(split[2]), int(split[0]), int(split[1])) temp_dates = temp_dates + [unicode(d)] dates = temp_dates for graph in dom.by_tag('graph'): name = graph.attributes['title'] poll_data = [] for n in graph.by_tag('value'): if n.content: poll_data = poll_data + [float(n.content)] else: poll_data = poll_data + [np.nan] result[name] = poll_data frame = pd.DataFrame(result, index=dates) frame.index.name = 'date' return frame
def rcp_poll_data(xml): '''dom = web.Element(xml) result = {} dates = dom.by_tag('series')[0] dates = {n.attributes['xid']: str(n.content) for n in dates.by_tag('value')} keys = dates.keys() result['date'] = pd.to_datetime([dates[k] for k in keys]) for graph in dom.by_tag('graph'): name = graph.attributes['title'] data = {n.attributes['xid']: float(n.content) if n.content else np.nan for n in graph.by_tag('value')} result[name] = [data[k] for k in keys] result = pd.DataFrame(result) result = result.sort_values("date",ascending=True) result = result.reset_index(drop=True) return result xml = get_poll_xml(1044) print rcp_poll_data(xml)''' '''result = {} dom = web.Element(xml) dates = dom.by_tag("series")[0] dates = {n.attributes["xid"]:str(n.content) for n in dates.by_tag("value")} result["date"] = pd.to_datetime([dates[k] for k in dates.keys()]) for value in dom.by_tag("graph"): name = value.attributes['title'] dic = {n.attributes["xid"]:float(n.content) if n.content else np.nan for n in value.by_tag("value")} result[name] = [dic[k] for k in dic.keys()] result = pd.DataFrame(result) result = result.sort_values("date",ascending=True) result = result.reset_index(drop=True) return result''' dom = web.Element(xml) result = {} general = dom.by_tag("div#polling-data-full")[0] if_name = general.by_tag("th") if len(if_name)>6: dates = [date.content.split("-")[1].strip()+"/10" for date in general.by_tag("td")[15::7]] dates = pd.to_datetime(dates) name_1 = str(if_name[3].content.split("(")[0].strip()) name_2 = str(if_name[4].content.split("(")[0].strip()) name_3 = str(if_name[5].content.split("(")[0].strip()) result["date"] = dates result[name_1] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[17::7]] result[name_2] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[18::7]] result[name_3] = [float(c.content) if c.content!="--" else np.nan for c in general.by_tag("td")[19::7]] elif len(if_name)<=6: dates = [date.content.split("-")[1].strip()+"/10" for date in general.by_tag("td")[13::6]] dates = pd.to_datetime(dates) name_1 = str(if_name[3].content.split("(")[0].strip()) name_2 = str(if_name[4].content.split("(")[0].strip()) result["date"] = dates result[name_1] = [float(c.content) for c in general.by_tag("td")[15::6]] result[name_2] = [float(c.content) for c in general.by_tag("td")[16::6]] result = pd.DataFrame(result) return result
def url_to_text(input): #scrape url, get content of news articles text = '' dom = web.Element(input) para = dom.by_tag('div.column1 grid8 grid-inside')[0] for p in para.by_tag('p'): for c in p.children: if c.type == 'text': text += c.source return text
def get_population_html_tables(html): dom = web.Element(html) #print(dom.by_tag('table')) tbls = [t for t in dom.by_tag('table') if t.attributes['class'] == ['sortable', 'wikitable']] return tbls
def __init__(self, schedule): self.schedule = schedule self.browser_schedule = webdriver.PhantomJS() self.browser_schedule.set_window_size(1024, 768) self.browser_schedule.get(self.schedule) self.website_schedule = self.browser_schedule.page_source self.browser_schedule.quit() self.dom_schedule = web.Element(self.website_schedule)
def extract_dom(sort, start, url='http://www.imdb.com/search/title'): params = dict(sort=sort, at=0, start=start, title_type='feature', year='1950,2014') r = requests.get(url, params=params) print bcolors.WARNING + r.url + bcolors.ENDC return web.Element(r.text)
def get_population_html_tables(html): dom = web.Element(html) ### 0. step: look at html source! #### 1. step: get all tables #tbls = dom('table') #### 2. step: get all tables we care about tbls = dom.by_class('sortable wikitable') return tbls
def __init__(self, main_website): self.main_website = main_website self.browser_main = webdriver.PhantomJS() self.browser_main.set_window_size(1024, 768) self.browser_main.get(self.main_website) self.website_main = self.browser_main.page_source self.browser_main.quit() self.dom = web.Element(self.website_main) self.links = self.dom.by_class('expanded') self.main_url = URL(self.main_website)
def get_acestream(url): browser = webdriver.PhantomJS() browser.set_window_size(1024, 768) browser.get(url) website = browser.page_source browser.quit() dom = web.Element(website) aces = dom.by_attr(target='_blank') for ace in aces: if ace.href[:3] == 'ace': return str(ace.href)
def PQarchive_url_list(start_date, end_date, page, newspaper_tag='latimes', query='romney OR obama', debug=False): ''' Scrapes the PQ archive system to get a list of all URLs. Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed) Output: URL List ''' # split dates into M, D, Y start_date = start_date.split('-') end_date = end_date.split('-') options = {} # run the query url = 'http://pqasb.pqarchiver.com/' + newspaper_tag + '/results.html' options['st'] = 'advanced' options['sortby'] = 'CHRON' options['datetype'] = 6 options['frommonth'] = start_date[0] options['fromday'] = start_date[1] options['fromyear'] = start_date[2] options['tomonth'] = end_date[0] options['today'] = end_date[1] options['toyear'] = end_date[2] options['type'] = 'current' options['start'] = (page - 1) * 10 options['QryTxt'] = query # try to get url with specified parameters try: r = requests.get(url, params=options) html = r.text if debug: print r.url except: print 'Unable to parse URL list for ' + str(url) return None # declare dom object to begin parsing the data dom = web.Element(html) url_list = [] # find each url for a in dom('.result_title a'): url_list.append(str(a.attrs['href'])) return url_list
def get_tables(html): """Parse html and return html tables of wikipedia population data.""" dom = web.Element(html) ### 0. step: look at html source! #### 1. step: get all tables #tbls = dom('table') #### 2. step: get all tables we care about tbls = dom.by_class('collapse1') # href = tbls.by_class('thumbnail') return tbls
def get_craigslist_postings(): postings = json.load(open('results.json', 'r')) # filter cross postings filtered = {} for boat_name, posts in postings.items(): founds = [] urls = [] for post in posts: _hash = post.split('/')[-2] if _hash not in founds: founds.append(_hash) urls.append(post) filtered[boat_name] = urls for boat_name, posts in filtered.items(): for post in posts: r = requests.get(post) w = web.Element(r.content) body = w.by_id('postingbody') content = body.content links = body.by_tag('a') if links: content = content.replace(links[0].content, '') c = w.by_class("print-qrcode-label") if c: content = content.replace(c[0].content, '') content = web.plaintext(content) formatted_attrs = {} attrs = w.by_class('attrgroup')[0].by_tag('span') for attr in attrs: values = web.plaintext(attr.content).split(': ') if len(values) == 2: key = values[0].replace(' ', '_').replace('/','') value = values[1] formatted_attrs[key] = value price = web.plaintext(w.by_class('price')[0].content.replace('$','')) post_data = { 'body': { 'source': content, 'link': post, 'attrs': formatted_attrs, 'boat': boat_name, 'price': price }, "index": "listings", 'doc_type': 'listing' } res = es.index(**post_data)
def get_all_year_votes(year_url): senate_votes = [] house_votes = [] vote_page = requests.get(year_url).text element = web.Element(vote_page) link_list = element.by_tag("a") for link in link_list[0:5]: label = link[0] if str(label)[0] == "s": senate_votes.append(get_vote(str(label))) if str(label)[0] == "h": house_votes.append(get_vote(str(label))) return (senate_votes, house_votes)
def get_region(self): if self.data: dom = web.Element(self.data) h = HTMLParser.HTMLParser() locality_section = dom.by_tag('span.locality') if locality_section: region = h.unescape(locality_section[0].content.strip()) return region else: print self.name, 'get_region Error - no page given'
def find_governor_races(html): dom = web.Element(html) option_tags = dom.by_tag('option') gov_links = [] #iterate through option tags for op in option_tags: value = op.attributes['value'] # only append governor links if re.search("2010/governor", value): gov_links.append(value) return gov_links
def find_governor_races(html): dom = web.Element(html) result = [] for ahref in dom.by_tag('a'): name = ahref.attributes.get('href', '') reg1 = 'http://www.realclearpolitics.com/epolls/????/governor/??/*-*.html' reg2 = '/epolls/????/governor/??/*-*.html' if fnmatch.fnmatch(name, reg1): result.append(name) elif fnmatch.fnmatch(name, reg2): result.append("http://www.realclearpolitics.com" + name) return result
def get_year_bills(chamber, session): bills = [] url = bill_url(chamber, session) html = requests.get(url).text element = web.Element(html) bill_list = element.by_tag("a") for link in bill_list[1:]: number=link[0] bill = get_bill(url,number) bills.append(bill) return bills
def Globe_url_list(start_date, end_date, page): ''' Scrapes the Boston Globe archives to get a list of all URLs. Inputs: M(M)-D(D)-YYYY of start and end date, page number (1-indexed) Output: URL List ''' # split dates into M, D, Y start_date = start_date.split('-') end_date = end_date.split('-') options = {} # run the query url = 'http://pqasb.pqarchiver.com/boston/results.html' options['st'] = 'advanced' options['sortby'] = 'CHRON' options['datetype'] = 6 options['frommonth'] = start_date[0] options['fromday'] = start_date[1] options['fromyear'] = start_date[2] options['tomonth'] = end_date[0] options['today'] = end_date[1] options['toyear'] = end_date[2] options['type'] = 'current' options['start'] = (page - 1) * 10 options['QryTxt'] = 'romney OR obama' # try to get url with specified parameters try: html = requests.get(url, params=options).text except: print 'Unable to parse URL list for ' + str(url) return None # declare dom object to begin parsing the data dom = web.Element(html) url_list = [] # find each url for a in dom('table a'): # check if the a tag has a title, the title matches the Preview sring, and the href is not from the header faq section if (('title' in a.attrs) and (a.attrs['title'] == 'Preview (Abstract/Citation)') and (a.attrs['href'] != 'faq.html#abs')): # add url to url_list url_list += [str(a.attrs['href'])] return url_list
def get_population_html_tables(html): """Parse html and return html tables of wikipedia population data.""" dom = web.Element(html) ### 0. step: look at html source! #### 1. step: get all tables tbls = dom.by_class('sortable wikitable') #### 2. step: get all tables we care about return tbls
def race_result(url): dom = web.Element(requests.get(url).text) table = dom.by_tag('div#polling-data-rcp')[0] result = table.by_tag('tr.final')[0] td = result.by_tag('td') results = [float(t.content) for t in td[3:-1]] tot = sum(results) / 100 headers = table.by_tag('th') labels = {str(t.content).split('(')[0].strip() for t in headers[3:-1]} return {l: r / tot for l, r in zip(labels, results)}