def get_live_log_map(search, url=None): full_search = ' '.join(search) if url: soup = get_soup(url) else: last_name = format_live_search(search) soup = get_soup(espn_search_url.format(search=last_name)) name_tag = soup.find('meta', attrs={'property': 'og:title'}) if name_tag: try: name = name_tag.get('content').replace(' Stats, News, Bio | ESPN', '') is_playing = soup.find('h3', class_='Card__Header__Title Card__Header__Title--no-theme', text='Current Game') just_played = soup.find('h3', class_='Card__Header__Title Card__Header__Title--no-theme', text='Previous Game') has_stats = soup.findChildren('div', class_='StatBlockInner ph2 flex-expand') if (is_playing or just_played) and has_stats: log_map = {} game_summary = soup.findChild('a', attrs={'title': 'Game Summary'}) stats_table = game_summary.find_next('tbody', class_='Table__TBODY') stats = [row.text for row in stats_table.findChildren(lambda tag: tag.name == 'td')] log_map['mp'] = stats[2] log_map['fg_pct'] = stats[3] log_map['tp_pct'] = stats[4] log_map['ft_pct'] = stats[5] log_map['trb'] = int(float(stats[6])) log_map['ast'] = int(float(stats[7])) log_map['blk'] = int(float(stats[8])) log_map['stl'] = int(float(stats[9])) log_map['pf'] = int(float(stats[10])) log_map['tov'] = int(float(stats[11])) log_map['pts'] = int(float(stats[12])) log_map['pm'] = has_stats[-1].text log_map['name'] = name return just_played is not None, log_map else: raise NoResultsError(f"Either {name} isn't currently playing or ESPN's site is lying to me") except Exception as ex: raise ex else: results_table = soup.find('div', attrs={'id': 'my-players-table'}).find_next('table') col_header = results_table.findChild('tr', class_='colhead') if col_header: player_results = results_table.findChildren(lambda tag: tag.name == 'tr' and tag.get('class') not in ['stathead', 'colhead']) result_map = {} for result in player_results: a = result.find_next('a') name = a.text.split(', ') name = f'{name[1]} {name[0]}' match = SequenceMatcher(None, full_search, name).ratio() result_map[a.get('href')] = match player_href = sorted(result_map, key=result_map.get, reverse=True)[0] return get_live_log_map(search, player_href) else: raise NoResultsError(f"No results for '{full_search}'")
def get_bill_pages(scraper, url=None,doc_types=None): if url is None: url = legislation_url() """Return a sequence of tuples by retrieving all the documents described in the given url (representing a specific GA and session.) Optionally filter the sequence to only the given document types ('house bill', 'senate bill', etc.). Each tuple returned will be in the form: (bill_id,short_name,status_url) """ s = get_soup(scraper, url) links = s("a", { "href": lambda x: x is not None and x.find("grplist.asp") != -1 }) links = map(lambda x: x['href'], links) d = {} for link in links: types = re.findall("DocTypeID=(.+?)&",link) for t in types: d.setdefault(t,[]).append(urljoin(url,link)) pages = [] if not doc_types: doc_types = ['HB','SB'] # sane default for type in doc_types: if d.has_key(type): simplified_url = min_max(d[type]) pages.extend(extract_bill_links(scraper, simplified_url)) return pages
def extract_vote_pdf_links(scraper, url,chamber_filter=None): """Given a URL to a "votehistory.asp" page, return a sequence of tuples, each of which has the form (chamber,label,url) It's expected that the URLs are for PDF files. """ l = [] s = get_soup(scraper, url) if s.find(text="No vote detail available for the selected legislation."): return [] tables = s("table") vote_table = tables[6] rows = vote_table("tr") rows = rows[1:] # lose header for row in rows: tds = row("td") if len(tds) > 1: c2 = tds[1] chamber = c2(text=True)[0] links = row("a") if links: link = links[0] href = urljoin(url,link['href']) label = link(text=True)[0] if (not chamber_filter) or chamber_filter.lower() == chamber.lower(): l.append((chamber,label,href)) return l
def get_bill_pages(scraper, url=None, doc_types=None): if url is None: url = legislation_url() """Return a sequence of tuples by retrieving all the documents described in the given url (representing a specific GA and session.) Optionally filter the sequence to only the given document types ('house bill', 'senate bill', etc.). Each tuple returned will be in the form: (bill_id,short_name,status_url) """ s = get_soup(scraper, url) links = s( "a", {"href": lambda x: x is not None and x.find("grplist.asp") != -1}) links = map(lambda x: x['href'], links) d = {} for link in links: types = re.findall("DocTypeID=(.+?)&", link) for t in types: d.setdefault(t, []).append(urljoin(url, link)) pages = [] if not doc_types: doc_types = ['HB', 'SB'] # sane default for type in doc_types: if d.has_key(type): simplified_url = min_max(d[type]) pages.extend(extract_bill_links(scraper, simplified_url)) return pages
def get_all_nfl_teams(year): """ Get all NFL teams and links to their season stats for a given year. """ url = BASE_URL + '/years/{}/'.format(year) soup = get_soup(url) table = soup.find('table', attrs={'id': 'team_stats'}) rows = table.find_all('tr') team_list = [] for row in rows: team = row.find('td', attrs={'data-stat': 'team'}) if not team: continue team_link = team.find('a') if not team_link: continue team_link = team_link.get('href') team_list.append(team_link) return team_list
def get_player_page(search=None, url=None): soup = get_soup(url if url else search_url.format(search=urllib.parse.quote(search))) log_holder = soup.find('span', text="Game Logs") if log_holder: return soup elif soup.findChild('div', class_='search-results'): nba_players = soup.find('div', attrs={"id": "players"}) if nba_players: results = nba_players.findChildren('div', class_='search-item') if len(results) == 1: href = nba_players.find_next('div', class_='search-item-url').text return get_player_page(url=bbref_url + href) else: result_map = {} for result in results: a = result.find_next('div', class_='search-item-name').find_next('a') name = letters.sub('', a.text) match = SequenceMatcher(None, search, name).ratio() result_map[a.get('href')] = match href = sorted(result_map, key=result_map.get, reverse=True)[0] return get_player_page(url=bbref_url + href) else: raise NoResultsError("No NBA results for %s" % search) else: raise NoResultsError("No results for %s" % search)
def fetch_mimvp(): """ 从http://proxy.mimvp.com/free.php 抓免费代理 """ querys = [ "proxy=in_tp", "proxy=in_hp", "proxy=in_tp&sort=p_transfer", "proxy=in_hp&sort=p_transfer", "proxy=in_tp&sort=p_ping", "proxy=in_hp&sort=p_ping", ] proxies = [] try: for query in querys: url = "http://proxy.mimvp.com/free.php?%s" % (query) soup = get_soup(url) table = soup.find("div", attrs={"class": "free-list"}).table tds = table.tbody.find_all("td") for i in range(0, len(tds), 10): ip = tds[i + 1].text port = img2port(tds[i + 2].img["src"]) protocal_types = tds[i + 3]["title"].split("/") response_time = tds[i + 7]["title"][:-1] transport_time = tds[i + 8]["title"][:-1] proxy = "%s:%s" % (ip, port) if port is not None: proxies += _filter_proxy(float(response_time), proxy) except: logger.warning("fail to fetch from mimvp") return proxies
def extract_vote_pdf_links(scraper, url, chamber_filter=None): """Given a URL to a "votehistory.asp" page, return a sequence of tuples, each of which has the form (chamber,label,url) It's expected that the URLs are for PDF files. """ l = [] s = get_soup(scraper, url) if s.find(text="No vote detail available for the selected legislation."): return [] tables = s("table") vote_table = tables[6] rows = vote_table("tr") rows = rows[1:] # lose header for row in rows: tds = row("td") if len(tds) > 1: c2 = tds[1] chamber = c2(text=True)[0] links = row("a") if links: link = links[0] href = urljoin(url, link['href']) label = link(text=True)[0] if (not chamber_filter ) or chamber_filter.lower() == chamber.lower(): l.append((chamber, label, href)) return l
def get_weather(self, province, city, spell): month_list = date_range(self.start_time, self.end_time) for month in month_list: url = self.history_url % (spell, month) print(url) weather_list = get_soup(url).find(name='div', id='content').find_all(name='tr') # remove the first element del (weather_list[0]) for weather in weather_list: detail = weather.find_all(name='td') date = detail[0].find( name='a').get('href').split('.')[0].split('/')[-1] date = get_all(date) state = detail[1].get_text() state = get_all(state) temperature = detail[2].get_text() temperature = get_all(temperature) wind = detail[3].get_text() wind = get_all(wind) print(province, city, date, state, temperature, wind) sql = 'INSERT INTO weather_list(weather_date, province, city, spell, state, temperature, wind) ' \ 'values (%s, %s, %s, %s, %s, %s, %s)' params = [ date, province, city, spell, state, temperature, wind ] self.mysql.insert(sql=sql, params=params)
def extract_bill_urls_from_group(scraper, chamber, url): """Given a url to a page grouping bills of a certain type in a certain session, return a sequence of all the URLs to the specific bill statuses from that page. """ s = get_soup(scraper, url) bill_links = s("a", {"href": re.compile(".*BillStatus.*DocTypeID")}) bill_links = map(lambda link: urljoin(url, link['href']), bill_links) return bill_links
def extract_bill_urls_from_group(scraper, chamber,url): """Given a url to a page grouping bills of a certain type in a certain session, return a sequence of all the URLs to the specific bill statuses from that page. """ s = get_soup(scraper, url) bill_links = s("a",{"href":re.compile(".*BillStatus.*DocTypeID")}) bill_links = map(lambda link: urljoin(url,link['href']), bill_links) return bill_links
def scrape_cards_bridge_nl(url: str, browser=get_browser()) -> dict: winds_bridge_nl = 'NWES' soup = get_soup(browser=browser, url=url) hand_tags = soup.find_all('div', class_='vierkant33procent spelverdeling_hand') return { wind: hand_tag.text.split('\n')[1:5] for wind, hand_tag in zip(winds_bridge_nl, hand_tags) }
def get_recipes(): soup = get_soup(BASE_URL + RECIPE_SUFFIX) tables = soup.find_all("table") #16 tables attack = tables[11] parse_table(attack, "attack") magic = tables[12] parse_table(magic, "magic") action = tables[13] parse_table(action, "action")
def get_city(self): content = get_soup(self.base_url) province_list = content.find_all(name='table')[-1] province_list = province_list.find_all(name='td') for index, province in enumerate(province_list): href = province.find(name='a') province_name = href.get_text() content = get_soup(self.base_url + href.get('href')) content = content.find(name='div', id='content') city_list = content.find(name='table').find_all(name='td') for city in city_list: city_href = city.find(name='a') city = city_href.get_text() spell = city_href.get('href').split('.')[0].split('/')[-1] sql = 'INSERT INTO city_list(province, city, spell) values (%s, %s, %s)' params = [province_name, city, spell] print(params) self.mysql.insert(sql, params) self.get_weather(province_name, city, spell)
def get_avg_log_table(search, last): player_soup = get_player_page(search) career_games = int(player_soup.find('h4', class_='poptip', attrs={'data-tip': 'Games'}).find_next('p').find_next('p').text) name_node = player_soup.find('h1', attrs={'itemprop': 'name'}) name = name_node.text if last > career_games: raise ValueError(f'{name} has only played {career_games} career games') page_id = player_soup.find('link', attrs={'rel': 'canonical'}).get('href').split('/')[-1].split('.')[0] log_soup = get_soup(last_url.format(page_id=page_id, last=career_games - last + 1, career=career_games)) table = log_soup.find('table', attrs={'id': 'pgl_basic_span'}).find('tbody') return name, table
def get_stepbridge_tournament_overview_dataframe(stepbridge_user_url: str) -> pd.DataFrame: logged_in_browser = browser_login_stepbridge(util.get_browser()) initial_soup = util.get_soup(browser=logged_in_browser, url=stepbridge_user_url) overview_page_urls = [stepbridge_user_url] overview_page_urls += get_other_page_urls_from_overview_page_stepbridge_my_results(initial_soup) result = get_all_tournament_overview_dataframe(browser=logged_in_browser, tournament_result_overview_urls=overview_page_urls) return result
def get_all_bill_urls(scraper, chamber,session,types=None): """Given a session number (e.g. '96' for the 2009-2010 GA session) and a chamber, return all bill URLs which can be identified as associated with the given session. At this time, Executive Orders and Joint Session Resolutions will never be returned. """ session_url = BASE_LEGISLATION_URL % session[0:2] s = get_soup(scraper, session_url) groups = extract_bill_groups(s,session_url) special_sessions = s(text=re.compile(".*View Special Session.*")) if special_sessions: ss_url = urljoin(session_url,special_sessions[0].parent['href']) ss = get_soup(scraper, ss_url) groups.extend(extract_bill_groups(ss,ss_url)) urls = [] for g in groups: doctype = extract_doctype(g) if (types is None or doctype in types) and (chamber == chamber_for_doctype(doctype)): urls.extend(extract_bill_urls_from_group(scraper, chamber, g)) return urls
def fetch_stackoverflow(): words = [] for pageNo in range(1, 20): url = 'https://stackoverflow.com/tags?page=%d&tab=popular' % (pageNo) soup = get_soup(url) tags_list = soup.find('div', attrs={'id': 'tags_list'}) trs = tags_list.table.find_all('tr') for tr in trs: tds = tr.find_all('td') for td in tds: words.append(td.a.text) return words
def get_all_tournament_overview_dataframe(browser: mechanize.Browser, tournament_result_overview_urls: list) -> pd.DataFrame: result = None for url in tournament_result_overview_urls: page_soup = util.get_soup(browser=browser, url=url) df_tournament_results_single_page = get_tournament_overview_dataframe(page_soup) if result is None: result = df_tournament_results_single_page else: result = result.append(df_tournament_results_single_page) result.reset_index(drop=True, inplace=True) return result
def fetch_lagou(): words = [] url = 'https://www.lagou.com/' soup = get_soup(url) category_list = soup.find_all('div', attrs={'class': 'menu_sub dn'}) for category in category_list: dls = category.find_all('dl') for dl in dls: names = dl.dd.find_all('a') for name in names: words.append(name.text) return words
def get_all_bill_urls(scraper, chamber,session,types=None): """Given a session number (e.g. '96' for the 2009-2010 GA session) and a chamber, return all bill URLs which can be identified as associated with the given session. At this time, Executive Orders and Joint Session Resolutions will never be returned. """ session_url = BASE_LEGISLATION_URL % session s = get_soup(scraper, session_url) groups = extract_bill_groups(s,session_url) special_sessions = s(text=re.compile(".*View Special Session.*")) if special_sessions: ss_url = urljoin(session_url,special_sessions[0].parent['href']) ss = get_soup(scraper, ss_url) groups.extend(extract_bill_groups(ss,ss_url)) urls = [] for g in groups: doctype = extract_doctype(g) if (types is None or doctype in types) and (chamber == chamber_for_doctype(doctype)): urls.extend(extract_bill_urls_from_group(scraper, chamber, g)) return urls
def fetch_zhipin(): words = [] url = 'http://www.zhipin.com/' soup = get_soup(url) job_menu = soup.find('div', attrs={'class': 'job-menu'}) dls = job_menu.find_all('dl') for dl in dls: divs = dl.find_all('div', attrs={'class': 'text'}) for div in divs: names = div.find_all('a') for name in names: words.append(name.text) return words
def get_player_log_table(search): player_soup = get_player_page(search) log_holder = player_soup.find('span', text="Game Logs") name_node = player_soup.find('h1', attrs={'itemprop': 'name'}) name = name_node.text game_log_link_list = log_holder.find_next('div').find('ul').findChildren('a') game_log_link = game_log_link_list.pop() if 'Playoffs' in game_log_link.text: game_log_link = game_log_link_list.pop() href = game_log_link.get('href') log_soup = get_soup(bbref_url + href) table = log_soup.find('table', attrs={'id': 'pgl_basic'}).find('tbody') return name, table
def extract_bill_links(scraper, url): """Given a url to a page of BillStatus links (as expected from min_max), return a list of tuples of the form (id, title, url) """ s = get_soup(scraper, url) links = s("a", { "href": lambda x: x is not None and x.find("BillStatus") != -1}) l = [] for link in links: text = link(text=True)[0].replace(" "," ") match = re.match("^(\S+)\s+(.+)$",text) if match: l.append((match.groups()[0],match.groups()[1],urljoin(url,link['href']))) return l
def get_highlight_lowlight_map(highlight=True): top_soup = get_soup(top_url) table = top_soup.find('table', attrs={'id': 'stats'}) if not table: return None else: rows = table.find('tbody').findChildren(lambda tag: tag.name == 'tr' and not 'thead' == tag.get('class') and tag.findChild(lambda child: child.name == 'td' and child.get('data-stat') == 'mp' and int(child.text.split(':')[0]) >= 25)) if highlight: return index_row(rows[0]) else: return index_row(rows[-1])
def extract_bill_links(scraper, url): """Given a url to a page of BillStatus links (as expected from min_max), return a list of tuples of the form (id, title, url) """ s = get_soup(scraper, url) links = s("a", {"href": lambda x: x is not None and x.find("BillStatus") != -1}) l = [] for link in links: text = link(text=True)[0].replace(" ", " ") match = re.match("^(\S+)\s+(.+)$", text) if match: l.append((match.groups()[0], match.groups()[1], urljoin(url, link['href']))) return l
def get_state_shapes(): url = 'https://www.mccurley.org/svg/data/states.svg' soup = get_soup(url) state_tags = soup.find_all('g', attrs={'statename': True}) full_coords = pd.DataFrame() for tag in state_tags: new_coords = get_coords(tag) full_coords = pd.concat([full_coords, new_coords]) return full_coords
def extract_versions(scraper, s): """Get the fulltext link from the page. visit it. get all links on that page that ref fulltext.asp skip the 'printer friendly' for the current page append '&print=true' to each of the links return a sequence of 2-tuples (name,link) """ versions = [] links = s("a", {"class": "legislinks", "href": re.compile(".*fulltext.asp.*")}) if links: s = get_soup(scraper, urljoin(s.orig_url, links[0]['href'])) links = s("a", {"href": re.compile(".*fulltext.asp.*"), "target": None}) # target is used for printer friendly, we'll skip that one. for link in links: versions.append((link.next, urljoin(s.orig_url,link['href'] + "&print=true"))) return versions
def get_components(url, componentType): components = [] soup = get_soup(url) attack_table_rows = soup.find_all("table")[10].find_all("tr") row_iter = iter(attack_table_rows) next(row_iter) #ignore header row for tr in row_iter: items_in_tr = len(tr.findChildren()) if items_in_tr > 13 or items_in_tr < 4: #ignore odd rows that get detected but aren't part of the chart continue components.append(parse_component_row(tr, componentType)) print("found " + str(len(components)) + " " + componentType + "...") with open("../db/data/khbbs/components/KHBBS" + componentType + ".json", "w") as file_pointer: json.dump(components, file_pointer)
def fetch_ip181(): """ http://www.ip181.com/ """ proxies = [] try: url = "http://www.ip181.com/" soup = get_soup(url) table = soup.find("table") trs = table.find_all("tr") for i in range(1, len(trs)): tds = trs[i].find_all("td") ip = tds[0].text port = tds[1].text response_time = tds[4].text[:-2] proxy = "%s:%s" % (ip, port) proxies += _filter_proxy(float(response_time), proxy) except Exception as e: logger.warning("fail to fetch from ip181: %s" % e) return proxies
def fetch_kxdaili(page): """ 从http://www.kxdaili.com抓取免费代理 """ proxies = [] try: url = "http://www.kxdaili.com/dailiip/1/%d.html" % page soup = get_soup(url) table_tag = soup.find("table", attrs={"class": "segment"}) trs = table_tag.tbody.find_all("tr") for tr in trs: tds = tr.find_all("td") ip = tds[0].text port = tds[1].text latency = tds[4].text.split(" ")[0] proxy = "%s:%s" % (ip, port) proxies += _filter_proxy(float(latency), proxy) except: logger.warning("fail to fetch from kxdaili") return proxies
def parse_bill(scraper, url): """Given a bill status URL, return a fully loaded Bill object, except for votes, which are expected to be handled externally. """ session = extract_session(url) chamber = chamber_for_doctype(extract_doctype(url)) s = get_soup(scraper, url) bill_id = extract_bill_id(s) landmark = s(text=re.compile(".*Short Description.*")) name_span = landmark[0].findParent().findNextSibling() bill_name = get_text(name_span) bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url) actions = extract_actions(s) for chamber,action,date in actions: bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em. sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions]) for type,namelist in sponsor_dict.iteritems(): for name in namelist: bill.add_sponsor(type,name) for name,link in extract_versions(scraper, s): bill.add_version(name,link) return bill
def fetch_xici(): """ http://www.xicidaili.com/nn/ """ proxies = [] try: url = "http://www.xicidaili.com/wt/" soup = get_soup(url) table = soup.find("table", attrs={"id": "ip_list"}) trs = table.find_all("tr") for i in range(1, len(trs)): tr = trs[i] tds = tr.find_all("td") ip = tds[1].text port = tds[2].text speed = tds[6].div["title"][:-1] latency = tds[7].div["title"][:-1] if float(speed) < 0.5 and float(latency) < 1.0: proxies.append("%s:%s" % (ip, port)) except: logger.warning("fail to fetch from xici") return proxies
def fetch_ip002(page=1): """ http://www.ip002.net/free.html """ proxies = [] try: url = "http://www.ip002.net/free_%d.html" % page soup = get_soup(url) table = soup.find( "table", attrs={"class": "table table-bordered table-hover"}) trs = table.tbody.find_all("tr") for i in range(2, len(trs)): tr = trs[i] tds = tr.find_all("td") ip = tds[1].text port = tds[2].text response_time = tds[4].text.split("/")[0] proxy = "%s:%s" % (ip, port) proxies += _filter_proxy(float(response_time) / 1000.00, proxy) except: logger.warning("failed to fetch ip002") return proxies
def fetch_httpdaili(): """ http://www.httpdaili.com/mfdl/ 更新比较频繁 """ proxies = [] try: url = "http://www.httpdaili.com/mfdl/" soup = get_soup(url) table = soup.find("div", attrs={"kb-item-wrap11"}).table trs = table.find_all("tr") for i in range(1, len(trs)): try: tds = trs[i].find_all("td") ip = tds[0].text port = tds[1].text type = tds[2].text proxies.append("%s:%s" % (ip, port)) except: pass except Exception as e: logger.warning("fail to fetch from httpdaili: %s" % e) return proxies
def __init__(self, page_sub_url): self.soup = get_soup(page_sub_url)
def __init__(self, url): self.url = url print('article url: ' + url) self.soup = get_soup(url, is_sub=False)
import bs4 import re import util import os abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) out="out/Objetivos.html" soup=util.get_soup("out/LFS201.html") soup.title.string=soup.title.string+": Objetivos" flds=soup.findAll("fieldset", attrs={'class': re.compile(r".*\bn2\b.*")}) for f in flds: if f.legend.get_text().strip().lower()=="objetivos de aprendizaje": f.legend.string=f.parent.h1.a.string f.div.p.extract() else: f.extract() for h in soup.findAll("h1"): h.extract() for div in soup.body.div.select(" > div"): div.unwrap() h=unicode(soup) with open(out, "wb") as file: file.write(h.encode('utf8'))
def get_max_page_index(self): soup = get_soup(vars.init_url[self.name]) maxpage = soup.find('div', {'class' : 'btn-group btn-group-paging'}).findAll('a')[1]['href'].replace(vars.url_ending, '') self.max_page_index = int(maxpage[maxpage.index('index') + 5:]) + 1
i.attrs["class"]="item" i.append(a) def get_lab(f,txt): a=soup.new_tag("a", **{"href": "labs/"+f, "title":"Fichero original en: https://lms.360training.com/custom/12396/808239/"+f}) a.string=txt return a soup = util.get_tpt("LFS201","rec/lfs201.css") fldB=None divCp=None hts=sorted(glob.glob('html/clean/*.html')) for ht in hts: soup2 = util.get_soup(ht) t=soup2.title b=soup2.body if "_popup" in ht: n=3 else: ca=int(cp.sub("\\1",ht)) if ca>caB: n=1 f=1 caB=ca else: n=2 f=f+1