def replace_links_with_text(html): """any absolute links will be replaced with the url in plain text, same with any img tags """ soup = BeautifulSoup(html, 'html5lib') abs_url_re = r'^http(s)?://' images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') if url == '' or re.match(abs_url_re, url): image.replaceWith(format_url_replacement(url, text)) links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' if text == '': # this is due to an issue with url inlining in comments link.replaceWith('') elif url == '' or re.match(abs_url_re, url): link.replaceWith(format_url_replacement(url, text)) return force_text(soup.find('body').renderContents(), 'utf-8')
def parse(html): ''' 页面分析,如果接收到的内容是ERROR_NUM,则说明超时了,则无需在分析; 如果正常,则分别匹配出商品的id,name,price,stat,并写到以日期命名的文件中 ''' if not html: logger.info('======pass parse=====') return {} items = {} # print isinstance(html, str) parse_page = BeautifulSoup(html) goods = parse_page.find_all('div', class_='goods-content') for good in goods: good_id = good['nctype_goods'][1:]#在开始有一个空格 good_name = good.select('div[class="goods-name"]')[0].a.text.replace(',', '_') good_price = good.select('em[class="sale-price"]')[0].text if re.findall(u'\u4e07', good_price):#处理‘1.3万’这种价格 good_price = str(float(good_price[:-1])*10000) else:#去掉价格里的人民币符号 good_price = good_price[1:] good_stat = good.select('a[class="status"]')[0].text items[good_id] = good_name + ',' + good_price + ',' + good_stat return items
def from_pmml(self, pmml): """Returns a model with the intercept and coefficients represented in PMML file.""" model = self() # Reads the input PMML file with BeautifulSoup. with open(pmml, "r") as f: lm_soup = BeautifulSoup(f, "xml") if not lm_soup.RegressionTable: raise ValueError("RegressionTable not found in the input PMML file.") else: ##### DO I WANT TO PULL THIS OUT AS ITS OWN FUNCTION? ##### # Pulls out intercept from the PMML file and assigns it to the # model. If the intercept does not exist, assign it to zero. intercept = 0 if "intercept" in lm_soup.RegressionTable.attrs: intercept = lm_soup.RegressionTable['intercept'] model.intercept_ = float(intercept) # Pulls out coefficients from the PMML file, and assigns them # to the model. if not lm_soup.find_all('NumericPredictor'): raise ValueError("NumericPredictor not found in the input PMML file.") else: coefs = [] numeric_predictors = lm_soup.find_all('NumericPredictor') for i in numeric_predictors: i_coef = float(i['coefficient']) coefs.append(i_coef) model.coef_ = numpy.array(coefs) return model
def moderate_tags(html): """replaces instances of <a> and <img> with "item in moderation" alerts """ from askbot.conf import settings soup = BeautifulSoup(html, 'html5lib') replaced = False if settings.MODERATE_LINKS: links = soup.find_all('a') if links: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), links) replaced = True if settings.MODERATE_IMAGES: images = soup.find_all('img') if images: template = get_template('widgets/moderated_link.jinja') aviso = BeautifulSoup(template.render(), 'html5lib').find('body') map(lambda v: v.replaceWith(aviso), images) replaced = True if replaced: return force_text(soup.find('body').renderContents(), 'utf-8') return html
def test_23_admin_add_category(self): """Test ADMIN add category works""" self.create() category = {'name': 'cat', 'short_name': 'cat', 'description': 'description'} # Anonymous user url = '/admin/categories' res = self.app.post(url, data=category, follow_redirects=True) dom = BeautifulSoup(res.data) err_msg = "Anonymous users should be redirected to sign in" assert dom.find(id='signin') is not None, err_msg # Authenticated user but not admin self.signin(email=self.email_addr2, password=self.password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Non-Admin users should get 403" assert res.status_code == 403, err_msg self.signout() # Admin self.signin(email=self.root_addr, password=self.root_password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Category should be added" assert "Category added" in res.data, err_msg assert category['name'] in res.data, err_msg category = {'name': 'cat', 'short_name': 'cat', 'description': 'description'} self.signin(email=self.root_addr, password=self.root_password) res = self.app.post(url, data=category, follow_redirects=True) err_msg = "Category form validation should work" assert "Please correct the errors" in res.data, err_msg
def scrap_items(): for itemlist in ITEMLIST: soup = BS(urllib2.urlopen(''.join([LOLWIKI, itemlist])).read()) item_table = soup.find('table', class_='stdt sortable') for tr in item_table.find_all('tr'): tds = tr.find_all('td') if len(tds) < 1: continue if tr.find('p') == None: continue item_name = tr.find('p').text.strip() item_url = tr.find('img')['src'] if item_url.split(':')[0] == 'data': item_url = tr.find('img')['data-src'] if not HOOKED: continue #store item in database d_item = Item() d_item.name = item_name t_img = NamedTemporaryFile(delete=True) t_img.write(urllib2.urlopen(item_url).read()) t_img.flush() t_img.name = '.'.join([item_name, 'jpg']) d_item.picture = File(t_img) d_item.save()
def __call__(self, url, count_of_crawler): """ Function which fetch the content from the given URL and collect all the URL in the content and pass the first url of the page to fetch the content. """ try: page = urllib2.urlopen(url) soup = BeautifulSoup(page.read()) links_on_page = map(lambda anchor: anchor.get('href'), soup.find_all('a')) cleaned_url = map(lambda link: link if urlparse(link).scheme and urlparse(url).netloc else (urlparse(url) .scheme+"://"+urlparse(url).netloc+link if link[0] == "/" else url+link), links_on_page) visited_url.append(url) total_collected_url.append(cleaned_url) next_url_to_visit = [next_url for next_url in cleaned_url\ if not next_url in visited_url and not "#" in next_url][0] if count_of_crawler and next_url_to_visit: count_of_crawler = crawler(next_url_to_visit, count_of_crawler-1) except: print "It seems there is some issue in URL "+url return count_of_crawler
def crawlSearch(url,pages): try: arr=[] source_code=requests.get(url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for link in soup.findAll('a'): href=link.get('href') href_test=str(href) #if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#': if is_in_arr(pages,str(href))==False: if "microsoft" not in href_test and "facebook" not in href_test and "twitter" not in href_test and "google" not in href_test: if href_test.startswith("http"): if "bing" not in href_test: if "scholarships.com" not in href_test: pages.append(href) print str(href) else: if countS<2: crawl(href,pages) print "Crawling "+str(href) countS=countS+1 else: print "Skiping "+str(href) else: pass except: print "Error at: "+str(url)
def show_options(id): r = requests.get("https://interaktiv.mx.dk/toolbox/" + votetype + "/get/" + id) soup2 = BeautifulSoup(r.text, "lxml") clear_console() print_logo() print "(Interaktiv version. Kør scriptet med -h eller --help for flere indstillinger.)" print vote_text = soup2.find("div", attrs={"id": "vote_text"}).text print vote_text print if votetype == "advancedvotes": for option in soup2.find_all("div", attrs={"class": "vote_button"}): number = option.get("data-vote") text = option.text print "(%s) %s" % (number, text) print else: for option in soup2.find_all("div", attrs={"class": "vote_button"}): if option.get("id") == "vote_yes": number = "1" else: number = "0" text = option.text print "(%s) %s" % (number, text) print
def reverseIP(self): #acomodar la url como la necesitamos (www.url.com) if self.url.startswith("http://"): url = self.url.replace("http://","") #remplazar por vacio :v else: url = self.url #se envia por post ya que la pagina usa un formulario para pedir la url a escanear #data son los datos POST que es la url #remoteHost es como se envía el parametro (la url que se especifica en connection) data = {"remoteHost" : url} connection = requests.post( #parametros necesarios para la conexion url="http://www.ipfingerprints.com/scripts/getReverseIP.php", data=data ) #connection.text es el html que retorna la conexion #BeautifulSoup lo parsea menos horrible #html.parser para salida mas limpia beautifulOut = BeautifulSoup(connection.text, "html.parser") #aqui guardaremos todos los links que encontremos en la etiqueta response = list() #find_all busca todas las equitas y 'a' es el parametro para filtrar solo ese tipo de etiqueta for link in beautifulOut.find_all("a"): #href es el nombre del dominio (que es lo unico que nos interesa de toda la etiqueta) currentLink = link.get("href") response.append(currentLink[11:-2]) return response
def crawlLinkScoial(url): try: pages=[] arr=[] source_code=requests.get(url) plain_text=source_code.text soup=BeautifulSoup(plain_text) for link in soup.findAll('a'): href=link.get('href') href_test=str(href) #if href_test[0]!='/' and href_test[0]!='j' and href_test!='none' and href_test[0]!='#': if is_in_arr(pages,str(href))==False: if "facebook" in href_test or "twitter" in href_test or "google" in href_test: lin=getGoodLink(url) pages.append(lin+str(href)) newArr=deleteDuplicates(pages) for page in newArr: socialFile.write(page) socialFile.write("\n") allFile.write("Social-Media-Links: \n") for page in newArr: allFile.write(page) allFile.write("\n") except: print "Error at: "+str(url)
def convert_links(text, quote="\""): soup = BeautifulSoup(text, "html.parser") for t in soup.findAll(text=True): if has_link_parent(t): continue split = re.split(r"(?:(https?://)|(www\.))([\S]+\.[^\s<>\"\']+)", t) if len(split) == 1: continue r = "" n = 0 split = [s or "" for s in split] while split: if n % 2 == 0: r += split[0] split.pop(0) else: r += "<a href=%shttp://%s%s%s>%s%s%s</a>" % ( quote, split[1], split[2], quote, split[0], split[1], split[2] ) split.pop(0) split.pop(0) split.pop(0) n += 1 t.replaceWith(BeautifulSoup(r, "html.parser")) return str(soup)
def parse_data(data): page = BeautifulSoup(data) results = page.find("div", id="res") if results is None: raise NoResultsException calc = results.find("img", src="/images/icons/onebox/calculator-40.gif") if calc is not None: calc = results.find("h2", {"class": "r"}) if calc is not None: superscripts = calc.find_all("sup") if superscripts is not None and len(superscripts): for x in superscripts: x.contents[0].replaceWith("^" + x.contents[0]) return [dict(type="string", string=util.strip_html(calc).decode("utf-8"))] nresults = results.find_all("li", {"class": "g"}) if len(nresults) == 0: raise NoResultsException processed_results = [] for x in nresults: a_tag = x.find("a") if a_tag is not None: processed_results.append( dict(type="result", href=urlparse.parse_qs(urlparse.urlparse(a_tag["href"]).query)["q"][0], text=util.strip_html(a_tag).decode("utf-8"))) return processed_results
def get_sp500_symbols(): page_html = wiki_html('List_of_S%26P_500_companies', 'SP500.html') wiki_soup = BeautifulSoup(page_html, "html.parser") symbol_table = wiki_soup.find(attrs={'class': 'wikitable sortable'}) symbol_data_list = list() for symbol in symbol_table.find_all("tr"): symbol_data_content = dict() symbol_raw_data = symbol.find_all("td") td_count = 0 for symbol_data in symbol_raw_data: if(td_count == 0): symbol_data_content[ 'symbol'] = symbol_data.text elif(td_count == 1): symbol_data_content[ 'company'] = symbol_data.text elif(td_count == 3): symbol_data_content[ 'sector'] = symbol_data.text elif(td_count == 4): symbol_data_content[ 'industry'] = symbol_data.text elif(td_count == 5): symbol_data_content[ 'headquarters'] = symbol_data.text td_count += 1 symbol_data_list.append(symbol_data_content) return symbol_data_list[1::]
def parse(self, response): logger.info("Parsing {}".format(response.url)) soup = BeautifulSoup(response.body, "html.parser") trs = soup.find_all("tr", "item") if trs: for tr in trs: link = tr.find("a") article_url = DETAIL_URL.format(link["href"]) r = scrapy.Request(article_url, callback=self.parse_article) yield r # next urls try: next_url = soup.find(class_="next").a cat_url = response.url u = urlparse(cat_url) query = None # Strip the query part u = u._replace(query=query) follow_url = urlunparse(u) + next_url["href"] r = scrapy.Request(follow_url, callback=self.parse) yield r except AttributeError: logger.info("Done with".format(response.url)) pass
def _get_new_brunswick_flows(requests_obj): """ Gets current electricity flows in and out of New Brunswick. There is no reported data timestamp in the page. The page returns current time and says "Times at which values are sampled may vary by as much as 5 minutes." """ url = 'https://tso.nbpower.com/Public/en/SystemInformation_realtime.asp' response = requests_obj.get(url) soup = BeautifulSoup(response.text, 'html.parser') table = soup.find('table', attrs={'bordercolor': '#191970'}) rows = table.find_all('tr') headers = rows[1].find_all('td') values = rows[2].find_all('td') flows = {headers[i].text.strip(): float(row.text.strip()) for i, row in enumerate(values)} return flows
def get_page_info(id_no, s=None): ''' Extract restaurant information from Charlotte's health inspection website INPUT: id_no = int, id # for ESTABLISHMENT s = request.Session(), [OPTIONAL] OUTPUT: out = dict, establishment-level information ''' if s is None: s = requests.Session() link = 'https://public.cdpehs.com/NCENVPBL/INSPECTION/ShowESTABLISHMENTPage.aspx' payload = {'ESTABLISHMENT':id_no, 'esttst_cty':60} z = s.get(link, params=payload) soup = BeautifulSoup(z.content, from_encoding='UTF-8') t = soup.findAll('table')[0] insp_info = np.array([y.text for y in t.findAll('td', attrs={'class':'ttc'})]).reshape(-1,4) if insp_info.shape[0] < 1: return None r = t.findAll('td', attrs={'class':'dfv'}) rest_info = [x.text for x in r] return {'name' :rest_info[0], 'address' :rest_info[2], 'city' :rest_info[8], 'state' :rest_info[9], 'zip' :rest_info[10], 'type' :rest_info[16], 'county' :rest_info[19], 'inspections':insp_info}
def htmlfile(url): r = urllib2.urlopen(url) soup = BeautifulSoup(r) html = [] #html- title, css (body width 960px) html.append('<html><head><title>'+soup.title.string+'</title><link rel="stylesheet" type="text/css" href="page.css"></head><body>') #parses for content only in article div - depends on site oblicously content = soup.find('div', {'class': 'layout-block-a'}) #gets hhtml paragraphs and h1 headings - should be alterd for websites style for text in content.find_all(['p', 'h1']): if text.name == 'p': html.append(str(text).decode("ascii", "ignore")) else: html.append(str(text).decode("ascii", "ignore")) html.append('</body></html>') # creates html files here out = open(soup.title.string+'.html', 'a') for line in html: out.write(line) out.close( if __name__ == '__main__': main()
def getWeibos(self, keyword, page=1, count=None): url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'].decode('gb2312') soup = BeautifulSoup(infos) total_soup = soup.select('.headerR1')[0] total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip() return_val = {'total_count': int(total_num), 'msgs':[]} allmsgs = [] msgs_soup = soup.select('.nr_con') for msg_soup in msgs_soup: avatar = 'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href') nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':') nickname = nickandtext[0] text = nickandtext[1] ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text() allmsgs.append({ 'avatar': avatar, 'nickname': nickname, 'text': text, 'datetime': ts, }) return_val['msgs'] = allmsgs return return_val
def getCategoryUrl(site="",url=""): catDb = openTable(tableName=global_setting['catTable']) r = session.get(url) if not r.text: return False soup = BeautifulSoup(r.text) for level1 in soup.select('.classify_books'): curLevel1 = level1.select('.classify_title')[0].text curLevel1 = re.sub('\s', '', curLevel1) for level2 in level1.select('.classify_kind'): curLevel2 = level2.select('.classify_kind_name')[0].text curLevel2 = re.sub('\s', '', curLevel2) for level3 in level2.select('ul li a'): #curLevel3 = re.sub('\s', '', level3.text) curLevel3 = level3.text.strip() curlUrl = level3['href'] retFind = re.findall(r'\/cp(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'book' else: retFind = re.findall(r'\/cid(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'nonbook' if retFind: if catDb.find({'catId':curCatID}).count() >0: logger.debug('catetogy %s exists,skip\n'%(curCatID)) else: catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site}) return True
def extract_images(base,html): images = [] soup = BeautifulSoup(html) for img in soup.find_all("img"): if img.has_attr("src"): images.append(urljoin(base,img["src"])) return images
def insert_push(self): uw = user_website.UserWebsite() userids = uw.get_user_ids_by_website_id(self.website_id) for id in userids: p = push.Push() p.website_id = self.website_id p.user_id = id p.title = "has new notice" soup_diff = BeautifulSoup(self.get_different()) new_link_list = soup_diff.find_all('a') new_link_count = len(new_link_list) if new_link_count == 1: content = "one notice is published:\n" else: content = str(new_link_count) + " notices are published:\n" content += self.get_different() p.content = content p.content = p.content.replace('"',"'") p.date = datetime.now().strftime('%Y-%m-%d %H:%M:%S') p.website_id = self.website_id p.content_url = "" p.insert()
def prettify(which, id): prefix = which[0] bs = BeautifulSoup(open(os.path.join(root,which, i+"-" + prefix + ".xml")), 'xml') sgm = i + "-" + prefix + ".sgm" out = bs.prettify(encoding='utf-8') [first, rest] = out.split("\n",1) return rest.replace(sgm, i) # the ID in the files look like "atwoma-b.sgm" rather than "atwoma"
def Get_All_Teams(): data_path = '../data/' # get the teams url = 'http://espn.go.com/nba/teams' html = urllib.urlopen(url).read() soup = BeautifulSoup(html, 'lxml') # print (soup.prettify()) tables = soup.find_all('ul', class_ = 'medium-logos') tables[0].find_all('li')[0].h5.a name_pref_Tuples = [] city_name_Dict = {} for table in tables: lis = table.find_all('li') for li in lis: info = li.h5.a team_url = info['href'] team_name = info.text pref = team_url.split('/')[-2] city_name = ' '.join(info.text.split()[:-1]) if team_name == 'Portland Trail Blazers': city_name = 'Portland' city_name_Dict[city_name] = team_name name_pref_Tuples.append((team_name, pref)) print 'output two files: city_name.pickle and name_pref.pickle' print 'city_name.pickle is a dict with (city, team_name) pairs' print 'name_pref.pickle is a list of (team_name, team_name_prefix) tuples' pk.dump(city_name_Dict, open(data_path + 'city_name.pickle', 'wb')) pk.dump(name_pref_Tuples, open(data_path + 'name_pref.pickle', 'wb'))
def get_text_from_html(html_text): """Returns the content part from an HTML document retains links and references to images and line breaks. """ soup = BeautifulSoup(html_text, 'html5lib') # replace <a> links with plain text links = soup.find_all('a') for link in links: url = link.get('href', '') text = ''.join(link.text) or '' link.replaceWith(format_url_replacement(url, text)) # replace <img> tags with plain text images = soup.find_all('img') for image in images: url = image.get('src', '') text = image.get('alt', '') image.replaceWith(format_url_replacement(url, text)) # extract and join phrases body_element = soup.find('body') filter_func = lambda s: bool(s.strip()) phrases = map( lambda s: s.strip(), filter(filter_func, body_element.get_text().split('\n')) ) return '\n\n'.join(phrases)
def _login(self, username=None, store_password=False): if username is None: if self.USERNAME == "": raise LoginError("If you do not pass a username to login(), you should configure a default one!") else: username = self.USERNAME # Get password from keyring or prompt password_from_keyring = keyring.get_password("astroquery:www.eso.org", username) if password_from_keyring is None: if system_tools.in_ipynb(): log.warn("You may be using an ipython notebook:" " the password form will appear in your terminal.") password = getpass.getpass("{0}, enter your ESO password:\n".format(username)) else: password = password_from_keyring # Authenticate log.info("Authenticating {0} on www.eso.org...".format(username)) # Do not cache pieces of the login process login_response = self._request("GET", "https://www.eso.org/sso/login", cache=False) login_result_response = self._activate_form(login_response, form_index=-1, inputs={'username': username, 'password': password}) root = BeautifulSoup(login_result_response.content, 'html5lib') authenticated = not root.select('.error') if authenticated: log.info("Authentication successful!") else: log.exception("Authentication failed!") # When authenticated, save password in keyring if needed if authenticated and password_from_keyring is None and store_password: keyring.set_password("astroquery:www.eso.org", username, password) return authenticated
def get_visible_text(html): """returns visible text from html http://stackoverflow.com/a/19760007/110274 """ soup = BeautifulSoup(html, 'html5lib') [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])] return soup.get_text()
def get_games(date, output_file=None): # games_url = base + '/scoreboard/' + format_date(date) + '/games.json' games_url = si_base + 'schedule' #print format_date(date) result = requests.get(games_url, params={'date': format_date(date)}) #print games_url + format_date(date) soup = BeautifulSoup(result.text) #date_string = date.strftime('%B %d,%Y') games = soup.find_all('tr', 'component-scoreboard-list final') game_ids = [] for game in games: game_date_elem = game.find('div', 'game-anchor') game_date_text = game_date_elem['id'] game_date = date_parser.parse(game_date_text).date() if game_date == date: game_id = int(game['data-id']) game_ids.append(game_id) if output_file is not None: of = open(output_file, 'w') of.write(json.dumps({'game_date': format_date(date), 'game_ids': game_ids})) of.close() return game_ids
def getMoviesActors(movieList): """ :param A list containing formatted movie list :return: A list containing ID of the movie and all actors in that movie including actors ID """ actorsInMovies = {} for x in movieList: req = urllib.request.Request(BASE_URL+movieList[x]["Url"]+"/fullcredits") #print(req.full_url) # Header is necessary to get the right movie titles, as in the english title req.add_header('Accept-Language', 'en-US,en') # Send the request and get response response = urllib.request.urlopen(req) bsoup = BeautifulSoup(response) findCastList = bsoup.find("table", {"class": "cast_list"}) findAllActors = findCastList.findAll("td", itemprop="actor") actors = {} for d in findAllActors: actorName = d.find("span", itemprop="name") actorNumber = d.find("a", href=re.compile("\/name\/nm")) actorID = re.match("(?:\/name\/nm)(?P<userid>\d+)", actorNumber["href"]).group("userid") actors[actorID] = actorName.contents[0] actorsInMovies[movieList[x]["ID"]] = actors return actorsInMovies
def get_Comics(self, name, comic_url): if not self.mkdir(name): again = '' while (1): again = str(input('Directory ' + name + ' already exists, do you wanna to download again? (Y/N)')) if again == 'Y' or again == 'N': break if again == 'N': print('Folder \'BLEACH/' + name + '\' already exists!') return else: shutil.rmtree(self.path) self.mkdir(name) # Parse html page_url = self.prefix + comic_url data = urllib.request.urlopen(page_url).read().decode('utf-8', 'ignore') data.encode('utf-8') soup = BeautifulSoup(data, 'lxml') lists = soup.findAll('img', {'class': 'BDE_Image'}) print('Downloading: ' + name) # Define progress bar's length progress_bar = tqdm(unit='Pic', total=len(lists)) count = 0 for each in lists: pic_url = each['src'] filename = '%03d.txt' % count + '.' + pic_url.split('.')[-1] urllib.request.urlretrieve(pic_url, filename = self.path + '/' + filename) progress_bar.update(1) count = count + 1 # Close bar progress_bar.close()