def get_comment(movieid): pagenum = 25 comment_list = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/60.0.3112.101 Safari/537.36' } for i in range(pagenum): start = i * 20 url = ("https://movie.douban.com/subject/" + movieid + "/comments" + "?" + "start=" + str(start) + "&limit=20" + "status=P") r = requests.get(url, headers) websoup = B(r.text, "html.parser") div_list = websoup.find("div", id="wrapper").find_all('div', class_="comment") for link in div_list: comment_name = link.find("a", class_="").get_text() comment = link.find("span", class_="short").get_text().replace( ' ', '').replace('\n', '') vote = link.find("span", class_="votes").get_text() comment_list.append(comment) with open('D:\\python_study\\Spider_Douban\\comment.txt', 'a+', encoding="utf-8") as f: f.write(comment_name + ": " + comment + " " + vote + "\r\n") return comment_list
def english(art_album, sing): ext = ".html" url = "https://www.azlyrics.com/lyrics" links = "".join([url, "/", artistic, "/", sing_song, ext]) page = requests.get(links) # Parsing the HTML request soup = B(page.content, "html.parser") # Finding Artist/Album name heading = soup.find("div", class_="lyricsh") # Extracting Artist/Album name heading_extr = heading.get_text() print(heading_extr) # Finding the lyrics name lyrics_name = soup.find_all("b")[1] # Extracting the lyrics name name_extr = lyrics_name.get_text() print(name_extr) # Finding the lyrics lyrics = soup.find_all("div")[20] # Extracting the lyrics lyrics_extr = lyrics.get_text() print(lyrics_extr)
def clean(review): text = B(review) text = text.get_text() text = re.sub("[^a-zA-Z]", " ", text) text = text.lower().split() text = [w for w in text if w not in stop] return (" ".join(text))
def parse(self, html): # 截取出一级的目录 first_level_tags = [str(_) for _ in html.find_all('h2')] all_tags = [] # 企业背景、股东信息、对外投资信息等每个大模块之间的html(用正则解析两个h2标签中的html) for i in range(len(first_level_tags)): if i == len(first_level_tags) - 1: first_pattern_str = '({}.*)'.format(first_level_tags[i]) else: first_pattern_str = '({}.*?){}'.format(first_level_tags[i], first_level_tags[i + 1]) first_level_html = re.search(first_pattern_str, str(html)).group(1) # print(first_level_html) # 找到二级目录的标签 second_level_tags = [str(_) for _ in B(first_level_html, 'html.parser').find_all('h3')] for j in range(len(second_level_tags)): # print(j) # print(len(second_level_tags)) if j == len(second_level_tags) - 1: second_pattern_str = '({}.*)'.format(second_level_tags[j]) else: second_pattern_str = '({}.*?){}'.format(second_level_tags[j], second_level_tags[j + 1]) second_level_html = re.search(second_pattern_str, first_level_html).group(1) all_tags.append(second_level_html) # print(all_tags) dic = { } second_level_dic = { '工商信息': 'baseInfo', '分支机构': 'branch', '变更记录': 'changeInfo', '主要人员': 'staffCount', '股东信息': 'holderInfo', } for _ in all_tags: html = B(_, 'html.parser') title = html.find('h3').find('span').get_text(strip=True) if '工商信息' in title: data = self.base_parse(html) if data: dic[second_level_dic[title]] = data elif title in ['分支机构','变更记录']: data = self.table_parse(html) if data: dic[second_level_dic[title]] = data
def parsing_page(data): soup = B(data,'html.parser') img_list = [] for i in soup.find_all('div',class_='single-post')[1:]: img_title = i.find('h2').text.strip() print(img_title) img = i.find('img')['style'].split()[1] img_list.append(img) saving_img(img_title,img) print('Total image link got ={0}'.format(len(img_list)))
def cityinfo(bot, update): message = update.message.text city = message[5:] html = urlopen('http://nesiditsa.ru/city/' + city) soup = B(html, "html.parser") content = soup.find('div', 'city-info-block row') sections = content.find_all('td') #joke_list = [i.find('div', 'anekdot_text').text for i in content] bot.sendMessage(chat_id=update.message.chat_id, text=city[0].upper() + city[1:] + ' city:') for i in sections: bot.sendMessage(chat_id=update.message.chat_id, text=i.text)
def parsing_page(url): header = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0' } page = requests.get(url,headers=header) print('Received status code {0}'.format(page.status_code)) soup = B(page.content,'html.parser') for i in soup.find_all('tr')[1:]: print('working on it') try: data = parser(i) save_to_db(data) except Exception as e: print('Something happen wrong {0}'.format(e))
def hindi(sing): ext = ".html" url = "https://gaana.com/lyrics/amp/" links = "".join([url, sing_song, ext]) page = requests.get(links) # Parsing the HTML request soup = B(page.content, "html.parser") # Finding the lyrics name lyrics_name = soup.find_all("li", class_="current")[0] # Extracting the lyrics name name_extr = lyrics_name.get_text() print(name_extr) print() # Finding the lyrics lyrics = soup.find_all("pre")[0] # Extracting the lyrics lyrics_extr = lyrics.get_text() print(lyrics_extr)
from urllib.request import urlopen from urllib.error import HTTPError from urllib.error import URLError from bs4 import BeautifulSoup as B try: html = urlopen('https://coreyms.com/') except HTTPError as e: print(e) except URLError as e: print('The server is not found!') bsobj = B(html.read(), 'lxml') for article in bsobj.find_all('article'): title = article.h2.a.text content = article.div.p.text link = article.find('iframe', class_='youtube-player') print(title) print(content) print(link) print() # print(get_into.h2.a.text) # for i in bsobj. # print(len(img)) # print(img[0].get_text()) # print(list(bsobj.body))
import requests as rq from bs4 import BeautifulSoup as B base_url='https://edward.kmu.ac.kr/nx/' #page_path='/page%d' #page=2 res=rq.get(base_url) soup=B(res.content, 'lxml') posts=soup.select('body div.mainframe_VFrameSet_HFrameSet_VFrameSet1_WorkFrame_Child_M503056_form_div_Work_Tab01_tabpage9_grd_scho101_body_gridrow_7_cell_7_4GridCellTextContainerElement') mainframe_VFrameSet_HFrameSet_VFrameSet1_WorkFrame_Child_M503056_form_div_Work_Tab01_tabpage9_grd_scho101_body_gridrow_4_cell_4_4GridCellTextContainerElement for post in posts: title=post.find('h3').text.strip() descript=post.find('h4').text.strip() author=post.find('span').text.strip() print(title, descript, author +"\n") while True: sub_path=page_path%(page) page+=1 res=rq.get(base_url + sub_path) if(res.status_code !=200): break soup=B(res.content, 'lxml') posts=soup.select('body main.page-content div.wrapper div.home div.p') for post in posts:
def get_random_joke(): html = urlopen('http://anekdotme.ru/lenta/page_' + str(randint(1, 464))) soup = B(html) content = soup.find_all('div', 'anekdot') joke_list = [i.find('div', 'anekdot_text').text for i in content] return str(joke_list[randint(0, len(joke_list))])
# -*- coding: utf-8 -*- """ Created on Fri Feb 23 19:04:11 2018 @author: arvin """ from bs4 import BeautifulSoup as B from urllib.request import Request, urlopen req = Request( 'https://www.quora.com/search?q=US+Mortgage', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36' }) page = urlopen(req).read() soup = B(page, 'html.parser') #print(soup) res = soup.find_all('span', class_="question_text") for i in res: #print(i.string) print(i.get_text()) #To get only the text #questions = soup.find(class_='Question')
#-*- coding:UTF-8 -*- import requests,re from common import redis_conn from bs4 import BeautifulSoup as B url = "https://www.tianyancha.com/search?key=%E5%BA%B7%E4%B8%96%E4%BF%AD" headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': 'ssuid=7703879792; TYCID=8971320038af11e99421a35550a0a6c7; undefined=8971320038af11e99421a35550a0a6c7; _ga=GA1.2.210168479.1551066219; tyc-user-info=%257B%2522claimEditPoint%2522%253A%25220%2522%252C%2522myAnswerCount%2522%253A%25220%2522%252C%2522myQuestionCount%2522%253A%25220%2522%252C%2522explainPoint%2522%253A%25220%2522%252C%2522privateMessagePointWeb%2522%253A%25220%2522%252C%2522nickname%2522%253A%2522%25E9%2583%25AD%25E8%2594%25B7%25E8%2596%2587%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522privateMessagePoint%2522%253A%25220%2522%252C%2522state%2522%253A%25220%2522%252C%2522announcementPoint%2522%253A%25220%2522%252C%2522isClaim%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522discussCommendCount%2522%253A%25221%2522%252C%2522monitorUnreadCount%2522%253A%2522129%2522%252C%2522onum%2522%253A%252240%2522%252C%2522claimPoint%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTg0NDUwMTE0MiIsImlhdCI6MTU1MTA2NjI0MiwiZXhwIjoxNTY2NjE4MjQyfQ.6BJfIf_rAdYIwkneCRXeic9ZtL7xY4mGErRIZo_vCWhqC6k8-POwOQn95M24lAnY6CrFZE2NIwmNtOglyR5_zA%2522%252C%2522pleaseAnswerCount%2522%253A%25221%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215844501142%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTg0NDUwMTE0MiIsImlhdCI6MTU1MTA2NjI0MiwiZXhwIjoxNTY2NjE4MjQyfQ.6BJfIf_rAdYIwkneCRXeic9ZtL7xY4mGErRIZo_vCWhqC6k8-POwOQn95M24lAnY6CrFZE2NIwmNtOglyR5_zA; __insp_ss=1551075666199; aliyungf_tc=AQAAAIRlRSq+mAQAQBSWtu0EKZQZqkeP; csrfToken=HKgW16znHhTzPMQsyL4_cSYd; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1551066219,1551403276,1551659277; refresh_page=null; bannerFlag=true; _gid=GA1.2.838233971.1551920883; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1551947451; __insp_wid=677961980; __insp_slim=1551947452884; __insp_nv=true; __insp_targlpu=aHR0cHM6Ly93d3cudGlhbnlhbmNoYS5jb20vc2VhcmNoP2tleT0lRTUlQkElQjclRTQlQjglOTYlRTQlQkYlQUQ%3D; __insp_targlpt=5bq35LiW5L_tX_ebuOWFs_aQnOe0oue7k_aenC3lpKnnnLzmn6U%3D; __insp_norec_sess=true; __insp_slim=1551920897962', 'Host': 'www.tianyancha.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } s = requests.session() res = s.get(url,headers=headers) html = B(res.text, 'html.parser') spans = html.find_all('span',class_='tt hidden') key_list = [ 'phoneList', 'emailList', 'id', 'name', 'regStatus', 'base', 'regCapital', 'estiblishTime', 'creditCode', 'regLocation', 'businessScope', 'categoryStr', 'city',
"Host":"www.smzdm.com", "Referer":"https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } todoList=[] taskList=[] infos=[] lnkPrefix="https:" lnk="https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/" visited=[lnk] rp=ses.get(lnk) s=B(rp.text,"html.parser") updateTodo(s) getMsg(s) while len(todoList)!=0 or len(taskList)!=0: if len(todoList)==0: pass else: visited.append(todoList.pop()) lnk=lnkPrefix+visited[-1] #print("Getting %s"%lnk) tr.Thread(target=nowtask,args=(lnk,)).start() with open("DumpResource2.txt","w") as f:
def request_jisho(self, key): html = requests.get(f'https://jisho.org/search/{key}').content b = B(html, 'lxml') return b
"https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } todoList = [] infos = [] lnkPrefix = "https:" lnk = "https://www.smzdm.com/tag/%E6%AF%8F%E5%A4%A9%E5%88%B7%E4%BB%80%E4%B9%88/youhui/" visited = [lnk] rp = ses.get(lnk) s = B(rp.text, "html.parser") updateTodo(s) getMsg(s) while len(todoList) != 0: visited.append(todoList.pop()) lnk = lnkPrefix + visited[-1] #print("Getting %s"%lnk) s = B(ses.get(lnk).text, "html.parser") updateTodo(s) getMsg(s) with open("DumpResource.txt", "w") as f: f.write(json.dumps(infos)) print(time.time() - timeS)
# http://docs.python-guide.org/en/latest/scenarios/scrape/ import time as t import requests as r from lxml import html from bs4 import BeautifulSoup as B # Let the magic(finally!) begin? base_url = 'http://www.dotabuff.com/esports' pages = [ '/events/121/series', '/leagues/4716/series', '/leagues/4716/series?page=2', '/events/112/series', '/leagues/4700/series' ] all_match_ids = set() for page in pages: t.sleep(1) p = r.get(base_url + page, headers={'User-agent': 'Mozilla/5.0'}) soup = B(p.content) for link in soup.find_all('a'): l = link.get('href').split('/') try: if (l[1] == 'matches'): match_id = l[2] all_match_ids.add(match_id) except: pass f = open('matchids.txt', 'w') for match in all_match_ids: f.write(match + '\n') f.close()
def crawler(url, neighbour_names, hrff, name, dictf): header = { "dnt": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" } re = requests.get(url, headers=header) base = B(re.content, 'html.parser') data_dictionary = { "Location": name, "zillow-value": "Nodata", "one-year-change": "Nodata", "one-year-forcast": "Nodata", "market-temperature": "Nodata", "price-sqft": "Nodata", "median-listing-price": "Nodata", "median-sale-price": "Nodata", "avg-days-on-market": "Nodata", "negative-equity": "Nodata", "delinquincy": "Nodata", "rent-list-price": "Nodata", "rent-sqft": "Nodata", } try: market_temp = base.find('div', {'class': 'market-temperature'}) temperature = market_temp.find('div', {'class': 'zsg-h2'}).text data_dictionary['market-temperature'] = temperature except: pass try: outer = base.find('section', {'class': 'zm-forecast-chart'}) content_box = outer.find('ul', {'class': 'zsg-g'}) all_li = content_box.find('li', {'class': 'zsg-lg-1-2'}) sp = all_li.find('span', {'class': 'zsg-fineprint'}) sp.decompose() all_li = content_box.find('li', {'class': 'zsg-lg-1-2'}).text temp = all_li.replace(" ", "") temp = temp.replace("\n", "") temp = temp.replace("%", "") data_dictionary['one-year-change'] = temp except: pass try: outer = base.find('section', {'class': 'zsg-content-section market-overview'}) content_box = outer.find('ul', {'class': 'value-info-list'}) all_li = content_box.find_all('li') ab = [] for i in all_li: temp = i.find('span', {'class': 'value'}).text temp = temp.replace(" ", "") temp = temp.replace("\n", "") temp = temp.replace("%", "") temp = temp.replace("$", "") ab.append(temp) data_dictionary['zillow-value'] = ab[0] data_dictionary['one-year-forcast'] = ab[1] data_dictionary['median-listing-price'] = ab[2] data_dictionary['median-sale-price'] = ab[3] except: pass try: outer = base.find('section', {'class': 'zsg-content-section market-health'}) content_box = outer.find('ul', {'class': 'value-info-list'}) all_li = content_box.find_all('li') ab = [] for i in all_li: temp = i.find('span', {'class': 'value'}).text temp = temp.replace(" ", "") temp = temp.replace("\n", "") temp = temp.replace("%", "") ab.append(temp) if len(ab) >= 3: data_dictionary['avg-days-on-market'] = ab[0] data_dictionary['negative-equity'] = ab[1] data_dictionary['delinquincy'] = ab[2] else: data_dictionary['negative-equity'] = ab[0] data_dictionary['delinquincy'] = ab[1] except: pass if data_dictionary['negative-equity'] != 'Nodata': data_dictionary['negative-equity'] = round( float(data_dictionary['negative-equity']) / 100, 3) if data_dictionary['delinquincy'] != 'Nodata': data_dictionary['delinquincy'] = round( float(data_dictionary['delinquincy']) / 100, 3) try: outer = base.find('section', {'class': 'zsg-content-section listing-to-sales'}) content_box = outer.find('ul', {'class': 'value-info-list'}) all_li = content_box.find_all('li') ab = [] for i in all_li: temp = i.find('span', {'class': 'value'}).text temp = temp.replace(" ", "") temp = temp.replace("\n", "") temp = temp.replace("$", "") ab.append(temp) data_dictionary['price-sqft'] = ab[0] except: pass try: outer = base.find_all('section', {'class': 'zsg-content-section region-info'}) content_box = outer[1].find('ul', {'class': 'value-info-list'}) spans = content_box.find_all('span', {'class': 'value'}) ab = [] for i in spans: temp = i.text temp = temp.replace(" ", "") temp = temp.replace("\n", "") temp = temp.replace("$", "") ab.append(temp) data_dictionary['rent-list-price'] = ab[1] data_dictionary['rent-sqft'] = ab[2] except: pass print(data_dictionary) dictf.append(data_dictionary) try: nearby = base.find('section', {'class': 'zsg-content-section nearby-regions'}) neighbourhoods = nearby.find('div', { 'class': 'zsg-content-section' }).text.split() if 'Neighborhoods' in neighbourhoods: tables = nearby.find_all('table') for k in tables: at = k.find_all('a') for p in at: n_n = p.text n_l = p['href'] if n_n not in neighbour_names and n_l not in hrff: neighbour_names.append(n_n) hrff.append(n_l) except: pass
def update_moose_tcg(): moose_inventory = MooseInventory.objects start_time = time() # Entire Moose Loot Listed inventory listed_cards = api.get_category_skus('magic') if listed_cards['success'] is True: print(f"Updating {listed_cards['totalItems']} for Moose Inventory") for index, card in enumerate(listed_cards['results']): try: condition = card['conditionName'] printing = card['printingName'] print(index) if condition != 'Unopened': current_price = card['currentPrice'] low = card['lowPrice'] if current_price != low: sku = card['skuId'] product_id = card['productId'] name = card['productName'] expansion = card['groupName'] market = card['marketPrice'] language = card['languageName'] ''' If the card is not English it will be priced at the low price minus one cent. For each card in the MooseLoot inventory we will make a request to the tcgplayer page containing all seller data for a given product. We request and scan pages (10 results per page) until we find 2 listings with sellers that have 10,000 sales or more. We break the while loop once we have found those two listings and move on to the next card. In the case where only one or zero listings are found, we break the loop and use one price to match against or default to the market price. ''' if language != 'English' and printing != 'Foil': # catch instances where there is no low price try: updated_price = low - .01 except TypeError: updated_price = None if updated_price is not None: api.update_sku_price(sku_id=sku, price=updated_price, _json=True) else: card_data = { 'card_name': '', 'card_set': '', 'card_condition': '', 'seller_1_name': '', 'seller_1_total_sales': '', 'seller_2_name': '', 'seller_2_total_sales': '', 'seller_1_total_price': '', 'seller_2_total_price': '', 'Updated_price': '', } next_page = True page = 1 seller_data_list = [] while next_page is True: request_path = url(product_id=product_id, condition=condition, foil=printing, page=page) r = requests.get(request_path).content soup = B(r, 'html.parser') data = soup.find_all( 'div', {'class': 'product-listing '}) # Check if there are products in the request. If not that indicates no more listings and thus we break the loop if not data: break # loop over each item on the page and get Seller Info for d in data: check = d.find('span', {'class': 'seller__sales'}) if check is not None: seller_total_sales = integers_from_string( d.find('span', { 'class': 'seller__sales' }).text) seller_name = d.find( 'a', { 'class': 'seller__name' }).text.strip() seller_condition = d.find( 'div', { 'class': 'product-listing__condition' }).text.strip() if seller_total_sales >= 10000 and seller_name != 'MTGFirst' and seller_name != 'Moose Loot' and condition == seller_condition: # seller_feedback = d.find('span', {'class': 'seller__feedback-rating'}).text # function extracts all floating points from string. price = float_from_string( d.find( 'span', { 'class': 'product-listing__price' }).text) # Fail Safe in the case where html is changed and no real value is extracted if price is not None and price is not 0: shipping = float_from_string( d.find( 'span', { 'class': 'product-listing__shipping' }).text.strip()) # 25 would be extracted from shipping text that state "Free shipping over 25". We make this result 0 and # handle additional shipping costs using defaults if shipping == 25.: shipping = 0 # Default shipping added to cards under five. if price >= 5: total_price = price + shipping else: total_price = price # We are appending the two cheapest listings with 10,000 minimum sales and that meets other if requirements. # Break once we get 2 seller_data_list.append( total_price) if len(seller_data_list) == 1: card_data[ 'seller_1_name'] = seller_name card_data[ 'seller_1_total_sales'] = seller_total_sales card_data[ 'seller_1_total_price'] = total_price card_data[ 'card_name'] = name card_data[ 'card_set'] = expansion card_data[ 'card_condition'] = condition if len(seller_data_list) == 2: card_data[ 'seller_2_name'] = seller_name card_data[ 'seller_2_total_sales'] = seller_total_sales card_data[ 'seller_2_total_price'] = total_price next_page = False break page += 1 ''' We will check the number of other seller listings. If there were zero listings found we simply make the updated price the market price. If just one listing is found, we run the price algorithm which will just add shipping if default and price .01c less. If there are 2 10,000+ listings, algorithm will compare and take the best/cheapest listings price ''' if len(seller_data_list) == 1: seller_data_list.append(0) updated_price = moose_price_algorithm( seller_data_list=seller_data_list, market_price=market, low_price=low, condition=condition) card_data['updated_price'] = updated_price new = moose_inventory.create( name=card_data['card_name'], expansion=card_data['card_set'], condition=card_data['card_condition'], printing=printing, seller_1_name=card_data['seller_1_name'], seller_1_total_sales=card_data[ 'seller_1_total_sales'], seller_1_total_price=card_data[ 'seller_1_total_price'], seller_2_name=card_data['seller_2_name'], seller_2_total_sales=card_data[ 'seller_2_total_sales'], seller_2_total_price=card_data[ 'seller_2_total_price'], updated_price=card_data['updated_price'], ) new.save() if updated_price is not None: api.update_sku_price(sku_id=sku, price=updated_price, _json=True) if index < 100: print(index, name, expansion, condition, printing) print( f"Current: {current_price}, Market: {market}, low: {low}, Updated: {updated_price}" ) except Exception as e: print(e) subject = "Error on function to update MooseLoot tcg" message = f"Error on function to update MooseLoot tcg:\n {card}\n\nSeller Info: {seller_name, seller_total_sales}" mail_from = 'tcgfirst' mail_to = [ '*****@*****.**', ] send_mail(subject, message, mail_from, mail_to) end_time = time() elapsed = end_time - start_time subject = "Time elapsed for Moose Tcg Auto Price - 1 cycle" message = f"Time auto price completed: {elapsed} seconds" mail_from = 'tcgfirst' mail_to = [ '*****@*****.**', ] send_mail(subject, message, mail_from, mail_to)
driver = webdriver.Chrome(chromedriver, options=headless_options) driver.get('https://www.bananamall.co.kr/index.php') id_field = driver.find_element_by_name("id") id_field.clear() id_field.send_keys('amen03') pw_field = driver.find_element_by_name("passwd") pw_field.send_keys('2435570js!') pw_field.send_keys(Keys.RETURN) time.sleep(2) driver.get('https://www.bananamall.co.kr/etc/womanizer_sp_v1.php?cl=womanizer') html = driver.page_source soup = B(html, 'html.parser') brand_name = driver.find_elements_by_xpath( '/html/body/div[2]/div[2]/div[6]/div[2]/ul/li/a/span[2]') brand_url = soup.select( 'div.contents > div.brands.brands_cate.brands_cate_whole.clearfix > ul > li > a' ) brand_list = [] for num, item in enumerate(brand_name): brand_list.append([item.text, brand_url[num]['href']]) for item_url in brand_list: driver.get('https://www.bananamall.co.kr' + item_url[1]) loop, count = True, 0 while loop and count < 10:
def btc(): html = urlopen('http://bitkurs.ru/') soup = B(html) usd = soup.find('span', 'usd_c currencies').text rub = soup.find('span', 'rub_c currencies').text return '1 BTC = %s или %s' % (usd, rub)
end_page = int( driver.find_element_by_css_selector( '#container > div > nav > span > strong').text) + 1 print(end_page) num1 = 1 for page in range(1, end_page): driver.get( 'https://msdepart.com/adm/shop_admin/orderlist.php?od_status=&od_settle_case=&od_misu=&od_cancel_price=&od_refund_price=&od_receipt_point=&od_coupon=&fr_date=' + start_data + '&to_date=' + end_date + '&sel_field=od_id&search=&save_search=&sort1=od_id&sort2=desc&page=' + str(page)) html = driver.page_source soup = B(html, 'html.parser') data_raw = soup.select( '#sodr_list > tbody > tr > td.td_mng.td_mng_s > a > span.sound_only') driver.implicitly_wait(delay) num2 = 0 for index in data_raw: site = 'https://msdepart.com/shop/orderinquiryview.php?od_id=' + index.text driver.get(site) driver.implicitly_wait(delay)
reader = csv.DictReader(csvfile) # reader = itertools.islice(csv.DictReader(csvfile), 1000) for row in reader: url = row['url'] if (url): wd = webdriver.Chrome() wd.get(url) # And grab the page HTML source html_page = wd.page_source wd.quit() # Now you can use html_page as you like soup = B(html_page, "lxml") image_url = soup.find('link', { 'itemprop': 'image' }).get('href') else: image_url = 'https://vignette.wikia.nocookie.net/international-entertainment-project/images/9/94/SpongeBob_SquarePants_%28SpongeBob_SquarePants%29.png' #failed_url_list.append(url) image_url_list.append(image_url) except: pass csv_input['image_url'] = image_url_list csv_input.to_csv('Data/output1.csv', index=False)
def detail_parse(self, url): # url = company_dic.get('companyUrl') self.driver.get(url) time.sleep(3) # print(self.driver.page_source) try: html = B(str(self.driver.page_source), 'lxml') header_html = html.find('div', class_='detail') except Exception as e: log.error('获取头部信息失败 {}'.format(e)) return divs = header_html.find_all('div', class_='in-block') header_data = {} # 公司logo try: header_data['clUrl'] = html.find('div', class_='logo -w100').attrs['data-src'] except Exception: pass # 曾用名 try: header_data['usedName'] = html.find('div', class_='history-content').get_text(strip=True) except Exception: pass # 上市信息 try: # 股票板块 header_data['plate'] = html.find('span', class_='line').get_text() bond = html.find('span', class_='bond').get_text() bond_name = html.find('span', class_='bond_name').get_text() # 股票代号 header_data['stockNum'] = bond + bond_name except Exception: pass # 头部基本信息 header_dic = { '电话:': 'companyTel', '邮箱': 'companyEmail', '网址': 'companyWebeUrl', '地址': 'registerAddress', '简介': 'companyBrief', } for name, value in header_dic: try: if '网址' in name: header_data[value] = divs.find('span',text='网址:').find('a').get_text(strip=True) else: header_data[value] = divs.find('span', text=name).find('script', attrs={'type': 'text/html'}).get_text(strip=True) except Exception: pass """工商信息""" base_data = {} try: tables = html.find('div',id='_container_baseInfo').find_all('table') except Exception: log.error('工商信息获取失败') return # 法人及法人logo try: base_data['legalMan'] = tables[0].find('div',class_='humancompany').get_text(strip=True) base_data['mlUrl'] = tables[0].find('div',class_='lazy-img -image').img.attrs['data-src'] except Exception: pass # 注册资本 try: base_data['registerMoney'] = tables[0].find_all('tr', recursive=False)[0].find_all('td',recursive=False)[1].find_all('div',recursive=False)[1].attrs['title'] except Exception: pass # 注册时间 try: # base_data['registerTime'] registerTimes = tables[0].find_all('tr', recursive=False)[1].td.find_all('div',recursive=False)[1].get_text(strip=True).split('-') for registerTime in registerTimes: for _ in registerTime: except Exception: pass
def parse_and_get_list_company(self): ''' 获取列表页的公司数据 :return: ''' """另一种策略,不具体定位class里面的值,因为class里面的值会变,所以定位到标签,再用正则做匹配""" try: # 获取页面html html = B(self.driver.page_source, 'html.parser') # 找到所有的公司外层的模块 div_lists = html.find_all('div', attrs={'data-id': re.compile('\d+')}) except Exception as e: log.info('[error]: 找不到所有的公司外层的模块{}'.format(e)) self.db.account_results.update_one( {'_id': self.accunt_item['_id']}, {'$set': { "flag": 0 }}) self.driver.quit() quit() for div in div_lists: dic = {} try: tmp = div.find( 'a', attrs={ 'href': re.compile('https://www.tianyancha.com/company/\d+') }) # 公司名称 dic['companyName'] = tmp.get_text(strip=True) # 公司url dic['companyUrl'] = tmp.attrs['href'] # 经营状态 dic['businessState'] = tmp.next_sibling.get_text(strip=True) # 公司所属省份 dic['companyProvince'] = div.contents[2].get_text(strip=True) except Exception as e: log.info('[error]: {}'.format(e)) quit() # log.info('[error]: {}'.format(e)) # 法人/注册资本/注册时间/联系电话/邮箱/法人信息 tags = div.contents[1].contents[1:-2] data = [] for tag in tags: for _ in tag.contents: data.append(_.get_text(strip=True)) # 对初步解析的文本进一步分割 tmp_dic = { '法定代表人': 'legalMan', '代表人': 'representMan', '负责人': 'chargeMan', '注册资本': 'registerMoney', '注册时间': 'registerTime', '联系电话': 'companyTel', '邮箱': 'companyEmail', } for _ in data: key, value = _.split(":") # 联系电话可能存在多个 if key in ['法定代表人', '代表人', '负责人']: # 法定代表人url try: dic['manUrl'] = div.find('a', attrs={ 'title': value }).attrs['href'] except Exception as e: log.info('[error]: 获取法人链接失败{}'.format(e)) if key in ['联系电话', '邮箱']: try: tel_lists = re.search('.*\[(.*)\].*', value.replace( '\"', '')).group(1).split(',') except Exception: tel_lists = [value] dic[tmp_dic[key]] = tel_lists else: dic[tmp_dic[key]] = value # 数据存储 self.save_data(dic) """---------"""
import smtplib import requests as R from bs4 import BeautifulSoup as B import datetime url = 'https://news.ycombinator.com' req = R.get(url) data = req.text soup = B(data, "html.parser") #lt_text = soup.find_all("a",class_ = 'storylink').string.strip() #lt_link = soup.find_all("a",class_ = 'storylink').get('href') lt_text = soup.find_all("a", class_='storylink') email_text = '' for i in range(10): email_text += str(i + 1) + '. ' + lt_text[i].string + ' \n ' + lt_text[i].get( 'href') + ' \n ' #print(email_text.encode('utf-8')) email_text = email_text.encode('utf-8') subject = 'HackerNews Headlines : ' + str(datetime.date.today()) gmail_user = '******' #mailid gmail_password = '******' #password
from bs4 import BeautifulSoup as B html = """<html><head><title>test site</title></head> <body> <p>test1</p> <p id='d'>test2</p> <p>test3</p> </body></html>""" soup = B(html, 'lxml') print(soup.find_all(id='d'))
def read_html(self): with open('tyc.html', 'r', encoding='utf-8') as f: html = B(f.read(), 'html.parser', ) return html
def nowtask(nowlnk): taskList.append(nowlnk) ss=B(ses.get(nowlnk).text,"html.parser") updateTodo(ss) getMsg(ss) taskList.remove(nowlnk)
def html_detail(self): """ 解析模块,解析列表页面 :return: 解析后的数据 :rtype:dict or None """ """另一种策略,不具体定位class里面的值,因为class里面的值会变,所以定位到标签,再用正则做匹配""" try: # 获取页面html html = B(self.driver.page_source, 'html.parser') # 找到所有的公司外层的模块 # div_lists = html.find_all('div', attrs={'data-id': re.compile('\d+')}) div_lists = html.find( 'div', class_='result-list sv-search-container').find_all( 'div', attrs={'data-id': re.compile('\d+')}) except Exception as e: self.log.error('找不到所有的公司外层的模块{}'.format(e)) return items = [] for div in div_lists: dic = {} # 基本信息json try: base_txt = str(div.find('span', class_='tt hidden').get_text()) except Exception: base_txt = None # 找不到基本信息的json,可能是事业单位这种类型的,走下面那个逻辑 if base_txt: # 基本信息字典 base_txt = base_txt.replace('\"\"', 'None').replace( '\'', '\"').replace('null', 'None').replace('true', 'True') try: base_dic = self.re_sub(base_txt) except Exception: base_txt = base_txt.replace('\"\"', 'None').replace( '\'', '\"').replace('null', 'None').replace('true', 'True') base_dic = eval(base_txt) # 公司名称 dic['companyName'] = base_dic.get('name') # 公司url dic['companyUrl'] = "https://www.tianyancha.com/company/{}".format( base_dic.get('id')) # 营业状态 dic['businessState'] = base_dic.get('regStatus') # 省份 dic['companyProvince'] = base_dic.get('base') # 注册资金 dic['registerMoney'] = base_dic.get('regCapital') # 注册时间 dic['registerTime'] = base_dic.get('estiblishTime').split( ' ')[0] # 联系电话 dic['companyTel'] = base_dic['phoneList'] if base_dic.get( 'phoneList') else '' # 邮箱 dic['companyEmail'] = base_dic.get( 'emailList') if base_dic.get('emailList') else '' # 统一信用代码 dic['creditCode'] = base_dic.get('creditCode') # 注册地址 dic['registerAddress'] = base_dic.get('regLocation') # 经营范围 dic['businessScope'] = base_dic.get('businessScope') # 所属行业 dic['industry'] = base_dic.get('categoryStr') # 所属城市 dic['companyCity'] = base_dic.get('city') # 所属地区 dic['companyArea'] = base_dic.get('district') # 基本信息json dic['base_txt'] = base_txt else: try: tmp = div.find( 'a', attrs={ 'href': re.compile( 'https://www.tianyancha.com/company/\d+') }) # 公司名称 dic['companyName'] = tmp.get_text(strip=True) # 公司url dic['companyUrl'] = tmp.attrs['href'] except Exception as e: self.log.error(e) continue # 经营状态 try: dic['businessState'] = tmp.next_sibling.get_text( strip=True) except Exception: dic['businessState'] = '' # 公司所属省份 try: dic['companyProvince'] = div.contents[3].get_text( strip=True) except Exception: dic['companyProvince'] = '' # 法人/注册资本/注册时间/联系电话/邮箱/法人信息 tags = div.contents[2].contents[1:-1] data = [] for tag in tags: for _ in tag.contents: data.append(_.get_text(strip=True)) # 对初步解析的文本进一步分割 tmp_dic = { '法定代表人': 'legalMan', '代表人': 'representMan', '负责人': 'chargeMan', '注册资本': 'registerMoney', '资本总额': 'registerMoney', '注册时间': 'registerTime', '联系电话': 'companyTel', '邮箱': 'companyEmail', } for _ in data: try: key, value = _.split(":") except Exception: continue # 联系电话可能存在多个 if key in ['法定代表人', '代表人', '负责人']: # 法定代表人url try: dic['manUrl'] = \ div.find('a', attrs={'title': value}).attrs['href'] except Exception: pass # log.error('获取法人链接失败') if key in ['联系电话', '邮箱']: try: tel_lists = re.search('.*\[(.*)\].*', value.replace( '\"', '')).group(1).split(',') except Exception: tel_lists = [value] dic[tmp_dic[key]] = tel_lists else: try: dic[tmp_dic[key]] = value except Exception: pass # 数据存储 items.append(dic) return items if items else None