def parser_article(url_list): titles = [] contents = [] for url in url_list: req = requests.get(url) soup = btfs(req.content, 'html.parser') try: title = soup.find('h1', class_='art-title').get_text() sub_title = soup.find('p', class_='art-sub').get_text() sub_title = re.sub(r'\s', '', sub_title) title = title + ' : ' + sub_title content = str(soup.find('div', class_='art-main')) content = re.sub(r'(<br/></p>)|(</p><br/>)|(<br/><br/>)', _addparag, content) content = btfs(content, 'html.parser').get_text() content = re.sub(r'\s', '', content).split('|||') content = '\n'.join(content) titles.append(title) contents.append(content) except Exception as e: print('something wrong in', url) print('Error:', e) time.sleep(1) data = pd.DataFrame({'titles': titles, 'contents': contents}) return data
def parser(self, url): req = requests.get(url) if req.status_code == 200: soup = btfs(req.content, 'html.parser') try: category = soup.find('h4', class_='catgr').get_text().strip() title = soup.find('h1', class_='art-title').get_text().strip() sub_title = soup.find('p', class_='art-sub').get_text() sub_title = re.sub(r'\s', '', sub_title) title = title + ' : ' + sub_title content = str(soup.find('div', class_='art-main')) content = re.sub(r'(<br/></p>)|(</p><br/>)|(<br/><br/>)', self._addparag, content) content = btfs(content, 'html.parser').get_text() content = re.sub(r'\s', '', content).split('|||') content = '\n'.join(content) except Exception as e: print('something wrong in', url) print('Error:', e) return False time.sleep(1) # 間格時間,防止對網站query過快 return self.DATATUPLE(category, title, content, url) else: return False
def parser_article(url): req = requests.get(url) if req.status_code == 200: soup = btfs(req.content, 'html.parser') try: title = soup.find('h1', class_='art-title').get_text().strip() sub_title = soup.find('p', class_='art-sub').get_text() sub_title = re.sub(r'\s', '', sub_title) title = title + ' : ' + sub_title content = str(soup.find('div', class_='art-main')) content = re.sub(r'(<br/></p>)|(</p><br/>)|(<br/><br/>)', _addparag, content) content = btfs(content, 'html.parser').get_text() content = re.sub(r'\s', '', content).split('|||') content = '\n'.join(content) except Exception as e: print('something wrong in', url) print('Error:', e) return {'stat': 0} else: return {'stat': 0} return {'title': title, 'content': content, 'stat': 1}
def parser(self, url): req = requests.get(url) if req.status_code == 200: # 確認網站正確連接 soup = btfs(req.content, 'html.parser') # 網站原始碼 # --開始解析 try: category = soup.find('meta', {'name': 'section'})['content'] datetime = soup.find('time').get_text().strip() title = soup.find('meta', {'property': 'og:title'})['content'] paragraphs = soup.select('article > p') paragraphs = [p.get_text().strip() for p in paragraphs] content = '\r\n'.join(paragraphs) author = soup.find('a', {'rel': 'author'}) author = author.get_text().strip() if author else '' except Exception as e: print('something wrong in', url) print('Error:', e) return False time.sleep(1) # 間格時間,防止對網站query過快 return self.DATATUPLE(category, datetime, title, content, author, url) else: return False
def parser(self, url): chrome_options = Options() # set chrome option chrome_options.add_argument('--headless') # 不開啟瀏覽器方式 with webdriver.Chrome(options=chrome_options) as driver: # 開始解析 try: driver.get(url) # 取得網站控制 soup = btfs(driver.page_source, 'html.parser') # 網站原始碼 category = soup.find('h2', text=re.compile('《.*》')).get_text().strip() hgroup = soup.find('hgroup') datetime = hgroup.find('div', class_='ndArticle_creat').get_text().strip() title = hgroup.find('h1').get_text().strip() content = soup.select('.ndArticle_margin > p') content = [p.get_text().strip() for p in content] content = '\r\n'.join(content) view = hgroup.find('div', class_='ndArticle_view') view = view.get_text().strip() if view else 0 time.sleep(0.5) # 間格時間,防止對網站query過快 except Exception as e: print(e) print(url) return False return self.DATATUPLE(category, datetime, title, content, url, view)
def parser(self, url): req = requests.get(url) if req.status_code == 200: # 確認網站正確連接 try: soup = btfs(req.content, 'html.parser') # 網頁原始碼 meta = soup.find('script', {'type': 'application/ld+json'}).get_text().strip() meta = json.loads(meta) category = meta['articleSection'][3:] hgroup = soup.find('hgroup') datetime = hgroup.find('div', class_='ndArticle_creat').get_text().strip()[5:] title = hgroup.find('h1').get_text().strip() content = soup.select('.ndArticle_margin > p')[0] # 去除span、a(不必要的內容) for span in content.select('span'): span.decompose() for a in content.select('a'): a.decompose() content = content.get_text().strip() view = hgroup.find('div', class_='ndArticle_view') view = view.get_text().strip() if view else 0 time.sleep(0.5) # 間格時間,防止對網站query過快 return self.DATATUPLE(category, datetime, title, content, url, view) except Exception as e: print(e) print(url) return False else: return False
def open_url2(url_2, to_print=''): try: r_2 = requests.get(url_2, headers=hds) r_2.raise_for_status #r_2.encoding = r.apparent_encoding #print r.status_code dem_2 = r_2.text sp_2 = btfs(dem_2, "html.parser") seach_next = '' print to_print, '1' #进度显示 sp_j = get_picurl_download(sp_2, to_print + '1_') seach_next = sp_j.parent.attrs['href'].split('_')[0] #print ' ',seach_next ''' 寻找下一页 ''' nub = 1 for next in sp_2.find_all(href=re.compile('^%s_' % seach_next)): nub = max(nub, int(next['href'].split('_')[-1].split('.')[0])) ''' loop ''' for i in range(2, nub + 1): to_next = src_url + seach_next + '_' + str(i) + '.html' #print to_next r_2 = requests.get(to_next, headers=hds) r_2.raise_for_status #r_2.encoding = r.apparent_encoding #print r.status_code dem_2 = r_2.text sp_2 = btfs(dem_2, "html.parser") print to_print, '%d' % i #进度显示 get_picurl_download(sp_2, to_print + str(i) + '_') #time.sleep(0.5) except: print("第二层网页打开失败")
def get_url(self, maxpage): url = [] for page in range(1, maxpage + 1): req = requests.get(self.BASIC_URL + str(page)) if req.status_code == 200: # 確認網站正確連接 soup = btfs(req.content, 'html.parser') # 網站原始碼 # 該頁所有新聞鏈結 a_link = soup.select('.rtddt > a') if a_link: # 沒有任何鏈結便結束 for a in a_link: url.append(a['href']) else: break time.sleep(1) # 間格時間,防止對網站query過快 return url
def get_url(self, maxpage=100): url = [] # 新聞鏈結 for page in range(1, maxpage + 1): req = requests.get(self.BASIC_URL, params={'page': page}) # 連接網站 if req.status_code == 200: # 確認網站正確連接 soup = btfs(req.content, 'html.parser') # 網站原始碼 # 下一頁不能按表示最後一頁 last_page_btn = soup.find('li', text='下一頁') if last_page_btn.has_attr( 'class') and 'disabled' in last_page_btn['class']: break # 該頁所有新聞鏈結 for a in soup.select('.listRight > ul > li > h2 > a'): url.append(urllib.parse.urljoin(self.BASIC_URL, a['href'])) time.sleep(1) # 間格時間,防止對網站query過快 return url
def get_url(self, maxpage): url = [] # iterate SITE for site in self.SITE: for page in range(1, maxpage + 1): req = requests.get(site + str(page)) if req.status_code == 200: # 確認網站正確連接 soup = btfs(req.content, 'html.parser') # 該頁所有鏈結 a_link = soup.select('.rtddt > a') if a_link: # 沒有任何鏈結便結束 for a in a_link: url.append(urllib.parse.urljoin(site, a['href'])) else: break time.sleep(1) # 間格時間,防止對網站query過快 return url
def get_article_url(driver): url = [] # 雜誌內容鏈結 for _ in range(MAXPAGE): soup = btfs(driver.page_source, 'html.parser') # 網站原始碼 # 解析出雜誌內容連結 for a in soup.select('.content_mixbox_txt > h4 > a'): url.append(urllib.parse.urljoin(BASIC_URL, a['href'])) try: next_page_btn = driver.find_element_by_id( 'ctl00_ContentPlaceHolder2_lnkbtnNext') # 取得下一頁按紐 if next_page_btn.get_attribute('disabled'): # 不可按代表最後一頁 break else: time.sleep(0.5) # 間格時間,防止對網站query過快 next_page_btn.click() except NoSuchElementException: print('NoSuchElementException') return url
def get_url(self, maxpage=100): chrome_options = Options() # set chrome option chrome_options.add_argument('--headless') # 不開啟瀏覽器方式 with webdriver.Chrome(chrome_options=chrome_options) as driver: url = [] # 雜誌內容鏈結 # --iterate 雜誌類別 for site in self.SITE: driver.get(self.SA_URL + '?Unit=featurearticles&Cate=' + site) # 取得網址控制 for _ in range(maxpage): soup = btfs(driver.page_source, 'html.parser') # 網站原始碼 # 解析出雜誌內容連結 for a in soup.select('.content_mixbox_txt > h4 > a'): url.append( urllib.parse.urljoin(self.BASIC_URL, a['href'])) try: next_page_btn = driver.find_element_by_id( 'ctl00_ContentPlaceHolder2_lnkbtnNext') # 取得下一頁按紐 if next_page_btn.get_attribute( 'disabled'): # 不可按代表最後一頁 break else: time.sleep(0.5) # 間格時間,防止對網站query過快 next_page_btn.click() # 下一頁 except NoSuchElementException: print('NoSuchElementException') # driver.close() # 關閉瀏覽器視窗,釋放資源 return url
titles = [] years = [] time = [] imdb_ratings = [] metascores = [] votes = [] us_gross = [] pages = np.arange(1, 951, 50) for page in pages: page = requests.get( 'https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,&user_rating=7.0,7.9&languages=en&start=' + str(page) + '&ref_=adv_nxt') soup = btfs(page.text, 'html.parser') # print(soup.prettify()) movie_div = soup.find_all('div', class_='lister-item mode-advanced') sleep(randint(2, 10)) # SCRAPPING ELEMENTS for i in movie_div: nv = i.find_all('span', {'name': 'nv'}) try: vote = nv[0].text except: vote = '0'
def get_divdnd_cal(currentTab='nextWeek', country='United States', min_yield=3): global currentText currentText = currentTab url = "https://www.investing.com/dividends-calendar/Service/getCalendarFilteredData" headers = { "User-Agent": random_user_agent(), "X-Requested-With": "XMLHttpRequest", "Accept": "text/html", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive" } data = { 'timeZone': 'GMT -11:00', 'timeFilter': { 'time_remaining': 'timeRemain', 'time_only': 'timeOnly' }, 'currentTab': currentTab, 'submitFilters': 1, 'limit_from': 0, 'country': ["United States", "South Korea"], } # 'thisWeek' , nextWeek req = requests.post(url, headers=headers, data=data) # root = fromstring(req.json()['data']) # table = root.xpath(".//tr") # table html = req.content trData = btfs(req.json()['data'], 'lxml').find_all('tr') # tdData = trData[0].find_all('td') rowList = [] columnList = [] flags = [] trDataLen = len(trData) for idx, i in enumerate(range(0, trDataLen)): tdData = trData[i].find_all('td') # print(tdData) flag = str( btfs(str(tdData[0]), 'lxml').find_all('td', {'class': 'flag'})) # print( flag.split('title=')[1].replace('></span></td>','')) if 'flag' in flag: flag_text = flag.split('title=')[1].replace( '></span></td>', '').replace('"]', '').replace('"', '') flags.append(flag_text) else: flags.append('-') tdDataLen = len(tdData) for j in range(0, tdDataLen): element = tdData[j].text columnList.append(element) rowList.append(columnList) columnList = [] cols = ['idx', 'Ticker', 'Ex_divd', 'Divdn', 'Type', 'Pay_date', 'Yield'] result = pd.DataFrame(rowList, columns=cols) result['flag'] = flags # print(icons) # result['type'] = icons result = result[result.flag == country] result['Yield'] = pd.to_numeric(result['Yield'].str.replace('%', ''), errors='coerce') print(result.info()) return result[result['Yield'] >= min_yield].reset_index(drop=True)
主函数: 支持断点下载: start_i = 0 开始页数 start_j = 0 每页中有20个子图集 ''' start_i = 0 start_j = 3 try: r = requests.get(url_1, headers=hds) r.raise_for_status #r.encoding = r.apparent_encoding #print r.status_code dem = r.text sp = btfs(dem, "html.parser").body #print(sp.prettify()) if start_i == 0: loop_url1(sp, start_i, start_j) all_loop = sp.find_all('span')[2] end_url_nub = 2 for url_loop in all_loop.find_all(href=re.compile('/tag/%s' % tag)): nub = int(url_loop['href'].split('/')[-1].split('.')[0]) end_url_nub = max(end_url_nub, nub) #print nub #print set_url