def crawler_m01(mydb, category): page = 1 today = date.today() print('Today is {}'.format(today)) while True: data_list = [] url = 'https://www.mobile01.com/articles.php?c=18&p={}'.format(page) session = requests.session() headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36' } html = session.get(url, headers=headers) soup = BeautifulSoup(html.text, 'lxml') print('page:{}'.format(page)) print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※') article_list = soup.find_all("a", {"class": "c-articleCard"}) if article_list == []: print('On last page, over the App.') return article_num = 0 for article in article_list: article_title = article.find_all("div", {"class": "l-articleCardDesc"})[0].text.strip() article_href = article.get('href') article_url = 'https://www.mobile01.com/{}'.format(article_href) sel_sql = input_sql.select_sql(mydb, article_url) if article_num == 0 and (sel_sql == False or article_title == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('Title:{}'.format(article_title)) print('-----------------------------------------------------------') article_html = session.get(article_url, headers=headers) article_soup = BeautifulSoup(article_html.text, 'lxml') article_content_list = article_soup.find_all("div", {"itemprop": "articleBody"}) text_list = [] for content_text in article_content_list: text_list.append(content_text.text) article_content = ''.join(text_list) data = { "title":article_title, "url":article_url, "content":article_content, "category":category, 'date':today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list) page += 1
def crawler_ltn(mydb, category): page = 1 today = date.today() print('Today is {}'.format(today)) while True: data_list = [] url = 'https://playing.ltn.com.tw/list/travel/{}'.format(page) print('page:{}'.format(page)) print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※') html = requests.get(url) soup = BeautifulSoup(html.text, 'lxml') article_list = soup.find_all("a", {"class": "tit"}) if article_list == []: print('On last page, over the App.') return article_num = 0 for article in article_list: article_title = article.text.strip() article_href = article.get('href') article_url = 'https:{}'.format(article_href) sel_sql = input_sql.select_sql(mydb, article_url) if article_num == 0 and (sel_sql == False or article_title == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('title:{}'.format(article_title)) print( '-----------------------------------------------------------') article_html = requests.get(article_url) article_soup = BeautifulSoup(article_html.text, 'lxml') article_content_list = article_soup.find_all('p') text_list = [] for content_text in article_content_list: text_list.append(content_text.text) article_content = ''.join(text_list) data = { "title": article_title, "url": article_url, "content": article_content, "category": category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list) page += 1
def hotal_info(mydb, category): city_list = ['上海', '三亞', '南京', '紐約'] today = date.today() print('Today is {}'.format(today)) session = requests.session() for city in city_list: all_hotal_data = crawler_booking(city) data_list = [] for hotal_data in all_hotal_data: time.sleep(1) eventlet.monkey_patch() with eventlet.Timeout(60, False): headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' } hotal_html = session.get(hotal_data['url'], headers=headers) hotal_soup = BeautifulSoup(hotal_html.text, 'lxml') hotal_content_list = hotal_soup.find_all( "div", {"class": "hp_desc_main_content"}) text_list = [] for content_text in hotal_content_list: text_list.append(content_text.text.strip()) hotal_content = ''.join(text_list) data = { "title": hotal_data['name'], "url": hotal_data['url'], "content": hotal_content, "keyword": city, "category": category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) print('目前總筆數:{}'.format(len(data_list))) input_sql.insert_sql(mydb, data_list)
def crawler_lehman(mydb, category): page = 1 today = date.today() print('Today is {}'.format(today)) while True: data_list = [] url = 'http://123.57.143.90/art/articleHomeShows.htm' session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36' } post_data = {'pageSize': '15', 'curPage': '{}'.format(page)} data = session.post(url, data=post_data, headers=headers) data_json = json.loads(data.text) article_list = data_json['rows'] print('page:{}'.format(page)) print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※') if article_list == []: print('On last page, over the App.') return article_num = 0 for article in article_list: article_href = article['id'] article_url = 'http://123.57.143.90/art/show.htm?id={}'.format( article_href) article_title = article['name'] sel_sql = input_sql.select_sql(mydb, article_url) if article_num == 0 and (sel_sql == False or article_title.strip() == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('title:{}'.format(article_title)) print( '-----------------------------------------------------------') article_html = session.get(article_url, headers=headers) article_soup = BeautifulSoup(article_html.text, 'lxml') article_content_list = article_soup.find_all("p") text_list = [] for content_text in article_content_list: if content_text.text.strip( ) == '' or '雷曼军事网' in content_text.text: continue text_list.append(content_text.text.strip()) article_content = ''.join(text_list) data = { "title": article_title, "url": article_url, "content": article_content, "category": category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list) page += 1 time.sleep(5)
def crawler_udn_opinion(mydb, category): data_list = [] crawler_page = 1 today = date.today() print('Today is {}'.format(today)) while True: print('category:{}, page:{}'.format(category, crawler_page)) if category == 'military': url = 'https://opinion.udn.com/opinion/ajax_articletag/%E8%BB%8D%E4%BA%8B%E8%A9%95%E8%AB%96/{}?_=1576737799589'.format( crawler_page) elif category == 'travel': url = 'https://udn.com/rank/ajax_newest/1013/0/{}?_=1576829807430'.format( crawler_page) else: break crawler_page += 1 html = requests.get(url) soup = BeautifulSoup(html.text, 'lxml') article_list = soup.find_all('h2') if article_list == []: print('On last page, over the App.') return article_num = 0 for article in article_list: article_title = article.a.text.strip() if category == 'military': article_url = 'https://opinion.udn.com{}'.format( article.a.get('href')) elif category == 'travel': article_url = article.a.get('href') sel_sql = input_sql.select_sql(mydb, article_url) if article_num == 0 and (sel_sql == False or article_title == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('--------------------------------------------------------') article_html = requests.get(article_url) article_soup = BeautifulSoup(article_html.text, 'lxml') content_text_list = article_soup.find_all('p') text_list = [] for content_text in content_text_list: text_list.append(content_text.text) dict_content = '' content = dict_content.join(text_list) data = { 'title': article_title, 'url': article_url, 'content': content, 'category': category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list)
def crawler_udn(mydb, category): crawler_page = 1 today = date.today() print('Today is {}'.format(today)) while True: print('category:{}, page:{}'.format(category, crawler_page)) data_list = [] if category == 'constellation': url = 'https://udn.com/news/get_article/{}/2/6649/7268?_=1575266787923'.format( crawler_page) elif category == 'military': url = 'https://udn.com/news/get_article/{}/2/6638/10930?_=1575956277623'.format( crawler_page) else: break crawler_page += 1 html = requests.get(url) html_et = etree.HTML(html.text) article_num = 0 for news_list_num in range(1, 21): try: article = html_et.xpath( '/html/body/dt[{}]/a[2]'.format(news_list_num)) article_title = html_et.xpath( '/html/body/dt[{}]/a[2]/h2/text()'.format(news_list_num)) except Exception as e: print(e) return if article == []: continue for news in article: news_url = 'http://udn.com{}'.format(news.attrib['href']) sel_sql = input_sql.select_sql(mydb, news_url) if article_num == 0 and (sel_sql == False or article_title == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format( article_num, news_url)) print( '--------------------------------------------------------') news_html = requests.get(news_url) new_soup = BeautifulSoup(news_html.text, 'lxml') new_content = new_soup.findAll('p') content_select = [] for content in new_content: content_select.append(content.text) content = ''.join(content_select) data = { 'title': article_title[0].strip(), 'url': news_url, 'content': content, 'category': category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue print( '----------------------------------------------------------------------------' ) data_list.append(data) input_sql.insert_sql(mydb, data_list)
def crawler_storm(mydb, category): page = 1 today = date.today() print('Today is {}'.format(today)) while True: data_list = [] url = 'https://www.storm.mg/authors/126954/%E9%A2%A8%E4%BA%91%E8%BB%8D%E4%BA%8B/{}'.format( page) html = requests.get(url) soup = BeautifulSoup(html.text, 'lxml') print('page:{}'.format(page)) print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※') article_list = soup.find_all('a', {'class': 'card_link link_title'}) if article_list == []: print('On last page, over the App.') return article_num = 0 for article in article_list: article_title = article.text.strip() article_href = article.get('href') article_url = 'https://www.storm.mg{}'.format(article_href) sel_sql = input_sql.select_sql(mydb, article_url) print(article_title) if article_num == 0 and (sel_sql == False or article_title == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('title:{}'.format(article_title)) print( '-----------------------------------------------------------') article_html = requests.get(article_url) article_soup = BeautifulSoup(article_html.text, 'lxml') article_content_list = article_soup.find_all( 'div', {'id': 'CMS_wrapper'}) text_list = [] for content_text in article_content_list: all_text = BeautifulSoup(content_text.text, 'lxml').find_all('p') for text_p in all_text: text_list.append(text_p.text) article_content = ''.join(text_list) data = { "title": article_title, "url": article_url, "content": article_content, "category": category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list) page += 1
def crawler_upmedia(mydb, category): page = 1 today = date.today() print('Today is {}'.format(today)) while True: data_list = [] url = 'https://www.upmedia.mg/news_list.php?currentPage={}&Type=157'.format( page) session = requests.session() headers = { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36' } html = session.get(url, headers=headers) soup = BeautifulSoup(html.text, 'lxml') article_list = soup.find_all('dd') print('page:{}'.format(page)) print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※') if article_list == []: print('On last page, over the App.') return article_num = 0 for articles in article_list: article = articles.find_all('a') for article_detail in article: article_href = article_detail.get('href') if 'news_info' in article_href: article_title = article_detail.text break article_url = 'https://www.upmedia.mg/{}'.format(article_href) sel_sql = input_sql.select_sql(mydb, article_url) if article_num == 0 and (sel_sql == False or article_title.strip() == ''): print('Over the App.') return elif sel_sql == False or article_title == '': print('Over the page.') break article_num += 1 print('Article number:{}, url:{}'.format(article_num, article_url)) print('Title:{}'.format(article_title)) print( '-----------------------------------------------------------') article_html = session.get(article_url, headers=headers) article_soup = BeautifulSoup(article_html.text, 'lxml') article_content_list = article_soup.find_all( "div", {"class": "editor"}) text_list = [] for content_text in article_content_list: if content_text.text.strip() == '': continue text_list.append(content_text.text.strip()) article_content = ''.join(text_list) data = { "title": article_title, "url": article_url, "content": article_content, "category": category, 'date': today.strftime('%Y-%m-%d') } if data in data_list: continue data_list.append(data) input_sql.insert_sql(mydb, data_list) page += 1