def crawler_m01(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'https://www.mobile01.com/articles.php?c=18&p={}'.format(page)
        session = requests.session()
        headers = {
            'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36'
        }
        html = session.get(url, headers=headers)
        soup = BeautifulSoup(html.text, 'lxml')
        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')
        article_list = soup.find_all("a", {"class": "c-articleCard"})
        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            article_title = article.find_all("div", {"class": "l-articleCardDesc"})[0].text.strip()
            article_href = article.get('href')
            article_url = 'https://www.mobile01.com/{}'.format(article_href)
            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False or article_title == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break

            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('Title:{}'.format(article_title))
            print('-----------------------------------------------------------')
            article_html = session.get(article_url, headers=headers)
            article_soup = BeautifulSoup(article_html.text, 'lxml')

            article_content_list = article_soup.find_all("div", {"itemprop": "articleBody"})
            text_list = []
            for content_text in article_content_list:
                text_list.append(content_text.text)

            article_content = ''.join(text_list)
            data = {
                "title":article_title,
                "url":article_url,
                "content":article_content,
                "category":category,
                'date':today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)
        input_sql.insert_sql(mydb, data_list)
        page += 1
Example #2
0
def crawler_ltn(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'https://playing.ltn.com.tw/list/travel/{}'.format(page)

        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')

        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'lxml')
        article_list = soup.find_all("a", {"class": "tit"})

        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            article_title = article.text.strip()
            article_href = article.get('href')
            article_url = 'https:{}'.format(article_href)
            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False or article_title == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break
            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('title:{}'.format(article_title))
            print(
                '-----------------------------------------------------------')
            article_html = requests.get(article_url)
            article_soup = BeautifulSoup(article_html.text, 'lxml')
            article_content_list = article_soup.find_all('p')
            text_list = []
            for content_text in article_content_list:
                text_list.append(content_text.text)
            article_content = ''.join(text_list)
            data = {
                "title": article_title,
                "url": article_url,
                "content": article_content,
                "category": category,
                'date': today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)

        input_sql.insert_sql(mydb, data_list)
        page += 1
Example #3
0
def test(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'http://www.justlaw.com.tw/TxtSearch.php?keystr=%E6%B3%95%E9%99%A2&page={}'.format(page)
        session = requests.session()
        headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36'
        }
        html = session.get(url, headers=headers)
        print('連結狀況:{}'.format(html))
        soup = BeautifulSoup(html.text, 'lxml')

        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')

        article_list = soup.find_all('a', {'class':'link1'})

        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            time.sleep(1)
            article_title = article.text.strip()
            article_href = article.get('href')
            article_url = 'http://www.justlaw.com.tw/{}'.format(article_href)

            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False or article_title == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break

            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('title:{}'.format(article_title))
            print('-----------------------------------------------------------')
            headers = {
                'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36',
                'Referer':url
            }
            article_content_soup = ''
            article_html = session.get(article_url, headers=headers)
            article_soup = BeautifulSoup(article_html.text, 'lxml')
            article_content = article_soup.find_all('div', {'class':'law_inform_cnt'})
            for content in article_content:
                article_content_soup = BeautifulSoup(content.text, 'lxml')
            if not article_content_soup:
                continue
            article_content = article_content_soup.text
            data = {
                "title":article_title,
                "url":article_url,
                "content":article_content,
                "category":category,
                'date':today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            print(data)
            print('θθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθθ')
            data_list.append(data)

        # input_sql.insert_sql(mydb, data_list)
        page += 1
Example #4
0
def crawler_lehman(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'http://123.57.143.90/art/articleHomeShows.htm'
        session = requests.Session()
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36'
        }
        post_data = {'pageSize': '15', 'curPage': '{}'.format(page)}
        data = session.post(url, data=post_data, headers=headers)
        data_json = json.loads(data.text)
        article_list = data_json['rows']
        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')

        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            article_href = article['id']
            article_url = 'http://123.57.143.90/art/show.htm?id={}'.format(
                article_href)
            article_title = article['name']
            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False
                                     or article_title.strip() == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break
            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('title:{}'.format(article_title))
            print(
                '-----------------------------------------------------------')
            article_html = session.get(article_url, headers=headers)
            article_soup = BeautifulSoup(article_html.text, 'lxml')
            article_content_list = article_soup.find_all("p")
            text_list = []
            for content_text in article_content_list:
                if content_text.text.strip(
                ) == '' or '雷曼军事网' in content_text.text:
                    continue
                text_list.append(content_text.text.strip())

            article_content = ''.join(text_list)
            data = {
                "title": article_title,
                "url": article_url,
                "content": article_content,
                "category": category,
                'date': today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)
        input_sql.insert_sql(mydb, data_list)
        page += 1
        time.sleep(5)
Example #5
0
def crawler_udn_opinion(mydb, category):
    data_list = []
    crawler_page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        print('category:{}, page:{}'.format(category, crawler_page))

        if category == 'military':
            url = 'https://opinion.udn.com/opinion/ajax_articletag/%E8%BB%8D%E4%BA%8B%E8%A9%95%E8%AB%96/{}?_=1576737799589'.format(
                crawler_page)
        elif category == 'travel':
            url = 'https://udn.com/rank/ajax_newest/1013/0/{}?_=1576829807430'.format(
                crawler_page)
        else:
            break
        crawler_page += 1
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'lxml')
        article_list = soup.find_all('h2')

        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            article_title = article.a.text.strip()
            if category == 'military':
                article_url = 'https://opinion.udn.com{}'.format(
                    article.a.get('href'))
            elif category == 'travel':
                article_url = article.a.get('href')
            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False or article_title == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break
            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('--------------------------------------------------------')
            article_html = requests.get(article_url)
            article_soup = BeautifulSoup(article_html.text, 'lxml')
            content_text_list = article_soup.find_all('p')
            text_list = []
            for content_text in content_text_list:
                text_list.append(content_text.text)
            dict_content = ''
            content = dict_content.join(text_list)
            data = {
                'title': article_title,
                'url': article_url,
                'content': content,
                'category': category,
                'date': today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)
        input_sql.insert_sql(mydb, data_list)
Example #6
0
def crawler_udn(mydb, category):
    crawler_page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        print('category:{}, page:{}'.format(category, crawler_page))

        data_list = []
        if category == 'constellation':
            url = 'https://udn.com/news/get_article/{}/2/6649/7268?_=1575266787923'.format(
                crawler_page)
        elif category == 'military':
            url = 'https://udn.com/news/get_article/{}/2/6638/10930?_=1575956277623'.format(
                crawler_page)
        else:
            break

        crawler_page += 1
        html = requests.get(url)
        html_et = etree.HTML(html.text)
        article_num = 0
        for news_list_num in range(1, 21):
            try:
                article = html_et.xpath(
                    '/html/body/dt[{}]/a[2]'.format(news_list_num))
                article_title = html_et.xpath(
                    '/html/body/dt[{}]/a[2]/h2/text()'.format(news_list_num))
            except Exception as e:
                print(e)
                return
            if article == []:
                continue
            for news in article:
                news_url = 'http://udn.com{}'.format(news.attrib['href'])
                sel_sql = input_sql.select_sql(mydb, news_url)
                if article_num == 0 and (sel_sql == False
                                         or article_title == ''):
                    print('Over the App.')
                    return
                elif sel_sql == False or article_title == '':
                    print('Over the page.')
                    break
                article_num += 1
                print('Article number:{}, url:{}'.format(
                    article_num, news_url))
                print(
                    '--------------------------------------------------------')

                news_html = requests.get(news_url)
                new_soup = BeautifulSoup(news_html.text, 'lxml')
                new_content = new_soup.findAll('p')
                content_select = []
                for content in new_content:
                    content_select.append(content.text)
                content = ''.join(content_select)

                data = {
                    'title': article_title[0].strip(),
                    'url': news_url,
                    'content': content,
                    'category': category,
                    'date': today.strftime('%Y-%m-%d')
                }
                if data in data_list:
                    continue
                print(
                    '----------------------------------------------------------------------------'
                )
                data_list.append(data)
        input_sql.insert_sql(mydb, data_list)
Example #7
0
def crawler_storm(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'https://www.storm.mg/authors/126954/%E9%A2%A8%E4%BA%91%E8%BB%8D%E4%BA%8B/{}'.format(
            page)
        html = requests.get(url)
        soup = BeautifulSoup(html.text, 'lxml')

        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')

        article_list = soup.find_all('a', {'class': 'card_link link_title'})
        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for article in article_list:
            article_title = article.text.strip()
            article_href = article.get('href')
            article_url = 'https://www.storm.mg{}'.format(article_href)
            sel_sql = input_sql.select_sql(mydb, article_url)
            print(article_title)
            if article_num == 0 and (sel_sql == False or article_title == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break
            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('title:{}'.format(article_title))
            print(
                '-----------------------------------------------------------')
            article_html = requests.get(article_url)
            article_soup = BeautifulSoup(article_html.text, 'lxml')
            article_content_list = article_soup.find_all(
                'div', {'id': 'CMS_wrapper'})
            text_list = []
            for content_text in article_content_list:
                all_text = BeautifulSoup(content_text.text,
                                         'lxml').find_all('p')
                for text_p in all_text:
                    text_list.append(text_p.text)
            article_content = ''.join(text_list)
            data = {
                "title": article_title,
                "url": article_url,
                "content": article_content,
                "category": category,
                'date': today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)

        input_sql.insert_sql(mydb, data_list)
        page += 1
def crawler_upmedia(mydb, category):
    page = 1
    today = date.today()
    print('Today is {}'.format(today))
    while True:
        data_list = []
        url = 'https://www.upmedia.mg/news_list.php?currentPage={}&Type=157'.format(
            page)
        session = requests.session()
        headers = {
            'user-agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/79.0.3945.79 Chrome/79.0.3945.79 Safari/537.36'
        }
        html = session.get(url, headers=headers)
        soup = BeautifulSoup(html.text, 'lxml')
        article_list = soup.find_all('dd')
        print('page:{}'.format(page))
        print('※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※※')

        if article_list == []:
            print('On last page, over the App.')
            return

        article_num = 0
        for articles in article_list:
            article = articles.find_all('a')
            for article_detail in article:
                article_href = article_detail.get('href')

                if 'news_info' in article_href:
                    article_title = article_detail.text
                    break
            article_url = 'https://www.upmedia.mg/{}'.format(article_href)
            sel_sql = input_sql.select_sql(mydb, article_url)
            if article_num == 0 and (sel_sql == False
                                     or article_title.strip() == ''):
                print('Over the App.')
                return
            elif sel_sql == False or article_title == '':
                print('Over the page.')
                break
            article_num += 1
            print('Article number:{}, url:{}'.format(article_num, article_url))
            print('Title:{}'.format(article_title))
            print(
                '-----------------------------------------------------------')
            article_html = session.get(article_url, headers=headers)
            article_soup = BeautifulSoup(article_html.text, 'lxml')

            article_content_list = article_soup.find_all(
                "div", {"class": "editor"})
            text_list = []
            for content_text in article_content_list:
                if content_text.text.strip() == '':
                    continue
                text_list.append(content_text.text.strip())

            article_content = ''.join(text_list)
            data = {
                "title": article_title,
                "url": article_url,
                "content": article_content,
                "category": category,
                'date': today.strftime('%Y-%m-%d')
            }
            if data in data_list:
                continue
            data_list.append(data)
        input_sql.insert_sql(mydb, data_list)
        page += 1