Esempio n. 1
0
def fetch_news():
    news_list = []
    resp = r.get(index_url, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        resp.encoding = 'utf8'
        pq = PyQuery(resp.text)
        data_list = pq('ul#date-list-ul')
        for li in data_list('li').items():
            img = li('a > img')
            print(li('p').text())
            news_list.append(
                News(
                    url=li('a').attr('href'),
                    _id=li('a').attr('href').split('/')[-1].replace(
                        '.html', ''),
                    title=img.attr('alt'),
                    image=img.attr('src'),
                    overview=li('div#list-t p#list-abs').text(),
                    publish_time=li(
                        'div#list-t > p#list-sm span:first').text(),
                    origin=li('div#list-t > p#list-sm > span:last').text(),
                ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('xinlvjie')
    client.insert_many(fetch_news())
    print("新旅社爬取完毕!")
    resp = r.post(ajax_url, data=json.dumps(ajax_params), headers=headers)
    if resp is not None:
        res = resp.json()
        for i in res['res']:
            news_list.append(
                News(_id=i['id'],
                     title=i['title'],
                     overview=i['content'],
                     publish_time=i['create_time'],
                     origin=i['src_name'],
                     url=news_detail_url + i['uid']).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('dongmaiwang')
    cur_page = 1
    while True:
        print("爬取第%d页" % cur_page)
        result_list = fetch_news(cur_page)
        client.insert_many(result_list)
        if int(round(time.time())) - int(
                time.mktime(
                    time.strptime(result_list[-1]['publish_time'],
                                  "%Y-%m-%d %H:%M:%S"))) < 43200:
            cur_page += 1
            continue
        else:
            break
    print("动脉网爬取完毕!")
Esempio n. 3
0
def fetch_diyicaijing_news():
    news_list = []
    resp = r.get(diyicaijing_url,
                 params={'page': 2},
                 headers=diyicaijing_headers)
    bs = BeautifulSoup(resp.text, 'lxml')
    articles = bs.findAll('article', attrs={'class': 'article-item clearfix'})
    for article in articles:
        detail_url = diyicaijing_url[:-1] + article.a['href']
        if not detail_url.endswith('subscribe'):
            news_content = article.div.text.replace(' ', '').replace('\n', '')
            text_result = msg_extract_pattern.findall(news_content)
            if text_result is not None:
                for content in text_result:
                    news_list.append(
                        News(
                            _id=detail_url.split('/')[-1],
                            url=detail_url,
                            image=url_extract_pattern.search(
                                article.a['style']).group(1),
                            origin=content[0],
                            title=content[1],
                            publish_time=content[2],
                        ).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('diyicaijing')
    client.insert_many(fetch_diyicaijing_news())
Esempio n. 4
0
            data_dict = json.loads(data_json)
            for data in data_dict['data']['list']:
                news_list.append(News(
                    _id=data['id'],
                    title=data['title'],
                    overview=data['brief'],
                    image=data['thumb'],
                    publish_time=data['time'],
                    url=data['url'],
                    origin=data['columnName']
                ).to_dict())
                sort_field = data['sort_field']
    return news_list, sort_field


if __name__ == '__main__':
    client = MongodbClient('jueshengwang')
    result = fetch_index_news()
    client.insert_many(result[0])
    count_time -= 1
    min_id = result[1]
    while True:
        result = fetch_more_news(min_id)
        client.insert_many(result[0])
        if int(round(time.time())) - int(result[0][-1]['publish_time']) < 432000:
            count_time -= 1
            min_id = result[1]
            continue
        else:
            break
Esempio n. 5
0
        resp = r.get(penpai_ajax_url,
                     params=ajax_params,
                     headers=penpai_headers)
        resp_content = resp.text
        print("爬取:", resp.url)
        results = news_pattern.findall(resp_content)
        for result in results:
            if '小时前' in result[5]:
                hours_before = hours_pattern.search(result[5])
                if hours_before is not None:
                    if int(hours_before.group(1)) > 12:
                        return news_list
                    else:
                        news_list.append(
                            News(_id=result[0].split('_')[-1],
                                 title=result[2],
                                 overview=result[3].replace('\n', '').replace(
                                     ' ', ''),
                                 url=penpai_url + result[0],
                                 image='http:' + result[1],
                                 publish_time=result[5],
                                 origin=result[4]).to_dict())
        pageidx += 1
        time.sleep(random.randint(0, 2))


if __name__ == '__main__':
    client = MongodbClient('penpai')
    data_list = fetch_penpai_news()
    client.insert_many(data_list)
Esempio n. 6
0
                         'pagesize': 20
                     },
                     headers=iheima_headers)
        print("爬取:", resp.url)
        if resp is not None:
            resp_json = resp.json()
            contents = resp_json['contents']
            for content in contents:
                # 只抓取12个小时以内的新闻
                if int(round(time.time())) - int(
                        time.mktime(
                            time.strptime(content['published'],
                                          "%Y-%m-%d %H:%M"))) > 86400:
                    return news_list
                else:
                    news_list.append(
                        News(_id=content['contentid'],
                             title=content['title'],
                             url=iheima_url[:-1] + content['url'],
                             image=content['thumb'],
                             publish_time=content['published'],
                             origin=content['author'],
                             overview=content['description']).to_dict())
            page += 1


if __name__ == '__main__':
    client = MongodbClient('iheima')
    client.insert_many(fetch_iheima_news())
    print("爱黑马爬取完毕!")
Esempio n. 7
0
    bs = BeautifulSoup(resp.text, 'lxml')
    data_list = bs.find("ul", attrs={'class': 'gallery l-list-selected l-m'})
    lis = data_list.findAll('li')
    for li in lis:
        l_cbox = li.find('div', attrs={'class': 'l-cbox'})
        spans = l_cbox.find('div', attrs={
            'class': 'l-foot-par'
        }).findAll('span')
        news_id_result = xhs_news_id_pattern.search(li.a['href'])
        if news_id_result is not None:
            # 判断新闻的发布时间与当前时间的时间间隔,只保存12个小时以内的新闻
            publish_time = spans[1].text.replace('\n', '').strip()
            if int(round(time.time())) - int(
                    time.mktime(
                        time.strptime(publish_time,
                                      "%Y-%m-%d %H:%M:%S"))) < 43200:
                news_list.append(
                    News(_id=news_id_result.group(1),
                         url=li.a['href'],
                         title=li.a.img['alt'],
                         image=xhs_gd_url + li.a.img['src'],
                         origin=spans[0].text,
                         publish_time=publish_time,
                         overview=l_cbox.p.text).to_dict())
    return news_list


if __name__ == '__main__':
    client = MongodbClient('xinhuashe')
    client.insert_many(fetch_xh_focus())
    client.insert_many(fetch_gd_news())
Esempio n. 8
0
from news import News, MongodbClient
from tools import user_agents

base_url = 'https://www.iyiou.com/breaking/'
headers = {
    'User-Agent': user_agents.random_user_agent()
}


def fetch_news(url):
    news_list = []
    resp = r.get(url, headers=headers)
    print("抓取:", resp.url)
    if resp is not None:
        pq = PyQuery(resp.text)
        a_s = pq('.newsFlashListWrap > div > ul > li > a')
        for item in a_s.items():
            news_list.append(News(
                _id=item.attr('href').split('/')[-1].replace('.html', ''),
                url=item.attr('href'),
                title=item('span.fl').text(),
                publish_time=item('span.fr').text()
            ).to_dict())
        return news_list


if __name__ == '__main__':
    client = MongodbClient('yiou')
    for i in range(1, 3):
        client.insert_many(fetch_news("{}p{}.html".format(base_url, i)))