def __init__(self):
     self.bf = BloomFilter(
         host=setting['redies_host'],
         port=setting['redis_port'],
         key='article_toutiao_test',
         blockNum=1,
         db=0,
     )
Exemple #2
0
    def __init__(self):
        self.start_url = 'https://www.toutiao.com/ch/news_house/'

        browser = webdriver.ChromeOptions()
        browser.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=browser)

        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
Exemple #3
0
 def __init__(self):
     self.headers = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
     }
     self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house"
     self.proxies = [
         {
             "http": "http://192.168.0.96:3234"
         },
         {
             "http": "http://192.168.0.93:3234"
         },
         {
             "http": "http://192.168.0.90:3234"
         },
         {
             "http": "http://192.168.0.94:3234"
         },
         {
             "http": "http://192.168.0.98:3234"
         },
         {
             "http": "http://192.168.0.99:3234"
         },
         {
             "http": "http://192.168.0.100:3234"
         },
         {
             "http": "http://192.168.0.101:3234"
         },
         {
             "http": "http://192.168.0.102:3234"
         },
         {
             "http": "http://192.168.0.103:3234"
         },
     ]
     self.bf = BloomFilter(
         host=setting['redies_host'],
         port=setting['redis_port'],
         key='article_toutiao_test',
         blockNum=1,
         db=0,
     )
     self.rabbit = Rabbit(host=setting['rabbitmq_host'],
                          port=setting['rabbitmq_port'])
Exemple #4
0
 def __init__(self):
     self.headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
         # "Cookie": "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C"
     }
     self.start_url = "http://sh.house.163.com/news/"
     self.proxies = [{"http": "http://192.168.0.96:3234"},
                     {"http": "http://192.168.0.93:3234"},
                     {"http": "http://192.168.0.90:3234"},
                     {"http": "http://192.168.0.94:3234"},
                     {"http": "http://192.168.0.98:3234"},
                     {"http": "http://192.168.0.99:3234"},
                     {"http": "http://192.168.0.100:3234"},
                     {"http": "http://192.168.0.101:3234"},
                     {"http": "http://192.168.0.102:3234"},
                     {"http": "http://192.168.0.103:3234"}, ]
     self.bf = BloomFilter(host=setting['redies_host'],
                           port=setting['redis_port'],
                           key='article_toutiao_test',
                           blockNum=1,
                           db=0, )
class Meijing(object):
    def __init__(self):
        self.bf = BloomFilter(
            host=setting['redies_host'],
            port=setting['redis_port'],
            key='article_toutiao_test',
            blockNum=1,
            db=0,
        )

    def meijingstart(self):
        try:
            headers = {
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'Accept - Encoding': 'gzip, deflate',
                'Accept - Language': 'zh-CN,zh;q=0.9',
                'Cache - Control': 'max-age=0',
                'Connection': 'keep - alive',
                'User - Agent':
                'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
                'Upgrade - Insecure - Requests': '1',
                'Host': 'www.nbd.com.cn',
            }
            url = 'http://www.nbd.com.cn/fangchan'
            response = requests.get(url=url, headers=headers)
            soup = BeautifulSoup(response.text, 'lxml')

            piece = soup.select('.m-columnnews-list')[0]
            eachpiece = piece.select('li')
            for i in eachpiece:
                read_num = i.select('.f-source > span')[2].text.strip().strip(
                    '阅读')  #阅读量
                link = i.select('.f-title')[0].get('href')  #链接
                if self.bf.is_contains(link):
                    log.info('bloom_filter已经存在{}'.format(link))
                else:
                    self.bf.insert(link)
                    log.info('bloom_filter不存在,插入新的url:{}'.format(link))
                    proxies = [
                        {
                            "http": "http://192.168.0.96:3234"
                        },
                        {
                            "http": "http://192.168.0.93:3234"
                        },
                        {
                            "http": "http://192.168.0.90:3234"
                        },
                        {
                            "http": "http://192.168.0.94:3234"
                        },
                        {
                            "http": "http://192.168.0.98:3234"
                        },
                        {
                            "http": "http://192.168.0.99:3234"
                        },
                        {
                            "http": "http://192.168.0.100:3234"
                        },
                        {
                            "http": "http://192.168.0.101:3234"
                        },
                        {
                            "http": "http://192.168.0.102:3234"
                        },
                        {
                            "http": "http://192.168.0.103:3234"
                        },
                    ]
                    headers = {
                        'Accept':
                        'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8',
                        'Accept - Encoding': 'gzip, deflate',
                        'Accept - Language': 'zh - CN, zh;q = 0.9',
                        'Cache - Control': 'max - age = 0',
                        'Connection': 'keep - alive'
                    }
                    while True:
                        try:
                            response = requests.get(
                                url=link,
                                headers=headers,
                                proxies=proxies[random.randint(0, 9)])
                            break
                        except Exception as e:
                            log.error(e)

                    soup1 = BeautifulSoup(response.text, 'lxml')

                    title = soup1.select('.g-article-top > h1')[0].text.strip()
                    source = soup1.select('.source')[0].text.strip()
                    time = soup1.select('.time')[0].text.strip()
                    content = soup1.select('.g-articl-text')[0]
                    content = content.prettify()
                    img_replace = ImageReplace()
                    con = img_replace.image_download(content)
                    tag = soup1.select('.u-aticle-tag > span')
                    category = soup1.select('.u-column')[0].text
                    L = []
                    for j in tag:
                        tagList = j.text
                        L.append(tagList)

                    try:
                        desc = soup1.select('.g-article-abstract > p')[0].text
                        article.desc = desc
                        imglink = i.select('.u-columnnews-img > img')[0].get(
                            'data-aload')  # 图片链接
                        file_name = imglink
                        imglink = qiniufetch(imglink, file_name)
                        article.title_img = imglink
                    except:
                        article.desc = ''
                        article.title_img = ''
                    article.title = title
                    article.source_detail = source
                    article.post_time = time
                    article.body = con
                    article.tag = L
                    article.category = category
                    article.read_num = read_num
                    article.url = link
                    article.crawler_time = datetime.datetime.now()
                    article.insert_db()
                    log.info("{}文章入库".format("每经"))

            morelink = soup.select('#more')[0].get('href')
            return morelink
        except Exception as e:
            log.error(e)
Exemple #6
0
class Wangyi:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
            # "Cookie": "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C"
        }
        self.start_url = "http://sh.house.163.com/news/"
        self.proxies = [{"http": "http://192.168.0.96:3234"},
                        {"http": "http://192.168.0.93:3234"},
                        {"http": "http://192.168.0.90:3234"},
                        {"http": "http://192.168.0.94:3234"},
                        {"http": "http://192.168.0.98:3234"},
                        {"http": "http://192.168.0.99:3234"},
                        {"http": "http://192.168.0.100:3234"},
                        {"http": "http://192.168.0.101:3234"},
                        {"http": "http://192.168.0.102:3234"},
                        {"http": "http://192.168.0.103:3234"}, ]
        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )



    def connect(self):
        connection = pika.BlockingConnection(pika.ConnectionParameters(host=setting['rabbitmq_host'],
                                                                    port=setting['rabbitmq_port']))
        self.channel = connection.channel()
        self.channel.exchange_declare('article', 'direct', durable=True)
        self.channel.queue_declare('wangyi_article', durable=True)
        self.channel.queue_bind(exchange='article',
                           queue='wangyi_article',
                           routing_key='white')

    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        res.encoding = 'gbk'
        html = etree.HTML(res.text)
        city_list = html.xpath("//div[@class='city']/a")
        for city in city_list:
            city_name = city.xpath("./text()")[0]
            city_url = city.xpath("./@href")[0]
            city_news_url = city_url+'news'
            self.city_news(city_name,city_news_url)

    def city_news(self,city_name,city_url):
        while True:
            try:
                proxy = self.proxies[random.randint(0,9)]
                news_res = requests.get(city_url, headers=self.headers,proxies=proxy)
                break
            except Exception as e:
                log.error(e)
                continue
        news_res.encoding = 'gbk'
        news_html = etree.HTML(news_res.text)
        try:
            cate_list = news_html.xpath("//div[@class='importent-news']")
        except Exception as e:
            log.info(e)
            return
        for cate in cate_list:
            cate_name = cate.xpath("./h2/a/text()")[0]
            news_list = cate.xpath("./ul/li")
            for news in news_list:
                url = news.xpath("./h3/a/@href")[0]
                if self.bf.is_contains(url):  # 过滤详情页url
                    log.info('bloom_filter已经存在{}'.format(url))
                    continue
                else:
                    self.bf.insert(url)
                    log.info('bloom_filter不存在,插入新的url:{}'.format(url))
                try:
                    desc = news.xpath("./div[@class='news-detail']/p/text()")[0]
                except:
                    desc = None
                article = Article('网易')
                article.url = url
                article.desc = desc
                article.city = city_name
                article.category = cate_name
                message = json.dumps(article.to_dict())

                disconnected = True
                while disconnected:
                    try:
                        disconnected = False
                        self.channel.basic_publish(exchange='article',
                                                   routing_key='white',
                                                   body=message,
                                                   properties=pika.BasicProperties(delivery_mode=2))
                        log.info('已经放入队列')
                    except Exception as e:
                        log.error(e)
                        self.connect()
                        disconnected = True
Exemple #7
0
class Toutiao:
    def __init__(self):
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
        }
        self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house"
        self.proxies = [
            {
                "http": "http://192.168.0.96:3234"
            },
            {
                "http": "http://192.168.0.93:3234"
            },
            {
                "http": "http://192.168.0.90:3234"
            },
            {
                "http": "http://192.168.0.94:3234"
            },
            {
                "http": "http://192.168.0.98:3234"
            },
            {
                "http": "http://192.168.0.99:3234"
            },
            {
                "http": "http://192.168.0.100:3234"
            },
            {
                "http": "http://192.168.0.101:3234"
            },
            {
                "http": "http://192.168.0.102:3234"
            },
            {
                "http": "http://192.168.0.103:3234"
            },
        ]
        self.bf = BloomFilter(
            host=setting['redies_host'],
            port=setting['redis_port'],
            key='article_toutiao_test',
            blockNum=1,
            db=0,
        )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'],
                             port=setting['rabbitmq_port'])

    def start_crawler(self):

        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='toutiao')
        while True:
            try:
                self.url_list_crawler(channel)
                time.sleep(60)
            except:
                continue

    def url_list_crawler(self, channel):
        while True:
            proxy = self.proxies[random.randint(0, 9)]
            try:
                response = requests.get(self.start_url,
                                        headers=self.headers,
                                        proxies=proxy)
                url_dict = json.loads(response.text)
                url_list = url_dict["data"]
                break
            except:
                continue
        for url_content in url_list:
            con = url_content["content"]
            try:
                url = re.search('display_url":"(.*?)"', con).group(1)
            except:
                continue
            if re.search('wukong', url):
                continue
            else:
                if self.bf.is_contains(url):  # 过滤详情页url
                    log.info('bloom_filter已经存在{}'.format(url))
                    continue
                else:
                    self.bf.insert(url)
                    log.info('bloom_filter不存在,插入新的url:{}'.format(url))
                    article = Article('今日头条')
                    comment_code = Comment_url()
                    try:
                        organization_author = re.search(
                            '\\"source\\":\\"(.*?)\\"', con).group(1)
                        article.organization_author = organization_author
                    except Exception as e:
                        log.info('没有organization_author')
                    title = re.findall('"title":"(.*?)"', con)[1]
                    article.title = title
                    article.url = url
                    article.article_id = re.search('group/(\d+)', url).group(1)
                    comment_code.group_id = article.article_id
                    comment_code.crawler_time = datetime.datetime.utcnow()
                    try:
                        comment_count = re.search('\\"comment_count\\":(\d+)',
                                                  con).group(1)
                        article.comment_count = comment_count
                        comment_code.comment_count = comment_count
                    except Exception as e:
                        log.info('{}这篇文章没有评论'.format(title))
                    try:
                        title_img = re.search(
                            'middle_image.*?"url":"(.*?.webp)', con).group(1)
                        new_title_img = qiniufetch(title_img, title_img)
                        article.title_img = new_title_img
                    except Exception as e:
                        log.info('{}这篇文章没有list图片:'.format(title))

                    channel.basic_publish(exchange='',
                                          routing_key='toutiao',
                                          body=json.dumps(article.to_dict()))
                    log.info('已经放入队列')
Exemple #8
0
class Toutiao:
    def __init__(self):
        self.start_url = 'https://www.toutiao.com/ch/news_house/'

        browser = webdriver.ChromeOptions()
        browser.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=browser)

        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )

    def start_crawler(self):
        self.driver.get(self.start_url)
        time.sleep(5)
        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='article_test')
        while True:
            self.find_list_info(channel)
            self.driver.refresh()
            time.sleep(20)

    def find_list_info(self, channel):
        article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li')
        print('len, ', len(article_list))
        for i in article_list:
            if '看到这里' in i.text:
                print('看到这里')
                break
            try:
                wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text
            except Exception as e:
                continue
            if '悟空问答' in wenda:
                print('悟空问答')
                continue
            article_id = i.get_attribute('group_id')

            # article_id进入布隆过滤器
            if self.bf.is_contains(article_id):
                print('bloom_filter已经存在!')
                continue
            else:
                self.bf.insert(article_id)
                print('bloom_filter不存在,插入article_id!')

                article = Article('今日头条')
                try:
                    organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace(
                        '⋅', '')
                    article.organization_author = organization_author.strip()
                except Exception as e:
                    print('没有organization_author')
                title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text
                article.title = title
                url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href')
                article.url = url
                # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text
                # article.post_time = post_time

                try:
                    comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text
                    comment_count = int(re.search('\d+', comment_str, re.S | re.M).group())
                    article.comment_count = comment_count
                except Exception as e:
                    print('这篇文章没有评论', title)

                try:
                    title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src')
                    article.title_img = [title_img]
                except Exception as e:
                    print('这篇文章没有list图片:', title)

                print(article.to_dict())
                # 没有在过滤器的文章加入rabbitmq

                channel.basic_publish(exchange='',
                                      routing_key='article_test',
                                      body=json.dumps(article.to_dict()))
                print('已经放入队列')
from itertools import cycle
from article_img.qiniu_fetch import qiniufetch

setting = yaml.load(open('config_local.yaml'))

m = MongoClient(setting['mongo_config']['config_host'],
                setting['mongo_config']['port'])
m.admin.authenticate(setting['mongo_config']['user_name'],
                     setting['mongo_config']['password'])
collection = m[setting['mongo_config']['config_db']][setting['mongo_config']
                                                     ['coll_list']]

bf = BloomFilter(
    host=setting['redies_host'],
    port=setting['redis_port'],
    key='article_test',
    blockNum=1,
    db=0,
)

log = LogHandler(__name__)

connect = pika.BlockingConnection(
    pika.ConnectionParameters(
        host=setting['rabbitmq_host'],
        port=setting['rabbitmq_port'],
    ))


class CrawlerArticleListUrl:
    def __init__(self):
class Fenghuang:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
            "Cookie" : "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C"
        }
        self.start_url = "http://sh.house.ifeng.com/news/wap"
        self.proxies = [{"http": "http://192.168.0.96:3234"},
                        {"http": "http://192.168.0.93:3234"},
                        {"http": "http://192.168.0.90:3234"},
                        {"http": "http://192.168.0.94:3234"},
                        {"http": "http://192.168.0.98:3234"},
                        {"http": "http://192.168.0.99:3234"},
                        {"http": "http://192.168.0.100:3234"},
                        {"http": "http://192.168.0.101:3234"},
                        {"http": "http://192.168.0.102:3234"},
                        {"http": "http://192.168.0.103:3234"}, ]
        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        # self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
    def connect(self):
        connection = pika.BlockingConnection(pika.ConnectionParameters(host=setting['rabbitmq_host'],
                                                                    port=setting['rabbitmq_port']))
        self.channel = connection.channel()
        self.channel.exchange_declare('article', 'direct', durable=True)
        self.channel.queue_declare('fenghuang_article', durable=True)
        self.channel.queue_bind(exchange='article',
                           queue='fenghuang_article',
                           routing_key='black')

    def start_crawler(self):
        res = requests.get('http://house.ifeng.com/news',headers=self.headers)
        res.encoding = 'utf-8'
        html = etree.HTML(res.text)
        city_list = html.xpath("//ul[@id='city_hot1']/li/a")
        for city in city_list:
            city_name = city.xpath("./text()")[0]
            city_url = city.xpath("./@href")[0]
            city_id = city.xpath("./@siteid")[0]
            news_url = city_url + '/news'

            news_res = requests.get(news_url,headers=self.headers)
            news_res.encoding = 'utf-8'
            news_html = etree.HTML(news_res.text)
            cate_id_list = news_html.xpath("//ul[@id='newsNavScroll']/li[@cateid]")

            self.article_url_crawler(city_name,city_id,news_url,cate_id_list)


    def article_url_crawler(self,city_name,city_id,news_url,cate_id_list):
        post_url =  news_url + '/wap'
        count = 1
        while True:
            for i in cate_id_list:
                cate_id = i.xpath("./@cateid")[0]
                cate_name = i.xpath("./a/text()")[0]
                formdata = {
                    'pageid' : count,
                    'cateid' : cate_id,
                    'siteid' : city_id,
                    'type' : 2
                }
                # time.sleep(10)
                while True:
                    proxy = self.proxies[random.randint(0, 9)]
                    try:
                        res = requests.post(post_url,data=formdata,headers=self.headers,proxies=proxy)
                        json_dict = json.loads(res.text)
                        break
                    except Exception as e:
                        log.error(e)
                        continue

                if len(json_dict['data']['newslist']) == 0:
                    count = 1
                    continue
                else:
                    news_list = json_dict['data']['newslist']
                    for news_info in news_list:

                        try:
                            desc = news_info['desc']
                            url  = news_info['url']
                            keywords = news_info['keywords'].values()
                            keywords_list = []
                            for i in keywords:
                                keywords_list.append(i)
                        except:
                            continue
                        title_img_url = news_info['pic_url']
                        if self.bf.is_contains(url):  # 过滤详情页url
                            log.info('bloom_filter已经存在{}'.format(url))
                            continue
                        else:
                            self.bf.insert(url)
                            log.info('bloom_filter不存在,插入新的url:{}'.format(url))
                            new_title_img = qiniufetch(title_img_url, title_img_url)
                            article = Article('凤凰网')
                            article.url = url
                            article.desc = desc
                            article.tag = str(keywords_list)
                            article.title_img = new_title_img
                            article.city = city_name
                            article.category = cate_name
                            message = json.dumps(article.to_dict())

                            disconnected = True
                            while disconnected:
                                try:
                                    disconnected = False
                                    self.channel.basic_publish(exchange='article',
                                                          routing_key='black',
                                                          body=message,
                                                          properties=pika.BasicProperties(delivery_mode=2))
                                    log.info('已经放入队列')
                                except Exception as e:
                                    log.error(e)
                                    self.connect()
                                    disconnected = True

            count += 1