def __init__(self): self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, )
def __init__(self): self.start_url = 'https://www.toutiao.com/ch/news_house/' browser = webdriver.ChromeOptions() browser.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=browser) self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house" self.proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36", # "Cookie": "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C" } self.start_url = "http://sh.house.163.com/news/" self.proxies = [{"http": "http://192.168.0.96:3234"}, {"http": "http://192.168.0.93:3234"}, {"http": "http://192.168.0.90:3234"}, {"http": "http://192.168.0.94:3234"}, {"http": "http://192.168.0.98:3234"}, {"http": "http://192.168.0.99:3234"}, {"http": "http://192.168.0.100:3234"}, {"http": "http://192.168.0.101:3234"}, {"http": "http://192.168.0.102:3234"}, {"http": "http://192.168.0.103:3234"}, ] self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, )
class Meijing(object): def __init__(self): self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) def meijingstart(self): try: headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept - Encoding': 'gzip, deflate', 'Accept - Language': 'zh-CN,zh;q=0.9', 'Cache - Control': 'max-age=0', 'Connection': 'keep - alive', 'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36', 'Upgrade - Insecure - Requests': '1', 'Host': 'www.nbd.com.cn', } url = 'http://www.nbd.com.cn/fangchan' response = requests.get(url=url, headers=headers) soup = BeautifulSoup(response.text, 'lxml') piece = soup.select('.m-columnnews-list')[0] eachpiece = piece.select('li') for i in eachpiece: read_num = i.select('.f-source > span')[2].text.strip().strip( '阅读') #阅读量 link = i.select('.f-title')[0].get('href') #链接 if self.bf.is_contains(link): log.info('bloom_filter已经存在{}'.format(link)) else: self.bf.insert(link) log.info('bloom_filter不存在,插入新的url:{}'.format(link)) proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] headers = { 'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8', 'Accept - Encoding': 'gzip, deflate', 'Accept - Language': 'zh - CN, zh;q = 0.9', 'Cache - Control': 'max - age = 0', 'Connection': 'keep - alive' } while True: try: response = requests.get( url=link, headers=headers, proxies=proxies[random.randint(0, 9)]) break except Exception as e: log.error(e) soup1 = BeautifulSoup(response.text, 'lxml') title = soup1.select('.g-article-top > h1')[0].text.strip() source = soup1.select('.source')[0].text.strip() time = soup1.select('.time')[0].text.strip() content = soup1.select('.g-articl-text')[0] content = content.prettify() img_replace = ImageReplace() con = img_replace.image_download(content) tag = soup1.select('.u-aticle-tag > span') category = soup1.select('.u-column')[0].text L = [] for j in tag: tagList = j.text L.append(tagList) try: desc = soup1.select('.g-article-abstract > p')[0].text article.desc = desc imglink = i.select('.u-columnnews-img > img')[0].get( 'data-aload') # 图片链接 file_name = imglink imglink = qiniufetch(imglink, file_name) article.title_img = imglink except: article.desc = '' article.title_img = '' article.title = title article.source_detail = source article.post_time = time article.body = con article.tag = L article.category = category article.read_num = read_num article.url = link article.crawler_time = datetime.datetime.now() article.insert_db() log.info("{}文章入库".format("每经")) morelink = soup.select('#more')[0].get('href') return morelink except Exception as e: log.error(e)
class Wangyi: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36", # "Cookie": "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C" } self.start_url = "http://sh.house.163.com/news/" self.proxies = [{"http": "http://192.168.0.96:3234"}, {"http": "http://192.168.0.93:3234"}, {"http": "http://192.168.0.90:3234"}, {"http": "http://192.168.0.94:3234"}, {"http": "http://192.168.0.98:3234"}, {"http": "http://192.168.0.99:3234"}, {"http": "http://192.168.0.100:3234"}, {"http": "http://192.168.0.101:3234"}, {"http": "http://192.168.0.102:3234"}, {"http": "http://192.168.0.103:3234"}, ] self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) def connect(self): connection = pika.BlockingConnection(pika.ConnectionParameters(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])) self.channel = connection.channel() self.channel.exchange_declare('article', 'direct', durable=True) self.channel.queue_declare('wangyi_article', durable=True) self.channel.queue_bind(exchange='article', queue='wangyi_article', routing_key='white') def start_crawler(self): res = requests.get(self.start_url, headers=self.headers) res.encoding = 'gbk' html = etree.HTML(res.text) city_list = html.xpath("//div[@class='city']/a") for city in city_list: city_name = city.xpath("./text()")[0] city_url = city.xpath("./@href")[0] city_news_url = city_url+'news' self.city_news(city_name,city_news_url) def city_news(self,city_name,city_url): while True: try: proxy = self.proxies[random.randint(0,9)] news_res = requests.get(city_url, headers=self.headers,proxies=proxy) break except Exception as e: log.error(e) continue news_res.encoding = 'gbk' news_html = etree.HTML(news_res.text) try: cate_list = news_html.xpath("//div[@class='importent-news']") except Exception as e: log.info(e) return for cate in cate_list: cate_name = cate.xpath("./h2/a/text()")[0] news_list = cate.xpath("./ul/li") for news in news_list: url = news.xpath("./h3/a/@href")[0] if self.bf.is_contains(url): # 过滤详情页url log.info('bloom_filter已经存在{}'.format(url)) continue else: self.bf.insert(url) log.info('bloom_filter不存在,插入新的url:{}'.format(url)) try: desc = news.xpath("./div[@class='news-detail']/p/text()")[0] except: desc = None article = Article('网易') article.url = url article.desc = desc article.city = city_name article.category = cate_name message = json.dumps(article.to_dict()) disconnected = True while disconnected: try: disconnected = False self.channel.basic_publish(exchange='article', routing_key='white', body=message, properties=pika.BasicProperties(delivery_mode=2)) log.info('已经放入队列') except Exception as e: log.error(e) self.connect() disconnected = True
class Toutiao: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house" self.proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) def start_crawler(self): channel = self.rabbit.get_channel() channel.queue_declare(queue='toutiao') while True: try: self.url_list_crawler(channel) time.sleep(60) except: continue def url_list_crawler(self, channel): while True: proxy = self.proxies[random.randint(0, 9)] try: response = requests.get(self.start_url, headers=self.headers, proxies=proxy) url_dict = json.loads(response.text) url_list = url_dict["data"] break except: continue for url_content in url_list: con = url_content["content"] try: url = re.search('display_url":"(.*?)"', con).group(1) except: continue if re.search('wukong', url): continue else: if self.bf.is_contains(url): # 过滤详情页url log.info('bloom_filter已经存在{}'.format(url)) continue else: self.bf.insert(url) log.info('bloom_filter不存在,插入新的url:{}'.format(url)) article = Article('今日头条') comment_code = Comment_url() try: organization_author = re.search( '\\"source\\":\\"(.*?)\\"', con).group(1) article.organization_author = organization_author except Exception as e: log.info('没有organization_author') title = re.findall('"title":"(.*?)"', con)[1] article.title = title article.url = url article.article_id = re.search('group/(\d+)', url).group(1) comment_code.group_id = article.article_id comment_code.crawler_time = datetime.datetime.utcnow() try: comment_count = re.search('\\"comment_count\\":(\d+)', con).group(1) article.comment_count = comment_count comment_code.comment_count = comment_count except Exception as e: log.info('{}这篇文章没有评论'.format(title)) try: title_img = re.search( 'middle_image.*?"url":"(.*?.webp)', con).group(1) new_title_img = qiniufetch(title_img, title_img) article.title_img = new_title_img except Exception as e: log.info('{}这篇文章没有list图片:'.format(title)) channel.basic_publish(exchange='', routing_key='toutiao', body=json.dumps(article.to_dict())) log.info('已经放入队列')
class Toutiao: def __init__(self): self.start_url = 'https://www.toutiao.com/ch/news_house/' browser = webdriver.ChromeOptions() browser.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=browser) self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], ) def start_crawler(self): self.driver.get(self.start_url) time.sleep(5) channel = self.rabbit.get_channel() channel.queue_declare(queue='article_test') while True: self.find_list_info(channel) self.driver.refresh() time.sleep(20) def find_list_info(self, channel): article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li') print('len, ', len(article_list)) for i in article_list: if '看到这里' in i.text: print('看到这里') break try: wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text except Exception as e: continue if '悟空问答' in wenda: print('悟空问答') continue article_id = i.get_attribute('group_id') # article_id进入布隆过滤器 if self.bf.is_contains(article_id): print('bloom_filter已经存在!') continue else: self.bf.insert(article_id) print('bloom_filter不存在,插入article_id!') article = Article('今日头条') try: organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace( '⋅', '') article.organization_author = organization_author.strip() except Exception as e: print('没有organization_author') title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text article.title = title url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href') article.url = url # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text # article.post_time = post_time try: comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text comment_count = int(re.search('\d+', comment_str, re.S | re.M).group()) article.comment_count = comment_count except Exception as e: print('这篇文章没有评论', title) try: title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src') article.title_img = [title_img] except Exception as e: print('这篇文章没有list图片:', title) print(article.to_dict()) # 没有在过滤器的文章加入rabbitmq channel.basic_publish(exchange='', routing_key='article_test', body=json.dumps(article.to_dict())) print('已经放入队列')
from itertools import cycle from article_img.qiniu_fetch import qiniufetch setting = yaml.load(open('config_local.yaml')) m = MongoClient(setting['mongo_config']['config_host'], setting['mongo_config']['port']) m.admin.authenticate(setting['mongo_config']['user_name'], setting['mongo_config']['password']) collection = m[setting['mongo_config']['config_db']][setting['mongo_config'] ['coll_list']] bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_test', blockNum=1, db=0, ) log = LogHandler(__name__) connect = pika.BlockingConnection( pika.ConnectionParameters( host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )) class CrawlerArticleListUrl: def __init__(self):
class Fenghuang: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36", "Cookie" : "TEMP_USER_ID=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1aWQiOiI1YWYxNDk5OTY4ZDYzIiwidGltZSI6MTUyNTc2MjQ1N30.yT2cDnBOA7Zj9lFxI52f064z6zI4zxPv78HWjvXvwyc; city_redirected=2; prov=cn021; city=021; weather_city=sh; region_ip=116.247.70.x; region_ver=1.2; userid=1525762465015_d0klfz8748; Hm_lvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; Hm_lpvt_2618c9646a4a7be2e5f93653be3d5429=1525762465; ifh_site=3066%2C" } self.start_url = "http://sh.house.ifeng.com/news/wap" self.proxies = [{"http": "http://192.168.0.96:3234"}, {"http": "http://192.168.0.93:3234"}, {"http": "http://192.168.0.90:3234"}, {"http": "http://192.168.0.94:3234"}, {"http": "http://192.168.0.98:3234"}, {"http": "http://192.168.0.99:3234"}, {"http": "http://192.168.0.100:3234"}, {"http": "http://192.168.0.101:3234"}, {"http": "http://192.168.0.102:3234"}, {"http": "http://192.168.0.103:3234"}, ] self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) # self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) def connect(self): connection = pika.BlockingConnection(pika.ConnectionParameters(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])) self.channel = connection.channel() self.channel.exchange_declare('article', 'direct', durable=True) self.channel.queue_declare('fenghuang_article', durable=True) self.channel.queue_bind(exchange='article', queue='fenghuang_article', routing_key='black') def start_crawler(self): res = requests.get('http://house.ifeng.com/news',headers=self.headers) res.encoding = 'utf-8' html = etree.HTML(res.text) city_list = html.xpath("//ul[@id='city_hot1']/li/a") for city in city_list: city_name = city.xpath("./text()")[0] city_url = city.xpath("./@href")[0] city_id = city.xpath("./@siteid")[0] news_url = city_url + '/news' news_res = requests.get(news_url,headers=self.headers) news_res.encoding = 'utf-8' news_html = etree.HTML(news_res.text) cate_id_list = news_html.xpath("//ul[@id='newsNavScroll']/li[@cateid]") self.article_url_crawler(city_name,city_id,news_url,cate_id_list) def article_url_crawler(self,city_name,city_id,news_url,cate_id_list): post_url = news_url + '/wap' count = 1 while True: for i in cate_id_list: cate_id = i.xpath("./@cateid")[0] cate_name = i.xpath("./a/text()")[0] formdata = { 'pageid' : count, 'cateid' : cate_id, 'siteid' : city_id, 'type' : 2 } # time.sleep(10) while True: proxy = self.proxies[random.randint(0, 9)] try: res = requests.post(post_url,data=formdata,headers=self.headers,proxies=proxy) json_dict = json.loads(res.text) break except Exception as e: log.error(e) continue if len(json_dict['data']['newslist']) == 0: count = 1 continue else: news_list = json_dict['data']['newslist'] for news_info in news_list: try: desc = news_info['desc'] url = news_info['url'] keywords = news_info['keywords'].values() keywords_list = [] for i in keywords: keywords_list.append(i) except: continue title_img_url = news_info['pic_url'] if self.bf.is_contains(url): # 过滤详情页url log.info('bloom_filter已经存在{}'.format(url)) continue else: self.bf.insert(url) log.info('bloom_filter不存在,插入新的url:{}'.format(url)) new_title_img = qiniufetch(title_img_url, title_img_url) article = Article('凤凰网') article.url = url article.desc = desc article.tag = str(keywords_list) article.title_img = new_title_img article.city = city_name article.category = cate_name message = json.dumps(article.to_dict()) disconnected = True while disconnected: try: disconnected = False self.channel.basic_publish(exchange='article', routing_key='black', body=message, properties=pika.BasicProperties(delivery_mode=2)) log.info('已经放入队列') except Exception as e: log.error(e) self.connect() disconnected = True count += 1