def get_details(self): """ 把网页放入队列 如果有list_page_url,返回url列表 :return: """ r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) channel = r.get_channel() channel.queue_declare(queue='hilder_gv') try: html_str = do_request(self.page_url, self.request_type, self.headers, self.encode) body = {'html': html_str, 'analyzer_type': self.analyzer_type, 'analyzer_rules_dict': self.analyzer_rules_dict, } # 放入队列 json.dumps(body) channel.basic_publish(exchange='', routing_key='hilder_gv', body=json.dumps(body)) r.connection.close() # print(json.dumps(body)) print('已经放入队列') if self.current_url_rule: current_page_list_url = self.get_current_page_url() return current_page_list_url except Exception as e: print(self.page_url, e)
def asyn_message(_url): try: result = requests.get(_url, timeout=5) print(result.text, _url) except Exception as e: log.info('request error,url={}'.format(_url)) return status = result.json()['status'] if status is '1': count = int(result.json()['count']) if count != 0: if count < 50: print('count < 50') channel_result = connection_result.channel() channel_result.queue_declare(queue='amap_result_json') channel_result.basic_publish(exchange='', routing_key='amap_result_json', body=json.dumps(result.json())) channel_result.close() else: print('count > 50') r = Rabbit('192.168.0.192', 5673) channel_page = r.get_channel() # connection_page = pika.BlockingConnection( # pika.ConnectionParameters(host='192.168.0.192', port=5673)) # channel_page = connection_page.channel() channel_page.queue_declare(queue='amap_page_url') for i in range(1, int(count / 50 + 0.5)): channel_page.basic_publish( exchange='', routing_key='amap_page_url', body=result.url + '&page=' + str(i + 1), ) print('分页 的url放入') channel_page.close() else: log.info('url={},result={}'.format(_url, result.text))
from lib.mongo import Mongo import datetime import yaml setting = yaml.load(open('config.yaml')) # 连接 MongoDB m = Mongo(setting['comm_price']['host'], setting['comm_price']['port']) fgg = m.connect[setting['comm_price']['db']] coll = fgg[setting['comm_price']['fgg_coll']] coll_login = fgg[setting['fgg']['user_info']] # 连接rabbit r = Rabbit('192.168.0.235', 5673) channel = r.get_channel() IPS = ["192.168.0.90:4234", "192.168.0.93:4234", "192.168.0.94:4234", "192.168.0.96:4234", "192.168.0.98:4234", "192.168.0.99:4234", "192.168.0.100:4234", "192.168.0.101:4234", "192.168.0.102:4234", "192.168.0.103:4234"] login = Login()
class Toutiao: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36" } self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house" self.proxies = [ { "http": "http://192.168.0.96:3234" }, { "http": "http://192.168.0.93:3234" }, { "http": "http://192.168.0.90:3234" }, { "http": "http://192.168.0.94:3234" }, { "http": "http://192.168.0.98:3234" }, { "http": "http://192.168.0.99:3234" }, { "http": "http://192.168.0.100:3234" }, { "http": "http://192.168.0.101:3234" }, { "http": "http://192.168.0.102:3234" }, { "http": "http://192.168.0.103:3234" }, ] self.bf = BloomFilter( host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port']) def start_crawler(self): channel = self.rabbit.get_channel() channel.queue_declare(queue='toutiao') while True: try: self.url_list_crawler(channel) time.sleep(60) except: continue def url_list_crawler(self, channel): while True: proxy = self.proxies[random.randint(0, 9)] try: response = requests.get(self.start_url, headers=self.headers, proxies=proxy) url_dict = json.loads(response.text) url_list = url_dict["data"] break except: continue for url_content in url_list: con = url_content["content"] try: url = re.search('display_url":"(.*?)"', con).group(1) except: continue if re.search('wukong', url): continue else: if self.bf.is_contains(url): # 过滤详情页url log.info('bloom_filter已经存在{}'.format(url)) continue else: self.bf.insert(url) log.info('bloom_filter不存在,插入新的url:{}'.format(url)) article = Article('今日头条') comment_code = Comment_url() try: organization_author = re.search( '\\"source\\":\\"(.*?)\\"', con).group(1) article.organization_author = organization_author except Exception as e: log.info('没有organization_author') title = re.findall('"title":"(.*?)"', con)[1] article.title = title article.url = url article.article_id = re.search('group/(\d+)', url).group(1) comment_code.group_id = article.article_id comment_code.crawler_time = datetime.datetime.utcnow() try: comment_count = re.search('\\"comment_count\\":(\d+)', con).group(1) article.comment_count = comment_count comment_code.comment_count = comment_count except Exception as e: log.info('{}这篇文章没有评论'.format(title)) try: title_img = re.search( 'middle_image.*?"url":"(.*?.webp)', con).group(1) new_title_img = qiniufetch(title_img, title_img) article.title_img = new_title_img except Exception as e: log.info('{}这篇文章没有list图片:'.format(title)) channel.basic_publish(exchange='', routing_key='toutiao', body=json.dumps(article.to_dict())) log.info('已经放入队列')
class Toutiao: def __init__(self): self.start_url = 'https://www.toutiao.com/ch/news_house/' browser = webdriver.ChromeOptions() browser.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=browser) self.bf = BloomFilter(host=setting['redies_host'], port=setting['redis_port'], key='article_toutiao_test', blockNum=1, db=0, ) self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], ) def start_crawler(self): self.driver.get(self.start_url) time.sleep(5) channel = self.rabbit.get_channel() channel.queue_declare(queue='article_test') while True: self.find_list_info(channel) self.driver.refresh() time.sleep(20) def find_list_info(self, channel): article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li') print('len, ', len(article_list)) for i in article_list: if '看到这里' in i.text: print('看到这里') break try: wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text except Exception as e: continue if '悟空问答' in wenda: print('悟空问答') continue article_id = i.get_attribute('group_id') # article_id进入布隆过滤器 if self.bf.is_contains(article_id): print('bloom_filter已经存在!') continue else: self.bf.insert(article_id) print('bloom_filter不存在,插入article_id!') article = Article('今日头条') try: organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace( '⋅', '') article.organization_author = organization_author.strip() except Exception as e: print('没有organization_author') title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text article.title = title url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href') article.url = url # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text # article.post_time = post_time try: comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text comment_count = int(re.search('\d+', comment_str, re.S | re.M).group()) article.comment_count = comment_count except Exception as e: print('这篇文章没有评论', title) try: title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src') article.title_img = [title_img] except Exception as e: print('这篇文章没有list图片:', title) print(article.to_dict()) # 没有在过滤器的文章加入rabbitmq channel.basic_publish(exchange='', routing_key='article_test', body=json.dumps(article.to_dict())) print('已经放入队列')
def connect_rabbit(self): r = Rabbit(self.r_host, self.r_port) return r.get_channel()