Example #1
0
    def get_details(self):
        """
        把网页放入队列
        如果有list_page_url,返回url列表
        :return:
        """
        r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
        channel = r.get_channel()
        channel.queue_declare(queue='hilder_gv')

        try:
            html_str = do_request(self.page_url, self.request_type, self.headers, self.encode)
            body = {'html': html_str,
                    'analyzer_type': self.analyzer_type,
                    'analyzer_rules_dict': self.analyzer_rules_dict,
                    }
            # 放入队列 json.dumps(body)
            channel.basic_publish(exchange='',
                                  routing_key='hilder_gv',
                                  body=json.dumps(body))
            r.connection.close()
            # print(json.dumps(body))
            print('已经放入队列')
            if self.current_url_rule:
                current_page_list_url = self.get_current_page_url()
                return current_page_list_url
        except Exception as e:
            print(self.page_url, e)
def asyn_message(_url):
    try:
        result = requests.get(_url, timeout=5)
        print(result.text, _url)
    except Exception as e:
        log.info('request error,url={}'.format(_url))
        return

    status = result.json()['status']

    if status is '1':
        count = int(result.json()['count'])
        if count != 0:
            if count < 50:
                print('count < 50')
                channel_result = connection_result.channel()

                channel_result.queue_declare(queue='amap_result_json')
                channel_result.basic_publish(exchange='',
                                             routing_key='amap_result_json',
                                             body=json.dumps(result.json()))
                channel_result.close()
            else:
                print('count > 50')

                r = Rabbit('192.168.0.192', 5673)
                channel_page = r.get_channel()
                # connection_page = pika.BlockingConnection(
                #     pika.ConnectionParameters(host='192.168.0.192', port=5673))
                # channel_page = connection_page.channel()
                channel_page.queue_declare(queue='amap_page_url')
                for i in range(1, int(count / 50 + 0.5)):
                    channel_page.basic_publish(
                        exchange='',
                        routing_key='amap_page_url',
                        body=result.url + '&page=' + str(i + 1),
                    )
                    print('分页 的url放入')
                channel_page.close()
    else:
        log.info('url={},result={}'.format(_url, result.text))
Example #3
0
from lib.mongo import Mongo
import datetime
import yaml

setting = yaml.load(open('config.yaml'))

# 连接 MongoDB
m = Mongo(setting['comm_price']['host'], setting['comm_price']['port'])
fgg = m.connect[setting['comm_price']['db']]
coll = fgg[setting['comm_price']['fgg_coll']]

coll_login = fgg[setting['fgg']['user_info']]

# 连接rabbit
r = Rabbit('192.168.0.235', 5673)
channel = r.get_channel()

IPS = ["192.168.0.90:4234",
       "192.168.0.93:4234",
       "192.168.0.94:4234",
       "192.168.0.96:4234",
       "192.168.0.98:4234",
       "192.168.0.99:4234",
       "192.168.0.100:4234",
       "192.168.0.101:4234",
       "192.168.0.102:4234",
       "192.168.0.103:4234"]

login = Login()

Example #4
0
class Toutiao:
    def __init__(self):
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
        }
        self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house"
        self.proxies = [
            {
                "http": "http://192.168.0.96:3234"
            },
            {
                "http": "http://192.168.0.93:3234"
            },
            {
                "http": "http://192.168.0.90:3234"
            },
            {
                "http": "http://192.168.0.94:3234"
            },
            {
                "http": "http://192.168.0.98:3234"
            },
            {
                "http": "http://192.168.0.99:3234"
            },
            {
                "http": "http://192.168.0.100:3234"
            },
            {
                "http": "http://192.168.0.101:3234"
            },
            {
                "http": "http://192.168.0.102:3234"
            },
            {
                "http": "http://192.168.0.103:3234"
            },
        ]
        self.bf = BloomFilter(
            host=setting['redies_host'],
            port=setting['redis_port'],
            key='article_toutiao_test',
            blockNum=1,
            db=0,
        )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'],
                             port=setting['rabbitmq_port'])

    def start_crawler(self):

        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='toutiao')
        while True:
            try:
                self.url_list_crawler(channel)
                time.sleep(60)
            except:
                continue

    def url_list_crawler(self, channel):
        while True:
            proxy = self.proxies[random.randint(0, 9)]
            try:
                response = requests.get(self.start_url,
                                        headers=self.headers,
                                        proxies=proxy)
                url_dict = json.loads(response.text)
                url_list = url_dict["data"]
                break
            except:
                continue
        for url_content in url_list:
            con = url_content["content"]
            try:
                url = re.search('display_url":"(.*?)"', con).group(1)
            except:
                continue
            if re.search('wukong', url):
                continue
            else:
                if self.bf.is_contains(url):  # 过滤详情页url
                    log.info('bloom_filter已经存在{}'.format(url))
                    continue
                else:
                    self.bf.insert(url)
                    log.info('bloom_filter不存在,插入新的url:{}'.format(url))
                    article = Article('今日头条')
                    comment_code = Comment_url()
                    try:
                        organization_author = re.search(
                            '\\"source\\":\\"(.*?)\\"', con).group(1)
                        article.organization_author = organization_author
                    except Exception as e:
                        log.info('没有organization_author')
                    title = re.findall('"title":"(.*?)"', con)[1]
                    article.title = title
                    article.url = url
                    article.article_id = re.search('group/(\d+)', url).group(1)
                    comment_code.group_id = article.article_id
                    comment_code.crawler_time = datetime.datetime.utcnow()
                    try:
                        comment_count = re.search('\\"comment_count\\":(\d+)',
                                                  con).group(1)
                        article.comment_count = comment_count
                        comment_code.comment_count = comment_count
                    except Exception as e:
                        log.info('{}这篇文章没有评论'.format(title))
                    try:
                        title_img = re.search(
                            'middle_image.*?"url":"(.*?.webp)', con).group(1)
                        new_title_img = qiniufetch(title_img, title_img)
                        article.title_img = new_title_img
                    except Exception as e:
                        log.info('{}这篇文章没有list图片:'.format(title))

                    channel.basic_publish(exchange='',
                                          routing_key='toutiao',
                                          body=json.dumps(article.to_dict()))
                    log.info('已经放入队列')
Example #5
0
class Toutiao:
    def __init__(self):
        self.start_url = 'https://www.toutiao.com/ch/news_house/'

        browser = webdriver.ChromeOptions()
        browser.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=browser)

        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )

    def start_crawler(self):
        self.driver.get(self.start_url)
        time.sleep(5)
        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='article_test')
        while True:
            self.find_list_info(channel)
            self.driver.refresh()
            time.sleep(20)

    def find_list_info(self, channel):
        article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li')
        print('len, ', len(article_list))
        for i in article_list:
            if '看到这里' in i.text:
                print('看到这里')
                break
            try:
                wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text
            except Exception as e:
                continue
            if '悟空问答' in wenda:
                print('悟空问答')
                continue
            article_id = i.get_attribute('group_id')

            # article_id进入布隆过滤器
            if self.bf.is_contains(article_id):
                print('bloom_filter已经存在!')
                continue
            else:
                self.bf.insert(article_id)
                print('bloom_filter不存在,插入article_id!')

                article = Article('今日头条')
                try:
                    organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace(
                        '⋅', '')
                    article.organization_author = organization_author.strip()
                except Exception as e:
                    print('没有organization_author')
                title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text
                article.title = title
                url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href')
                article.url = url
                # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text
                # article.post_time = post_time

                try:
                    comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text
                    comment_count = int(re.search('\d+', comment_str, re.S | re.M).group())
                    article.comment_count = comment_count
                except Exception as e:
                    print('这篇文章没有评论', title)

                try:
                    title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src')
                    article.title_img = [title_img]
                except Exception as e:
                    print('这篇文章没有list图片:', title)

                print(article.to_dict())
                # 没有在过滤器的文章加入rabbitmq

                channel.basic_publish(exchange='',
                                      routing_key='article_test',
                                      body=json.dumps(article.to_dict()))
                print('已经放入队列')
Example #6
0
 def connect_rabbit(self):
     r = Rabbit(self.r_host, self.r_port)
     return r.get_channel()