コード例 #1
0
    def get_details(self):
        """
        把网页放入队列
        如果有list_page_url,返回url列表
        :return:
        """
        r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
        channel = r.get_channel()
        channel.queue_declare(queue='hilder_gv')

        try:
            html_str = do_request(self.page_url, self.request_type, self.headers, self.encode)
            body = {'html': html_str,
                    'analyzer_type': self.analyzer_type,
                    'analyzer_rules_dict': self.analyzer_rules_dict,
                    }
            # 放入队列 json.dumps(body)
            channel.basic_publish(exchange='',
                                  routing_key='hilder_gv',
                                  body=json.dumps(body))
            r.connection.close()
            # print(json.dumps(body))
            print('已经放入队列')
            if self.current_url_rule:
                current_page_list_url = self.get_current_page_url()
                return current_page_list_url
        except Exception as e:
            print(self.page_url, e)
コード例 #2
0
    def __init__(self):
        self.start_url = 'https://www.toutiao.com/ch/news_house/'

        browser = webdriver.ChromeOptions()
        browser.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=browser)

        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
コード例 #3
0
def consume_all_url():
    rabbit = Rabbit(host='192.168.0.192', port=5673)
    connect_result = rabbit.connection
    channel = connect_result.channel()
    channel.basic_qos(prefetch_count=1)
    channel.basic_consume(callback, queue='amap_test')
    channel.start_consuming()
コード例 #4
0
ファイル: split_url.py プロジェクト: BHBSA/hilder_other
 def __init__(self):
     self.proxy = [
         {
             "https": "http://192.168.0.96:4234"
         },
         {
             "https": "http://192.168.0.93:4234"
         },
         {
             "https": "http://192.168.0.90:4234"
         },
         {
             "https": "http://192.168.0.94:4234"
         },
         {
             "https": "http://192.168.0.98:4234"
         },
         {
             "https": "http://192.168.0.99:4234"
         },
         {
             "https": "http://192.168.0.100:4234"
         },
         {
             "https": "http://192.168.0.101:4234"
         },
         {
             "https": "http://192.168.0.102:4234"
         },
         {
             "https": "http://192.168.0.103:4234"
         },
     ]
     self.rabbit_connection = Rabbit(setting['CEIC']['rabbit']['host'],
                                     setting['CEIC']['rabbit']['port'])
コード例 #5
0
ファイル: toutiao_api.py プロジェクト: BHBSA/hilder_articles
 def __init__(self):
     self.headers = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
     }
     self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house"
     self.proxies = [
         {
             "http": "http://192.168.0.96:3234"
         },
         {
             "http": "http://192.168.0.93:3234"
         },
         {
             "http": "http://192.168.0.90:3234"
         },
         {
             "http": "http://192.168.0.94:3234"
         },
         {
             "http": "http://192.168.0.98:3234"
         },
         {
             "http": "http://192.168.0.99:3234"
         },
         {
             "http": "http://192.168.0.100:3234"
         },
         {
             "http": "http://192.168.0.101:3234"
         },
         {
             "http": "http://192.168.0.102:3234"
         },
         {
             "http": "http://192.168.0.103:3234"
         },
     ]
     self.bf = BloomFilter(
         host=setting['redies_host'],
         port=setting['redis_port'],
         key='article_toutiao_test',
         blockNum=1,
         db=0,
     )
     self.rabbit = Rabbit(host=setting['rabbitmq_host'],
                          port=setting['rabbitmq_port'])
コード例 #6
0
def consume_all_url(api_key):
    rabbit = Rabbit(host=host, port=port)
    connection_result = rabbit.connection
    # connection = pika.BlockingConnection(pika.ConnectionParameters(host=host, port=port))
    channel = connection_result.channel()
    # channel.basic_qos(prefetch_count=1)
    channel.basic_consume(callback, queue='amap_url_list', consumer_tag=api_key)
    channel.start_consuming()
コード例 #7
0
def asyn_message(_url):
    try:
        result = requests.get(_url, timeout=5)
        print(result.text, _url)
    except Exception as e:
        log.info('request error,url={}'.format(_url))
        return

    status = result.json()['status']

    if status is '1':
        count = int(result.json()['count'])
        if count != 0:
            if count < 50:
                print('count < 50')
                channel_result = connection_result.channel()

                channel_result.queue_declare(queue='amap_result_json')
                channel_result.basic_publish(exchange='',
                                             routing_key='amap_result_json',
                                             body=json.dumps(result.json()))
                channel_result.close()
            else:
                print('count > 50')

                r = Rabbit('192.168.0.192', 5673)
                channel_page = r.get_channel()
                # connection_page = pika.BlockingConnection(
                #     pika.ConnectionParameters(host='192.168.0.192', port=5673))
                # channel_page = connection_page.channel()
                channel_page.queue_declare(queue='amap_page_url')
                for i in range(1, int(count / 50 + 0.5)):
                    channel_page.basic_publish(
                        exchange='',
                        routing_key='amap_page_url',
                        body=result.url + '&page=' + str(i + 1),
                    )
                    print('分页 的url放入')
                channel_page.close()
    else:
        log.info('url={},result={}'.format(_url, result.text))
コード例 #8
0
        "https": "http://192.168.0.103:4234"
    },
]

m = Mongo('192.168.0.235')
connect = m.connect

setting = yaml.load(open('config.yaml'))

db_name = setting['CEIC']['mongo']['db']
State_indicators_name = setting['CEIC']['mongo']['State_indicators']
State_indicators_details_name = setting['CEIC']['mongo'][
    'State_indicators_details']
log = LogHandler('ceic_detail')

r = Rabbit(setting['CEIC']['rabbit']['host'],
           setting['CEIC']['rabbit']['port'])


class Detail:
    def create_date(
        self,
        indexFrequency,
        start_year,
        start_mouth,
        end_year,
    ):
        """

        :return: ['from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1', 'from=2016-1&to=2017-1',]
        """
        """
コード例 #9
0
"""
import json
from lib.rabbitmq import Rabbit
from functools import partial
import sys
import yaml
import pika
import trip

setting = yaml.load(open('config.yaml'))
host = setting['amap']['rabbitmq']['host']
port = setting['amap']['rabbitmq']['port']

message_list = []

r_result = Rabbit(host=host, port=port)
connection_result = r_result.connection

channel = connection_result.channel()


def callback(ch, method, properties, body):
    """
    {'type': '010000', 'square_list': [73.010906, 44.471043, 73.510906, 43.971043]}
    :param ch:
    :param method:
    :param properties:
    :param body:
    :return:
    """
    body = json.loads(body.decode('utf8'))
コード例 #10
0
ファイル: consumer_detail.py プロジェクト: BHBSA/hilder_other
from lib.log import LogHandler
from lib.mongo import Mongo
import yaml

setting = yaml.load(open('config.yaml'))

# mongodb
m = Mongo('192.168.0.235')
connect = m.connect

db_name = setting['CEIC']['mongo']['db']
State_indicators_name = setting['CEIC']['mongo']['State_indicators']
State_indicators_details_name = setting['CEIC']['mongo'][
    'State_indicators_details']
# rabbit
r = Rabbit(setting['CEIC']['rabbit']['host'],
           setting['CEIC']['rabbit']['port'])
channel = r.get_channel()
queue = setting['CEIC']['rabbit']['queue']
channel.queue_declare(queue=queue)
log = LogHandler('ceic_detail')


class Consumer(object):
    def callback(self, ch, method, properties, body):
        ip = method.consumer_tag
        body = json.loads(body.decode())
        url = body['url']
        countryEnName = body['countryEnName']
        indexEnName = body['indexEnName']
        while True:
            proxy_ = {'https': ip}
コード例 #11
0
ファイル: toutiao_api.py プロジェクト: BHBSA/hilder_articles
class Toutiao:
    def __init__(self):
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"
        }
        self.start_url = "http://is.snssdk.com/api/news/feed/v46/?category=news_house"
        self.proxies = [
            {
                "http": "http://192.168.0.96:3234"
            },
            {
                "http": "http://192.168.0.93:3234"
            },
            {
                "http": "http://192.168.0.90:3234"
            },
            {
                "http": "http://192.168.0.94:3234"
            },
            {
                "http": "http://192.168.0.98:3234"
            },
            {
                "http": "http://192.168.0.99:3234"
            },
            {
                "http": "http://192.168.0.100:3234"
            },
            {
                "http": "http://192.168.0.101:3234"
            },
            {
                "http": "http://192.168.0.102:3234"
            },
            {
                "http": "http://192.168.0.103:3234"
            },
        ]
        self.bf = BloomFilter(
            host=setting['redies_host'],
            port=setting['redis_port'],
            key='article_toutiao_test',
            blockNum=1,
            db=0,
        )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'],
                             port=setting['rabbitmq_port'])

    def start_crawler(self):

        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='toutiao')
        while True:
            try:
                self.url_list_crawler(channel)
                time.sleep(60)
            except:
                continue

    def url_list_crawler(self, channel):
        while True:
            proxy = self.proxies[random.randint(0, 9)]
            try:
                response = requests.get(self.start_url,
                                        headers=self.headers,
                                        proxies=proxy)
                url_dict = json.loads(response.text)
                url_list = url_dict["data"]
                break
            except:
                continue
        for url_content in url_list:
            con = url_content["content"]
            try:
                url = re.search('display_url":"(.*?)"', con).group(1)
            except:
                continue
            if re.search('wukong', url):
                continue
            else:
                if self.bf.is_contains(url):  # 过滤详情页url
                    log.info('bloom_filter已经存在{}'.format(url))
                    continue
                else:
                    self.bf.insert(url)
                    log.info('bloom_filter不存在,插入新的url:{}'.format(url))
                    article = Article('今日头条')
                    comment_code = Comment_url()
                    try:
                        organization_author = re.search(
                            '\\"source\\":\\"(.*?)\\"', con).group(1)
                        article.organization_author = organization_author
                    except Exception as e:
                        log.info('没有organization_author')
                    title = re.findall('"title":"(.*?)"', con)[1]
                    article.title = title
                    article.url = url
                    article.article_id = re.search('group/(\d+)', url).group(1)
                    comment_code.group_id = article.article_id
                    comment_code.crawler_time = datetime.datetime.utcnow()
                    try:
                        comment_count = re.search('\\"comment_count\\":(\d+)',
                                                  con).group(1)
                        article.comment_count = comment_count
                        comment_code.comment_count = comment_count
                    except Exception as e:
                        log.info('{}这篇文章没有评论'.format(title))
                    try:
                        title_img = re.search(
                            'middle_image.*?"url":"(.*?.webp)', con).group(1)
                        new_title_img = qiniufetch(title_img, title_img)
                        article.title_img = new_title_img
                    except Exception as e:
                        log.info('{}这篇文章没有list图片:'.format(title))

                    channel.basic_publish(exchange='',
                                          routing_key='toutiao',
                                          body=json.dumps(article.to_dict()))
                    log.info('已经放入队列')
コード例 #12
0
"""
爬取顺序:城市-区域-街道-菜系
start:3
"""

from dianping.request_detail import request_get
from lxml import etree
import json
import yaml
import pika
from lib.rabbitmq import Rabbit

setting = yaml.load(open('config.yaml'))

# rabbit
r = Rabbit(setting['dianping']['rabbit']['host'],
           setting['dianping']['rabbit']['port'])
connection = r.connection
channel = connection.channel()
region_queue = setting['dianping']['rabbit']['queue']['region_queue']
street_queue = setting['dianping']['rabbit']['queue']['street_queue']
first_queue = setting['dianping']['rabbit']['queue']['first_queue']
list_queue = setting['dianping']['rabbit']['queue']['list_queue']
channel.queue_declare(queue=region_queue)


# 放入html队列
def html_put_in_queue(data):
    channel.queue_declare(queue=first_queue)
    channel.basic_publish(exchange='',
                          routing_key=first_queue,
                          body=json.dumps(data),
コード例 #13
0
大于50条,则放入amap_page_url队列

"""
import json
from lib.rabbitmq import Rabbit
from functools import partial
import sys
import yaml
import pika
import trip

setting = yaml.load(open('config.yaml'))
host = setting['amap']['rabbitmq']['host']
port = setting['amap']['rabbitmq']['port']

r_result = Rabbit(host=host, port=port)
r_page = Rabbit(host=host, port=port)


def requests_a(result):
    print('-----------------{}'.format(result.text))
    if 'DAILY_QUERY_OVER_LIMIT' in result.text:
        sys.exit()
    try:
        status = result.json()['status']
    except Exception as e:
        print(e)
        print(result)
        return
    if status is '1':
        count = int(result.json()['count'])
コード例 #14
0
ファイル: split_url.py プロジェクト: BHBSA/hilder_other
 def __init__(self):
     self.rabbit_connection = Rabbit(setting['CEIC']['rabbit']['host'],
                                     setting['CEIC']['rabbit']['port'])
コード例 #15
0
from xiaozijia.user_headers import get_headers

log = LogHandler('小资家_build')

setting = yaml.load(open('config.yaml'))

# mongo
m = Mongo(setting['xiaozijia']['mongo']['host'],
          setting['xiaozijia']['mongo']['port'],
          user_name=setting['xiaozijia']['mongo']['user_name'],
          password=setting['xiaozijia']['mongo']['password'])
coll_build = m.connect[setting['xiaozijia']['mongo']['db']][
    setting['xiaozijia']['mongo']['build_coll']]

# rabbit
r = Rabbit(setting['xiaozijia']['rabbit']['host'],
           setting['xiaozijia']['rabbit']['port'])
channel = r.get_channel()
build_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_build']
house_queue = setting['xiaozijia']['rabbit']['queue']['xiaozijia_house']
channel.queue_declare(queue=build_queue)
channel.queue_declare(queue=house_queue)

headers = get_headers()


def get_build_info(ch, method, properties, body):
    """
    消费xiaozijia_build队列,请求,入小区库,并放入房号页
    :param ch:
    :param method:
    :param properties:
コード例 #16
0
m = Mongo(setting['fgg_price_mongo']['host'], setting['fgg_price_mongo']['port'])

fgg = m.connect[setting['fgg_price_mongo']['db']]
coll = fgg[setting['fgg_price_mongo']['coll_fanggugu_price']]

fgg = m.connect[setting['fgg_price_mongo']['db']]
coll_test = fgg[setting['fgg_price_mongo']['coll_fanggugu_price_update']]

fgg = m.connect[setting['fgg_price_mongo']['db']]
coll_user = fgg[setting['fgg_price_mongo']['coll_user_info']]

fgg = m.connect[setting['fgg_price_mongo']['db']]
coll_login = fgg[setting['fgg_price_mongo']['coll_login']]

# 链接 rabbit
r = Rabbit(setting['fgg_price_rabbit']['host'], setting['fgg_price_rabbit']['port'], )

channel = r.get_channel()
channel.queue_declare(queue='fgg_comm_id')

IPS = ["192.168.0.90:4234",
       "192.168.0.93:4234",
       "192.168.0.94:4234",
       "192.168.0.96:4234",
       "192.168.0.98:4234",
       "192.168.0.99:4234",
       "192.168.0.100:4234",
       "192.168.0.101:4234",
       "192.168.0.102:4234",
       "192.168.0.103:4234"]
コード例 #17
0
import json
import datetime
import re
import time

setting = yaml.load(open('config_local.yaml'))
log = LogHandler('article_consumer')
m = MongoClient(setting['mongo_config']['config_host'],
                setting['mongo_config']['port'])
m.admin.authenticate(setting['mongo_config']['user_name'],
                     setting['mongo_config']['password'])
collection = m[setting['mongo_config']['config_db']][setting['mongo_config']
                                                     ['coll_detail']]
clean_coll = m[setting['mongo_config']['config_db']][setting['mongo_config']
                                                     ['clean']]
rabbit = Rabbit(setting['rabbitmq_host'], setting['rabbitmq_port'])
connection = rabbit.connection


class CrawlerDetail:
    def __init__(self):
        self.proxy = Proxies()

    def start_consume(self):
        channel = connection.channel()
        channel.queue_declare(queue='usual_article')
        channel.basic_qos(prefetch_count=1)
        channel.basic_consume(self.consume_article_detail_url,
                              queue='usual_article',
                              no_ack=False)
        channel.start_consuming()
コード例 #18
0
import requests
from lib.mongo import Mongo
from lib.rabbitmq import Rabbit
from urllib import parse
from lib.log import LogHandler
from backup.anew_fanggugu.user_names import username_list
from progressbar import *

log = LogHandler(__name__)

r = Rabbit('127.0.0.1', 5673)
channel = r.get_channel()
channel.queue_declare(queue='fgg_user_city')

m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******')
# m = Mongo('127.0.0.1', 27018)
coll_comm = m.connect['fgg']['comm']
coll_build = m.connect['fgg']['build']
coll_house = m.connect['fgg']['house']


class ConsumerCity(object):
    """
        获取所有的城市的小区,楼栋,房号,面积
    """
    def __init__(self):
        self.p = ProgressBar()
        self.headers = {'Authorization': ""}
        self.s = requests.session()
        self.currentCity = ''
        self.currentCityPy = ''
コード例 #19
0
 def __init__(self):
     self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )
コード例 #20
0
class Toutiao_Consumer:
    def __init__(self):
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )

    @staticmethod
    def parse_html(res):
        # res = requests.get(url=url, headers=headers)

        # 切割url()
        # article_id = re.search('\d+', url).group()
        if 'articleInfo' in res.text:
            # 今日头条的url
            readable_title = Document(res.content).short_title()
            readable_article_ = re.search("articleInfo.*?content.*?'(.*?)'", res.content.decode(), re.S | re.M).group(1)
            readable_article = html_parser.unescape(readable_article_)
            source_detail = '今日头条'
            img_change = ImageReplace()
            readable_article = img_change.image_download(readable_article)    #对今日头条来源的文章内容进行图片连接替换

        else:
            # 其他来源的文章
            html_byte = re.sub(b'<script.*script>', b'', res.content, )
            encode_dict = chardet.detect(html_byte)
            encode_type = encode_dict['encoding']
            readable_title = Document(html_byte.decode(encode_type)).short_title()
            readable_article = Document(html_byte.decode(encode_type)).summary()
            source_detail = 'other'
        return readable_title, readable_article,source_detail

    @staticmethod
    def get_post_time(res):
        if 'articleInfo' in res.text:
            # 今日头条的url
            time = re.search("time: '(.*?)'", res.content.decode(), re.S | re.M).group(1)
            return time
        else:
            return None

    def callback(self, ch, method, properties, body):
        body = json.loads(body.decode())
        article = Article(body['source'])
        article.dict_to_attr(body)
        url = article.url

        while True:
            try:
                res = requests.get(url=url, headers=headers, proxies=proxies[random.randint(0, 9)], timeout=10)
                res.encoding = 'utf-8'
                if '<html><head></head><body></body></html>' not in  res.text:
                    break
            except Exception as e:
                log.error('网络请求错误{}'.format(e))

        readable_title, readable_article,source_detail = self.parse_html(res)
        article.post_time = self.get_post_time(res)
        article.body = readable_article
        article.source_detail  = source_detail
        article.crawler_time = datetime.datetime.now()
        if '<body id="readabilityBody"/>' in article.body:
            log.info("文章为空")
        else:
            article.insert_db()
            log.info('{}一篇文张入库成功'.format('今日头条'))
        ch.basic_ack(delivery_tag=method.delivery_tag)

    def consume_connect(self):
        connect = self.rabbit.get_connection()
        self.channel = connect.channel()
        self.channel.basic_qos(prefetch_count=1)
        self.channel.basic_consume(self.callback,
                              queue='toutiao',
                              no_ack=False)

    def start_consume(self):
        disconnected = True
        while disconnected:
            try:
                disconnected = False
                self.channel.start_consuming()
            except Exception as e:
                disconnected = True
                self.consume_connect()
コード例 #21
0
from lib.rabbitmq import Rabbit
from lib.mongo import Mongo
import datetime
import yaml

setting = yaml.load(open('config.yaml'))

# 连接 MongoDB
m = Mongo(setting['comm_price']['host'], setting['comm_price']['port'])
fgg = m.connect[setting['comm_price']['db']]
coll = fgg[setting['comm_price']['fgg_coll']]

coll_login = fgg[setting['fgg']['user_info']]

# 连接rabbit
r = Rabbit('192.168.0.235', 5673)
channel = r.get_channel()

IPS = ["192.168.0.90:4234",
       "192.168.0.93:4234",
       "192.168.0.94:4234",
       "192.168.0.96:4234",
       "192.168.0.98:4234",
       "192.168.0.99:4234",
       "192.168.0.100:4234",
       "192.168.0.101:4234",
       "192.168.0.102:4234",
       "192.168.0.103:4234"]

login = Login()
コード例 #22
0
class Toutiao:
    def __init__(self):
        self.start_url = 'https://www.toutiao.com/ch/news_house/'

        browser = webdriver.ChromeOptions()
        browser.add_argument('--headless')

        self.driver = webdriver.Chrome(chrome_options=browser)

        self.bf = BloomFilter(host=setting['redies_host'],
                              port=setting['redis_port'],
                              key='article_toutiao_test',
                              blockNum=1,
                              db=0, )
        self.rabbit = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'], )

    def start_crawler(self):
        self.driver.get(self.start_url)
        time.sleep(5)
        channel = self.rabbit.get_channel()
        channel.queue_declare(queue='article_test')
        while True:
            self.find_list_info(channel)
            self.driver.refresh()
            time.sleep(20)

    def find_list_info(self, channel):
        article_list = self.driver.find_elements_by_xpath('/html/body/div/div[4]/div[2]/div[2]/div/div/div/ul/li')
        print('len, ', len(article_list))
        for i in article_list:
            if '看到这里' in i.text:
                print('看到这里')
                break
            try:
                wenda = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text
            except Exception as e:
                continue
            if '悟空问答' in wenda:
                print('悟空问答')
                continue
            article_id = i.get_attribute('group_id')

            # article_id进入布隆过滤器
            if self.bf.is_contains(article_id):
                print('bloom_filter已经存在!')
                continue
            else:
                self.bf.insert(article_id)
                print('bloom_filter不存在,插入article_id!')

                article = Article('今日头条')
                try:
                    organization_author = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[2]').text.replace(
                        '⋅', '')
                    article.organization_author = organization_author.strip()
                except Exception as e:
                    print('没有organization_author')
                title = i.find_element_by_xpath('div/div[1]/div/div[1]/a').text
                article.title = title
                url = i.find_element_by_xpath('div/div[1]/div/div[1]/a').get_attribute('href')
                article.url = url
                # post_time = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/span').text
                # article.post_time = post_time

                try:
                    comment_str = i.find_element_by_xpath('div/div[1]/div/div[2]/div[1]/div/a[3]').text
                    comment_count = int(re.search('\d+', comment_str, re.S | re.M).group())
                    article.comment_count = comment_count
                except Exception as e:
                    print('这篇文章没有评论', title)

                try:
                    title_img = i.find_element_by_xpath('div/div[2]/a/img').get_attribute('src')
                    article.title_img = [title_img]
                except Exception as e:
                    print('这篇文章没有list图片:', title)

                print(article.to_dict())
                # 没有在过滤器的文章加入rabbitmq

                channel.basic_publish(exchange='',
                                      routing_key='article_test',
                                      body=json.dumps(article.to_dict()))
                print('已经放入队列')
コード例 #23
0
ファイル: consumer.py プロジェクト: w4205/hilder_gv
class Consumer(object):
    r = Rabbit(host=setting['rabbitmq_host'], port=setting['rabbitmq_port'])
    channel = r.get_channel()
    channel.queue_declare(queue='hilder_gv')

    def callback(self, ch, method, properties, body):
        body = json.loads(body.decode())
        analyzer_rules_dict = body['analyzer_rules_dict']
        analyzer_type = body['analyzer_type']
        co_index = analyzer_rules_dict['co_index']
        data_type = analyzer_rules_dict['data_type']
        html = body['html']
        try:
            self.common_use(analyzer_type, co_index, data_type, html,
                            analyzer_rules_dict)
        except Exception as e:
            print(e)
        ch.basic_ack(delivery_tag=method.delivery_tag)

    def common_use(self, analyzer_type, co_index, data_type, html,
                   analyzer_rules_dict):
        if data_type == 'comm':
            info = self.rule_data(analyzer_type, analyzer_rules_dict, html)
            try:
                self.put_database(info, data_type, co_index=co_index)
            except Exception as e:
                print(e)
        elif data_type == 'build':
            co_id_rule = analyzer_rules_dict['co_id']
            co_name_rule = analyzer_rules_dict['co_name']
            co_id = self.rule_type(analyzer_type, html, co_id_rule)
            co_name = self.rule_type(analyzer_type, html, co_name_rule)
            co_id = self.have_no_have(co_id)
            co_name = self.have_no_have(co_name)
            del analyzer_rules_dict['co_id']
            del analyzer_rules_dict['co_name']
            info = self.rule_data(analyzer_type, analyzer_rules_dict, html)
            try:
                self.put_database(info,
                                  data_type,
                                  co_index=co_index,
                                  co_id=co_id,
                                  co_name=co_name)
            except Exception as e:
                print(e)
        elif data_type == 'house':
            bu_id_rule = analyzer_rules_dict['bu_id']
            bu_num_rule = analyzer_rules_dict['bu_num']

            bu_id = self.rule_type(analyzer_type, html, bu_id_rule)
            bu_num = self.rule_type(analyzer_type, html, bu_num_rule)
            bu_id = self.have_no_have(bu_id)
            bu_num = self.have_no_have(bu_num)
            del analyzer_rules_dict['bu_id']
            del analyzer_rules_dict['bu_num']
            info = self.rule_data(analyzer_type, analyzer_rules_dict, html)
            try:
                self.put_database(info,
                                  data_type,
                                  co_index=co_index,
                                  bu_id=bu_id,
                                  bu_num=bu_num)
            except Exception as e:
                print(e)

    def rule_type(self, rule_type, html, rule):
        if rule:
            if rule_type == 'regex':
                data = re.findall(rule, html, re.S | re.M)
                return data
            else:
                tree = etree.HTML(html)
                data = tree.xpath(rule)
                return data
        else:
            return None

    @staticmethod
    def rule_data(analyzer_type, analyzer_rules_dict, html):
        tree = etree.HTML(html)
        info = {}
        for i in analyzer_rules_dict:
            if not analyzer_rules_dict[i]: continue
            if i == 'co_index' or i == 'data_type': continue
            if analyzer_type == 'regex':
                info_list = re.findall(analyzer_rules_dict[i], html,
                                       re.M | re.S)
            else:
                info_list = tree.xpath(analyzer_rules_dict[i])
            if info_list:
                info[i] = info_list
            if not info: print('\n\n没有选取到任何信息\n\n')
        return info

    @staticmethod
    def have_no_have(num):
        if num:
            return num[0]
        else:
            return None

    @staticmethod
    def add_attr(obj, info, index):
        for key, value in info.items():
            if value:
                setattr(obj, key, value[index].strip())
        obj.insert_db()

    # 遍历字典放入数据库
    def put_database(self,
                     info,
                     analyzer,
                     co_index,
                     bu_id=None,
                     bu_num=None,
                     co_id=None,
                     co_name=None):
        key = sorted(info.items())[0][0]
        length = len(info[key])
        for i in range(0, length):
            obj = self.get_data_obj(analyzer, co_index)
            if analyzer == 'comm':
                pass
            elif analyzer == 'build':
                if co_id: setattr(obj, 'co_id', co_id)
                if co_name: setattr(obj, 'co_name', co_name)
            elif analyzer == 'house':
                if bu_id:
                    setattr(obj, 'bu_id', bu_id.strip())
                if bu_num:
                    setattr(obj, 'bu_num', bu_num.strip())
            self.add_attr(obj, info, i)

    # 创建对象(data_type是什么类型是就创建什么对象)
    def get_data_obj(self, analyzer, co_index):
        if analyzer == 'comm':
            return Comm(co_index)
        elif analyzer == 'build':
            return Building(co_index)
        elif analyzer == 'house':
            return House(co_index)

    def consume_queue(self):
        self.channel.basic_qos(prefetch_count=1)
        self.channel.basic_consume(self.callback, queue='hilder_gv')
        self.channel.start_consuming()
コード例 #24
0
 def connect_rabbit(self):
     r = Rabbit(self.r_host, self.r_port)
     return r.get_channel()
コード例 #25
0
import requests
import json
from lib.log import LogHandler
from lib.mongo import Mongo
from lib.rabbitmq import Rabbit
from xiaozijia_gevent.user_headers import get_headers
from xiaozijia_gevent.user_list import user_list
from multiprocessing import Process
from gevent import monkey
import random

log = LogHandler(__name__)
m = Mongo('114.80.150.196', 27777, user_name='goojia', password='******')
coll_detail = m.connect['friends']['xiaozijia_detail']

r = Rabbit('localhost', 5673)
channel = r.get_channel()
gevent.monkey.patch_all()
headers = ''


def detail_message(info):
    global headers
    data = json.loads(info)
    username = random.choice(user_list)
    headers = get_headers(username)
    id = data['Id']
    ConstructionName = data['ConstructionName']
    try:
        detail_url = 'http://www.xiaozijia.cn/HouseInfo/' + str(id)
        result = requests.get(detail_url, headers=headers, timeout=10)
コード例 #26
0
from login_fgg import Login
from lib.mongo import Mongo
from lib.rabbitmq import Rabbit
import random
import json
import requests

r = Rabbit('192.168.0.190', 5673)
connection = r.connection
channel = connection.channel()
channel.queue_declare(queue='fgg_all_city_code')

m = Mongo('192.168.0.235', 27017)
connect = m.connect
coll = connect['fgg']['user_info']

login = Login()

ips = [
    "192.168.0.90:4234", "192.168.0.93:4234", "192.168.0.94:4234",
    "192.168.0.96:4234", "192.168.0.98:4234", "192.168.0.99:4234",
    "192.168.0.100:4234", "192.168.0.101:4234", "192.168.0.102:4234",
    "192.168.0.103:4234"
]

known = '上海 35484,北京 20866,广州 16641,深圳 23559,大连 20751,厦门 15265,银川 17000,成都 13000,杭州 13000'


def put_queue_comm_id():
    headers = {
        'Cookie':