Esempio n. 1
0
class ProxyCheck(object):

    check_urls = ["http://news.cnstock.com/news/sns_yw/index.html"]

    def __init__(self):
        self.redis_action = RedisAction()

    def check_rules(self, proxy_url):
        for curl in self.check_urls:
            try:
                proxy_res = requests.get(curl,
                                         proxies={"http": proxy_url},
                                         timeout=20)
                if proxy_res.status_code not in [200]:
                    return False
            except Exception as e:
                traceback.print_exc()
                return False
        else:
            return True

    def period_check(self):
        for proxy_url in self.redis_action.members_set("proxy_set"):
            if not self.check_rules(proxy_url):
                self.redis_action.pop_set("proxy_set", proxy_url)
                print "proxy {} is disabled".format(proxy_url)
            else:
                print "proxy {} is enabled".format(proxy_url)

    def add_new(self):
        url_list = [
            "http://dec.ip3366.net/api/?key=20171207221341061&getnum=30&anonymoustype=3&filter=1&area=1&sarea=1&formats=2&proxytype=0",
            "http://dec.ip3366.net/api/?key=20171207221341061&getnum=30&anonymoustype=4&filter=1&area=1&sarea=1&formats=2&proxytype=0"
        ]
        res = requests.get(choice(url_list))
        proxy_json = json.loads(res.text)
        for proxy in proxy_json:
            proxy_url = "http://{}:{}".format(proxy["Ip"], proxy["Port"])
            for curl in self.check_urls:
                try:
                    proxy_res = requests.get(curl,
                                             proxies={"http": proxy_url},
                                             timeout=10)
                    if proxy_res.status_code not in [200]:
                        print "proxy {} is disabled".format(proxy_url)
                        break
                except Exception as e:
                    traceback.print_exc()
                    print "proxy {} is disabled".format(proxy_url)
                    break
            else:
                self.redis_action.add_set("proxy_set", proxy_url)
                print "proxy {} is enabled".format(proxy_url)
Esempio n. 2
0
class CrawlProducer(object):
    """
    定时扫描 hbase表中应该爬去的url, 然后写入crawl_task_queue
    crontab -e
    * * * * * python /home/szliu/fintech_crawler/period/crawl_task_producer.py >> /niub/crontab_log/crontab.log 2>&1
    hbase info
        url: 网址
        next_time: 下一次抓取时间
        last_time: 最近一次抓取时间
        channel: url所属频道
        priority: 优先级
        parse_func: url的解析函数
        once_every_minutes: 多少分钟抓一次
    """
    def __init__(self):
        self.redis_action = RedisAction()
        self.hbase_action = HBASEAction()

    def run(self, queue_name, hbase_table_name):
        # 扫描表
        count = 0
        for hbase_dict in self.hbase_action.scan_table(hbase_table_name, [
                "info:url", "info:priority", "info:parse_func",
                "info:next_time", "info:once_every_minutes"
        ]):
            base_time = datetime.now()
            rate = int(hbase_dict["info:once_every_minutes"])
            # 判断是否到达爬取时间
            if hbase_dict["info:next_time"] <= base_time.strftime(
                    "%Y-%m-%d %H:%M:%S"):
                crawl_dict = {
                    "info:url": hbase_dict["info:url"],
                    "info:priority": hbase_dict["info:priority"],
                    "info:parse_func": hbase_dict["info:parse_func"],
                }
                count += 1
                self.redis_action.priority_queue_push(
                    queue_name, json.dumps(crawl_dict),
                    int(crawl_dict["info:priority"]))
                # 修改表中下一次的时间
                self.hbase_action.insert_data(
                    hbase_table_name, {
                        "info:last_time":
                        base_time.strftime("%Y-%m-%d %H:%M:%S"),
                        "info:url":
                        hbase_dict["info:url"],
                        "info:next_time":
                        (base_time +
                         timedelta(minutes=rate)).strftime("%Y-%m-%d %H:%M:%S")
                    })
        print "[%s] write %s finished" % (datetime.now(), count)
Esempio n. 3
0
class Handler(BaseHandler):
    file_store = FileStore()
    redis_action = RedisAction()

    crawl_config = {
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
    }

    @every(minutes=1)
    def on_start(self):
        data_list = self.redis_action.priority_queue_pop("crawl_task_queue", 0)
        proxy_list = self.redis_action.get_random_set("crawler_set", 3)

        for data in data_list:
            if proxy_list:
                choice_proxy = choice(proxy_list)
                self.crawl(data["info:url"], callback=self.index_page, save=data,
                           proxy="{}:{}".format(choice_proxy["Ip"], choice_proxy["Port"]), headers=self.headers)
            else:
                self.crawl(data["info:url"], callback=self.index_page, save=data, headers=self.headers)

    @catch_status_code_error
    def index_page(self, response):
        if response.status_code in [404, 403, 302, 312, 500]:
            return {'result': response.url,
                    'html': response.save["html"],
                    'status_code': response.status_code,
                    'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        if isinstance(response.text, unicode):
            response.save.update({"html": response.text})
        else:
            response.save.update({"html": response.text.decode(response.encoding)})
        self.file_store.save(json.dumps(response.save))
        return {'result': response.url,
                'html': response.save["html"],
                'crawl_time': datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
                                           "info:status": "pending",
                                           "info:channel": "",
                                           "info:url": data["info:url"]})
        if hb_action.get_raw(table_name, data["info:url"], "info:is_hub"):
            redis_action.priority_queue_push(crawl_name, json.dumps(data), int(data["info:priority"]))
        return "update"


if __name__ == "__main__":
    """
    运行参数
    nohup /usr/local/spark2/bin/spark-submit --master spark://abc-cloudera001:7077 --py-files /home/szliu/fintech_crawler/spark_submit_library/db_actions.zip,/home/szliu/fintech_crawler/spark_submit_library/hbase.zip --conf spark.pyspark.virtualenv.enabled=true --conf spark.pyspark.virtualenv.type=native --conf spark.pyspark.virtualenv.requirements=/home/szliu/venv/crawler/requirements.txt --conf spark.pyspark.virtualenv.bin.path=/usr/local/python2.7/bin/virtualenv dupfilter_task_worker.py --executor-memory 2G --total-executor-cores 2 >> /niub/szliu/dupfilter_task_worker.log 2>&1 &
    nohup /usr/local/spark2/bin/spark-submit --py-files /home/szliu/fintech_crawler/spark_submit_library/db_actions.zip,/home/szliu/fintech_crawler/spark_submit_library/hbase.zip --conf spark.pyspark.virtualenv.enabled=true --conf spark.pyspark.virtualenv.type=native --conf spark.pyspark.virtualenv.requirements=/home/szliu/venv/crawler/requirements.txt --conf spark.pyspark.virtualenv.bin.path=/usr/local/python2.7/bin/virtualenv dupfilter_task_worker.py --executor-memory 2G --total-executor-cores 2  >> /niub/szliu/dupfilter_task_worker.log 2>&1 &
    """
    sc = SparkContext(appName="dupfilter_task_work")
    # sc.addPyFile("{base_path}/redis.zip".format(base_path=project_path))
    # sc.addPyFile("{base_path}/pyhdfs.zip".format(base_path=project_path))
    from db_actions.hbase_action import HBASEAction
    from db_actions.redis_action import RedisAction
    redis_action = RedisAction()
    while True:
        data_lines = redis_action.priority_queue_pop("dupfilter_task_queue", 50)
        if any(data_lines):
            news = sc.parallelize(data_lines).filter(lambda x: x).map(lambda x: dupfilter_task_work(x, HBASEAction(),
                                                                      RedisAction(), "crawl_task_queue", "url_schedule"))
            # news.count()
            print news.collect()
        else:
            time.sleep(10)

Esempio n. 5
0
 def __init__(self):
     self.redis_action = RedisAction()
     self.hbase_action = HBASEAction()
Esempio n. 6
0
 def __init__(self):
     self.link_analysis = LinkExtractor()
     self.redis_action = RedisAction()
Esempio n. 7
0
class BaseStock(object):
    def __init__(self):
        self.link_analysis = LinkExtractor()
        self.redis_action = RedisAction()

    def is_detail_url(self, dom):
        """
        判断url是否详情页
        :param dom:
        :return True or False:
        """
        return True

    def get_channel(self, dom):
        return ""

    def parse_content(self, response_text, params):
        """
        判断是否时详情页, 如果是则解析网页内容, 如果不是则提取网页所有网址
        :param response_text: 网页内容
        :param params: {'info:url': 'info:priority': 'info:pase_func': ,}
        :return:
        """
        _params = {k: v for k, v in params.items() if k.startswith("info")}
        dom = PyQuery(response_text.strip())
        if self.is_detail_url(dom):
            return self.parse_detail_url(dom=dom, params=_params)
        else:
            return self.parse_other_url(dom=dom, params=_params)

    def parse_other_url(self, dom, params):
        """
        获取所有url, 并写入depfilter_task_queue
        :param dom:
        :param params:
        :return:
        """
        result_list = []
        channel = self.get_channel(dom)
        for e in dom.find('a'):
            sub_url = PyQuery(e).attr('href')
            if sub_url and sub_url.startswith("."):
                sub_url = self.link_analysis.url_join(params["info:url"],
                                                      sub_url)

            if self.link_analysis.url_legal(sub_url, self.allow_domains):
                if not self.link_filter(sub_url):
                    # 存入redis队列
                    _params = dict(
                        params.copy(), **{
                            "info:url": sub_url,
                            "info:channel": channel
                        })
                    result_list.extend(
                        [json.dumps(_params),
                         int(_params["info:priority"])])
        self.redis_action.priority_queue_push("dupfilter_task_queue",
                                              *result_list)
        return "parse urls"

    @upload_hbase
    def parse_detail_url(self, dom, params):
        pass

    @html_to_dom
    def detect_anti(self, ):
        pass

    def link_filter(self, url):
        return False
Esempio n. 8
0
# encoding: utf-8
import os
import sys
import urlparse
import traceback
import requests
import urllib2
import urllib
from pyquery import PyQuery
from datetime import datetime
sys.path.append("..")
from db_actions.hbase_action import HBASEAction
from db_actions.redis_action import RedisAction
from utilty.link_analysis import LinkExtractor
hbase_client = HBASEAction()
redis_client = RedisAction()


def parse_item(html):
    dom = PyQuery(html)
    store_json = {
        "info:title":
        dom("div#img-content > h2#activity-name").text(),
        "info:publish_time":
        dom("div.rich_media_meta_list > em#post-date").text(),
        "info:source":
        dom("a#post-user").text(),
        "info:author":
        dom("div.rich_media_meta_list > em").eq(1).text(),
        "info:content":
        LinkExtractor().strip_html5_whitespace(dom("div#page-content").html()),
Esempio n. 9
0
 def __init__(self):
     self.redis_action = RedisAction()
Esempio n. 10
0
# encoding: utf-8
import sys
import json
sys.path.append("..")
import traceback
from pyquery import PyQuery
from db_actions.hbase_action import HBASEAction
from db_actions.redis_action import RedisAction


hbase_action = HBASEAction()
redis_action = RedisAction()


def upload_hbase(func):

    def wrap(*args, **kwargs):
        try:
            data = func(*args, **kwargs)
            hbase_action.insert_data("news_data", data)
            return "data save in hbase"
        except Exception as e:
            traceback.print_exc()
            print kwargs["params"]["info:url"], "css selector error"
            return kwargs["params"]["info:url"], "css selector error"

    return wrap


def html_to_dom(is_detect_anti):