Ejemplo n.º 1
0
    def urls_add(self):
        add_url_list = list(set(self.__request_json['urls_add']))  # 去重

        # 已存在queue中的
        exist_queue_url_list = []
        res = Mongo.get()['queue_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}},
                                                             {'url': 1})

        for doc in res:
            exist_queue_url_list.append(doc['url'])

        # 已存在parsed中的
        exist_parsed_url_list = []

        res = Mongo.get()['parsed_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}},
                                                              {'url': 1})
        for doc in res:  # todo 需要判断存在的时间, 允许重复抓取
            exist_parsed_url_list.append(doc['url'])

        # 加入队列
        add_urls_data = []
        for url in add_url_list:
            if url not in exist_queue_url_list and url not in exist_parsed_url_list:  # 不存在queue不存在parsed中才加入队列
                add_urls_data.append(
                    {'domain': get_domain(url),
                     'url': url,
                     'url_md5': md5(url),
                     'flag_time': 0,
                     'add_time': int(time.time()),
                     'slave_ip': self.__request_address[0]})

        add_urls_data and Mongo.get()['queue_' + self.project_name].insert(add_urls_data)
Ejemplo n.º 2
0
    def sort_urls_by_freq(cls, urls):
        """
        根据host抓取次数排序urls
        """
        sorted_urls = {}
        for url in urls:
            sorted_urls[url] = len(cls.host_freq_pool.get(get_domain(url), []))

        return cls.__sort_dict_by_value_return_keys(sorted_urls)
Ejemplo n.º 3
0
    def sort_urls_by_freq(cls, urls):
        """
        根据host抓取次数排序urls
        """
        sorted_urls = {}
        for url in urls:
            sorted_urls[url] = len(cls.host_freq_pool.get(get_domain(url), []))

        return cls.__sort_dict_by_value_return_keys(sorted_urls)
Ejemplo n.º 4
0
    def run(self, func, current_url, project_name, init_url, gevent_id):
        """
        :param func:
        :return:
        """
        self.handle_method = func

        # while True:
        # todo 需要些速度控制方法. gevent.sleep
        self.current_url = current_url

        print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url
        if not self.current_url:
            # continue
            return
        self.put_data(urls_parsed=[self.current_url, ])
        crawl_result = self.http_helper.get(self.current_url)
        if not str(crawl_result[1]).startswith('20') \
                and not str(crawl_result[1]).startswith('30'):  # 如果不是200系列和300系列的状态码输出错误
            echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name +
                     ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str(
                crawl_result[2]) + 'ms')
            # continue
            self.put_data(
                urls_fail=(
                    get_domain(self.current_url),
                    int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0),
                    int(time.time()),
                ),
            )
            return

        # 如果抓取自定义函数存在dict返回值则将dict推送至服务器
        try:
            parse_result = self.handle_method(
                S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url))
        except:
            print traceback.format_exc()
            return

        if not isinstance(parse_result, dict):
            # continue
            return

        if 'url' not in parse_result:
            parse_result['url'] = self.current_url
        # if 'runtime' not in parse_result:
        #     parse_result['runtime'] = crawl_result[2]

        self.put_data(save=parse_result)
Ejemplo n.º 5
0
def secure(sec, url, message, box, button):

    url = get_domain(url)

    if sec == 1:
        message.set_markup("<span size='small'>{} {}.\r{}.</span>".format(url, _("has no security"),\
        _("An attacker could see any information you send, or control the content that you see")))
        box.show_all()
        button.hide()
        return

    if sec == 0:        message.set_markup("<span size='small'>{} {}\r{}</span>".format(_("Connected to"),\
url, _("Your connection seems to be secure. Want to know more about?")))

    if sec == 2:        message.set_markup("<span size='small'>{} {}\r{}</span>".format(_("Connected to"),\
url, _("This web site did not properly secure your connection. Want to know more about?")))

    box.show_all()
Ejemplo n.º 6
0
    def get_sleep_times(cls, url):
        domain = get_domain(url)
        parsed_list = cls.host_freq_pool.get(domain, [])
        if not parsed_list:
            return 1

        list_403 = ['jandan.net', 'meizu.com', 'meizu.cn']  # 一些防爬虫机制比较严格的站点
        parsed_list_len = len(parsed_list)

        if parsed_list_len < 5:
            return 1

        if parsed_list_len < 10:
            return 1

        if parsed_list_len < 20:
            if domain in list_403:
                return random.randint(5, 20)
            return 2

        if parsed_list_len < 30:
            if domain in list_403:
                return random.randint(5, 35)
            return 4

        if parsed_list_len < 40:
            if domain in list_403:
                return random.randint(5, 40)
            return 6

        if parsed_list_len < 50:
            if domain in list_403:
                return random.randint(5, 56)
            return 8

        if parsed_list_len < 60:
            if domain in list_403:
                return random.randint(5, 70)
            return 10

        if parsed_list_len < 70:
            if domain in list_403:
                return random.randint(5, 90)
            return 12

        if parsed_list_len < 80:
            if domain in list_403:
                return random.randint(5, 100)
            return 14

        if parsed_list_len < 90:
            if domain in list_403:
                return random.randint(5, 110)
            return 16

        if parsed_list_len < 100:
            if domain in list_403:
                return random.randint(5, 120)
            return 18

        if parsed_list_len < 110:
            if domain in list_403:
                return random.randint(5, 130)
            return 20

        if parsed_list_len < 120:
            if domain in list_403:
                return random.randint(5, 140)
            return 22

        if parsed_list_len < 130:
            if domain in list_403:
                return random.randint(5, 150)
            return 24

        return 200
Ejemplo n.º 7
0
 def add_parsed(cls, url):
     # 获取主域名并更新该域名的访问频率
     cls.__update_host_freq(get_domain(url))
Ejemplo n.º 8
0
 def get(self, url, params=()):
     self.domain = get_domain(url)
     self.url = url
     return self.__request('get', params)
Ejemplo n.º 9
0
    def get_sleep_times(cls, url):
        domain = get_domain(url)
        parsed_list = cls.host_freq_pool.get(domain, [])
        if not parsed_list:
            return 1

        list_403 = ['jandan.net', 'meizu.com', 'meizu.cn']  # 一些防爬虫机制比较严格的站点
        parsed_list_len = len(parsed_list)

        if parsed_list_len < 5:
            return 1

        if parsed_list_len < 10:
            return 1

        if parsed_list_len < 20:
            if domain in list_403:
                return random.randint(5, 20)
            return 2

        if parsed_list_len < 30:
            if domain in list_403:
                return random.randint(5, 35)
            return 4

        if parsed_list_len < 40:
            if domain in list_403:
                return random.randint(5, 40)
            return 6

        if parsed_list_len < 50:
            if domain in list_403:
                return random.randint(5, 56)
            return 8

        if parsed_list_len < 60:
            if domain in list_403:
                return random.randint(5, 70)
            return 10

        if parsed_list_len < 70:
            if domain in list_403:
                return random.randint(5, 90)
            return 12

        if parsed_list_len < 80:
            if domain in list_403:
                return random.randint(5, 100)
            return 14

        if parsed_list_len < 90:
            if domain in list_403:
                return random.randint(5, 110)
            return 16

        if parsed_list_len < 100:
            if domain in list_403:
                return random.randint(5, 120)
            return 18

        if parsed_list_len < 110:
            if domain in list_403:
                return random.randint(5, 130)
            return 20

        if parsed_list_len < 120:
            if domain in list_403:
                return random.randint(5, 140)
            return 22

        if parsed_list_len < 130:
            if domain in list_403:
                return random.randint(5, 150)
            return 24

        return 200
Ejemplo n.º 10
0
 def add_parsed(cls, url):
     # 获取主域名并更新该域名的访问频率
     cls.__update_host_freq(get_domain(url))
Ejemplo n.º 11
0
 def get(self, url, params=()):
     self.domain = get_domain(url)
     self.url = url
     return self.__request('get', params)
Ejemplo n.º 12
0
"""
Open a csv and populate a dictionary with its contents. Prompt the user to
add members.
"""

from functions import (open_csv_populate_lst, get_domain,
                       prompt_user_for_domain, prompt_user_for_prefix,
                       concat_lists, write_lst_to_csv)

employees = open_csv_populate_lst("employees.csv")
domain = get_domain(employees)
domain_confirmed = prompt_user_for_domain(domain)
employees_to_add = prompt_user_for_prefix(domain, employees)
all_employees = concat_lists(employees, employees_to_add)
write_lst_to_csv("employees.csv", all_employees, ["email"])