def urls_add(self): add_url_list = list(set(self.__request_json['urls_add'])) # 去重 # 已存在queue中的 exist_queue_url_list = [] res = Mongo.get()['queue_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}}, {'url': 1}) for doc in res: exist_queue_url_list.append(doc['url']) # 已存在parsed中的 exist_parsed_url_list = [] res = Mongo.get()['parsed_' + self.project_name].find({'url_md5': {'$in': [md5(l) for l in add_url_list]}}, {'url': 1}) for doc in res: # todo 需要判断存在的时间, 允许重复抓取 exist_parsed_url_list.append(doc['url']) # 加入队列 add_urls_data = [] for url in add_url_list: if url not in exist_queue_url_list and url not in exist_parsed_url_list: # 不存在queue不存在parsed中才加入队列 add_urls_data.append( {'domain': get_domain(url), 'url': url, 'url_md5': md5(url), 'flag_time': 0, 'add_time': int(time.time()), 'slave_ip': self.__request_address[0]}) add_urls_data and Mongo.get()['queue_' + self.project_name].insert(add_urls_data)
def sort_urls_by_freq(cls, urls): """ 根据host抓取次数排序urls """ sorted_urls = {} for url in urls: sorted_urls[url] = len(cls.host_freq_pool.get(get_domain(url), [])) return cls.__sort_dict_by_value_return_keys(sorted_urls)
def run(self, func, current_url, project_name, init_url, gevent_id): """ :param func: :return: """ self.handle_method = func # while True: # todo 需要些速度控制方法. gevent.sleep self.current_url = current_url print 'gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- ' + self.current_url if not self.current_url: # continue return self.put_data(urls_parsed=[self.current_url, ]) crawl_result = self.http_helper.get(self.current_url) if not str(crawl_result[1]).startswith('20') \ and not str(crawl_result[1]).startswith('30'): # 如果不是200系列和300系列的状态码输出错误 echo_err('gevent_id: ' + str(gevent_id) + ' -- ' + self.project_name + ' -- URL: ' + self.current_url + ' 获取失败 HTTP code: ' + str(crawl_result[1]) + ' Runtime: ' + str( crawl_result[2]) + 'ms') # continue self.put_data( urls_fail=( get_domain(self.current_url), int(crawl_result[1] if str(crawl_result[1]).isdigit() else 0), int(time.time()), ), ) return # 如果抓取自定义函数存在dict返回值则将dict推送至服务器 try: parse_result = self.handle_method( S(self, crawl_result[0], get_urls_form_html(self.current_url, crawl_result[0]), project_name, init_url)) except: print traceback.format_exc() return if not isinstance(parse_result, dict): # continue return if 'url' not in parse_result: parse_result['url'] = self.current_url # if 'runtime' not in parse_result: # parse_result['runtime'] = crawl_result[2] self.put_data(save=parse_result)
def secure(sec, url, message, box, button): url = get_domain(url) if sec == 1: message.set_markup("<span size='small'>{} {}.\r{}.</span>".format(url, _("has no security"),\ _("An attacker could see any information you send, or control the content that you see"))) box.show_all() button.hide() return if sec == 0: message.set_markup("<span size='small'>{} {}\r{}</span>".format(_("Connected to"),\ url, _("Your connection seems to be secure. Want to know more about?"))) if sec == 2: message.set_markup("<span size='small'>{} {}\r{}</span>".format(_("Connected to"),\ url, _("This web site did not properly secure your connection. Want to know more about?"))) box.show_all()
def get_sleep_times(cls, url): domain = get_domain(url) parsed_list = cls.host_freq_pool.get(domain, []) if not parsed_list: return 1 list_403 = ['jandan.net', 'meizu.com', 'meizu.cn'] # 一些防爬虫机制比较严格的站点 parsed_list_len = len(parsed_list) if parsed_list_len < 5: return 1 if parsed_list_len < 10: return 1 if parsed_list_len < 20: if domain in list_403: return random.randint(5, 20) return 2 if parsed_list_len < 30: if domain in list_403: return random.randint(5, 35) return 4 if parsed_list_len < 40: if domain in list_403: return random.randint(5, 40) return 6 if parsed_list_len < 50: if domain in list_403: return random.randint(5, 56) return 8 if parsed_list_len < 60: if domain in list_403: return random.randint(5, 70) return 10 if parsed_list_len < 70: if domain in list_403: return random.randint(5, 90) return 12 if parsed_list_len < 80: if domain in list_403: return random.randint(5, 100) return 14 if parsed_list_len < 90: if domain in list_403: return random.randint(5, 110) return 16 if parsed_list_len < 100: if domain in list_403: return random.randint(5, 120) return 18 if parsed_list_len < 110: if domain in list_403: return random.randint(5, 130) return 20 if parsed_list_len < 120: if domain in list_403: return random.randint(5, 140) return 22 if parsed_list_len < 130: if domain in list_403: return random.randint(5, 150) return 24 return 200
def add_parsed(cls, url): # 获取主域名并更新该域名的访问频率 cls.__update_host_freq(get_domain(url))
def get(self, url, params=()): self.domain = get_domain(url) self.url = url return self.__request('get', params)
""" Open a csv and populate a dictionary with its contents. Prompt the user to add members. """ from functions import (open_csv_populate_lst, get_domain, prompt_user_for_domain, prompt_user_for_prefix, concat_lists, write_lst_to_csv) employees = open_csv_populate_lst("employees.csv") domain = get_domain(employees) domain_confirmed = prompt_user_for_domain(domain) employees_to_add = prompt_user_for_prefix(domain, employees) all_employees = concat_lists(employees, employees_to_add) write_lst_to_csv("employees.csv", all_employees, ["email"])