def get_scrapy_list(): mysql2 = NativeMysql() sql = """ SELECT url FROM scrapy_service where 1=1 """ result = mysql2.getAll(sql) return result
def reset_crawl_task(task_id): mysql2 = NativeMysql() sql = """ update crawl_task set task_status = 3 where id = %s """ params = (task_id, ) mysql2.update(sql, param=params)
def crawl_ips(): #爬取西刺的免费代理ip mysql1 = NativeMysql() headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} for i in range(3444): result = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers) selector = Selector(text=result.text) all_trs = selector.css("#ip_list tr") ip_list = [] for tr in all_trs[1:]: speed_str = tr.css(".bar::attr(title)").extract()[0] if '秒' in speed_str: speed = float(speed_str.split("秒")[0]) all_texts = tr.css("td::text").extract() ip = all_texts[0] port = all_texts[1] proxy_type = all_texts[5] if proxy_type not in ['HTTP', 'HTTPS']: proxy_type = all_texts[4] ip_list.append((ip, port, proxy_type, speed)) for ip_info in ip_list: sql = "insert into proxy_ip(ip, port, proxy_type, speed) VALUES ('{0}', '{1}', '{2}', {3})".format( ip_info[0], ip_info[1], ip_info[2], ip_info[3] ) mysql1.insertOne(sql)
def add_crawl_record(task_id): mysql2 = NativeMysql() tid = create_table_id() create_time = get_now_date() sql = """ insert into crawl_record (id,task_id,create_time,start_time,crawl_num,in_db_num,history) values (%s, %s, %s, %s, %s, %s, %s) """ params = (tid, task_id, create_time, create_time, 0, 0, 0) mysql2._execute_commit(sql, arg=params) return tid
def add_crawl_job(value_map, service): db_map = value_map['dbMap'] mysql2 = NativeMysql() tid = create_table_id() sql = """ insert into scrapy_task (id,task_id,status,service_name,crawl_num,lost_num,total_page,history) values (%s, %s, %s, %s, %s, %s, %s, %s) """ params = (tid, db_map["task_id"], 0, service, 0, 0, 0, 0) mysql2._execute_commit(sql, arg=params) return tid
def update_crawl_status(crawl_id, task_id, failed_num, success_num): if failed_num is None: failed_num = 0 if success_num is None: success_num = 0 total_page = failed_num + success_num logging.info("start crawl data ===failed_num" + task_id + str(failed_num)) logging.info("start crawl data ===success_num" + task_id + str(success_num)) #redis中统计爬虫任务爬虫记录数 obj.hash_inrc_key("'" + task_id + "'", 'failed_num', failed_num) obj.hash_inrc_key("'" + task_id + "'", 'success_num', success_num) try: sql = """ update scrapy_task set status = '1',history = '1', crawl_num = %s, lost_num = %s, total_page = %s where id = %s """ mysql2 = NativeMysql() params = (success_num, failed_num, total_page, crawl_id) mysql2.update(sql, param=params) #查询该任务的所有爬虫信息 query_sql = """ select status from scrapy_task WHERE history = '0' and task_id = %s for update """ query_param = (task_id, ) cursor = mysql2.getAll(query_sql, param=query_param) if not cursor: # 更新crawl_record, crawl_task uptask_sql = """ update crawl_task set task_status = 3 where id = %s """ record_sql = """ update crawl_record set crawl_num = %s, in_db_num = %s, end_time = %s where task_id = %s """ task_total_success = obj.get_hash_value("'" + task_id + "'", 'success_num') if task_total_success is None: task_total_success = 0 task_total_fail = obj.get_hash_value("'" + task_id + "'", 'failed_num') if task_total_fail is None: task_total_fail = 0 total_page = int(task_total_success) + int(task_total_fail) update_param = (total_page, task_total_success, get_now_date(), task_id) mysql2.update(uptask_sql, param=query_param) mysql2.update(record_sql, param=update_param) obj.del_key(task_id) obj.del_key(task_id + 'dog') obj.remove_hash("'" + task_id + "'", {'failed_num', 'success_num'}) except Exception as e: print(e)
class GetIp(object): mysql2 = NativeMysql() def delete_ip(self, ip): # 从数据库中删除无效的ip delete_sql = """ delete from proxy_ip where ip='{0}' """.format(ip) self.mysql2.delete(delete_sql) return True def judge_ip(self, ip, port): # 判断ip是否可用 http_url = "http://www.baidu.com" proxy_url = "http://{0}:{1}".format(ip, port) try: proxy_dict = { "http": proxy_url, } response = requests.get(http_url, proxies=proxy_dict, timeout=4) except Exception as e: print("invalid ip and port") self.delete_ip(ip) return False else: code = response.status_code if code >= 200 and code < 300: print("effective ip") return True else: print("invalid ip and port") self.delete_ip(ip) return False def get_random_ip(self): # 从数据库中随机获取一个可用的ip random_sql = """ SELECT ip, port FROM proxy_ip where proxy_type = 'HTTPS' ORDER BY RAND() LIMIT 1 """ result = self.mysql2.getOne(random_sql) ip = result[0] port = result[1] judge_re = self.judge_ip(ip, port) if judge_re: return "http://{0}:{1}".format(ip, port) else: return self.get_random_ip()