コード例 #1
0
ファイル: run.py プロジェクト: dasinWalk/ArticleSpider
def get_scrapy_list():
    mysql2 = NativeMysql()
    sql = """
          SELECT url FROM scrapy_service where  1=1 
        """
    result = mysql2.getAll(sql)
    return result
コード例 #2
0
ファイル: run.py プロジェクト: dasinWalk/ArticleSpider
def reset_crawl_task(task_id):
    mysql2 = NativeMysql()
    sql = """
              update crawl_task set task_status = 3 where id = %s
            """
    params = (task_id, )
    mysql2.update(sql, param=params)
コード例 #3
0
def crawl_ips():
    #爬取西刺的免费代理ip
    mysql1 = NativeMysql()
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"}
    for i in range(3444):
        result = requests.get("http://www.xicidaili.com/nn/{0}".format(i), headers=headers)
        selector = Selector(text=result.text)
        all_trs = selector.css("#ip_list tr")
        ip_list = []
        for tr in all_trs[1:]:
            speed_str = tr.css(".bar::attr(title)").extract()[0]
            if '秒' in speed_str:
                speed = float(speed_str.split("秒")[0])
                all_texts = tr.css("td::text").extract()
                ip = all_texts[0]
                port = all_texts[1]
                proxy_type = all_texts[5]
                if proxy_type not in ['HTTP', 'HTTPS']:
                    proxy_type = all_texts[4]
                ip_list.append((ip, port, proxy_type, speed))
        for ip_info in ip_list:
            sql = "insert into proxy_ip(ip, port, proxy_type, speed) VALUES ('{0}', '{1}', '{2}', {3})".format(
                ip_info[0], ip_info[1], ip_info[2], ip_info[3]
            )
            mysql1.insertOne(sql)
コード例 #4
0
ファイル: run.py プロジェクト: dasinWalk/ArticleSpider
def add_crawl_record(task_id):
    mysql2 = NativeMysql()
    tid = create_table_id()
    create_time = get_now_date()
    sql = """
          insert into crawl_record (id,task_id,create_time,start_time,crawl_num,in_db_num,history) 
          values (%s, %s, %s, %s, %s, %s, %s)
        """
    params = (tid, task_id, create_time, create_time, 0, 0, 0)
    mysql2._execute_commit(sql, arg=params)
    return tid
コード例 #5
0
ファイル: run.py プロジェクト: dasinWalk/ArticleSpider
def add_crawl_job(value_map, service):
    db_map = value_map['dbMap']
    mysql2 = NativeMysql()
    tid = create_table_id()
    sql = """
          insert into scrapy_task (id,task_id,status,service_name,crawl_num,lost_num,total_page,history) 
          values (%s, %s, %s, %s, %s, %s, %s, %s)
        """
    params = (tid, db_map["task_id"], 0, service, 0, 0, 0, 0)
    mysql2._execute_commit(sql, arg=params)
    return tid
コード例 #6
0
def update_crawl_status(crawl_id, task_id, failed_num, success_num):
    if failed_num is None:
        failed_num = 0
    if success_num is None:
        success_num = 0
    total_page = failed_num + success_num
    logging.info("start crawl data ===failed_num" + task_id + str(failed_num))
    logging.info("start crawl data ===success_num" + task_id +
                 str(success_num))
    #redis中统计爬虫任务爬虫记录数
    obj.hash_inrc_key("'" + task_id + "'", 'failed_num', failed_num)
    obj.hash_inrc_key("'" + task_id + "'", 'success_num', success_num)
    try:
        sql = """
                  update scrapy_task set status = '1',history = '1', crawl_num = %s, lost_num = %s, 
                  total_page = %s where id = %s
              """
        mysql2 = NativeMysql()
        params = (success_num, failed_num, total_page, crawl_id)
        mysql2.update(sql, param=params)
        #查询该任务的所有爬虫信息
        query_sql = """
                        select status from scrapy_task WHERE history = '0' and task_id = %s  for update
                    """
        query_param = (task_id, )
        cursor = mysql2.getAll(query_sql, param=query_param)
        if not cursor:
            # 更新crawl_record, crawl_task
            uptask_sql = """
                             update crawl_task set task_status = 3 where id = %s
                           """
            record_sql = """
                         update crawl_record set crawl_num = %s, in_db_num = %s, end_time = %s where task_id = %s
                     """
            task_total_success = obj.get_hash_value("'" + task_id + "'",
                                                    'success_num')
            if task_total_success is None:
                task_total_success = 0
            task_total_fail = obj.get_hash_value("'" + task_id + "'",
                                                 'failed_num')
            if task_total_fail is None:
                task_total_fail = 0
            total_page = int(task_total_success) + int(task_total_fail)
            update_param = (total_page, task_total_success, get_now_date(),
                            task_id)
            mysql2.update(uptask_sql, param=query_param)
            mysql2.update(record_sql, param=update_param)
            obj.del_key(task_id)
            obj.del_key(task_id + 'dog')
            obj.remove_hash("'" + task_id + "'", {'failed_num', 'success_num'})
    except Exception as e:
        print(e)
コード例 #7
0
class GetIp(object):
    mysql2 = NativeMysql()

    def delete_ip(self, ip):
        # 从数据库中删除无效的ip
        delete_sql = """
            delete from proxy_ip where ip='{0}'
        """.format(ip)
        self.mysql2.delete(delete_sql)
        return True

    def judge_ip(self, ip, port):
        # 判断ip是否可用
        http_url = "http://www.baidu.com"
        proxy_url = "http://{0}:{1}".format(ip, port)
        try:
            proxy_dict = {
                "http": proxy_url,
            }
            response = requests.get(http_url, proxies=proxy_dict, timeout=4)
        except Exception as e:
            print("invalid ip and port")
            self.delete_ip(ip)
            return False
        else:
            code = response.status_code
            if code >= 200 and code < 300:
                print("effective ip")
                return True
            else:
                print("invalid ip and port")
                self.delete_ip(ip)
                return False

    def get_random_ip(self):
        # 从数据库中随机获取一个可用的ip
        random_sql = """
              SELECT ip, port FROM proxy_ip where  proxy_type = 'HTTPS'
            ORDER BY RAND()
            LIMIT 1
            """
        result = self.mysql2.getOne(random_sql)
        ip = result[0]
        port = result[1]
        judge_re = self.judge_ip(ip, port)
        if judge_re:
            return "http://{0}:{1}".format(ip, port)
        else:
            return self.get_random_ip()