Example #1
0
class BaseSpider(Spider):
    name = 'basespider'

    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10

        self.sql = SqlHelper()

        self.dir_log = 'log/proxy/%s' % self.name

        self.is_record_web_page = False

    def init(self):
        self.meta = {
            'download_timeout': self.timeout,
        }

        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(config.free_ipproxy_table)
        self.sql.execute(command)

    def start_requests(self):
        for i, url in enumerate(self.urls):
            yield Request(
                url=url,
                headers=self.headers,
                meta=self.meta,
                dont_filter=True,
                callback=self.parse_page,
                errback=self.error_parse,
            )

    def parse_page(self, response):
        self.write(response.body)
        pass

    def error_parse(self, failure):
        request = failure.request
        pass

    def add_proxy(self, proxy):
        utils.sql_insert_proxy(self.sql, config.free_ipproxy_table, proxy)

    def write(self, data):
        if self.is_record_web_page:
            with open(
                    '%s/%s.html' %
                (self.dir_log,
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
                    'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
Example #2
0
    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            ip = inputs.get('ip')
            command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            sql.execute(command)

            command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            res = sql.query_one(command)
            return res is None
        except:
            pass
        return False
Example #3
0
class SendSms(object):
    def __init__(self):
        self.sql = SqlHelper()

        self.weather_table_name = config.weather_table
        self.user_table_name = config.user_table

    def send_sms(self):
        command = ("SELECT * FROM {};".format(self.user_table_name))
        self.sql.execute(command)
        users = self.sql.cursor.fetchall()
        if users != None:
            for user in users:
                utils.log('send_sms get user info user:%s' % str(user))
                # 判断用户定义的时间,只有满足用户定义时间才发送短信
                user_time = user[5]
                time_info = user_time.split(':')

                u_hour = time_info[0]
                u_minute = time_info[1]

                # 获取系统时间
                s_hour = datetime.datetime.now().hour
                s_minute = datetime.datetime.now().minute

                if int(u_hour) == s_hour and int(u_minute) == s_minute:
                    utils.log('send sms to user:%s' % str(user))

                    command = (
                        "select * from {0} where city_name='{1}' order by id desc limit 1;"
                        .format(self.weather_table_name, user[3]))
                    self.sql.execute(command)
                    weather = self.sql.cursor.fetchone()
                    if weather != None:
                        temp_code = 'SMS_41855112'
                        phone = user[2]
                        info = {
                            'name': user[1],
                            'city': user[3],
                            'weather': weather[15],
                            'temp': '%s ~ %s' % (weather[9], weather[8]),
                            'aqilevel': utils.get_aqi_level_info(weather[12]),
                        }

                        sms = AliyunSms()
                        sms.send_sms(temp_code, info, phone)
Example #4
0
class Validator(Spider):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''
        self.is_record_web_page = False

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {
            'CONCURRENT_REQUESTS': cls.concurrent_requests,
            'RETRY_ENABLED': cls.retry_enabled,
        },
                         priority='spider')

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, config.httpbin_table)

        ids = utils.get_table_ids(self.sql, self.name)
        ids_free = utils.get_table_ids(self.sql, config.httpbin_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_free[i - len(ids)]

            proxy = utils.get_proxy_info(self.sql, table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                        'vali_count':
                        proxy.get('vali_count', 0)
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('success_parse speed:%s meta:%s' %
                  (time.time() - response.meta.get('cur_time'), response.meta))

        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')
        id = response.meta.get('id')
        ip = proxy.get('ip')

        self.save_page(ip, response.body)

        if self.success_mark in response.body or self.success_mark is '':
            speed = time.time() - response.meta.get('cur_time')
            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    vali_count = response.meta.get('vali_count', 0) + 1
                    command = utils.get_update_data_command(
                        table, id, speed, vali_count)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None, 1)

                    self.sql.insert_data(command, msg, commit=True)
        else:
            # 如果没有找到成功标示,说明这里返回信息有误,需要删除当前库的 ip
            if table == self.name:
                command = utils.get_delete_data_command(table, id)
                self.sql.execute(command)

    def error_parse(self, failure):
        request = failure.request
        utils.log('error_parse value:%s url:%s meta:%s' %
                  (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)
        utils.log('filename:%s' % filename)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()
Example #5
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('\d+', re.S)
        product_id = re.search(pattern, url).group()
        sql = SqlHelper()

        utils.log('product_id:%s' % product_id)

        if 'item.jd.com' in url and product_id != None:
            data['status'] = 'success'
            data['guid'] = str(uuid.uuid4())
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={product_id}". \
                format(table = config.jd_item_table, product_id = product_id)
            result = sql.query_one(command)

            if result == None:
                name = 'jd'
                cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a product_id={product_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           product_id = product_id)

                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
                        format(config.analysis_item_table, product_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, product_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'product_id={product_id};'. \
                        format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'),
                               product_id = product_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
Example #6
0
    sql = SqlHelper()

    spiders = [
        XiCiDaiLiSpider,
        SixSixIpSpider,
        IpOneEightOneSpider,
        KuaiDaiLiSpider,  # 在访问前加了一个 js ,反爬
        GatherproxySpider,
        HidemySpider,
        ProxylistplusSpider,
        FreeProxyListsSpider,
        # PeulandSpider,  # 目标站点失效
        UsProxySpider,
        ProxyDBSpider,
        ProxyRoxSpider,
    ]

    while True:
        utils.log('*******************run spider start...*******************')

        command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL 0.2 DAY)".format(
                table = config.free_ipproxy_table)
        sql.execute(command)

        for spider in spiders:
            scrapydo.run_spider(spider)

        utils.log('*******************run spider waiting...*******************')
        time.sleep(1200)
Example #7
0
class Validator(Spider):
    name = 'base'

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, free_ipproxy_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else free_ipproxy_table

            proxy = utils.get_proxy_info(self.sql, table, i)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('name:%s success_parse proxy:%s meta:%s' %
                  (self.name, str(
                      response.meta.get('proxy_info')), str(response.meta)))

        filename = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S_%f')
        self.save_page(filename, response.body)

        if response.body.find(self.success_mark) or self.success_mark is '':
            proxy = response.meta.get('proxy_info')
            speed = time.time() - response.meta.get('cur_time')
            table = response.meta.get('table')
            id = response.meta.get('id')

            utils.log('speed:%s table:%s id:%s' % (speed, table, id))

            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    command = utils.get_update_data_command(table, id, speed)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None)

                    self.sql.insert_data(command, msg)

    def error_parse(self, failure):
        utils.log('error_parse value:%s' % failure.value)

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, filename, data):
        if get_project_settings().get('IS_RECODE_HTML', False):
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()
Example #8
0
def main(n):
    _tc = TimeCnt()
    _tc.cnt_time()
    #model_labels_2 = model_labels_2
    #model_labels = model_labels
    #model_columns_base = model_columns_base
    #print("> this is sql test")
    #shandong_ = SqlHelper(Config_map_shandong)
    #guangxi_ = SqlHelper(Config_guangxi)
    #storm_shandong_= SqlHelper(Configstormshandong)
    #storm_110_ = SqlHelper(Configooo)
    _tc.cnt_time()
    today = todayStr()

    #from load_mysqL_from_localcpk import load_mongodb_conn
    #bond_risk_ = load_mysqL_from_localcpk.load_mongodb_conn()
    mysql_bond_risk_ = SqlHelper(Config_bond_risk)
    #cursor = bond_risk_.find()
    #for i in cursor:
        #print(i)
        #pdb.set_trace()
    #    timeFormat(i['date_input'])
    label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date)>=%d);"% n)
    compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date)>=%d);"% n)
    #label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n)
    #_tc.cnt_time()
    #compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n)
    #label_120_, compname_120_ = get_label_time_window(bond_risk_, n, n-120)
    set_ = set()
    [set_.add(i) for i in label_120_]
    label_lst_ = list(set_)
    _tc.cnt_time()
    set_.clear()
    [set_.add(i) for i in compname_120_]
    compname_lst_ = list(set_)
    _dic = get_label_120(mysql_bond_risk_, compname_lst_, label_lst_, n)
    _tc.cnt_time()
    _panel = pd.Panel(_dic)
    _panel = _panel.fillna(0.0)
    #filter_data_by_time(bond_risk_, today)
    _tc.cnt_time()

    df_4_model = pd.DataFrame(index=compname_lst_, columns = model_labels + model_labels_2)
    df_4_model = df_4_model.fillna(0.0)
    df_4_model = df_4_model.astype(np.float64)
    _index = list(df_4_model.index)
    _columns = model_labels
    _columns_2 = model_labels_2
    #print("> ready to get data")
    _cnt = 0
    for i in _index:
        _cnt+=1
        if _cnt % 100 ==1:
            pass
            #print(">>>> !!! handle the,", i, _cnt)
        #if _cnt > 300:
            #break
            #print("> handle the,", i, _cnt)
        for c in _columns:
            df_4_model.loc[i,c] = cell_fill(_panel, i,c)
        df_4_model.loc[i, "企业名称"] = i
        df_4_model.loc[i, "发布日期"] = datetime.datetime.now()
        df_4_model.loc[i, "credit_recent"] = 0
        df_4_model.loc[i, "credit_ago"] = 0
        df_4_model.loc[i, "credit_trend"] = 0

        df_4_model.loc[i, "60"] = _panel[i].loc[60,:].sum()
        df_4_model.loc[i, "120"] = _panel[i].loc[120,:].sum()
        df_4_model.loc[i, "180"] = _panel[i].loc[180,:].sum()
        df_4_model.loc[i, "债券风险60"] = group_cnt_key_word("债券风险60",i,_panel)
        df_4_model.loc[i, "债券风险120"] = group_cnt_key_word("债券风险120",i,_panel)
        df_4_model.loc[i, "债券风险180"] = group_cnt_key_word("债券风险180",i,_panel)
        df_4_model.loc[i, "个人风险60"] = group_cnt_key_word("个人风险60",i,_panel)
        df_4_model.loc[i, "个人风险120"] = group_cnt_key_word("个人风险120",i,_panel)
        df_4_model.loc[i, "个人风险180"] = group_cnt_key_word("个人风险180",i,_panel)
        df_4_model.loc[i, "财务风险60"] = group_cnt_key_word("财务风险60",i,_panel)
        df_4_model.loc[i, "财务风险120"] = group_cnt_key_word("财务风险120",i,_panel)
        df_4_model.loc[i, "财务风险180"] = group_cnt_key_word("财务风险180",i,_panel)
        df_4_model.loc[i, "经营风险60"] = group_cnt_key_word("经营风险60",i,_panel)
        df_4_model.loc[i, "经营风险120"] = group_cnt_key_word("经营风险120",i,_panel)
        df_4_model.loc[i, "经营风险180"] = group_cnt_key_word("经营风险180",i,_panel)
        df_4_model.loc[i, "行业风险60"] = group_cnt_key_word("行业风险60",i,_panel)
        df_4_model.loc[i, "行业风险120"] = group_cnt_key_word("行业风险120",i,_panel)
        df_4_model.loc[i, "行业风险180"] = group_cnt_key_word("行业风险180",i,_panel)
        df_4_model.loc[i, "企业风险60"] = group_cnt_key_word("企业风险60",i,_panel)
        df_4_model.loc[i, "企业风险120"] = group_cnt_key_word("企业风险120",i,_panel)
        df_4_model.loc[i, "企业风险180"] = group_cnt_key_word("企业风险180",i,_panel)
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x)
        df_4_model.loc[i, "sub120_60"] = df_4_model.loc[i, "120"] - df_4_model.loc[i, "60"]
        df_4_model.loc[i, "sub180_120"] = df_4_model.loc[i, "180"] - df_4_model.loc[i, "120"]
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==-1 else x)
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x)

    _x = df_4_model.drop(["企业名称","发布日期","Label"],1)
    _z = pd.read_csv("/home/siyuan/bond_risk/_z.csv").drop(["Unnamed: 0","发布日期","Label"],1)
    #_z.index = _z["企业名称"]
    _z = _z.drop("企业名称", axis=1)
    _x.columns = list(_z.columns)
    # !! filter
    #_x = _x[(_x["sub120_60"]>0) & (_x["60"]>0)]
    #_x = _x[(_x["60"]>0)]
    _x = _x[(_x["120"]>0)]
    train_separator = len(_x.index)
    #print(train_separator)
    _pred_data = pd.concat([_x, _z], axis=0)

    _pred_data = set_dummy(_pred_data, False)
    # output predict label
    bst = xgb.Booster()
    bst.load_model("/home/siyuan/data/xgb.model")
    #pdb.set_trace()

    #_lz = pd.read_csv("/home/siyuan/bond_risk/_z.csv")["Label"]
    result_ = predict(bst, _pred_data, _pred_data.iloc[1])
    dict_ = dict(zip(list(_pred_data.index), result_))
    dict_res = dict(zip(list(_pred_data.index)[:train_separator], result_[:train_separator]))
    #dict_res = dict(zip(list(_pred_data.index), result_))

    #print(collections.Counter(list(result_)))
    #print(collections.Counter(list(result_)[:train_separator]))
    cnt = 0
    #pdb.set_trace()
    #print(dict_res)
    for i in dict_res.keys():
        #sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e')))
        sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e')))
        #print(sql_)
        sql_res_ = mysql_bond_risk_.execute(sql_)
        #print(sql_res_)
        cnt+=1
    pdb.set_trace()
    mysql_bond_risk_.connect.commit()
Example #9
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('user-rate-')
        urls = re.split(pattern, url)
        user_id = urls[1]
        pattern = re.compile('\w+', re.S)
        user_id = re.search(pattern, user_id).group()
        sql = SqlHelper()

        utils.log('user_id:%s' % user_id)

        if 'rate.taobao.com' in url and user_id != None:
            data['status'] = 'success'
            data['guid'] = str(random.randint(1000000000000,
                                              9999999999999)) + '_' + str(
                                                  random.randint(100, 999))
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={user_id}". \
                format(table = config.tb_item_table, user_id = user_id)
            result = sql.query_one(command)

            if result == None:
                name = 'tb_comment'
                cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a user_id={user_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           user_id = user_id)

                logging.warn(cmd)
                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \
                        format(config.analysis_item_table, user_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, user_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'user_id={user_id};'. \
                        format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'),
                               user_id = user_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
Example #10
0
class GameInfo(CrawlSpider):
    name = 'game_info'

    def __init__(self, *a, **kw):
        super(GameInfo, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

        self.error_count = 0

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` TEXT NOT NULL,"
                   "`price` INT(5) NOT NULL,"
                   "`metacritic_score` FLOAT DEFAULT NULL,"
                   "`user_reviews_count` INT(6) NOT NULL,"
                   "`positive_user_reviews_count` INT(6) NOT NULL,"
                   "`positive_percent` FLOAT NOT NULL ,"
                   "`negative_user_reviews_count` INT(6) NOT NULL,"
                   '`steam_user_reviews_count` INT(6) NOT NULL,'
                   '`non_steam_user_reviews_count` INT(6) NOT NULL,'
                   '`english_user_reviews_count` INT(6) NOT NULL,'
                   '`non_english_user_reviews_count` INT(6) NOT NULL,'
                   "`tag_list` TEXT DEFAULT NULL,"
                   "`achievements_count` INT(4) DEFAULT NULL,"
                   "`category` TEXT NOT NULL,"
                   "`genre` TEXT NOT NULL,"
                   "`developer` TEXT NOT NULL,"
                   "`publisher` TEXT NOT NULL,"
                   "`release_date` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`language_number` INT(3) DEFAULT NULL,"
                   "`description` TEXT DEFAULT NULL,"
                   "`save_time` TIMESTAMP NOT NULL,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_info_table))
        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format(
            config.steam_game_urls_table)
        data = self.sql.query(command)
        for i, item in enumerate(data):
            yield Request(
                url=item[3],
                dont_filter=True,
                method='GET',
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'item': item,
                    'id': item[0],
                },
                cookies={
                    'mature_content': '1',
                },
                callback=self.parse_game,
                errback=self.error_parse,
            )

    def parse_game(self, response):
        self.log('parse_game url:%s' % response.url)
        id = response.meta.get('id')

        # file_name = '%s/%s.html' % (self.dir_game, id)
        # self.save_page(file_name, response.body)

        if u'Please enter your birth date to continue' in response.body:
            self.log('Please enter your birth date to continue meta:%s' %
                     response.meta)

            url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id)
            return FormRequest(url=url,
                               dont_filter=True,
                               method='POST',
                               formdata={
                                   'ageDay': str(range(1, 25)),
                                   'ageMonth': 'January',
                                   'ageYear': str(range(1980, 1995)),
                                   'snr': '1_agecheck_agecheck__age-gate',
                               },
                               callback=self.parse_game)

        soup = BeautifulSoup(response.body, 'lxml')
        sel = Selector(text=response.body)

        name = sel.xpath(
            '//div[@class="apphub_AppName"]/text()').extract_first()
        if name == '' or name == None:
            self.log('no get data meta:%s' % response.meta)
            return

        price = sel.xpath('//div[@class="game_purchase_price price"]/text()'
                          ).extract_first()
        try:
            p = price.split('¥')
            price = int(p[1])
        except:
            price = -1

        # 该游戏在 metacritic 上的评分
        metacritic_score = sel.xpath(
            '//div[@class="score high"]/text()').extract_first()
        try:
            metacritic_score = int(metacritic_score)
        except:
            metacritic_score = -1

        # 所有用户回复数量
        user_reviews_count = sel.xpath(
            '//label[@for="review_type_all"]/span/text()').extract_first()
        user_reviews_count = self.count_to_int(user_reviews_count)

        # 好评的用户数量
        positive_user_reviews_count = sel.xpath(
            '//label[@for="review_type_positive"]/span/text()').extract_first(
            )
        positive_user_reviews_count = self.count_to_int(
            positive_user_reviews_count)

        # 好评的百分比
        if user_reviews_count != -1 and positive_user_reviews_count != -1:
            positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100
        else:
            positive_percent = 0

        # 差评的用户数量
        negative_user_reviews_count = sel.xpath(
            '//label[@for="review_type_negative"]/span/text()').extract_first(
            )
        negative_user_reviews_count = self.count_to_int(
            negative_user_reviews_count)

        # 在 steam 购买的用户的评论数
        steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_steam"]/span/text()').extract_first()
        steam_user_reviews_count = self.count_to_int(steam_user_reviews_count)

        # 在其他平台购买的用户的评论数
        non_steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_non_steam"]/span/text()'
        ).extract_first()
        non_steam_user_reviews_count = self.count_to_int(
            non_steam_user_reviews_count)

        # 英语评论的数量
        english_user_reviews_count = sel.xpath(
            '//label[@for="review_language_mine"]/span/text()').extract_first(
            )
        english_user_reviews_count = self.count_to_int(
            english_user_reviews_count)

        # 非英语的评论数量
        non_english_user_reviews_count = user_reviews_count - english_user_reviews_count

        # 该游戏的标签列表
        try:
            tags = soup.find(attrs={'class': 'glance_tags popular_tags'})
            tag_list = tags.text.replace('\t', '')
            tag_list = tag_list.replace('\n', ',')
        except:
            tag_list = ''

        # 该游戏的成就数量
        achievements = sel.xpath(
            '//div[@id="achievement_block"]/div/text()').extract_first()
        try:
            achievements_count = re.search('\d+', achievements, re.S).group(0)
            achievements_count = int(achievements_count)
        except:
            achievements_count = 0

        # 该游戏的分类 All Games > Action Games > Counter-Strike
        try:
            category = soup.find(name='div', attrs={
                'class': 'breadcrumbs'
            }).text
            category = category.replace('\t', '')
            category = category.replace('\n', '')
        except:
            category = ''

        # 游戏类型
        genre = sel.xpath(
            '//div[@class="block_content"]/div/div/a/text()').extract_first()

        # 游戏开发商
        developer = sel.xpath(
            '//div[@class="block_content"]/div/div/a[2]/text()').extract_first(
            )

        # 游戏发行商
        publisher = sel.xpath(
            '//div[@class="block_content"]/div/div/a[3]/text()').extract_first(
            )

        # 游戏发行日期
        release_date = sel.xpath(
            '//div[@class="release_date"]/span/text()').extract_first()

        # 游戏支持的语言
        language_number = len(
            sel.xpath(
                '//table[@class="game_language_options"]/tr').extract()) - 1

        # 游戏描述
        description = sel.xpath(
            '//div[@class="game_description_snippet"]/text()').extract_first()

        # 抓取该游戏时间
        save_time = None

        msg = (id, name, price, response.url, metacritic_score,
               user_reviews_count, positive_user_reviews_count,
               positive_percent, negative_user_reviews_count,
               steam_user_reviews_count, non_steam_user_reviews_count,
               english_user_reviews_count, non_english_user_reviews_count,
               tag_list, achievements_count, category, genre, developer,
               publisher, release_date, language_number, description,
               save_time)

        command = (
            "INSERT IGNORE INTO {} "
            "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, "
            "positive_percent, negative_user_reviews_count, steam_user_reviews_count, "
            "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, "
            "tag_list, achievements_count, category, genre, developer, publisher, release_date, "
            "language_number, description, save_time)"
            "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
            "%s)".format(config.steam_game_info_table))

        self.sql.insert_data(command, msg)

        command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format(
            config.steam_game_urls_table, id)
        self.sql.execute(command)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id

        self.error_count = self.error_count + 1
        utils.log('get_id error url:%s' % url)
        return -self.error_count

    def count_to_int(self, data):
        try:
            ret = data
            ret = ret.replace('(', '')
            ret = ret.replace(')', '')
            ret = ret.replace(',', '')

            return int(ret)
        except:
            return -1

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()