Python SqlHelper.execute Examples

Programming Language: Python

Namespace/Package Name: sqlhelper

Class/Type: SqlHelper

Method/Function: execute

Examples at hotexamples.com: 10

Python SqlHelper.execute - 10 examples found. These are the top rated real world Python examples of sqlhelper.SqlHelper.execute extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SqlHelper(21)

insert_data(13)

query(11)

create_table(10)

execute(10)

query_one(6)

execute_statement(5)

sql_create_table(2)

insert_data_to_users(2)

insert_data_to_albums(2)

_get_rand_tableno(2)

get_tablename(2)

insert_data_to_photos(1)

insert_json(1)

execute_query(1)

commit(1)

sanitize_colname(1)

classify_dbtype(1)

sql_insert_query(1)

type_to_dbstatement(1)

Example #1

Show file

File: basespider.py Project: zhuozecheng/IPProxyTool

class BaseSpider(Spider):
    name = 'basespider'

    def __init__(self, *a, **kw):
        super(BaseSpider, self).__init__(*a, **kw)

        self.urls = []
        self.headers = {}
        self.timeout = 10

        self.sql = SqlHelper()

        self.dir_log = 'log/proxy/%s' % self.name

        self.is_record_web_page = False

    def init(self):
        self.meta = {
            'download_timeout': self.timeout,
        }

        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(config.free_ipproxy_table)
        self.sql.execute(command)

    def start_requests(self):
        for i, url in enumerate(self.urls):
            yield Request(
                url=url,
                headers=self.headers,
                meta=self.meta,
                dont_filter=True,
                callback=self.parse_page,
                errback=self.error_parse,
            )

    def parse_page(self, response):
        self.write(response.body)
        pass

    def error_parse(self, failure):
        request = failure.request
        pass

    def add_proxy(self, proxy):
        utils.sql_insert_proxy(self.sql, config.free_ipproxy_table, proxy)

    def write(self, data):
        if self.is_record_web_page:
            with open(
                    '%s/%s.html' %
                (self.dir_log,
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
                    'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()

Example #2

Show file

File: dataserver.py Project: dytttf/IPProxyTool

    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            ip = inputs.get('ip')
            command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            sql.execute(command)

            command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            res = sql.query_one(command)
            return res is None
        except:
            pass
        return False

Example #3

Show file

class SendSms(object):
    def __init__(self):
        self.sql = SqlHelper()

        self.weather_table_name = config.weather_table
        self.user_table_name = config.user_table

    def send_sms(self):
        command = ("SELECT * FROM {};".format(self.user_table_name))
        self.sql.execute(command)
        users = self.sql.cursor.fetchall()
        if users != None:
            for user in users:
                utils.log('send_sms get user info user:%s' % str(user))
                # 判断用户定义的时间，只有满足用户定义时间才发送短信
                user_time = user[5]
                time_info = user_time.split(':')

                u_hour = time_info[0]
                u_minute = time_info[1]

                # 获取系统时间
                s_hour = datetime.datetime.now().hour
                s_minute = datetime.datetime.now().minute

                if int(u_hour) == s_hour and int(u_minute) == s_minute:
                    utils.log('send sms to user:%s' % str(user))

                    command = (
                        "select * from {0} where city_name='{1}' order by id desc limit 1;"
                        .format(self.weather_table_name, user[3]))
                    self.sql.execute(command)
                    weather = self.sql.cursor.fetchone()
                    if weather != None:
                        temp_code = 'SMS_41855112'
                        phone = user[2]
                        info = {
                            'name': user[1],
                            'city': user[3],
                            'weather': weather[15],
                            'temp': '%s ~ %s' % (weather[9], weather[8]),
                            'aqilevel': utils.get_aqi_level_info(weather[12]),
                        }

                        sms = AliyunSms()
                        sms.send_sms(temp_code, info, phone)

Example #4

Show file

class Validator(Spider):
    name = 'base'
    concurrent_requests = 16
    retry_enabled = False

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''
        self.is_record_web_page = False

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    @classmethod
    def update_settings(cls, settings):
        settings.setdict(cls.custom_settings or {
            'CONCURRENT_REQUESTS': cls.concurrent_requests,
            'RETRY_ENABLED': cls.retry_enabled,
        },
                         priority='spider')

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, config.httpbin_table)

        ids = utils.get_table_ids(self.sql, self.name)
        ids_free = utils.get_table_ids(self.sql, config.httpbin_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else config.httpbin_table
            id = ids[i] if i < count else ids_free[i - len(ids)]

            proxy = utils.get_proxy_info(self.sql, table, id)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                        'vali_count':
                        proxy.get('vali_count', 0)
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('success_parse speed:%s meta:%s' %
                  (time.time() - response.meta.get('cur_time'), response.meta))

        proxy = response.meta.get('proxy_info')
        table = response.meta.get('table')
        id = response.meta.get('id')
        ip = proxy.get('ip')

        self.save_page(ip, response.body)

        if self.success_mark in response.body or self.success_mark is '':
            speed = time.time() - response.meta.get('cur_time')
            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    vali_count = response.meta.get('vali_count', 0) + 1
                    command = utils.get_update_data_command(
                        table, id, speed, vali_count)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None, 1)

                    self.sql.insert_data(command, msg, commit=True)
        else:
            # 如果没有找到成功标示，说明这里返回信息有误，需要删除当前库的 ip
            if table == self.name:
                command = utils.get_delete_data_command(table, id)
                self.sql.execute(command)

    def error_parse(self, failure):
        request = failure.request
        utils.log('error_parse value:%s url:%s meta:%s' %
                  (failure.value, request.url, request.meta))

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型，进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, ip, data):
        filename = '{time} {ip}'.format(
            time=datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'),
            ip=ip)
        utils.log('filename:%s' % filename)

        if self.is_record_web_page:
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()

    def close(spider, reason):
        spider.sql.commit()

Example #5

Show file

File: views.py Project: ztg498/jd_analysis

def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('\d+', re.S)
        product_id = re.search(pattern, url).group()
        sql = SqlHelper()

        utils.log('product_id:%s' % product_id)

        if 'item.jd.com' in url and product_id != None:
            data['status'] = 'success'
            data['guid'] = str(uuid.uuid4())
            data['info'] = '成功接收数据，正在为您抓取并分析数据，精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={product_id}". \
                format(table = config.jd_item_table, product_id = product_id)
            result = sql.query_one(command)

            if result == None:
                name = 'jd'
                cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a product_id={product_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           product_id = product_id)

                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据，从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
                        format(config.analysis_item_table, product_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, product_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'product_id={product_id};'. \
                        format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'),
                               product_id = product_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误，请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误，错误原因：%s' % e

Example #6

Show file

    sql = SqlHelper()

    spiders = [
        XiCiDaiLiSpider,
        SixSixIpSpider,
        IpOneEightOneSpider,
        KuaiDaiLiSpider,  # 在访问前加了一个 js ，反爬
        GatherproxySpider,
        HidemySpider,
        ProxylistplusSpider,
        FreeProxyListsSpider,
        # PeulandSpider,  # 目标站点失效
        UsProxySpider,
        ProxyDBSpider,
        ProxyRoxSpider,
    ]

    while True:
        utils.log('*******************run spider start...*******************')

        command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL 0.2 DAY)".format(
                table = config.free_ipproxy_table)
        sql.execute(command)

        for spider in spiders:
            scrapydo.run_spider(spider)

        utils.log('*******************run spider waiting...*******************')
        time.sleep(1200)

Example #7

Show file

File: validator.py Project: yummy-pinghai/IPProxyTool

class Validator(Spider):
    name = 'base'

    def __init__(self, name=None, **kwargs):
        super(Validator, self).__init__(name, **kwargs)
        self.sql = SqlHelper()

        self.dir_log = 'log/validator/%s' % self.name
        self.timeout = 10

        self.urls = []
        self.headers = None
        self.success_mark = ''

    def init(self):
        utils.make_dir(self.dir_log)

        command = utils.get_create_table_command(self.name)
        self.sql.create_table(command)

    def start_requests(self):
        count = utils.get_table_length(self.sql, self.name)
        count_free = utils.get_table_length(self.sql, free_ipproxy_table)

        for i in range(0, count + count_free):
            table = self.name if (i < count) else free_ipproxy_table

            proxy = utils.get_proxy_info(self.sql, table, i)
            if proxy == None:
                continue

            for url in self.urls:
                cur_time = time.time()
                yield Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'cur_time':
                        cur_time,
                        'download_timeout':
                        self.timeout,
                        'proxy_info':
                        proxy,
                        'table':
                        table,
                        'id':
                        proxy.get('id'),
                        'proxy':
                        'http://%s:%s' % (proxy.get('ip'), proxy.get('port')),
                    },
                    dont_filter=True,
                    callback=self.success_parse,
                    errback=self.error_parse,
                )

    def success_parse(self, response):
        utils.log('name:%s success_parse proxy:%s meta:%s' %
                  (self.name, str(
                      response.meta.get('proxy_info')), str(response.meta)))

        filename = datetime.datetime.now().strftime('%Y-%m-%d %H_%M_%S_%f')
        self.save_page(filename, response.body)

        if response.body.find(self.success_mark) or self.success_mark is '':
            proxy = response.meta.get('proxy_info')
            speed = time.time() - response.meta.get('cur_time')
            table = response.meta.get('table')
            id = response.meta.get('id')

            utils.log('speed:%s table:%s id:%s' % (speed, table, id))

            if table == self.name:
                if speed > self.timeout:
                    command = utils.get_delete_data_command(table, id)
                    self.sql.execute(command)
                else:
                    command = utils.get_update_data_command(table, id, speed)
                    self.sql.execute(command)
            else:
                if speed < self.timeout:
                    command = utils.get_insert_data_command(self.name)
                    msg = (None, proxy.get('ip'), proxy.get('port'),
                           proxy.get('country'), proxy.get('anonymity'),
                           proxy.get('https'), speed, proxy.get('source'),
                           None)

                    self.sql.insert_data(command, msg)

    def error_parse(self, failure):
        utils.log('error_parse value:%s' % failure.value)

        proxy = failure.request.meta.get('proxy_info')
        table = failure.request.meta.get('table')
        id = failure.request.meta.get('id')

        if table == self.name:
            command = utils.get_delete_data_command(table, id)
            self.sql.execute(command)
        else:
            # TODO... 如果 ip 验证失败应该针对特定的错误类型，进行处理
            pass

            #
            # request = failure.request.meta
            # utils.log('request meta:%s' % str(request))
            #
            # # log all errback failures,
            # # in case you want to do something special for some errors,
            # # you may need the failure's type
            # self.logger.error(repr(failure))
            #
            # #if isinstance(failure.value, HttpError):
            # if failure.check(HttpError):
            #     # you can get the response
            #     response = failure.value.response
            #     self.logger.error('HttpError on %s', response.url)
            #
            # #elif isinstance(failure.value, DNSLookupError):
            # elif failure.check(DNSLookupError):
            #     # this is the original request
            #     request = failure.request
            #     self.logger.error('DNSLookupError on %s', request.url)
            #
            # #elif isinstance(failure.value, TimeoutError):
            # elif failure.check(TimeoutError):
            #     request = failure.request
            #     self.logger.error('TimeoutError on url:%s', request.url)

    def save_page(self, filename, data):
        if get_project_settings().get('IS_RECODE_HTML', False):
            with open('%s/%s.html' % (self.dir_log, filename), 'w') as f:
                f.write(data)
                f.close()

Example #8

Show file

File: data_helper.py Project: scmsqhn/ner-bilstm-dnn

def main(n):
    _tc = TimeCnt()
    _tc.cnt_time()
    #model_labels_2 = model_labels_2
    #model_labels = model_labels
    #model_columns_base = model_columns_base
    #print("> this is sql test")
    #shandong_ = SqlHelper(Config_map_shandong)
    #guangxi_ = SqlHelper(Config_guangxi)
    #storm_shandong_= SqlHelper(Configstormshandong)
    #storm_110_ = SqlHelper(Configooo)
    _tc.cnt_time()
    today = todayStr()

    #from load_mysqL_from_localcpk import load_mongodb_conn
    #bond_risk_ = load_mysqL_from_localcpk.load_mongodb_conn()
    mysql_bond_risk_ = SqlHelper(Config_bond_risk)
    #cursor = bond_risk_.find()
    #for i in cursor:
        #print(i)
        #pdb.set_trace()
    #    timeFormat(i['date_input'])
    label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date)>=%d);"% n)
    compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date)>=%d);"% n)
    #label_120_ = mysql_bond_risk_.execute("select label from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n)
    #_tc.cnt_time()
    #compname_120_ = mysql_bond_risk_.execute("select compname from middleTable where (to_days(now()) - to_days(date_input) <=720 and to_days(now()) - to_days(date)<=%d);"% n)
    #label_120_, compname_120_ = get_label_time_window(bond_risk_, n, n-120)
    set_ = set()
    [set_.add(i) for i in label_120_]
    label_lst_ = list(set_)
    _tc.cnt_time()
    set_.clear()
    [set_.add(i) for i in compname_120_]
    compname_lst_ = list(set_)
    _dic = get_label_120(mysql_bond_risk_, compname_lst_, label_lst_, n)
    _tc.cnt_time()
    _panel = pd.Panel(_dic)
    _panel = _panel.fillna(0.0)
    #filter_data_by_time(bond_risk_, today)
    _tc.cnt_time()

    df_4_model = pd.DataFrame(index=compname_lst_, columns = model_labels + model_labels_2)
    df_4_model = df_4_model.fillna(0.0)
    df_4_model = df_4_model.astype(np.float64)
    _index = list(df_4_model.index)
    _columns = model_labels
    _columns_2 = model_labels_2
    #print("> ready to get data")
    _cnt = 0
    for i in _index:
        _cnt+=1
        if _cnt % 100 ==1:
            pass
            #print(">>>> !!! handle the,", i, _cnt)
        #if _cnt > 300:
            #break
            #print("> handle the,", i, _cnt)
        for c in _columns:
            df_4_model.loc[i,c] = cell_fill(_panel, i,c)
        df_4_model.loc[i, "企业名称"] = i
        df_4_model.loc[i, "发布日期"] = datetime.datetime.now()
        df_4_model.loc[i, "credit_recent"] = 0
        df_4_model.loc[i, "credit_ago"] = 0
        df_4_model.loc[i, "credit_trend"] = 0

        df_4_model.loc[i, "60"] = _panel[i].loc[60,:].sum()
        df_4_model.loc[i, "120"] = _panel[i].loc[120,:].sum()
        df_4_model.loc[i, "180"] = _panel[i].loc[180,:].sum()
        df_4_model.loc[i, "债券风险60"] = group_cnt_key_word("债券风险60",i,_panel)
        df_4_model.loc[i, "债券风险120"] = group_cnt_key_word("债券风险120",i,_panel)
        df_4_model.loc[i, "债券风险180"] = group_cnt_key_word("债券风险180",i,_panel)
        df_4_model.loc[i, "个人风险60"] = group_cnt_key_word("个人风险60",i,_panel)
        df_4_model.loc[i, "个人风险120"] = group_cnt_key_word("个人风险120",i,_panel)
        df_4_model.loc[i, "个人风险180"] = group_cnt_key_word("个人风险180",i,_panel)
        df_4_model.loc[i, "财务风险60"] = group_cnt_key_word("财务风险60",i,_panel)
        df_4_model.loc[i, "财务风险120"] = group_cnt_key_word("财务风险120",i,_panel)
        df_4_model.loc[i, "财务风险180"] = group_cnt_key_word("财务风险180",i,_panel)
        df_4_model.loc[i, "经营风险60"] = group_cnt_key_word("经营风险60",i,_panel)
        df_4_model.loc[i, "经营风险120"] = group_cnt_key_word("经营风险120",i,_panel)
        df_4_model.loc[i, "经营风险180"] = group_cnt_key_word("经营风险180",i,_panel)
        df_4_model.loc[i, "行业风险60"] = group_cnt_key_word("行业风险60",i,_panel)
        df_4_model.loc[i, "行业风险120"] = group_cnt_key_word("行业风险120",i,_panel)
        df_4_model.loc[i, "行业风险180"] = group_cnt_key_word("行业风险180",i,_panel)
        df_4_model.loc[i, "企业风险60"] = group_cnt_key_word("企业风险60",i,_panel)
        df_4_model.loc[i, "企业风险120"] = group_cnt_key_word("企业风险120",i,_panel)
        df_4_model.loc[i, "企业风险180"] = group_cnt_key_word("企业风险180",i,_panel)
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x)
        df_4_model.loc[i, "sub120_60"] = df_4_model.loc[i, "120"] - df_4_model.loc[i, "60"]
        df_4_model.loc[i, "sub180_120"] = df_4_model.loc[i, "180"] - df_4_model.loc[i, "120"]
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==-1 else x)
        #df_4_model = df_4_model.applymap(lambda x : np.NaN if x==0 else x)

    _x = df_4_model.drop(["企业名称","发布日期","Label"],1)
    _z = pd.read_csv("/home/siyuan/bond_risk/_z.csv").drop(["Unnamed: 0","发布日期","Label"],1)
    #_z.index = _z["企业名称"]
    _z = _z.drop("企业名称", axis=1)
    _x.columns = list(_z.columns)
    # !! filter
    #_x = _x[(_x["sub120_60"]>0) & (_x["60"]>0)]
    #_x = _x[(_x["60"]>0)]
    _x = _x[(_x["120"]>0)]
    train_separator = len(_x.index)
    #print(train_separator)
    _pred_data = pd.concat([_x, _z], axis=0)

    _pred_data = set_dummy(_pred_data, False)
    # output predict label
    bst = xgb.Booster()
    bst.load_model("/home/siyuan/data/xgb.model")
    #pdb.set_trace()

    #_lz = pd.read_csv("/home/siyuan/bond_risk/_z.csv")["Label"]
    result_ = predict(bst, _pred_data, _pred_data.iloc[1])
    dict_ = dict(zip(list(_pred_data.index), result_))
    dict_res = dict(zip(list(_pred_data.index)[:train_separator], result_[:train_separator]))
    #dict_res = dict(zip(list(_pred_data.index), result_))

    #print(collections.Counter(list(result_)))
    #print(collections.Counter(list(result_)[:train_separator]))
    cnt = 0
    #pdb.set_trace()
    #print(dict_res)
    for i in dict_res.keys():
        #sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e')))
        sql_ = "INSERT INTO resultTable VALUES('', '%s', CURTIME(), '%s');"%(i,str(format(dict_res[i],'.9e')))
        #print(sql_)
        sql_res_ = mysql_bond_risk_.execute(sql_)
        #print(sql_res_)
        cnt+=1
    pdb.set_trace()
    mysql_bond_risk_.connect.commit()

Example #9

Show file

File: views.py Project: tyzctyzc/tb_analysis

def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('user-rate-')
        urls = re.split(pattern, url)
        user_id = urls[1]
        pattern = re.compile('\w+', re.S)
        user_id = re.search(pattern, user_id).group()
        sql = SqlHelper()

        utils.log('user_id:%s' % user_id)

        if 'rate.taobao.com' in url and user_id != None:
            data['status'] = 'success'
            data['guid'] = str(random.randint(1000000000000,
                                              9999999999999)) + '_' + str(
                                                  random.randint(100, 999))
            data['info'] = '成功接收数据，正在为您抓取并分析数据，精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={user_id}". \
                format(table = config.tb_item_table, user_id = user_id)
            result = sql.query_one(command)

            if result == None:
                name = 'tb_comment'
                cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a user_id={user_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           user_id = user_id)

                logging.warn(cmd)
                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据，从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \
                        format(config.analysis_item_table, user_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, user_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'user_id={user_id};'. \
                        format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'),
                               user_id = user_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误，请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误，错误原因：%s' % e

Example #10

Show file

class GameInfo(CrawlSpider):
    name = 'game_info'

    def __init__(self, *a, **kw):
        super(GameInfo, self).__init__(*a, **kw)

        self.dir_game = 'log/%s' % self.name
        self.sql = SqlHelper()
        self.init()

        utils.make_dir(self.dir_game)

        self.error_count = 0

    def init(self):
        command = ("CREATE TABLE IF NOT EXISTS {} ("
                   "`id` INT(8) NOT NULL AUTO_INCREMENT,"
                   "`name` TEXT NOT NULL,"
                   "`price` INT(5) NOT NULL,"
                   "`metacritic_score` FLOAT DEFAULT NULL,"
                   "`user_reviews_count` INT(6) NOT NULL,"
                   "`positive_user_reviews_count` INT(6) NOT NULL,"
                   "`positive_percent` FLOAT NOT NULL ,"
                   "`negative_user_reviews_count` INT(6) NOT NULL,"
                   '`steam_user_reviews_count` INT(6) NOT NULL,'
                   '`non_steam_user_reviews_count` INT(6) NOT NULL,'
                   '`english_user_reviews_count` INT(6) NOT NULL,'
                   '`non_english_user_reviews_count` INT(6) NOT NULL,'
                   "`tag_list` TEXT DEFAULT NULL,"
                   "`achievements_count` INT(4) DEFAULT NULL,"
                   "`category` TEXT NOT NULL,"
                   "`genre` TEXT NOT NULL,"
                   "`developer` TEXT NOT NULL,"
                   "`publisher` TEXT NOT NULL,"
                   "`release_date` TEXT NOT NULL,"
                   "`url` TEXT NOT NULL,"
                   "`language_number` INT(3) DEFAULT NULL,"
                   "`description` TEXT DEFAULT NULL,"
                   "`save_time` TIMESTAMP NOT NULL,"
                   "PRIMARY KEY(id)"
                   ") ENGINE=InnoDB".format(config.steam_game_info_table))
        self.sql.create_table(command)

    def start_requests(self):
        command = "SELECT * FROM {} WHERE is_crawled = \'no\' AND type = \'app\'".format(
            config.steam_game_urls_table)
        data = self.sql.query(command)
        for i, item in enumerate(data):
            yield Request(
                url=item[3],
                dont_filter=True,
                method='GET',
                headers={
                    'Accept':
                    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Encoding':
                    'gzip, deflate',
                    'Accept-Language':
                    'en-US,en;q=0.5',
                    'Connection':
                    'keep-alive',
                    'Host':
                    'store.steampowered.com',
                    'Upgrade-Insecure-Requests':
                    '1',
                    'User-Agent':
                    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 '
                    'Firefox/51.0',
                },
                meta={
                    'item': item,
                    'id': item[0],
                },
                cookies={
                    'mature_content': '1',
                },
                callback=self.parse_game,
                errback=self.error_parse,
            )

    def parse_game(self, response):
        self.log('parse_game url:%s' % response.url)
        id = response.meta.get('id')

        # file_name = '%s/%s.html' % (self.dir_game, id)
        # self.save_page(file_name, response.body)

        if u'Please enter your birth date to continue' in response.body:
            self.log('Please enter your birth date to continue meta:%s' %
                     response.meta)

            url = 'http://store.steampowered.com/agecheck/app/%s/' % str(id)
            return FormRequest(url=url,
                               dont_filter=True,
                               method='POST',
                               formdata={
                                   'ageDay': str(range(1, 25)),
                                   'ageMonth': 'January',
                                   'ageYear': str(range(1980, 1995)),
                                   'snr': '1_agecheck_agecheck__age-gate',
                               },
                               callback=self.parse_game)

        soup = BeautifulSoup(response.body, 'lxml')
        sel = Selector(text=response.body)

        name = sel.xpath(
            '//div[@class="apphub_AppName"]/text()').extract_first()
        if name == '' or name == None:
            self.log('no get data meta:%s' % response.meta)
            return

        price = sel.xpath('//div[@class="game_purchase_price price"]/text()'
                          ).extract_first()
        try:
            p = price.split('¥')
            price = int(p[1])
        except:
            price = -1

        # 该游戏在 metacritic 上的评分
        metacritic_score = sel.xpath(
            '//div[@class="score high"]/text()').extract_first()
        try:
            metacritic_score = int(metacritic_score)
        except:
            metacritic_score = -1

        # 所有用户回复数量
        user_reviews_count = sel.xpath(
            '//label[@for="review_type_all"]/span/text()').extract_first()
        user_reviews_count = self.count_to_int(user_reviews_count)

        # 好评的用户数量
        positive_user_reviews_count = sel.xpath(
            '//label[@for="review_type_positive"]/span/text()').extract_first(
            )
        positive_user_reviews_count = self.count_to_int(
            positive_user_reviews_count)

        # 好评的百分比
        if user_reviews_count != -1 and positive_user_reviews_count != -1:
            positive_percent = positive_user_reviews_count * 1.0 / user_reviews_count * 100
        else:
            positive_percent = 0

        # 差评的用户数量
        negative_user_reviews_count = sel.xpath(
            '//label[@for="review_type_negative"]/span/text()').extract_first(
            )
        negative_user_reviews_count = self.count_to_int(
            negative_user_reviews_count)

        # 在 steam 购买的用户的评论数
        steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_steam"]/span/text()').extract_first()
        steam_user_reviews_count = self.count_to_int(steam_user_reviews_count)

        # 在其他平台购买的用户的评论数
        non_steam_user_reviews_count = sel.xpath(
            '//label[@for="purchase_type_non_steam"]/span/text()'
        ).extract_first()
        non_steam_user_reviews_count = self.count_to_int(
            non_steam_user_reviews_count)

        # 英语评论的数量
        english_user_reviews_count = sel.xpath(
            '//label[@for="review_language_mine"]/span/text()').extract_first(
            )
        english_user_reviews_count = self.count_to_int(
            english_user_reviews_count)

        # 非英语的评论数量
        non_english_user_reviews_count = user_reviews_count - english_user_reviews_count

        # 该游戏的标签列表
        try:
            tags = soup.find(attrs={'class': 'glance_tags popular_tags'})
            tag_list = tags.text.replace('\t', '')
            tag_list = tag_list.replace('\n', ',')
        except:
            tag_list = ''

        # 该游戏的成就数量
        achievements = sel.xpath(
            '//div[@id="achievement_block"]/div/text()').extract_first()
        try:
            achievements_count = re.search('\d+', achievements, re.S).group(0)
            achievements_count = int(achievements_count)
        except:
            achievements_count = 0

        # 该游戏的分类 All Games > Action Games > Counter-Strike
        try:
            category = soup.find(name='div', attrs={
                'class': 'breadcrumbs'
            }).text
            category = category.replace('\t', '')
            category = category.replace('\n', '')
        except:
            category = ''

        # 游戏类型
        genre = sel.xpath(
            '//div[@class="block_content"]/div/div/a/text()').extract_first()

        # 游戏开发商
        developer = sel.xpath(
            '//div[@class="block_content"]/div/div/a[2]/text()').extract_first(
            )

        # 游戏发行商
        publisher = sel.xpath(
            '//div[@class="block_content"]/div/div/a[3]/text()').extract_first(
            )

        # 游戏发行日期
        release_date = sel.xpath(
            '//div[@class="release_date"]/span/text()').extract_first()

        # 游戏支持的语言
        language_number = len(
            sel.xpath(
                '//table[@class="game_language_options"]/tr').extract()) - 1

        # 游戏描述
        description = sel.xpath(
            '//div[@class="game_description_snippet"]/text()').extract_first()

        # 抓取该游戏时间
        save_time = None

        msg = (id, name, price, response.url, metacritic_score,
               user_reviews_count, positive_user_reviews_count,
               positive_percent, negative_user_reviews_count,
               steam_user_reviews_count, non_steam_user_reviews_count,
               english_user_reviews_count, non_english_user_reviews_count,
               tag_list, achievements_count, category, genre, developer,
               publisher, release_date, language_number, description,
               save_time)

        command = (
            "INSERT IGNORE INTO {} "
            "(id, name, price, url, metacritic_score, user_reviews_count, positive_user_reviews_count, "
            "positive_percent, negative_user_reviews_count, steam_user_reviews_count, "
            "non_steam_user_reviews_count, english_user_reviews_count, non_english_user_reviews_count, "
            "tag_list, achievements_count, category, genre, developer, publisher, release_date, "
            "language_number, description, save_time)"
            "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, "
            "%s)".format(config.steam_game_info_table))

        self.sql.insert_data(command, msg)

        command = "UPDATE {0} SET is_crawled=\'yes\' WHERE id=\'{1}\'".format(
            config.steam_game_urls_table, id)
        self.sql.execute(command)

    def error_parse(self, faiture):
        request = faiture.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta))

    def get_id(self, url):
        type = ''
        if '/sub/' in url:
            pattern = re.compile('/sub/(\d+)/')
            type = 'sub'
        elif '/app/' in url:
            pattern = re.compile('/app/(\d+)/', re.S)
            type = 'app'
        elif '/bundle/' in url:
            pattern = re.compile('/bundle/(\d+)/', re.S)
            type = 'bundle'
        else:
            pattern = re.compile('/(\d+)/', re.S)
            type = 'other'
            utils.log('get_id other url:%s' % url)

        id = re.search(pattern, url)
        if id:
            id = id.group(1)
            return id

        self.error_count = self.error_count + 1
        utils.log('get_id error url:%s' % url)
        return -self.error_count

    def count_to_int(self, data):
        try:
            ret = data
            ret = ret.replace('(', '')
            ret = ret.replace(')', '')
            ret = ret.replace(',', '')

            return int(ret)
        except:
            return -1

    def save_page(self, file_name, data):
        with open(file_name, 'w') as f:
            f.write(data)
            f.close()