Esempio n. 1
0
    def record_result(self,
                      result,
                      color='default',
                      font_size=16,
                      strong=False,
                      type='word',
                      br=True,
                      default=False,
                      new_line=False):
        logging.error("RealTimeAnalysis 1111")
        self.full_result = ''
        if type == 'word' and default == False:
            if strong:
                result = '<strong style="color: %s; font-size: %spx;">%s</strong>' % (
                    color, font_size, result)
            else:
                result = '<span style="color: %s; font-size: %spx;">%s</span>' % (
                    color, font_size, result)
        elif type == 'image':
            result = markdown2.markdown(result)

        self.full_result += result

        if br:
            self.full_result += '<br>'
        if new_line:
            self.full_result += '\n'

        logging.error("full_result:%s result:%s" % (self.full_result, result))
        logging.error("guid:%s, user_id:%s, info:%s, type:%s" %
                      (self.guid, self.user_id, self.full_result, type))
        utils.push_redis(guid=self.guid,
                         user_id=self.user_id,
                         info=self.full_result,
                         type=type)
Esempio n. 2
0
    def record_result(self,
                      result,
                      color='default',
                      font_size=16,
                      strong=False,
                      type='word',
                      br=True,
                      default=False,
                      new_line=False):
        self.full_result = ''
        if type == 'word' and default == False:
            if strong:
                result = '<strong style="color: %s; font-size: %spx;">%s</strong>' % (
                    color, font_size, result)
            else:
                result = '<span style="color: %s; font-size: %spx;">%s</span>' % (
                    color, font_size, result)
        elif type == 'image':
            result = markdown2.markdown(result)

        self.full_result += result

        if br:
            self.full_result += '<br>'
        if new_line:
            self.full_result += '\n'

        utils.push_redis(guid=self.guid,
                         product_id=self.product_id,
                         info=self.full_result,
                         type=type)
Esempio n. 3
0
    def init(self):
        command = (
            "CREATE TABLE IF NOT EXISTS {} ("
            "`id` BIGINT (15) NOT NULL AUTO_INCREMENT,"  # 评论的 id
            "`content` TEXT NOT NULL,"  # 评论的内容
            "`creation_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"  # 评论创建的时间
            "`reply_count` INT(4) DEFAULT NULL ,"  # 回复数量
            "`score` INT(2) DEFAULT NULL,"  # 评星
            "`useful_vote_count` INT(5) DEFAULT NULL,"  # 其他用户觉得有用的数量
            "`useless_vote_count` INT(4) DEFAULT NULL,"  # 其他用户觉得无用的数量
            "`user_level_id` INT(4) DEFAULT NULL,"  # 评论用户等级的 id
            '`user_province` CHAR(8) DEFAULT NULL,'  # 用户的省份
            '`nickname` CHAR(20) DEFAULT NULL,'  # 评论用户的昵称
            '`product_color` CHAR(50) DEFAULT NULL,'  # 商品的颜色
            "`product_size` CHAR(50) DEFAULT NULL,"  # 商品的大小
            "`user_level_name` CHAR(20) DEFAULT NULL,"  # 评论用户的等级
            "`user_client` INT(5) DEFAULT NULL,"  # 用户评价平台
            "`user_client_show` CHAR(20) DEFAULT NULL,"  # 用户评价平台
            "`is_mobile` INT(3) DEFAULT NULL,"  # 是否是在移动端完成的评价
            "`days` INT(3) DEFAULT NULL,"  # 购买后评论的天数
            "`reference_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"  # 购买的时间
            "`after_days` INT(3) DEFAULT NULL,"  # 购买后再次评论的天数
            "`images_count` INT(3) DEFAULT NULL,"  # 评论总图片的数量
            "`ip` CHAR(20) DEFAULT NULL,"  # 再次评论时的 ip 地址
            "`after_content` TEXT DEFAULT NULL,"  # 再次评论的内容
            "`save_time` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"  # 抓取数据的时间
            "PRIMARY KEY(id)"
            ") ENGINE=InnoDB".format(self.item_table))
        self.sql.create_table(command)

        utils.push_redis(self.guid, self.product_id, '开始抓取京东商城该商品的评价信息...')
Esempio n. 4
0
    def get_comment_count(self, response):
        self.save_page('%s.html' % self.product_id, response.body)

        name = response.xpath('//head/title/text()').extract_first()
        self.log('name:%s' % name)

        utils.push_redis(
            self.guid, self.product_id,
            '商品名称:%s 链接:<a href="%s" target="_blank">%s' %
            (name, self.url, self.url))

        ids = response.xpath('//div[@class="dd"]/div/@data-sku').extract()
        item_ids = ','.join(ids)
        self.log('item_ids:%s' % item_ids)

        pattern = re.compile('commentVersion:\'(\d+)\'', re.S)
        comment_version = re.search(pattern, response.body).group(1)

        # sort type 5:推荐排序 6:时间排序
        url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \
              '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page=0&pageSize=10' \
              '&isShadowSku=0'. \
            format(product_id = self.product_id, comment_version = comment_version, sort_type = '6')

        yield Request(
            url=url,
            headers={
                'Accept':
                '*/*',
                'Accept-Encoding':
                'gzip, deflate, br',
                'Accept-Language':
                'en-US,en;q=0.5',
                'Connection':
                'keep-alive',
                'Host':
                'club.jd.com',
                'Referer':
                'https://item.jd.com/%s.html' % self.product_id,
                'User-Agent':
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
                'Firefox/52.0',
            },
            method='GET',
            meta={
                'name': name,
                'comment_version': comment_version,
                'item_ids': item_ids,
            },
            dont_filter=True,
            callback=self.get_all_comment)
Esempio n. 5
0
    def handle(self, *args, **options):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        os.chdir(sys.path[0])

        spargs = utils.arglist_to_dict(options['spargs'])

        if not os.path.exists('log'):
            os.makedirs('log')

        configure_logging(install_root_handler=False)
        logging.basicConfig(filename='log/%s.log' % spargs.get('user_id'),
                            format='%(levelname)s %(asctime)s: %(message)s',
                            level=logging.ERROR)

        guid = spargs.get('guid', '0')
        user_id = spargs.get('user_id', '0')

        logging.warn('user_id')
        if guid == '0' or user_id == '0':
            utils.log('分析数据传入参数不对,接收到的参数为: spargs:%s' % spargs)
            utils.push_redis(guid=guid,
                             user_id=user_id,
                             info='分析数据传入参数不对,接收到的参数为:%s' % spargs)
            utils.push_redis(guid=guid, user_id=user_id, info='finish')
            return

        utils.log('开始分析:%s' % spargs)
        sql = SqlHelper()
        red = redis.StrictRedis(host=config.redis_host,
                                port=config.redis_part,
                                db=config.redis_db,
                                password=config.redis_pass)
        spargs['sql'] = sql
        spargs['red'] = red

        # 运行爬虫
        logging.warn(spargs)
        runspider(spargs)

        # 开启分析
        logging.warn(spargs)
        analysis = RealTimeAnalysis(**spargs)
        analysis.run()
Esempio n. 6
0
    def close(spider, reason):
        if spider.product_msg != None:
            spider.sql.insert_json(spider.product_msg, config.jd_item_table)

        # 如果是分布式抓取 清理 redis
        if config.is_distributed:
            utils.red.delete('%s_page' % spider.product_id)
            utils.red.delete(spider.product_id)
            spider.log('clear redis product_id:%s' % spider.product_id)

            # 等其他抓取进程一下
            time.sleep(5)

        command = "SELECT COUNT(*) FROM {}".format('item_%s' %
                                                   spider.product_id)
        spider.sql.execute(command, commit=False)
        (count, ) = spider.sql.cursor.fetchone()

        command = "SELECT COUNT(*) FROM {} WHERE score=5".format(
            'item_%s' % spider.product_id)
        spider.sql.execute(command, commit=False)
        (good_count, ) = spider.sql.cursor.fetchone()

        command = "SELECT COUNT(*) FROM {} WHERE score>=3 and score <=4".format(
            'item_%s' % spider.product_id)
        spider.sql.execute(command, commit=False)
        (general_count, ) = spider.sql.cursor.fetchone()

        command = "SELECT COUNT(*) FROM {} WHERE score<=2".format(
            'item_%s' % spider.product_id)
        spider.sql.execute(command, commit=False)
        (poor_count, ) = spider.sql.cursor.fetchone()

        utils.push_redis(
            spider.guid,
            spider.product_id,
            info=
            '抓取信息完成,实际抓取评价信息,<strong style="color: red; font-size: 24px;">总共抓取评价数:%s、好评数:%s、'
            '中评数:%s、差评数:%s</strong>' %
            (count, good_count, general_count, poor_count))

        # 事务提交数据
        spider.sql.commit()
Esempio n. 7
0
def randitem(spargs):
    guid = spargs.get('guid', 0)
    utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False)

    url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \
          '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682'
    headers = {
        'Host':
        'diviner.taobao.com',
        'Referer':
        'https://www.taobao.com/',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    cookies = {
        '__jda':
        '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1',
        '__jdb': '122270672.1.1492415671516609876050|1.1492415672',
        '__jdc': '122270672',
        '__jdv': '122270672|direct|-|none|-|1492415671524',
        '__jdu': '1492415671516609876050',
    }

    r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20)
    pattern = re.compile('"sku":(\d+),', re.S)
    ids = re.findall(pattern, r.text)
    id = random.choice(ids)

    url = 'https://item.taobao.com/%s.html' % str(id)
    utils.push_redis(guid,
                     0,
                     '生成商品链接:<a href="%s" target="_blank">%s' % (url, url),
                     save_to_mysql=False)

    sql = SqlHelper()
    command = "SELECT id FROM {table} WHERE id={product_id}". \
        format(table = config.tb_item_table, product_id = id)
    result = sql.query_one(command)

    # 如果数据库中没有,则重新抓取
    if result == None:
        cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
              '-a product_id={product_id} -a url={url};'. \
            format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid,
                   product_id = id)
        subprocess.Popen(cmd, shell=True)
    else:
        # 如果数据库中存在则,直接读取数据库中数据
        command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
            format(config.analysis_item_table, id)
        result = sql.query(command)
        for res in result:
            utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
Esempio n. 8
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('\d+', re.S)
        product_id = re.search(pattern, url).group()
        sql = SqlHelper()

        utils.log('product_id:%s' % product_id)

        if 'item.jd.com' in url and product_id != None:
            data['status'] = 'success'
            data['guid'] = str(uuid.uuid4())
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={product_id}". \
                format(table = config.jd_item_table, product_id = product_id)
            result = sql.query_one(command)

            if result == None:
                name = 'jd'
                cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a product_id={product_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           product_id = product_id)

                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
                        format(config.analysis_item_table, product_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, product_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'product_id={product_id};'. \
                        format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'),
                               product_id = product_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
Esempio n. 9
0
    def get_all_comment(self, response):
        self.save_page('%s_all_comment.html' % self.product_id, response.body)

        detect = chardet.detect(response.body)
        encoding = detect.get('encoding', '')
        body = response.body.decode(encoding, 'ignore')
        pattern = re.compile('\((.*?)\);', re.S)
        item = re.search(pattern, body)
        if item != None and item.group(1) != None:
            data = json.loads(item.group(1))
            # productCommentSummary
            pcs = data.get('productCommentSummary')
            self.product_msg = {
                'id': self.product_id,
                'name': response.meta.get('name'),
                'good_rate_show': pcs.get('goodRateShow'),
                'poor_rate_show': pcs.get('poorRateShow'),
                'average_score': pcs.get('averageScore'),
                'good_count': pcs.get('goodCount'),
                'general_rate': pcs.get('generalRate'),
                'general_count': pcs.get('generalCount'),
                'poor_rate': pcs.get('poorRate'),
                'after_count': pcs.get('afterCount'),
                'good_rate_style': pcs.get('goodRateStyle'),
                'poor_count': pcs.get('poorCount'),
                'poor_rate_style': pcs.get('poorRateStyle'),
                'general_rate_style': pcs.get('generalRateStyle'),
                'comment_count': pcs.get('commentCount'),
                'product_id': pcs.get('productId'),
                'good_rate': pcs.get('goodRate'),
                'general_rate_show': pcs.get('generalRateShow'),
                'url': self.url,
                'item_ids': response.meta.get('item_ids'),
                'save_time':
                datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            }

            info = '京东商城显示的评价信息,<strong style="color: red; font-size: 24px;">总的评价数:{comment_count}、好评数:{good_count}、' \
                   '好评百分比:{good_rate}%、中评数:{general_count}、中评百分比:{general_rate}%、差评数:{poor_count}、差评百分比:{poor_rate}% ' \
                   '</strong>' \
                .format(comment_count = pcs.get('commentCount'), good_count = pcs.get('goodCount'),
                        general_count = pcs.get('generalCount'), poor_count = pcs.get('poorCount'),
                        good_rate = pcs.get('goodRate', 0) * 100,
                        general_rate = pcs.get('generalRate', 0) * 100,
                        poor_rate = pcs.get('poorRate', 0) * 100)

            utils.push_redis(self.guid, self.product_id, info)
            # 显示正在加载图片
            utils.push_redis(
                self.guid,
                self.product_id,
                '<li id="loader"><img src="/static/loader.gif"  height="90" width="90"></li>',
                type='image',
                save_to_mysql=False)

            comment_version = response.meta.get('comment_version')
            comment_count = pcs.get('commentCount')
            page_count = int(comment_count) / 10 + 10  # 这里为什么加 10 ?

            inner_crawl_page = get_project_settings().get(
                'INNER_CRAWL_PAGE', 20)
            if page_count > inner_crawl_page and config.is_distributed:
                for i in range(inner_crawl_page, page_count):
                    # 将数据插入 redis ,实现分布式抓取
                    data = {
                        'prodyct_id': self.product_id,
                        'comment_version': comment_version,
                        'sort_type': '6',
                        'page': i
                    }
                    self.red.rpush(self.product_id, json.dumps(data))

                count = self.red.llen('spiders')
                self.red.set('%s_page' % self.product_id,
                             page_count - inner_crawl_page)
                for i in range(count):
                    guid = self.red.lindex('spiders', i)
                    self.red.rpush(guid, self.product_id)

            # 正常抓取
            count = min(page_count, inner_crawl_page)
            for i in range(count):
                # sort type 5:推荐排序 6:时间排序
                url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \
                      '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page={page}&' \
                      'pageSize=10&isShadowSku=0'. \
                    format(product_id = self.product_id, comment_version = comment_version, sort_type = '6',
                           page = i)

                yield Request(
                    url=url,
                    headers={
                        'Accept':
                        '*/*',
                        'Accept-Encoding':
                        'gzip, deflate, br',
                        'Accept-Language':
                        'en-US,en;q=0.5',
                        'Connection':
                        'keep-alive',
                        'Host':
                        'club.jd.com',
                        'Referer':
                        'https://item.jd.com/%s.html' % self.product_id,
                        'User-Agent':
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
                        'Firefox/52.0',
                    },
                    method='GET',
                    meta={
                        'page': i,
                        'name': response.meta.get('name'),
                    },
                    dont_filter=True,
                    callback=self.parse_comment)
Esempio n. 10
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('user-rate-')
        urls = re.split(pattern, url)
        user_id = urls[1]
        pattern = re.compile('\w+', re.S)
        user_id = re.search(pattern, user_id).group()
        sql = SqlHelper()

        utils.log('user_id:%s' % user_id)

        if 'rate.taobao.com' in url and user_id != None:
            data['status'] = 'success'
            data['guid'] = str(random.randint(1000000000000,
                                              9999999999999)) + '_' + str(
                                                  random.randint(100, 999))
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={user_id}". \
                format(table = config.tb_item_table, user_id = user_id)
            result = sql.query_one(command)

            if result == None:
                name = 'tb_comment'
                cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a user_id={user_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           user_id = user_id)

                logging.warn(cmd)
                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \
                        format(config.analysis_item_table, user_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, user_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'user_id={user_id};'. \
                        format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'),
                               user_id = user_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e