Beispiel #1
0
    def GET(self):
        try:
            sql = SqlHelper()

            inputs = web.input()
            name = inputs.get('name')

            proxy = Proxy()
            proxy.set_value(
                ip=inputs.get('ip'),
                port=inputs.get('port'),
                country=inputs.get('country', None),
                anonymity=inputs.get('anonymity', None),
                https=inputs.get('https', 'no'),
                speed=inputs.get('speed', -1),
                source=inputs.get('source', name),
            )

            utils.sql_insert_proxy(sql, name, proxy)

            command = "SELECT ip FROM {0} WHERE ip={1} AND port={2}".format(
                name, inputs.get('ip'), inputs.get('port'))
            res = sql.query_one(command)
            return res is None
        except:
            pass

        return False
Beispiel #2
0
def randitem(spargs):
    guid = spargs.get('guid', 0)
    utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False)

    url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \
          '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682'
    headers = {
        'Host':
        'diviner.taobao.com',
        'Referer':
        'https://www.taobao.com/',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    cookies = {
        '__jda':
        '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1',
        '__jdb': '122270672.1.1492415671516609876050|1.1492415672',
        '__jdc': '122270672',
        '__jdv': '122270672|direct|-|none|-|1492415671524',
        '__jdu': '1492415671516609876050',
    }

    r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20)
    pattern = re.compile('"sku":(\d+),', re.S)
    ids = re.findall(pattern, r.text)
    id = random.choice(ids)

    url = 'https://item.taobao.com/%s.html' % str(id)
    utils.push_redis(guid,
                     0,
                     '生成商品链接:<a href="%s" target="_blank">%s' % (url, url),
                     save_to_mysql=False)

    sql = SqlHelper()
    command = "SELECT id FROM {table} WHERE id={product_id}". \
        format(table = config.tb_item_table, product_id = id)
    result = sql.query_one(command)

    # 如果数据库中没有,则重新抓取
    if result == None:
        cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
              '-a product_id={product_id} -a url={url};'. \
            format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid,
                   product_id = id)
        subprocess.Popen(cmd, shell=True)
    else:
        # 如果数据库中存在则,直接读取数据库中数据
        command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
            format(config.analysis_item_table, id)
        result = sql.query(command)
        for res in result:
            utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
Beispiel #3
0
    def GET(self):
        try:
            sql = SqlHelper()
            inputs = web.input()
            name = inputs.get('name')
            ip = inputs.get('ip')
            command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            sql.execute(command)

            command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip)
            res = sql.query_one(command)
            return res is None
        except:
            pass
        return False
Beispiel #4
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('\d+', re.S)
        product_id = re.search(pattern, url).group()
        sql = SqlHelper()

        utils.log('product_id:%s' % product_id)

        if 'item.jd.com' in url and product_id != None:
            data['status'] = 'success'
            data['guid'] = str(uuid.uuid4())
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={product_id}". \
                format(table = config.jd_item_table, product_id = product_id)
            result = sql.query_one(command)

            if result == None:
                name = 'jd'
                cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a product_id={product_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           product_id = product_id)

                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \
                        format(config.analysis_item_table, product_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, product_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'product_id={product_id};'. \
                        format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'),
                               product_id = product_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e
Beispiel #5
0
class RunAnalysis(object):
    def __init__(self, red, key, user):
        self.key = key
        self.red = red

        data = json.loads(user)
        self.product_id = data.get('product_id')
        self.url = data.get('url')
        self.email = data.get('email')
        self.guid = data.get('guid')
        self.spider_name = 'tb_comment'
        self.spargs = data

        self.sql = SqlHelper()
        self.spargs['red'] = self.red
        self.spargs['sql'] = self.sql

        if not os.path.exists('log'):
            os.makedirs('log')

        configure_logging(install_root_handler = False)
        logging.basicConfig(
                filename = 'log/%s.log' % self.product_id,
                format = '%(levelname)s %(asctime)s: %(message)s',
                level = logging.DEBUG
        )

    def run(self):
        self.runspider()
        self.analysis()
        self.send_notice()
        self.clear_cache()

    # 运行抓取程序,使用代理抓取所有的商品评价
    def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(TBItemInfoSpider, **spargs)
            yield runner.crawl(TBCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 调度分析
    def analysis(self):
        analysis = Analysis(**self.spargs)
        result = analysis.run()

        tb_comment = TBCommentAnalysis(id = None, guid = self.guid, product_id = self.product_id, item_name = 'name',
                                       content = result, email = self.email, create_time = datetime.datetime.now())
        tb_comment.save()

    # 向用户预留邮箱发送邮件
    def send_notice(self):
        subject = '淘宝店铺 - 商品评价分析结果展示'

        blog_url = '%stb/full_result/%s' % ('http://127.0.0.1:8000/', self.guid)

        command = "SELECT name FROM {0} WHERE id={1}".format(config.tb_item_table, self.product_id)
        (item_name,) = self.sql.query_one(command)

        body = '''
        您好~
        您订阅的淘宝店铺商品评价信息分析服务已经完成。商品名称:{item_name},商品链接:{tb_url},分析结果请见:{blog_url}
        '''.format(tb_url = self.url, blog_url = blog_url, item_name = item_name)

        send_email(to_email = self.email, subject = subject, body = body)

    def clear_cache(self):
        data = self.red.delete(self.key)
        logging.debug('clear_cacha data:%s' % data)
Beispiel #6
0
def runspider(request):
    data = {
        'status': 'failure',
        'guid': '0',
        'info': '',
    }

    try:
        # 正式环境用 post 请求
        url = request.POST.get('url')
        force = request.POST.get('force', 'false')
        pattern = re.compile('user-rate-')
        urls = re.split(pattern, url)
        user_id = urls[1]
        pattern = re.compile('\w+', re.S)
        user_id = re.search(pattern, user_id).group()
        sql = SqlHelper()

        utils.log('user_id:%s' % user_id)

        if 'rate.taobao.com' in url and user_id != None:
            data['status'] = 'success'
            data['guid'] = str(random.randint(1000000000000,
                                              9999999999999)) + '_' + str(
                                                  random.randint(100, 999))
            data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现',

            command = "SELECT id FROM {table} WHERE id={user_id}". \
                format(table = config.tb_item_table, user_id = user_id)
            result = sql.query_one(command)

            if result == None:
                name = 'tb_comment'
                cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \
                      '-a user_id={user_id} -a url={url};'. \
                    format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'),
                           user_id = user_id)

                logging.warn(cmd)
                subprocess.Popen(cmd, shell=True)
            else:
                if force == 'false':
                    utils.log('数据库中存在数据,从数据库中取出分析结果')
                    command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \
                        format(config.analysis_item_table, user_id)
                    result = sql.query(command)
                    for res in result:
                        utils.push_redis(data.get('guid'),
                                         res[1],
                                         res[2],
                                         res[3],
                                         save_to_mysql=False)
                else:
                    command = "DELETE FROM {0} WHERE produce_id={1}".format(
                        config.analysis_item_table, user_id)
                    sql.execute(command)
                    #重新分析数据
                    cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \
                          'user_id={user_id};'. \
                        format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'),
                               user_id = user_id)

                    subprocess.Popen(cmd, shell=True)
        else:
            data[
                'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm'
    except Exception, e:
        logging.error('run spider exception:%s' % e)
        data['info'] = '出现错误,错误原因:%s' % e