def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') proxy = Proxy() proxy.set_value( ip=inputs.get('ip'), port=inputs.get('port'), country=inputs.get('country', None), anonymity=inputs.get('anonymity', None), https=inputs.get('https', 'no'), speed=inputs.get('speed', -1), source=inputs.get('source', name), ) utils.sql_insert_proxy(sql, name, proxy) command = "SELECT ip FROM {0} WHERE ip={1} AND port={2}".format( name, inputs.get('ip'), inputs.get('port')) res = sql.query_one(command) return res is None except: pass return False
def randitem(spargs): guid = spargs.get('guid', 0) utils.push_redis(guid, 0, '正在随机产生商品链接', save_to_mysql=False) url = 'https://diviner.taobao.com/diviner?p=610009&callback=jsonpCallbackMoreGood&lid=1&uuid=122270672' \ '.1492415671516609876050.1492415672.1492415672.1492415672.1&pin=&lim=100&ec=utf-8&_=1492415813682' headers = { 'Host': 'diviner.taobao.com', 'Referer': 'https://www.taobao.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' } cookies = { '__jda': '122270672.1492415671516609876050.1492415672.1492415672.1492415672.1', '__jdb': '122270672.1.1492415671516609876050|1.1492415672', '__jdc': '122270672', '__jdv': '122270672|direct|-|none|-|1492415671524', '__jdu': '1492415671516609876050', } r = requests.get(url=url, headers=headers, cookies=cookies, timeout=20) pattern = re.compile('"sku":(\d+),', re.S) ids = re.findall(pattern, r.text) id = random.choice(ids) url = 'https://item.taobao.com/%s.html' % str(id) utils.push_redis(guid, 0, '生成商品链接:<a href="%s" target="_blank">%s' % (url, url), save_to_mysql=False) sql = SqlHelper() command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.tb_item_table, product_id = id) result = sql.query_one(command) # 如果数据库中没有,则重新抓取 if result == None: cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = 'tb', dir = settings.BASE_DIR, guid = guid, product_id = id) subprocess.Popen(cmd, shell=True) else: # 如果数据库中存在则,直接读取数据库中数据 command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, id) result = sql.query(command) for res in result: utils.push_redis(guid, res[1], res[2], res[3], save_to_mysql=False)
def GET(self): try: sql = SqlHelper() inputs = web.input() name = inputs.get('name') ip = inputs.get('ip') command = "DELETE FROM {0} WHERE ip=\'{1}\'".format(name, ip) sql.execute(command) command = "SELECT ip FROM {0} WHERE ip=\'{1}\'".format(name, ip) res = sql.query_one(command) return res is None except: pass return False
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('\d+', re.S) product_id = re.search(pattern, url).group() sql = SqlHelper() utils.log('product_id:%s' % product_id) if 'item.jd.com' in url and product_id != None: data['status'] = 'success' data['guid'] = str(uuid.uuid4()) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={product_id}". \ format(table = config.jd_item_table, product_id = product_id) result = sql.query_one(command) if result == None: name = 'jd' cmd = 'cd {dir};python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a product_id={product_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE product_id={1} ORDER BY id". \ format(config.analysis_item_table, product_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, product_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'product_id={product_id};'. \ format(url = url, name = 'jd', dir = settings.BASE_DIR, guid = data.get('guid'), product_id = product_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://item.jd.com/3995645.html' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e
class RunAnalysis(object): def __init__(self, red, key, user): self.key = key self.red = red data = json.loads(user) self.product_id = data.get('product_id') self.url = data.get('url') self.email = data.get('email') self.guid = data.get('guid') self.spider_name = 'tb_comment' self.spargs = data self.sql = SqlHelper() self.spargs['red'] = self.red self.spargs['sql'] = self.sql if not os.path.exists('log'): os.makedirs('log') configure_logging(install_root_handler = False) logging.basicConfig( filename = 'log/%s.log' % self.product_id, format = '%(levelname)s %(asctime)s: %(message)s', level = logging.DEBUG ) def run(self): self.runspider() self.analysis() self.send_notice() self.clear_cache() # 运行抓取程序,使用代理抓取所有的商品评价 def runspider(self): configure_logging(install_root_handler = False) s = get_project_settings() runner = CrawlerRunner(settings = s) @defer.inlineCallbacks def crawl(**spargs): yield runner.crawl(TBItemInfoSpider, **spargs) yield runner.crawl(TBCommentSpider, **spargs) reactor.stop() crawl(**self.spargs) reactor.run() # the script will block here until the last crawl call is finished # 调度分析 def analysis(self): analysis = Analysis(**self.spargs) result = analysis.run() tb_comment = TBCommentAnalysis(id = None, guid = self.guid, product_id = self.product_id, item_name = 'name', content = result, email = self.email, create_time = datetime.datetime.now()) tb_comment.save() # 向用户预留邮箱发送邮件 def send_notice(self): subject = '淘宝店铺 - 商品评价分析结果展示' blog_url = '%stb/full_result/%s' % ('http://127.0.0.1:8000/', self.guid) command = "SELECT name FROM {0} WHERE id={1}".format(config.tb_item_table, self.product_id) (item_name,) = self.sql.query_one(command) body = ''' 您好~ 您订阅的淘宝店铺商品评价信息分析服务已经完成。商品名称:{item_name},商品链接:{tb_url},分析结果请见:{blog_url} '''.format(tb_url = self.url, blog_url = blog_url, item_name = item_name) send_email(to_email = self.email, subject = subject, body = body) def clear_cache(self): data = self.red.delete(self.key) logging.debug('clear_cacha data:%s' % data)
def runspider(request): data = { 'status': 'failure', 'guid': '0', 'info': '', } try: # 正式环境用 post 请求 url = request.POST.get('url') force = request.POST.get('force', 'false') pattern = re.compile('user-rate-') urls = re.split(pattern, url) user_id = urls[1] pattern = re.compile('\w+', re.S) user_id = re.search(pattern, user_id).group() sql = SqlHelper() utils.log('user_id:%s' % user_id) if 'rate.taobao.com' in url and user_id != None: data['status'] = 'success' data['guid'] = str(random.randint(1000000000000, 9999999999999)) + '_' + str( random.randint(100, 999)) data['info'] = '成功接收数据,正在为您抓取并分析数据,精彩稍候呈现', command = "SELECT id FROM {table} WHERE id={user_id}". \ format(table = config.tb_item_table, user_id = user_id) result = sql.query_one(command) if result == None: name = 'tb_comment' cmd = 'python manage.py real_time_analysis -a name={name} -a guid={guid} ' \ '-a user_id={user_id} -a url={url};'. \ format(url = str(url), name = name, dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) logging.warn(cmd) subprocess.Popen(cmd, shell=True) else: if force == 'false': utils.log('数据库中存在数据,从数据库中取出分析结果') command = "SELECT * FROM {0} WHERE user_id={1} ORDER BY id". \ format(config.analysis_item_table, user_id) result = sql.query(command) for res in result: utils.push_redis(data.get('guid'), res[1], res[2], res[3], save_to_mysql=False) else: command = "DELETE FROM {0} WHERE produce_id={1}".format( config.analysis_item_table, user_id) sql.execute(command) #重新分析数据 cmd = 'cd {dir};python manage.py analysis -a url={url} -a name={name} -a guid={guid} -a ' \ 'user_id={user_id};'. \ format(url = url, name = 'tb', dir = settings.BASE_DIR, guid = data.get('guid'), user_id = user_id) subprocess.Popen(cmd, shell=True) else: data[ 'info'] = '传入网址有误,请检查后重新输入,请输入以下格式的网址:\n%s' % 'https://rate.taobao.com/user-rate-UvGv0MFc0vFILvgTT.htm' except Exception, e: logging.error('run spider exception:%s' % e) data['info'] = '出现错误,错误原因:%s' % e