def api(sql=None, method='first'): try: data = None for case in switch(method): if case('first'): data = mysql_session.execute(sql).first() if data is not None: data = dict(data) break if case('scalar'): data = mysql_session.execute(sql).scalar() break if case('fetchall'): data = mysql_session.execute(sql).fetchall() break if case('execute'): data = mysql_session.execute(sql) mysql_session.commit() data = data.lastrowid break # 是否打印日志 if Config.MYSQL_DEBUG: service_logger.warn("sql:api", {"sql": sql}) return data except Exception, err: mysql_session.rollback() service_logger.error("sql:error", { "sql": sql, "data": traceback.format_exc() }) return None
def update(self, up_str='', id=0): if up_str == '': service_logger.error({"msg": "up_str error", "task_id": id}) sql = SqlService.sql(SqlService.TASK_UPDATE, up_str, id) SqlService.api(sql, 'execute') return True
def tengxun_detail(url, links): print json.dumps(links) cate = [] if 'tech' in url: cate = ['科技'] elif 'finance' in url: cate = ['财经'] elif 'edu' in url: cate = ['教育'] elif 'house' in url: cate = ['房产'] elif 'visit' in url: cate = ['旅游'] elif 'internet' in url or 'tcctit' in url or 'ai' in url: cate = ["互联网"] if len(links) > 0: for vo in links: # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Tengxun(vo['link']) # 补全数据 page.set_category(cate) data = page.get_content() if vo['image'] != '': data['image'] = vo['image'] # 如果图示:开头要加http if data['image'] != '' and data['image'][0:2] == '//': data['image'] = 'http:' + data['image'] # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data) # break except Exception, err: service_logger.error("tengxun-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def handle(self): service_logger.log(self.url) try: self._handle() except Exception, err: service_logger.error("task-exception", { "msg": traceback.format_exc(), "url": self.url })
def toutiao_detail(url, links): print json.dumps(links) cate = [] if 'news_baby' in url: cate = ['教育'] elif 'news_travel' in url: cate = ['旅游'] elif '人工智能' in url or '大数据' in url: cate = ['技术'] if len(links) > 0: for vo in links: # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Toutiao(vo['link']) # 补全数据 if len(cate) > 0: page.set_category(cate) data = page.get_content() if vo['image'] != '': data['image'] = vo['image'] # 如果图示:开头要加http if data['image'] != '' and data['image'][0:2] == '//': data['image'] = 'http:' + data['image'] # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data) # break except Exception, err: service_logger.error("toutiao-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def dytt_detail(url, links): print json.dumps(links) cate = [] if 'jddy' in url: cate = ['综合电影'] elif 'oumei' in url: cate = ['欧美电影'] elif 'china' in url: cate = ['国内电影'] elif 'rihan' in url: cate = ['日韩电影'] elif 'dyzz' in url: cate = ['最新电影'] if len(links) > 0: for vo in links: print vo['link'] # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Dytt(vo['link']) # 补全数据 page.set_category(cate) data = page.get_content(flag=False) # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data, 'video') # break except Exception, err: service_logger.error("dytt-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def upload_image(image, iscut=False, w=300, h=200): if image == '': return file = time.strftime("%Y%m%d%H%M%S", time.localtime()) + '_' + str( random.randint(10000, 99999)) subs = image.split('/')[-1] exts = subs.split('.') ext = 'jpg' if len(exts) > 1: ext = exts[-1] filename = file + '.' + ext y = time.strftime("%Y", time.localtime()) m = time.strftime("%m", time.localtime()) filepath = Config.IMAGE_PATH + '/' + y + '/' + m if os.path.isdir(filepath) == False: os.makedirs(filepath, 0775) newfile = filepath + '/' + filename oldfile = Config.DIR_PATH + filename try: # 存储原图 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36" } response = requests.get(image, headers=headers) if response.status_code != 200: return '' if '<!DOCTYPE' in response.content or '<iframe' in response.content: return '' cat_img = response.content with open(oldfile, "wb") as f: f.write(cat_img) except Exception, err: service_logger.error("task-exception", { "msg": traceback.format_exc(), "image": image }) return ''
def error_handle(msg='', data=None): service_logger.error(data={"msg": msg, "data": data}) raise ApiException(msg)