def api(sql=None, method='first'): try: data = None for case in switch(method): if case('first'): data = mysql_session.execute(sql).first() if data is not None: data = dict(data) break if case('scalar'): data = mysql_session.execute(sql).scalar() break if case('fetchall'): data = mysql_session.execute(sql).fetchall() break if case('execute'): data = mysql_session.execute(sql) mysql_session.commit() data = data.lastrowid break # 是否打印日志 if Config.MYSQL_DEBUG: service_logger.warn("sql:api", {"sql": sql}) return data except Exception, err: mysql_session.rollback() service_logger.error("sql:error", { "sql": sql, "data": traceback.format_exc() }) return None
def dytt_list(url=''): data = [] if check_file(url, ext='.list'): html = read_file(url, ext='.list') else: html = get_url_html(url) write_file(url, html, ext='.list') if html == '': return data doc = pq(html) tables = doc('.co_content8 table').items() for tb in tables: txt = pq(tb) links = txt('.ulink').items() item = {} for link in links: href = pq(link).attr('href') if 'index.html' not in href: item['title'] = pq(link).text() item['link'] = 'http://www.ygdy8.net' + pq(link).attr('href') data.append(item) # 记录日志 service_logger.warn(data=data) return data
def toutiao_list(url=''): data = [] if check_file(url, ext='.list'): html = read_file(url, ext='.list') else: cookie = 'UM_distinctid=165e23b8bd863a-02b6bf44638b1e-541b371f-100200-165e23b8bd9812; tt_webid=6601789411817768455; WEATHER_CITY=%E5%8C%97%E4%BA%AC; uuid="w:be3b8ee49353488b825ded5ccbcf16b3"; CNZZDATA1259612802=1933104973-1537094542-%7C1539087142; __tasessionId=qgp2gufge1539087164145; csrftoken=afc50bb8fb759393b3c1da8340182cd6; tt_webid=6601789411817768455' html = get_url_html(url, cookie) write_file(url, html, ext='.list') #print html # 获取文章url resu = json.loads(html) if 'data' in resu: for vo in resu['data']: if 'item_source_url' in vo and 'media_avatar_url' in vo: if "http" not in vo['item_source_url'] and 'local//' not in vo[ 'item_source_url']: dt = { 'link': 'https://www.toutiao.com' + vo['item_source_url'], 'image': vo['media_avatar_url'] } data.append(dt) elif 'source_url' in vo and 'image_url' in vo: if "http" not in vo['source_url'] and 'local//' not in vo[ 'source_url']: dt = { 'link': 'https://www.toutiao.com' + vo['source_url'], 'image': vo['image_url'] } data.append(dt) # 记录日志 service_logger.warn(data=data) return data
def tengxun_detail(url, links): print json.dumps(links) cate = [] if 'tech' in url: cate = ['科技'] elif 'finance' in url: cate = ['财经'] elif 'edu' in url: cate = ['教育'] elif 'house' in url: cate = ['房产'] elif 'visit' in url: cate = ['旅游'] elif 'internet' in url or 'tcctit' in url or 'ai' in url: cate = ["互联网"] if len(links) > 0: for vo in links: # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Tengxun(vo['link']) # 补全数据 page.set_category(cate) data = page.get_content() if vo['image'] != '': data['image'] = vo['image'] # 如果图示:开头要加http if data['image'] != '' and data['image'][0:2] == '//': data['image'] = 'http:' + data['image'] # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data) # break except Exception, err: service_logger.error("tengxun-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def toutiao_detail(url, links): print json.dumps(links) cate = [] if 'news_baby' in url: cate = ['教育'] elif 'news_travel' in url: cate = ['旅游'] elif '人工智能' in url or '大数据' in url: cate = ['技术'] if len(links) > 0: for vo in links: # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Toutiao(vo['link']) # 补全数据 if len(cate) > 0: page.set_category(cate) data = page.get_content() if vo['image'] != '': data['image'] = vo['image'] # 如果图示:开头要加http if data['image'] != '' and data['image'][0:2] == '//': data['image'] = 'http:' + data['image'] # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data) # break except Exception, err: service_logger.error("toutiao-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def dytt_detail(url, links): print json.dumps(links) cate = [] if 'jddy' in url: cate = ['综合电影'] elif 'oumei' in url: cate = ['欧美电影'] elif 'china' in url: cate = ['国内电影'] elif 'rihan' in url: cate = ['日韩电影'] elif 'dyzz' in url: cate = ['最新电影'] if len(links) > 0: for vo in links: print vo['link'] # todo 检查链接 if ImportService.check_url(vo['link']): continue # 延时抓取 tm = random.randint(4, 10) time.sleep(tm) try: page = Dytt(vo['link']) # 补全数据 page.set_category(cate) data = page.get_content(flag=False) # 记录日志 service_logger.warn(data=data) if data['send_time'] == '' or data['title'] == '': continue # todo 保存数据 ImportService.insert_handle(data, 'video') # break except Exception, err: service_logger.error("dytt-exception", { "msg": traceback.format_exc(), "link": vo['link'] }) # 删除文件 delete_file(vo['link']) # 删除列表 delete_file(url, ext='.list')
def tengxun_list(url=''): data = [] if check_file(url, ext='.list'): html = read_file(url, ext='.list') else: html = get_url_html(url) html = unicode(html, 'GBK').encode('UTF-8') write_file(url, html, ext='.list') res = re.findall('window.chData={(.*?)};', html, re.S) if len(res) > 0: str = '{' + res[0] + '}' arrs = json.loads(str) for vo in arrs['data']: dt = {'link': vo['url'], 'image': vo['img']} data.append(dt) # 记录日志 service_logger.warn(data=data) return data
class ImportService(): @staticmethod def check_url(url): res = PostsModel.check(url) if res: return True return False @staticmethod def get_douban_image(name, w=480, h=320): image = '' url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=' + name html = get_url_html(url) doc = pq(html) tables = doc('.c-container').items() i = 0 for tb in tables: i = i + 1 txt = pq(tb) title = txt.text() imgObj = txt('img') if name in title: image = imgObj.attr('src') break if i > 8: break if image != '': service_logger.log('百度搜索图片:' + image) image = ImportService.upload_image(image, iscut=False, w=w, h=h) return image @staticmethod def upload_image(image, iscut=False, w=300, h=200): if image == '': return file = time.strftime("%Y%m%d%H%M%S", time.localtime()) + '_' + str( random.randint(10000, 99999)) subs = image.split('/')[-1] exts = subs.split('.') ext = 'jpg' if len(exts) > 1: ext = exts[-1] filename = file + '.' + ext y = time.strftime("%Y", time.localtime()) m = time.strftime("%m", time.localtime()) filepath = Config.IMAGE_PATH + '/' + y + '/' + m if os.path.isdir(filepath) == False: os.makedirs(filepath, 0775) newfile = filepath + '/' + filename oldfile = Config.DIR_PATH + filename try: # 存储原图 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36" } response = requests.get(image, headers=headers) if response.status_code != 200: return '' if '<!DOCTYPE' in response.content or '<iframe' in response.content: return '' cat_img = response.content with open(oldfile, "wb") as f: f.write(cat_img) except Exception, err: service_logger.error("task-exception", { "msg": traceback.format_exc(), "image": image }) return '' if iscut: # 存储裁剪图 with open(oldfile, 'rb') as f: with Image.open(f) as img: print img.size if img.size[0] > w or img.size[1] > h: cover = resizeimage.resize_cover(img, [w, h]) cover.save(newfile, img.format) else: with open(newfile, 'wb') as fo: fo.write(cat_img) else: # 存储新的图片 with open(newfile, 'wb') as f: f.write(cat_img) # 图片日志 service_logger.warn(data={ "image": image, 'old': oldfile, 'new': newfile }) # 删除图片 os.remove(oldfile) return y + '/' + m + '/' + filename