def store2pg(ps=None, url=None, flag=None): '''存到pg cnt: 源码 url: url ''' try: uid, _obj = tran2pg(ps=ps, url=url, flag=flag) sess = session() sess.add(_obj) sess.commit() return uid except Exception as e: sess.rollback() traceback.print_exc() return False finally: sess.close()
def store2pg_parse(url=None, author=None, public_time=None, page_source=None, content=None, website_name=None, channel_name=None, title=None, topic=None, tag=None, meta_keywords=None, pic=None, flag=None): '''存到pg cnt: 源码 url: url ''' try: sess = session() uid, _obj = tran2pg_parse(url=url, author=author, public_time=public_time, page_source=page_source, content=content, website_name=website_name, channel_name=channel_name, title=title, topic=topic, tag=tag, meta_keywords=meta_keywords, flag=flag, pic=pic) sess.add(_obj) sess.commit() print('完成入库') return uid except Exception as e: print(e) sess.rollback() return False finally: sess.close() pass
""" import __init__ import requests import re import time import random from mrq.task import Task from mrq.job import queue_job from mrq.context import log, traceback from retrying import retry from __dber.pg_client import session, session1 from __dber.pg_orm_dongfangcaifu import Jz_dongfangcaifu_PageSource, Jz_dongfangcaifu_content from __utils.base import get_date_str, get_header, get_now_str, str_simhash, get_proxy_redis sess = session() sess1 = session1() class Start(Task): def run(self, params): #初始化的时候,选用页面长度为30 #每日增量,只要一页就可以 url_s1 = ('http://finance.eastmoney.com/news/cjjsp_%s.html', '经济时评') url_s2 = ('http://finance.eastmoney.com/news/cgnjj_%s.html', '国内经济') url_s3 = ('http://finance.eastmoney.com/news/cgjjj_%s.html', '国际经济') # for i in range(1,4): for i in range(1, 26): url1 = url_s1[0] % str(i) log.info('入队列 jz_cj_pagesource') queue_job('main_dongfangcaifu.Crawler1', {