def get(self): # 东方财富 url = 'http://stock.eastmoney.com/a/chyyj.html' # url = 'http://finance.eastmoney.com/a/ccjdd.html' # url = 'http://finance.eastmoney.com/a/202008121590598091.html' response = RequestUtil.get(url) code = response.status_code result = response.text log.info(code) if code != 200: return None # log.info result # 解析html soup = bs4.BeautifulSoup(result, 'html.parser') # 东方财富的查找 pls = soup.find_all('a', href=re.compile(r'finance.eastmoney.com/a/')) self.parse_sub_page(pls) log.info('解析完成,开始打印数据...') for k, v in self.results.items(): log.info(k) log.info( json.dumps(self.results.get(k), encoding='utf8', ensure_ascii=False))
def parse_sub_page(self, pls): for p in pls: url = p['href'] # 检查当前url是否已经解析过了 # if self.results.has_key(url): if url in self.results: log.info(url + "\tparse repetition") continue else: log.info(url) response = RequestUtil.get(url) html = response.text # log.info(result2) # 解析html soup = bs4.BeautifulSoup(html, 'html.parser') # 去除不需要的标签 [s.extract() for s in soup(['img', 'iframe', 'video'])] # 去除标签中不需要的属性 # del soup.a["class"] # del soup.a["href"] # divs = soup.find_all().findall('div', __class=re.compile('newsContent'))) h1 = soup.find('div', class_='newsContent').find('h1') time = soup.find('div', class_='newsContent').find('div', class_='time') source = soup.find('div', class_='newsContent').find( 'div', class_='source data-source') content_body = soup.find('div', class_='newsContent').find( 'div', id=re.compile('ContentBody')) # log.info("标题: %s" % h1) # log.info("时间:%s" % time) # log.info("来源:%s" % source) # log.info(h1) # log.info(time) # log.info(source) # log.info(content_body) result = { 'title': h1.get_text(), 'time': time.get_text(), 'source': str(source.get_text()).replace("来源:", ""), 'content': str(content_body) } self.results.setdefault(url, result)
def get_session(user, password, host, port, db): try: if DBSession is None: url = 'mysql+pymysql://{}:{}@{}:{}/{}'.format(user, password, host, port, db) log.info(url) log.info("开始连接数据库...") # engine = create_engine(url, echo=True) engine = create_engine(url) log.info("数据库连接成功...") session = sessionmaker(bind=engine) log.info("会话已创建") return session() else: return DBSession except Exception as e: log.error("数据库连接失败...") log.error(e.args)
def test_req(): log.info("开始请求...") req = RequestUtil() i = 0 while i < 10: # https://x-quote.cls.cn/quote/index/tline?app=CailianpressWeb&date=20200821&os=web&sv=7.2.2&sign=19451680ce43b6be73f28481e91cfc32 res = req.get( 'https://x-quote.cls.cn/quote/index/tline?app=CailianpressWeb&date=20200821&os=web&sv=7.2.2&sign=19451680ce43b6be73f28481e91cfc32' ) res2 = req.get( 'https://www.cls.cn/v3/transaction/anchor?app=CailianpressWeb&cdate=2020-08-21&os=web&sv=7.2.2&sign=c1a1f220a04f4aa92d04bc57bc4a9836' ) log.info(res.status_code) log.info(res.text) log.info(res2.status_code) log.info(res2.text) i = i + 1 log.info("结束请求...") log.info("")
def get(self): ut = database.DBSession.query(UserTest).filter(UserTest.id == 41).one() log.info(ut.id) log.info(ut.name)
# coding=utf-8 import sys import request import database from service.eastmoney import Eastmoney from util.my_logger import log from util.my_schedule import run_schedule from util.my_thread_pool_executor import MyThreadPoolExecutor from entity.user_test import UserTest reload(sys) # 设置系统默认编码 sys.setdefaultencoding('utf-8') # 添加该方法声明编码 if __name__ == '__main__': database.DBSession = database.get_session("root", "root", "localhost", "3306", "test") # 连接数据库 future = MyThreadPoolExecutor.add(run_schedule) # 启动定时任务 log.info('start %s', not future.done()) MyThreadPoolExecutor.add(Eastmoney().get) # 爬取东方财富 MyThreadPoolExecutor.add(request.test_req) # 请求大盘板块信息 ut = UserTest() ut.get() log.info('end...')