def get_rate(conn): """ 美元汇率查看 :param conn: :return: """ now = datetime.datetime.now() current_time = now.strftime("%Y-%m-%d %H:%M:%S") stamp_current_time = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S') c = CurrencyRates() c.get_rates('USD') # 查看美元最新汇率 stock_price = c.get_rate('USD', 'CNY') # 人民币汇率 stock_time = str(current_time) stock_name = '人民币汇率' stock_applies = None spider_data = stamp_current_time # 更新美元汇率 sql_update = """ UPDATE public.stock_code SET stock_time = %s, spider_data = %s, stock_name= %s, stock_price= %s, stock_applies= %s WHERE stock_name = '人民币汇率' """ sql_params = [ stock_time, spider_data, stock_name, stock_price, stock_applies ] logger.debug(sql_update) logger.debug(sql_params) execute_sql(conn, sql_update, sql_params)
def main(): now = datetime.datetime.now() current_time = now.strftime("%Y-%m-%d %H:%M:%S") one_day = now - datetime.timedelta(days=1) one_day_ago = one_day.strftime('%Y-%m-%d %H:%M:%S')[:16] history_day = (now - datetime.timedelta(days=10)).strftime('%Y-%m-%d %H:%M:%S')[:16] conn = None try: conn = get_conn() with conn: sql_max_date = """ SELECT max(CASE WHEN news_source='新浪财经' THEN news_date END), max(CASE WHEN news_source='华尔街见闻' THEN news_date END) FROM news_cj """ res = execute_select(conn, sql_max_date) max_date_sina = res[0][0] if res[0][0] else one_day_ago max_date_news = res[0][1] if res[0][1] else one_day_ago sql_delete = """ DELETE FROM news_cj WHERE news_date <= %s OR (news_source='华尔街见闻' AND news_date=%s) OR (news_source='新浪财经' AND news_date=%s) """ execute_sql(conn, sql_delete, (history_day, max_date_news, max_date_sina)) get_news(conn, max_date_news, current_time) get_sina_news(conn, max_date_sina, current_time) except Exception as e: logger.error(str(e)) finally: if conn: conn.close() sys.exit()
def get_code1(conn): """ 使用常规存储方式存储大盘指数行情数据获取 可获得上证指数、深证指数 :return: """ df_index = ts.get_index() sql_market = """ INSERT INTO middle_news_market( code, name, change_market, open_market, preclose, close_market, high, low, volume, amount ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """ for i in range(0, 25): code = df_index['code'][i] name = df_index['name'][i] change_market = str('%.2f' % df_index['change'][i]) open_market = str('%.4f' % df_index['open'][i]) preclose = str('%.4f' % df_index['preclose'][i]) close = str('%.4f' % df_index['close'][i]) high = str('%.4f' % df_index['high'][i]) low = str('%.4f' % df_index['low'][i]) volume = str(df_index['volume'][i]) amount = str('%.4f' % df_index['amount'][i]) sql_params = [ code, name, change_market, open_market, preclose, close, high, low, volume, amount ] logger.debug(sql_market) logger.debug(sql_params) # 存入数据库 execute_sql(conn, sql_market, sql_params)
def get_news(conn, max_date, current_time): """ 华尔街见闻抓取 :param conn: :param max_date: 数据库中最新新闻的日期 :param current_time: 当前时间 :return: """ func_name = "采集华尔街见闻" logger.debug('start %s ' % func_name) spider_data = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S') driver = None try: xvfb = Xvfb(width=1280, height=720) xvfb.start() driver = webdriver.Firefox(executable_path=chromedriver_path) driver.get('https://wallstreetcn.com/live/global') # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距 js = 'window.scrollBy(0,3000)' driver.execute_script(js) time.sleep(5) js = 'window.scrollBy(0,60000)' driver.execute_script(js) time.sleep(5) pages = driver.page_source soup = BeautifulSoup(pages, 'html.parser') soup1 = soup.find('div', class_='livenews-main') content = soup1.find_all('div', class_='live-item') news_source = '华尔街见闻' news_type = '宏观' last_news_time = '23:59' d_date = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S') for cont in content: news_time = cont.find('time', attrs={'class': 'live-item_created'}).get_text() news = cont.find('div', attrs={'class': 'live-item_main'}).find('div', attrs={'class': 'live-item_html'}) if news is None: return news = news.get_text().strip().replace('//', '') if last_news_time < news_time: d_date = d_date - datetime.timedelta(days=1) s_date = d_date.strftime("%Y-%m-%d") over_time = s_date + ' ' + news_time if max_date > over_time: break sql_params = [over_time, spider_data, news_source, news_type, news] logger.debug(sql_cj) logger.debug(sql_params) execute_sql(conn, sql_cj, sql_params) last_news_time = news_time logger.debug('end %s ' % func_name) except Exception as e: msg = func_name + ' 处理失败: ' + str(e) logger.error(msg) finally: if driver: # driver.close() driver.quit() xvfb.stop()
def del_his_info(conn, his_dtm): """ 考虑到数据量及新闻的时效性,这里对与日期在两个月前的新闻进行删除 :param conn: :param his_dtm: 当前日期倒减60天 :return: """ del_fun_nm = " 删除历史数据 " logger.debug('start %s ' % del_fun_nm) sql_del_params = (his_dtm, ) sql_del = " DELETE FROM infos.rslt_url WHERE pub_dtm < %s " execute_sql(conn, sql_del, sql_del_params) logger.debug('end %s ' % del_fun_nm)
def remv_repeat_info(conn, sql_params): """ 判断是否有重复再进行插入 :param conn: :param sql_params: :return: """ sql_rept = " SELECT url_nm FROM infos.rslt_url WHERE url_nm=%s " rept_re = execute_select(conn, sql_rept, (sql_params[0], )) if len(rept_re) == 0: logger.debug('插入sql:%s' % sql_info) logger.debug('数据参数:%s' % sql_params) execute_sql(conn, sql_info, sql_params)
def spider_Item(self, item, spider): try: with conn: sql_repeat = """ select * from public.db_movie where user_name=%s """ print('piplines') res = execute_select(conn, sql_repeat, item['user_name']) if not res[0]: sql_insert = """ INSERT INTO public.db_movie(user_name, comment_time, film_critics) VALUES(%s, %s, %s) """ execute_sql(conn, sql_insert, item) print('增加数据') else: pass finally: if conn: conn.close()
def get_sina_news(conn, max_date, current_time): """ 爬取新浪财经突发live板块新闻 :param conn: :param max_date: 数据库中最新新闻的日期 :param current_time: 当前时间 :return: """ func_name = "采集新浪财经新闻" logger.debug('start %s ' % func_name) spider_data = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S') driver = None try: xvfb = Xvfb(width=1280, height=720) xvfb.start() driver = webdriver.Firefox(executable_path=chromedriver_path) for num in range(1, 2): # url = 'http://live.sina.com.cn/zt/app_zt/f/v/finance/globalnews1/?page=' + str(num) url = 'http://finance.sina.com.cn/7x24/' driver.get(url) # 让页面滚动到下面,window.scrollBy(0, scrollStep),ScrollStep :间歇滚动间距 js = 'window.scrollBy(0,3000)' driver.execute_script(js) time.sleep(5) js = 'window.scrollBy(0,5000)' driver.execute_script(js) time.sleep(5) pages = driver.page_source xml = etree.HTML(pages) time_list = xml.xpath('//*[@class="bd_c0"]/div[@class="bd_list"]/div["bd_i"]/@data-time') soup = BeautifulSoup(pages, 'html.parser') save_file(soup.encode('utf-8')) soup1 = soup.find('div', id='liveList01') content = soup1.select('.bd_i') news_source = '新浪财经' for i in range(len(time_list)): time_stamp = time_list[i] data = content[i] over_time_1 = data.find('p', attrs={'class': 'bd_i_time_c'}).get_text() over_time = time_stamp + over_time_1 over_time_d = datetime.datetime.strptime(over_time, "%Y%m%d%H:%M:%S") over_time = datetime.datetime.strftime(over_time_d, "%Y-%m-%d %H:%M:%S") if max_date <= over_time: # data_type = data.find('p', attrs={'class': 'bd_i_tags'}).get_text().strip().replace("\n", "") # news_type = data_type.replace(' ', '') news_type = '' try: message = data.find('p', attrs={'class': 'bd_i_txt_c'}).get_text() mes = re.sub(r"http(.*)", '', message) news = re.sub('\s$', '', mes) except Exception as e: logger.error(e) sql_params = [over_time, spider_data, news_source, news_type, news] logger.debug(sql_cj) logger.debug(sql_params) execute_sql(conn, sql_cj, sql_params) else: return logger.debug('end %s ' % func_name) sys.exit() except Exception as e: msg = func_name + ' 处理失败: ' + str(e) logger.error(msg) finally: if driver: # driver.close() driver.quit() xvfb.stop()
def sina(conn, ips): logger.debug('新浪微博评论采集...sina') ip = random.choice(ips) # 指定IP uid = '4193705642468999' url = 'https://m.weibo.cn/single/rcList?format=cards&id=' + uid + '&type=comment&hot=0&page={}' i = 200 comment_num = 1 # 第几条评论 try: for i in range(i + 1, 67000): ip = random.choice(ips) proxies = {'http': ip} headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "你的cookie", "Host": "m.weibo.cn", "Referer": "https://m.weibo.cn/status/" + uid, "User-Agent": random.choice(ua_list), "X-Requested-With": "XMLHttpRequest", } logger.debug(proxies) try: logger.debug(url.format(i)) res = requests.get(url=url.format(i), headers=headers, proxies=proxies) r = res.json() content = r[0]['card_group'] if res.status_code == 200: logger.debug('抓取第%s页评论' % i) for j in range(0, len(content)): logger.debug('第%s条评论' % comment_num) hot_data = content[j] comment_id = hot_data['user']['id'] # 用户id user_name = hot_data['user']['screen_name'] # 用户名 created_at = hot_data['created_at'] # 评论时间 # 评论内容 comment = re.sub( '<.*?>|回复<.*?>:|[\U00010000-\U0010ffff]|[\uD800-\uDBFF][\uDC00-\uDFFF]', '', hot_data['text']) like_counts = hot_data['like_counts'] # 点赞数 sql_params = [ comment_id, user_name, created_at, comment, like_counts ] logger.debug(sql_params) execute_sql(conn, sql, sql_params) comment_num += 1 time.sleep(random.randint(2, 5)) except requests.exceptions.ConnectionError: logger.debug('ConnectionError') if not ips: logger.debug('ip 已失效') sys.exit() # 删除不可用的代理IP if ip in ips: ips.remove(ip) except Exception as e: logger.error(e)