def main(): now = datetime.datetime.now() current_time = now.strftime("%Y-%m-%d %H:%M:%S") one_day = now - datetime.timedelta(days=1) one_day_ago = one_day.strftime('%Y-%m-%d %H:%M:%S')[:16] history_day = (now - datetime.timedelta(days=10)).strftime('%Y-%m-%d %H:%M:%S')[:16] conn = None try: conn = get_conn() with conn: sql_max_date = """ SELECT max(CASE WHEN news_source='新浪财经' THEN news_date END), max(CASE WHEN news_source='华尔街见闻' THEN news_date END) FROM news_cj """ res = execute_select(conn, sql_max_date) max_date_sina = res[0][0] if res[0][0] else one_day_ago max_date_news = res[0][1] if res[0][1] else one_day_ago sql_delete = """ DELETE FROM news_cj WHERE news_date <= %s OR (news_source='华尔街见闻' AND news_date=%s) OR (news_source='新浪财经' AND news_date=%s) """ execute_sql(conn, sql_delete, (history_day, max_date_news, max_date_sina)) get_news(conn, max_date_news, current_time) get_sina_news(conn, max_date_sina, current_time) except Exception as e: logger.error(str(e)) finally: if conn: conn.close() sys.exit()
def main(): conn = None try: get_code() conn = get_conn() with conn: get_rate(conn) finally: if conn: conn.close()
def main(): conn = None try: conn = get_conn() with conn: ips = [] for i in range(6000): # 每隔1000次重新获取一次最新的代理IP,每次可获取最新的100个代理IP if i % 1000 == 0: ips.extend(get_ip()) sina(conn, ips) finally: if conn: conn.close()
def deal(): remv_his_log() last_mon = (now - datetime.timedelta(days=20)).strftime("%Y-%m-%d %H:%M:%S") str_his_dtm = (now - datetime.timedelta(days=60)).strftime("%Y-%m-%d %H:%M:%S") logger.debug('last_mon:%s, str_his_dtm:%s ' % (last_mon, str_his_dtm)) conn = None try: conn = get_conn() with conn: del_his_info(conn, str_his_dtm) # 查找出关键字 sql_key_word = " SELECT word FROM infos.hot_keyword " re_key_word = execute_select(conn, sql_key_word) key_word_list = [] for key_word in re_key_word: key_word_list.append(key_word[0]) # 查找网站地址 sql_url_link = " SELECT url, url_nm FROM infos.url_link " re_url_link = execute_select(conn, sql_url_link) url_list = [] for url in re_url_link: url_list.append(url[0]) # 查找相关百家号 sql_bj_num = " SELECT bj_num FROM infos.url_link WHERE bj_num>'' " re_bj_num = execute_select(conn, sql_bj_num) bj_num_list = [] for bj_num in re_bj_num: bj_num_list.append(bj_num) key_word_srch(conn, last_mon, key_word_list, url_list, bj_num_list) except Exception as e: logger.error(str(e)) finally: if conn: conn.close()
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import logging.config from config import logger_path from common.pgutils import get_conn, execute_select, execute_sql logging.config.fileConfig(logger_path) logger = logging.getLogger("root") conn = get_conn() class DoubanQrsPipeline(object): def process_item(self, item, spider): return item class crawlerDouban(object): def spider_Item(self, item, spider): try: with conn: sql_repeat = """ select * from public.db_movie where user_name=%s """ print('piplines') res = execute_select(conn, sql_repeat, item['user_name']) if not res[0]: