import copy from settings import REDIS_CONFIG_LOCAL, MYSQL_CONFIG_LOCAL from store import AmazonRedis, AmazonStorePro from config import Config rds = AmazonRedis(2, **copy.deepcopy(REDIS_CONFIG_LOCAL)) asin_set = set() with open('asin', encoding='utf-8') as f: for line in f: line = line.strip() if line: asin_set.add(line) print(asin_set) # 1、选择站点 suffix = 'com' # com\co.uk\co.jp\fr\it\es\de\in\ca # 2、选择采集类型 entry = 7 # 1详情、7评论 task_lst = [] for asin in asin_set: mp = {'entry': entry} if entry == Config.DETAIL: mp['page_url'] = "https://www.amazon.{}/dp/{}".format(suffix, asin) task_lst.append(mp) elif entry == Config.COMMENT: # 3、1表示采全部星级,0表示采特定星级 all_star = 1 if all_star:
data['scgs_category_url'], data['scgs_tags'], data['scgs_shop_name'], data['scgs_shop_url'], data['scgs_generation_time'], data['scgs_platform'], data['scgs_platform_url'], data['scgs_crawl_time'], data['scgs_create_time'], data['scgs_status'], data['scgs_questions'], data['scgs_is_delete'], data['scgs_reserve_field_1'], data['scgs_reserve_field_2'], data['scgs_reserve_field_3'], data['scgs_reserve_field_4'], data['scgs_reserve_field_5'], data['scgs_reserve_field_6'], data['scgs_reserve_field_7']) except Exception as exp: traceback.print_exc() item_json['error'] = '{!r}'.format(exp) rds.rds.lpush(Config.REDIS_DATA_ERROR, json.dumps(item_json)) print('finished insert') store.close() else: print('no item') time.sleep(30) if __name__ == '__main__': rds = AmazonRedis() while True: data_insert(rds)
import queue from config import Config from crawl_func import clear_rds, start_crawl, start_thread from store import AmazonRedis from scan_task import scan_database class ConfigSub(Config): REDIS_START_URLS_NAME = 'list' REDIS_SUB_DIR_NAME = 'lc' if __name__ == '__main__': conf = ConfigSub() que = queue.Queue() rds = AmazonRedis() clear_rds(rds, conf) sign = scan_database(rds, conf) if sign: rds.delete_key('amazon:di:cy:dc01:markdate') new_loop = asyncio.new_event_loop() start_thread(new_loop) try: while True: start_crawl(rds, que, conf, new_loop) # 队列都为空,采集完成
import asyncio import copy import queue from config import Config from crawl_func import clear_rds, start_crawl, start_thread from store import AmazonRedis from settings import REDIS_CONFIG_LOCAL if __name__ == '__main__': que = queue.Queue() rds = AmazonRedis(Config.REDIS_NUM, **copy.deepcopy(REDIS_CONFIG_LOCAL)) clear_rds(rds, Config) new_loop = asyncio.new_event_loop() start_thread(new_loop) try: while True: start_crawl(rds, que, Config, new_loop) # 队列都为空,采集完成 if not rds.exists_key( Config.REDIS_START_URLS) and not rds.exists_key( Config.REDIS_REQUEST_URLS) and not rds.exists_key( Config.REDIS_CRAWL_URLS): break except KeyboardInterrupt: print('KeyboardInterrupt') new_loop.stop()
import asyncio import copy import queue from config import Config from crawl_func import clear_rds, start_crawl, start_thread from store import AmazonRedis from settings import REDIS_CONFIG_LOCAL if __name__ == '__main__': que = queue.Queue() rds = AmazonRedis(Config.REDIS_NUM, **copy.deepcopy(REDIS_CONFIG_LOCAL)) clear_rds(rds, Config) new_loop = asyncio.new_event_loop() start_thread(new_loop) try: while True: start_crawl(rds, que, Config, new_loop) # 队列都为空,采集完成 if not rds.exists_key(Config.REDIS_START_URLS) and not rds.exists_key(Config.REDIS_REQUEST_URLS) and not rds.exists_key(Config.REDIS_CRAWL_URLS): break except KeyboardInterrupt: print('KeyboardInterrupt') new_loop.stop()
push_data_into_redis(rds, Config, data_mp) else: print('no exist asin') print('push repeat done') store.close() else: print('no repeat asin') def push_data_into_redis(rds, conf, data_mp): data_json = json.dumps(data_mp) rds.rds.lpush(conf.REDIS_DATA_LIST, data_json) if __name__ == '__main__': rds = AmazonRedis() detail_today = rds.get_hash_field('amazon:di:cy:dc01:markdate', 'today') if detail_today: detail_today = detail_today.split()[0] today = time.strftime("%Y-%m-%d") if detail_today == today: print('start handling repeat asin') select_asin(rds) else: print('wait for detail finish')
import time import sys from store import AmazonRedis from send_email import SendEmail if __name__ == '__main__': rds = AmazonRedis() mail_today = rds.get_hash_field('amazon:di:cy:mail', 'today') detail_today = rds.get_hash_field('amazon:di:cy:dc01:markdate', 'today') if detail_today: detail_today = detail_today.split()[0] today = time.strftime("%Y-%m-%d") if mail_today == today: print('DI finish') sys.exit() if detail_today == today and not rds.exists_key('amazon:di:cy:repeatasin'): email = SendEmail() context = 'ok' email.send_message('DI', '*****@*****.**', '今日采集完成', context) today_date = time.strftime("%Y-%m-%d") rds.set_hash('amazon:di:cy:mail', {'today': today_date}) else: print('DI no finish')
rds.rds.lpush(Config.REDIS_REPEAT_ASIN, repeat_mp) print('repeat asin') rds.add_set(Config.REDIS_CATE_ASIN, cate_asin) continue rds.add_set(Config.REDIS_CATE_ASIN, cate_asin) print(row['scgs_id']) page_url = 'https://www.amazon.{}/dp/{}'.format(suffix, asin) mp = {'entry': task_type, 'page_url': page_url, 'category_info': task_category, 'category_entry': category_entry, 'category_url': category_url, 'rank': 101, 'create_time': create_time} rds.rds.rpush('amazon:di:cy:detail', mp) store.close() if __name__ == '__main__': rds = AmazonRedis() today = time.strftime("%Y-%m-%d") asin_today = rds.get_hash_field('amazon:di:cy:asin:markdate', 'today') if asin_today: asin_today = asin_today.split()[0] if asin_today == today: print('toady finish') else: list_today = rds.get_hash_field('amazon:di:cy:lc:markdate', 'today') if list_today: list_today = list_today.split()[0] if list_today == today: rds.set_hash('amazon:di:cy:asin:markdate', {'today': time.strftime("%Y-%m-%d %H:%M:%S")}) print('scan_database') select_asin(rds) rds.delete_key(Config.REDIS_CATE_ASIN)