def spider(item): nonlocal total_count nonlocal post_poor nonlocal error_count error = True try: total_count += 1 if error: for themelinkfun in themes: if not error: break error = themelinkfun.get_last_post(item, post_poor) if error: print("-----------获取主页信息失败,采取sitemap策略----------") error, post_poor = sitmap_get(item, post_poor) except Exception as e: print('\n') print(item, "运用主页及sitemap爬虫爬取失败!请检查") print('\n') print(e) error_count += 1 if error: error = 'true' else: error = 'false' item.append(error) return item
def spider(item): nonlocal total_count nonlocal post_poor nonlocal error_count error = 'false' try: total_count += 1 error = butterfly.get_last_post_from_butterfly(item, post_poor) if error == 'true': error = matery.get_last_post_from_matery(item, post_poor) if error == 'true': error = volantis.get_last_post_from_volantis(item, post_poor) if error == 'true': print("-----------获取主页信息失败,采取sitemap策略----------") error, post_poor = sitmap_get(item, post_poor) except Exception as e: print('\n') print(item, "运用主页及sitemap爬虫爬取失败!请检查") print('\n') print(e) error_count += 1 item.append(error) return item
def main(): # 引入leancloud验证 if configs.DEBUG: leancloud.init(configs.LC_APPID, configs.LC_APPKEY) friendpage_link = configs.FRIENPAGE_LINK else: leancloud.init(sys.argv[1], sys.argv[2]) friendpage_link = sys.argv[3] # 导入yml配置文件 # config = load_config() config = configs.yml # 执行主方法 print('----------------------') print('-----------!!开始执行爬取文章任务!!----------') print('----------------------') print('\n') # 分离到handlers.coreDatas.py # today = datetime.datetime.today() # time_limit = 60 friend_poor = [] post_poor = [] print('----------------------') print('-----------!!开始执行友链获取任务!!----------') print('----------------------') if config['setting']['gitee_friends_links']['enable'] and config['setting']['gitee_friends_links']['type'] == 'normal': try: kang_api(friend_poor) except: print('读取gitee友链失败') else: print('未开启gitee友链获取') if config['setting']['github_friends_links']['enable'] and config['setting']['github_friends_links']['type'] == 'normal': try: github_issuse(friend_poor) except: print('读取github友链失败') else: print('未开启gihub友链获取') try: butterfly.butterfly_get_friendlink(friendpage_link,friend_poor) except: print('不是butterfly主题') try: matery.matery_get_friendlink(friendpage_link,friend_poor) except: print('不是matery主题') try: volantis.volantis_get_friendlink(friendpage_link,friend_poor) except: print('不是volantis主题或未配置gitee友链') friend_poor = delete_same_link(friend_poor) friend_poor = block_link(friend_poor) print('当前友链数量', len(friend_poor)) print('----------------------') print('-----------!!结束友链获取任务!!----------') print('----------------------') total_count = 0 error_count = 0 for index, item in enumerate(friend_poor): error = 'false' try: total_count += 1 error = butterfly.get_last_post_from_butterfly(item, post_poor) if error == 'true': error = matery.get_last_post_from_matery(item, post_poor) if error == 'true': error = volantis.get_last_post_from_volantis(item, post_poor) if error == 'true': print("-----------获取主页信息失败,采取sitemap策略----------") error, post_poor = sitmap_get(item, post_poor) except Exception as e: print('\n') print(item, "运用主页及sitemap爬虫爬取失败!请检查") print('\n') print(e) error_count += 1 item.append(error) print('\n') print('----------------------') print("一共进行%s次" % total_count) print("一共失败%s次" % error_count) print('----------------------') print('\n') print('----------------------') print('-----------!!执行用户信息上传!!----------') print('----------------------') leancloud_push_userinfo(friend_poor) print('----------------------') print('-----------!!用户信息上传完毕!!----------') print('----------------------') post_poor.sort(key=itemgetter('time'), reverse=True) print('----------------------') print('-----------!!执行文章信息上传!!----------') print('----------------------') leancloud_push(post_poor) print('----------------------') print('-----------!!文章信息上传完毕!!----------') print('----------------------')