def __init__(self): requests_times = global_config.getRaw('config', 'requests_times') self.cookie = global_config.getRaw('config', 'Cookie') self.ua = global_config.getRaw('config', 'user-agent') # Todo 这句话加在这里会明显拖慢加载速度,但是单独取出并不会浪费很长时间,因此暂时搁置虚假请求头 # Todo 作为对策,在这里判断ua不能为None # self.ua_engine = Factory.create() if self.ua is None: logger.error('user agent 暂时不支持为空') sys.exit() self.cookie_pool = global_config.getRaw('config', 'use_cookie_pool') self.cookie_pool = True if self.cookie_pool == 'True' else False if self.cookie_pool is True: logger.info('使用cookie池') if not os.path.exists('cookies.txt'): logger.error('cookies.txt文件不存在') sys.exit() try: self.stop_times = self.parse_stop_time(requests_times) except: logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)') sys.exit() self.global_time = 0 pass
def __init__(self): requests_times = global_config.getRaw('config', 'requests_times') self.cookie = global_config.getRaw('config', 'Cookie') self.ua = global_config.getRaw('config', 'user-agent') self.ua_engine = Factory.create() if self.ua is None: logger.error('user agent 暂时不支持为空') sys.exit() self.cookie_pool = global_config.getRaw('config', 'use_cookie_pool') self.cookie_pool = True if self.cookie_pool == 'True' else False if self.cookie_pool is True: logger.info('使用cookie池') if not os.path.exists('cookies.txt'): logger.error('cookies.txt文件不存在') sys.exit() self.ip_proxy = global_config.getRaw('proxy', 'use_proxy') self.ip_proxy = True if self.ip_proxy == 'True' else False if self.ip_proxy: self.proxy_pool = [] try: self.stop_times = self.parse_stop_time(requests_times) except: logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)') sys.exit() self.global_time = 0 pass
def get_proxy(self): """ 获取代理 """ try: repeat_nub = int(global_config.getRaw('proxy', 'repeat_nub')) except: logger.warning('repeat_nub 格式不正确,应为正整数') sys.exit() # http 提取模式 if global_config.getRaw('proxy', 'http_extract') == '1': # 代理池为空,提取代理 if len(self.proxy_pool) == 0: proxy_url = global_config.getRaw('proxy', 'http_link') r = requests.get(proxy_url) r_json = r.json() for proxy in r_json: # 重复添加,多次利用 for _ in range(repeat_nub): self.proxy_pool.append([proxy['ip'], proxy['port']]) # 获取ip proxies = self.http_proxy_utils(self.proxy_pool[0][0], self.proxy_pool[0][1]) self.proxy_pool.remove(self.proxy_pool[0]) return proxies # 秘钥提取模式 elif global_config.getRaw('proxy', 'key_extract') == '1': pass pass
def __init__(self): self.location_id = global_config.getRaw('detail', 'location_id') self.channel_id = global_config.getRaw('detail', 'channel_id') self.custom_search_url = global_config.getRaw('detail', 'search_url') self.need_detail = global_config.getRaw('detail', 'need_detail') self.need_comment = global_config.getRaw('detail', 'need_comment') self.requests_util = requests_util self.jump_wait = False
def __init__(self): requests_times = global_config.getRaw('config', 'requests_times') self.cookie = global_config.getRaw('config', 'cookie') self.ua = global_config.getRaw('config', 'user-agent') self.ua_engine = Factory.create() try: self.stop_times = self.parse_stop_time(requests_times) except: logger.error('配置文件requests_times解析错误,检查输入(必须英文标点)') sys.exit() self.global_time = 0 pass
def get_header(self, cookie): ua = global_config.getRaw('config', 'user-agent') if ua is None: ua_engine = Factory.create() ua = ua_engine.user_agent() header = {'User-Agent': ua, 'Cookie': cookie} return header
type=int, required=False, default=0, help='spider as custom(just review)') parser.add_argument('--shop_id', type=str, required=False, default='', help='custom shop id') args = parser.parse_args() if __name__ == '__main__': # args.review = 1 # args.normal = 0 # args.shop_id = 'l8QDQukrl2tXhzmY' if args.normal == 1: keyword = global_config.getRaw('detail', 'keyword') need_first = True if global_config.getRaw( 'detail', 'need_first') is 'True' else False need_pages = int(global_config.getRaw('detail', 'need_pages')) s = Search() s.search(keyword, need_first, need_pages) if args.detail == 1: from function.detail import Detail shop_id = args.shop_id logger.info('爬取店铺id:' + shop_id + '详情') d = Detail() d.get_detail(shop_id) if args.review == 1: from function.review import Review
def update_cookie(self): self.cookie = global_config.getRaw('config', 'Cookie')
def __init__(self): # config 的 config self.USE_COOKIE_POOL = True if global_config.getRaw('config', 'use_cookie_pool') == 'True' else False self.COOKIE = global_config.getRaw('config', 'Cookie') self.USER_AGENT = global_config.getRaw('config', 'user-agent') self.SAVE_MODE = global_config.getRaw('config', 'save_mode') self.MONGO_PATH = global_config.getRaw('config', 'mongo_path') self.REQUESTS_TIMES = global_config.getRaw('config', 'requests_times') self.UUID = global_config.getRaw('config', 'uuid') self.TCV = global_config.getRaw('config', 'tcv') # config 的 detail self.KEYWORD = global_config.getRaw('detail', 'keyword') self.LOCATION_ID = global_config.getRaw('detail', 'location_id') self.CHANNEL_ID = global_config.getRaw('detail', 'channel_id') self.SEARCH_URL = global_config.getRaw('detail', 'search_url') assert self.SEARCH_URL == '' or self.SEARCH_URL.endswith('p'), 'search_url 没有拼接p' self.NEED_FIRST = True if global_config.getRaw('detail', 'need_first') == 'True' else False try: self.NEED_SEARCH_PAGES = int(global_config.getRaw('detail', 'need_pages')) except: logger.error('need_pages 必须为整数') exit() # config 的 proxy self.USE_PROXY = True if global_config.getRaw('proxy', 'use_proxy') == 'True' else False if self.USE_PROXY: try: self.REPEAT_NUMBER = int(global_config.getRaw('proxy', 'repeat_nub')) except: logger.error('repeat_nub 必须为整数') exit() else: self.REPEAT_NUMBER = 0 self.HTTP_EXTRACT = True if global_config.getRaw('proxy', 'http_extract') == 'True' else False self.HTTP_LINK = global_config.getRaw('proxy', 'http_link') self.KEY_EXTRACT = True if global_config.getRaw('proxy', 'key_extract') == 'True' else False self.KEY_ID = global_config.getRaw('proxy', 'key_id') self.KEY_KEY = global_config.getRaw('proxy', 'key_key') assert not (self.HTTP_EXTRACT is True and self.KEY_EXTRACT is True), '代理模式不可以全为True' # require 的 shop phone self.NEED_DETAIL = True if require_config.getRaw('shop_phone', 'need') == 'True' else False self.NEED_PHONE_DETAIL = True if require_config.getRaw('shop_phone', 'need_detail') == 'True' else False if self.NEED_PHONE_DETAIL: logger.warn('开启了电话详情模式,会降低速度并增加反爬概率') # require 的 shop review self.NEED_REVIEW = True if require_config.getRaw('shop_review', 'need') == 'True' else False self.NEED_REVIEW_DETAIL = True if require_config.getRaw('shop_review', 'more_detail') == 'True' else False if self.NEED_REVIEW_DETAIL: logger.warn('开启了评论详情模式,会降低速度并增加反爬概率') try: self.NEED_REVIEW_PAGES = int(require_config.getRaw('shop_review', 'need_pages')) except: logger.error('need_pages 必须为整数') exit() else: self.NEED_REVIEW_PAGES = 0
┃ ┃ 神兽保佑 ┃ ┃ 代码无BUG! ┃ ┗━━━━━━━━━┓ ┃CREATE BY SNIPER┣┓ ┃ ┏┛ ┗━┓ ┓ ┏━━━┳ ┓ ┏━┛ ┃ ┫ ┫ ┃ ┫ ┫ ┗━┻━┛ ┗━┻━┛ """ import requests from function.search import Search from utils.config import global_config from utils.get_font_map import get_review_map_file cookie = global_config.getRaw('config', 'cookie') ua = global_config.getRaw('config', 'user-agent') def get_header(): """ 获取请求头 :return: """ header = {'User-Agent': ua, 'Cookie': cookie} return header if __name__ == '__main__': # debug search Search().search('一方', only_need_first=False, needed_pages=10)
def __init__(self): self.requests_util = requests_util self.pages_needed = global_config.getRaw('save', 'review_pages') pass
def __init__(self): self.cookie = global_config.getRaw('config', 'cookie') self.ua = global_config.getRaw('config', 'user-agent') self.location_id = global_config.getRaw('config', 'location_id') self.ua_engine = Factory.create() self.saver = Saver()