def __init__( self, refer=My.get('refer', ''), encoding='gb2312', parser='html5lib', ): Crawl.__init__(self, refer, encoding=encoding, parser=parser)
def __init__( self, refer=M.get('refer', ''), parser='html5lib', ): Crawl.__init__(self, refer, parser=parser) self.end_date = ''
def __init__( self, refer=M.get('refer', ''), catalog_index='', ): Crawl.__init__(self, refer) self.get_headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', } self.post_headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Referer': M['refer'], 'Content-Type': 'application/x-www-form-urlencoded', } self.sess = requests.session() if catalog_index: self.catalog_info = { 'index': catalog_index, 'info': catalog[catalog_index], 'base_pth': os.path.join(base_pth, catalog[catalog_index]['name']), 'url': M['practice'].format(catalog_index), 'cache_file': 'da2017_{}.txt'.format(catalog_index) } self.da2017 = []
def __init__(self, base_dir='/tmp/mzt', parser='lxml'): """ 初始化本地目录 """ Crawl.__init__(self, M['index']) self.all_indexes = None self.archives = None self.archive = None self.base_dir = base_dir self.parser = parser # self.single_page() return
def __init__(self, meta): Crawl.__init__(self) self.url = URL_VOLUME.format(meta.get('vol_index')) self.vol_index = meta.get('vol_index') self.bs = None self.base = { 'vol_index': meta.get('vol_index'), 'vol_num': meta.get('vol_num'), 'title': meta.get('title'), 'cover': meta.get('cover'), } self.init()
def __init__( self, refer=M.get('refer', ''), big_head=False, img_cache_dir='/tmp/weibo', img_height=6, use_cache=True, ): Crawl.__init__(self, refer) self.get_headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', } self.post_headers = { 'Host': 'login.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://weibo.com/', 'Content-Type': 'application/x-www-form-urlencoded', } self.json_header = { "Content-Type": "application/x-www-form-urlencoded", 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Host': 'weibo.com', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://weibo.com', 'Content-Type': 'application/x-www-form-urlencoded', } self.sess = requests.session() self.domid = '' self.big_head = big_head self.img_cache_dir = os.path.join(img_cache_dir, 'large' if big_head else 'little') self.img_height = img_height self.use_cache = use_cache self.cached_users_followed = [] self.cached_users_followed_index = 0 self.personal_info = {}
def __init__(self, base_dir='/tmp/d4', parser='html.parser', encoding="GBK"): """ 初始化本地目录 44.style: support html.parser """ Crawl.__init__(self, encoding=encoding) self.base_dir = base_dir self.parser = parser self.all_tags = None self.t2i = {} self.i2t = {}
def __init__(self): Crawl.__init__(self, refer=M['refer'])
def __init__(self, refer=My.get('refer', ''), baidu_pwd_len=4, encoding='utf-8'): Crawl.__init__(self, refer, encoding=encoding) self.baidu_pwd_len = baidu_pwd_len
def __init__(self, refer=My.get('refer', ''), encoding='utf-8'): Crawl.__init__(self, refer, encoding=encoding) self.soft = None self.soft_flatten = None self.soft_count = ''
def __init__(self, refer=M.get('refer', '')): Crawl.__init__(self, refer=refer) self.get_headers = { 'Accept': '*/*', 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', } self.post_headers = { 'Host': 'login.sina.com.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://weibo.com/', 'Content-Type': 'application/x-www-form-urlencoded', } self.json_headers = { 'X-Requested-With': 'XMLHttpRequest', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Host': 'weibo.com', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://weibo.com', 'Content-Type': 'application/x-www-form-urlencoded', } self.mobile_json_headers = { 'X-Requested-With': 'XMLHttpRequest', 'Host': 'm.weibo.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': 'application/json, */*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://m.weibo.cn', } self.pic_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Host': 'picupload.weibo.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://weibo.com', # 'X-Requested-With': 'XMLHttpRequest', 'Upgrade-Insecure-Requests': '1', # 'Cookie': 'SINAGLOBAL=2387021546050.9604.1459612470000; ULV=1513134639264:2:2:2:1334820737119.291.1513134639112:1512880066460; SCF=AoNKXIor2ckWKv5uNCqwGIFlEt8jOwxy5sbK-uJ6WbFaeVAfNTbEuPQ5Yo4F1uVa7ShKvmfEo4jIInVD8JYoSe4.; SUHB=0btMiDZ5GGqzsu; UOR=,,login.sina.com.cn; wvr=6; _T_WM=7d84e18b9f0d552d4aa911691e107a17; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5rvaLovm-ojOzIZ5a3kPSA5JpX5KMhUgL.Foq7So.NS0nfSK.2dJLoIEXLxK-LB.eLBo2LxK-LB.eLBo2LxK-LB.eLBo2LxKBLBonL1h5LxKML1KBL1-qt; ALF=1546231873; _s_tentry=-; Apache=1334820737119.291.1513134639112; cross_origin_proto=SSL; SUB=_2A253TBySDeRhGeBO7VsW9ybJzjWIHXVUOAlarDV8PUNbmtAKLUXEkW9NRcESI51fUtkkCic1lzuPz0Y80V_mATUU; SSOLoginState=1514695874', 'Content-Type': 'application/x-www-form-urlencoded', } self.mobile_login_headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'X-Requested-With': 'XMLHttpRequest', 'Host': 'passport.weibo.cn', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate, br', 'Referer': 'https://m.weibo.cn', } self.sess = requests.session() self.mobile_sess = requests.session() self.my_info = {} # weibo stk 返回的 domid, 在解析个人关注时, url拼接时需要使用 # self.stk_dom_id = '' self.img_cache_dir = cfg.get('weibo.img_cache_dir', '/tmp/weibo') self.use_cache = cfg.get('weibo.use_cache', True) self.big_head = cfg.get('weibo.big_head', False) self.img_height = cfg.get('weibo.img_height', 12) self.cached_users_followed = [] self.cached_users_followed_index = 0
def __init__( self, refer=M.get('refer', ''), ): Crawl.__init__(self, refer)
def __init__(self, base_dir='/tmp/one'): Crawl.__init__(self, refer=M['index']) self.base_dir = base_dir self.current_id = 0 self.sess = requests.session()
def __init__(self, base_dir): Crawl.__init__(self, MJ['index']) self.base_dir = base_dir
def __init__(self): Crawl.__init__(self) self.bs = None self.category = None self.hot = None self.page_count = 0