Ejemplo n.º 1
0
 def __init__(
         self,
         refer=My.get('refer', ''),
         encoding='gb2312',
         parser='html5lib',
 ):
     Crawl.__init__(self, refer, encoding=encoding, parser=parser)
Ejemplo n.º 2
0
 def __init__(
         self,
         refer=M.get('refer', ''),
         parser='html5lib',
 ):
     Crawl.__init__(self, refer, parser=parser)
     self.end_date = ''
Ejemplo n.º 3
0
    def __init__(
            self,
            refer=M.get('refer', ''),
            catalog_index='',
    ):
        Crawl.__init__(self, refer)
        self.get_headers = {
            'X-Requested-With':
            'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        }
        self.post_headers = {
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Referer': M['refer'],
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.sess = requests.session()

        if catalog_index:
            self.catalog_info = {
                'index': catalog_index,
                'info': catalog[catalog_index],
                'base_pth': os.path.join(base_pth,
                                         catalog[catalog_index]['name']),
                'url': M['practice'].format(catalog_index),
                'cache_file': 'da2017_{}.txt'.format(catalog_index)
            }
        self.da2017 = []
Ejemplo n.º 4
0
 def __init__(self, base_dir='/tmp/mzt', parser='lxml'):
     """
         初始化本地目录
     """
     Crawl.__init__(self, M['index'])
     self.all_indexes = None
     self.archives = None
     self.archive = None
     self.base_dir = base_dir
     self.parser = parser
     # self.single_page()
     return
Ejemplo n.º 5
0
    def __init__(self, meta):
        Crawl.__init__(self)
        self.url = URL_VOLUME.format(meta.get('vol_index'))
        self.vol_index = meta.get('vol_index')
        self.bs = None
        self.base = {
            'vol_index': meta.get('vol_index'),
            'vol_num': meta.get('vol_num'),
            'title': meta.get('title'),
            'cover': meta.get('cover'),
        }

        self.init()
Ejemplo n.º 6
0
    def __init__(
        self,
        refer=M.get('refer', ''),
        big_head=False,
        img_cache_dir='/tmp/weibo',
        img_height=6,
        use_cache=True,
    ):
        Crawl.__init__(self, refer)
        self.get_headers = {
            'X-Requested-With':
            'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        }
        self.post_headers = {
            'Host': 'login.sina.com.cn',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://weibo.com/',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.json_header = {
            "Content-Type": "application/x-www-form-urlencoded",
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Host': 'weibo.com',
            'Accept': '*/*',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://weibo.com',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.sess = requests.session()
        self.domid = ''
        self.big_head = big_head
        self.img_cache_dir = os.path.join(img_cache_dir,
                                          'large' if big_head else 'little')
        self.img_height = img_height
        self.use_cache = use_cache
        self.cached_users_followed = []
        self.cached_users_followed_index = 0

        self.personal_info = {}
Ejemplo n.º 7
0
 def __init__(self,
              base_dir='/tmp/d4',
              parser='html.parser',
              encoding="GBK"):
     """
         初始化本地目录
     44.style: support html.parser
     """
     Crawl.__init__(self, encoding=encoding)
     self.base_dir = base_dir
     self.parser = parser
     self.all_tags = None
     self.t2i = {}
     self.i2t = {}
Ejemplo n.º 8
0
 def __init__(self):
     Crawl.__init__(self, refer=M['refer'])
Ejemplo n.º 9
0
 def __init__(self, refer=My.get('refer', ''), baidu_pwd_len=4, encoding='utf-8'):
     Crawl.__init__(self, refer, encoding=encoding)
     self.baidu_pwd_len = baidu_pwd_len
Ejemplo n.º 10
0
 def __init__(self, refer=My.get('refer', ''), encoding='utf-8'):
     Crawl.__init__(self, refer, encoding=encoding)
     self.soft = None
     self.soft_flatten = None
     self.soft_count = ''
Ejemplo n.º 11
0
    def __init__(self, refer=M.get('refer', '')):
        Crawl.__init__(self, refer=refer)
        self.get_headers = {
            'Accept':
            '*/*',
            'X-Requested-With':
            'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
        }
        self.post_headers = {
            'Host': 'login.sina.com.cn',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://weibo.com/',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.json_headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Host': 'weibo.com',
            'Accept': '*/*',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://weibo.com',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.mobile_json_headers = {
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'm.weibo.cn',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Accept': 'application/json, */*',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://m.weibo.cn',
        }
        self.pic_headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Host': 'picupload.weibo.com',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://weibo.com',
            # 'X-Requested-With': 'XMLHttpRequest',
            'Upgrade-Insecure-Requests': '1',
            # 'Cookie': 'SINAGLOBAL=2387021546050.9604.1459612470000; ULV=1513134639264:2:2:2:1334820737119.291.1513134639112:1512880066460; SCF=AoNKXIor2ckWKv5uNCqwGIFlEt8jOwxy5sbK-uJ6WbFaeVAfNTbEuPQ5Yo4F1uVa7ShKvmfEo4jIInVD8JYoSe4.; SUHB=0btMiDZ5GGqzsu; UOR=,,login.sina.com.cn; wvr=6; _T_WM=7d84e18b9f0d552d4aa911691e107a17; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5rvaLovm-ojOzIZ5a3kPSA5JpX5KMhUgL.Foq7So.NS0nfSK.2dJLoIEXLxK-LB.eLBo2LxK-LB.eLBo2LxK-LB.eLBo2LxKBLBonL1h5LxKML1KBL1-qt; ALF=1546231873; _s_tentry=-; Apache=1334820737119.291.1513134639112; cross_origin_proto=SSL; SUB=_2A253TBySDeRhGeBO7VsW9ybJzjWIHXVUOAlarDV8PUNbmtAKLUXEkW9NRcESI51fUtkkCic1lzuPz0Y80V_mATUU; SSOLoginState=1514695874',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        self.mobile_login_headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'X-Requested-With': 'XMLHttpRequest',
            'Host': 'passport.weibo.cn',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0',
            'Accept': '*/*',
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://m.weibo.cn',
        }

        self.sess = requests.session()
        self.mobile_sess = requests.session()
        self.my_info = {}

        # weibo stk 返回的 domid, 在解析个人关注时, url拼接时需要使用
        # self.stk_dom_id = ''
        self.img_cache_dir = cfg.get('weibo.img_cache_dir', '/tmp/weibo')
        self.use_cache = cfg.get('weibo.use_cache', True)
        self.big_head = cfg.get('weibo.big_head', False)
        self.img_height = cfg.get('weibo.img_height', 12)

        self.cached_users_followed = []
        self.cached_users_followed_index = 0
Ejemplo n.º 12
0
 def __init__(
         self,
         refer=M.get('refer', ''),
 ):
     Crawl.__init__(self, refer)
Ejemplo n.º 13
0
 def __init__(self, base_dir='/tmp/one'):
     Crawl.__init__(self, refer=M['index'])
     self.base_dir = base_dir
     self.current_id = 0
     self.sess = requests.session()
Ejemplo n.º 14
0
 def __init__(self, base_dir):
     Crawl.__init__(self, MJ['index'])
     self.base_dir = base_dir
Ejemplo n.º 15
0
 def __init__(self):
     Crawl.__init__(self)
     self.bs = None
     self.category = None
     self.hot = None
     self.page_count = 0