Esempio n. 1
0
 def getAllPage(self):
     url = self.API_HOST + "?p=1"
     res = requests.get(url, headers=self.HEADER)
     res = res.text
     pages = re.search(r'<span>1.*?(\d*)</span>', res)
     self.API_PAGEALL = pages.group(1)
     logger.info(f'总共找到图片的页数{self.API_PAGEALL}')
Esempio n. 2
0
 def __init__(self):
     logger.info('spider360 模块启动中...')
     self.header = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400",
     }
     self.header1 = {
         "Accept":
         "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
         "Accept-Encoding":
         "gzip, deflate",
         "Accept-Language":
         "zh-CN,zh;q=0.9,en;q=0.8",
         "Cache-Control":
         "no-cache",
         "Connection":
         "keep-alive",
         "Cookie":
         "__guid=92280206.530040451316176400.1579142607132.9788; Q=u%3D360H2548941901%26n%3D%26le%3D%26m%3DZGtmWGWOWGWOWGWOWGWOWGWOAQZ2%26qid%3D2548941901%26im%3D1_t01dbdd7e726a89e0ec%26src%3Dpcw_so_image_qq%26t%3D1; T=s%3Dcfec7fa03be89cefab2bd998153c13fc%26t%3D1579142992%26lm%3D%26lf%3D%26sk%3D2d7d2aa636fd90ec7e4053dc960da44b%26mt%3D1579142992%26rc%3D%26v%3D2.0%26a%3D1",  # yapf: disabled
         "Host":
         "wallpaper.apc.360.cn",
         "Pragma":
         "no-cache",
         "Upgrade-Insecure-Requests":
         "1",
         "User-Agent":
         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36",
     }
     self.API_HOST = 'http://wallpaper.apc.360.cn/index.php?from=360chrome&c=WallPaper'
     self.API_TYPE = '&a=getAllCategoriesV2'
     self.API_NEW = '&a=getAppsByOrder&order=create_time&start=@parmas&count=@parmas'
     self.API_ARG = '&a=getAppsByCategory&cid=@parmas&start=@parmas&count=@parmas'
     self.dao = DBS()
     self.start()
Esempio n. 3
0
 def set_up(self):
     logger.info(f'爬虫进程:{self.list_lib}')
     for lib in self.list_lib:
         self.list_process.append(Thread(target=lib))
     for task in self.list_process:
         task.setDaemon(True)
         task.start()
     for task in self.list_process:
         task.join()
Esempio n. 4
0
 def __init__(self):
     logger.info('爬虫模块启动...')
     # 计时开始
     self.START_TIME = time()
     # 爬虫进程
     self.list_process = []
     # 相关爬虫模块
     self.list_lib = self.filterSpider()
     # 启动爬虫进程
     self.set_up()
Esempio n. 5
0
 def parseLink(self):
     for d in self.DATE_LIS:
         logger.info(f'获取{d}的数据')
         url = self.API_HOST + d
         res = requests.get(url, headers=self.HEADER)
         if res.status_code == 200:
             res.encoding = 'utf-8'
             self.parseJSON(res.text)
             sleep(self.INTERVAL)
         else:
             print(f"数据获取出错:{d}")
             continue
Esempio n. 6
0
 def __init__(self):
     logger.info('spiderJinShan 模块启动中...')
     self.HEADER = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400",
     }
     self.DAO = DBS()
     self.API_HOST = r'http://sentence.iciba.com/index.php?c=dailysentence&m=getdetail&title='
     self.DATE_LIS = []
     self.INTERVAL = 0.5
     self.TOTAL = 0
     self.TOTAL_EXIST = 0
     self.start()
Esempio n. 7
0
 def __init__(self):
     logger.info('spiderBiYing 模块启动中...')
     self.HEADER = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5603.400 QQBrowser/10.1.1775.400",
     }
     self.DAO = DBS()
     self.API_HOST = 'https://bing.ioliu.cn/'
     self.API_PAGE = 1
     self.API_PAGEALL = None
     self.INTERVAL = 1  # 每次请求的间隔时间
     self.TOTAL = 0
     self.TOTAL_EXIST = 0
     self.start()
Esempio n. 8
0
 def getAll(self):
     logger.info('开始获取360壁纸全部数据')
     url = self.API_HOST + self.API_TYPE
     res = self.getText(url, self.header)
     data = json.loads(res).get('data', '')
     totalALl, fetchedAll = 0, 0
     if isinstance(data, list) and len(data) > 0:
         for cate in data:
             cid, category, start, count, fetched, total = cate['id'], cate[
                 'name'], 0, 150, 0, 0
             logger.debug(f"开始获取<{category}>数据")
             while True:
                 # cid   start    count
                 url = self.API_HOST + self.API_ARG.replace(
                     '@parmas', str(cid), 1).replace(
                         "@parmas", str(start), 1).replace(
                             "@parmas", str(count))
                 res = self.getText(url, self.header)
                 res = json.loads(res)
                 total = int(res.get("total", 0))
                 lis = res.get('data', '')
                 if isinstance(lis, list) and len(lis) > 0:
                     for item in lis:
                         print(f'fetching  ---> {category}: {fetched}',
                               end="\r")
                         url = sub(r'__\d\d', "__100", item['url'])
                         tag = item['tag']
                         reso = item['resolution'].replace("*", "x")
                         result = self.dao.insertImage(
                             url, tag, reso, category)
                         if result is None:
                             fetched += 1
                         if (fetched >= total):
                             totalALl += total
                             fetchedAll += fetched
                             print(
                                 f"fetched ---> 分类:{category}  总计:{fetched}"
                             )
                             break
                     else:
                         continue
                     break
     logger.info(f'spider360--getAll运行结束,总共获取{fetchedAll}/{totalALl}张图片')
Esempio n. 9
0
 def __init__(self, **args):
     logger.info("数据库模块启动...")
     # load config
     config_path = os.path.abspath(
         os.path.join(os.path.dirname(__file__),
                      "../../config/config.json"))
     # load config
     with open(config_path) as file:
         self.config = json.load(file)
     # init cnn
     try:
         self.cnn = pymysql.connect(
             host=self.config.get("db_host", ''),
             user=self.config.get("db_user", ''),
             password=self.config.get("db_passwd", ''),
             db=self.config.get("db_database", ""),
             charset=self.config.get("db_charset", ''),
             cursorclass=pymysql.cursors.DictCursor)
     except Exception:
         logger.error('数据库连接初始化失败,请检查配置文件')
         exit()
Esempio n. 10
0
 def getNEWS(self):
     logger.info('开始获取360壁纸最新数据。。。')
     start, page, fetched, total = 0, 30, 0, 0
     # 循环获取最新
     while True:
         url = self.API_NEW.replace("@parmas", str(start),
                                    1).replace("@parmas", str(page))
         res = self.getText(self.API_HOST + url, self.header)
         data = json.loads(res).get("data", '')
         if isinstance(data, list) and len(data) > 0:
             for item in data:
                 print(f"fetching --> {total}", end='\r')
                 url = sub(r'__\d\d', "__100", item['url'])
                 tag = item['tag']
                 result = self.dao.insert(url, tag)
                 if result is None:
                     fetched += 1
                 total += 1
             start += 1
         else:
             break
     logger.info(f'spider360--getNews运行结束,总共获取{fetched}/{total}张图片')
Esempio n. 11
0
 def __del__(self):
     logger.info(
         f'SpiderJinShan爬取进程运行结束, 新获取数据:{self.TOTAL}, 过滤已存在:{self.TOTAL_EXIST}'
     )
Esempio n. 12
0
 def __del__(self):
     logger.info('SpiderBiYing爬取进程运行结束')
Esempio n. 13
0
 def __del__(self):
     self.cnn.close()
     logger.info('数据库连接关闭')
Esempio n. 14
0
 def __del__(self):
     during_time = time() - float(self.START_TIME)
     logger.info('爬虫模块运行结束, 耗时:%fs', during_time)
Esempio n. 15
0
 def __del__(self):
     # 后期再更改
     logger.info('spider360爬取进程运行结束')