def get_fluent_id_count(): import requests url = "https://apineo.llsapp.com/api/v1/curriculums/filtered" querystring = { "type": "1", "pageSize": "20", "level": "", "sort": "diamond_consume_desc", "page": "1", "appVer": "6", "deviceId": "354730011088642", "sDeviceId": "354730010301566", "appId": "lls", "token": "4cc82410cb810136b78c0a5864605084" } querystring['token'] = get_token() headers = { 'Accept-Language': "zh-cn", 'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)", 'Host': "apineo.llsapp.com", 'Connection': "Keep-Alive", 'Accept-Encoding': "gzip", 'cache-control': "no-cache", 'Postman-Token': "b4d8e656-09c2-4c88-a280-d0405f2f78d2" } response = requests.request("GET", url, headers=headers, params=querystring) print(response.text) response_text = json.loads(response.text) return response_text.get('total')
def course(): url = "https://apineo.llsapp.com/api/v1/curriculums/3-cccccccccccccccccccccccc" querystring = { "appVer": "6", "clientAppVersion": "5.", "token": "6ff2fce0cf5b0136652b0a5864605265", "deviceId": "354730011088642", "sDeviceId": "354730010301566", "appId": "lls", "orderSourceType": "1" } payload = "" headers = { 'cache-control': "no-cache", 'Postman-Token': "ad36915a-dc71-43ee-a819-779ddf840eb9" } querystring['token'] = get_token() response = requests.request("GET", url, data=payload, headers=headers, params=querystring) print(response.text) response.encoding = "utf-8" # 指定编码格式,防止乱码 response_text = json.loads(response.text) storage(response_text)
def parse_list(self, response): ID_count = get_course_id() # 获取自己建立的ID库的ID print('课程id库课程总长度', len(ID_count)) if "操作过于频繁,请稍后再试" in response.text: yield scrapy.Request(url=response.url, callback=self.parse_list, dont_filter=True) else: response_text = json.loads(response.text) curriculums = response_text.get("curriculums") for curriculum in curriculums: # 获取全部课程数据 id = curriculum.get("course").get("id") fluent_id_list.append(id) # 获取流利说全部课程ID print('获取全部流利课程ID', fluent_id_list) print('获取全部流利课程ID长度', len(fluent_id_list)) left_id_list = set(fluent_id_list) - set(ID_count) print('剩余IDlist', left_id_list) if left_id_list: for id in left_id_list: url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True)
def parse(self, response): if "操作过于频繁,请稍后再试" in response.text: url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse, dont_filter=True) response_text = json.loads(response.text) total = response_text.get("total") id_count = int(course_id_count()) # 返回的是有多少个课程ID # if total > id_count: total_int = int(total) total_page = math.ceil(total_int / 20) # 向上取整 total_page += 1 for page in range(1, int(total_page)): url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=" + str( page ) + "&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_list, dont_filter=True)
class Community(scrapy.Spider): name = 'Course' start_urls = [ "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token() ] custom_settings = { # 重试机制 "RETRY_ENABLED": True, "RETRY_TIMES": 5, "COOKIES_ENABLED": False, "HTTPERROR_ALLOWED_CODES": [429, 401], # 429的状态码不报错 # "DOWNLOAD_DELAY": 0.2, # 'AUTOTHROTTLE_ENABLED': True, # 启动[自动限速] "ITEM_PIPELINES": { 'fast_fluent.pipelines.Mysql_Course_ID_Pipeline': 300 }, "DOWNLOADER_MIDDLEWARES": { # 'fast_fluent.middlewares.ProxyMiddleware': None,#代理是否启用 'fast_fluent.middlewares.ProxyMiddleware': 543, # 代理是否启用 }, 'DEFAULT_REQUEST_HEADERS': { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 'Accept-Language': "zh-cn", 'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)", 'Accept-Encoding': "gzip,deflate", 'Host': "apineo.llsapp.com", 'cache-control': "no-cache", } } # 获取全部课程 def parse(self, response): if "操作过于频繁,请稍后再试" in response.text: url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse, dont_filter=True) response_text = json.loads(response.text) total = response_text.get("total") id_count = int(course_id_count()) # 返回的是有多少个课程ID # if total > id_count: total_int = int(total) total_page = math.ceil(total_int / 20) # 向上取整 total_page += 1 for page in range(1, int(total_page)): url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=" + str( page ) + "&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_list, dont_filter=True) # print('当前没有新增的课程') def parse_list(self, response): ID_count = get_course_id() # 获取自己建立的ID库的ID print('课程id库课程总长度', len(ID_count)) if "操作过于频繁,请稍后再试" in response.text: yield scrapy.Request(url=response.url, callback=self.parse_list, dont_filter=True) else: response_text = json.loads(response.text) curriculums = response_text.get("curriculums") for curriculum in curriculums: # 获取全部课程数据 id = curriculum.get("course").get("id") fluent_id_list.append(id) # 获取流利说全部课程ID print('获取全部流利课程ID', fluent_id_list) print('获取全部流利课程ID长度', len(fluent_id_list)) left_id_list = set(fluent_id_list) - set(ID_count) print('剩余IDlist', left_id_list) if left_id_list: for id in left_id_list: url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True) def parse_detail(self, response): courseitem = CourseItem() logger.info('详情页的数据%s' % response.text) if "操作过于频繁,请稍后再试" in response.text: yield scrapy.Request(url=response.url, callback=self.parse_detail, dont_filter=True) else: response_text = json.loads(response.text) courseitem["ID"] = response_text.get("id") localtime = time.localtime(time.time()) insert_time = time.strftime("%Y-%m-%d", localtime) courseitem["insert_time"] = insert_time yield courseitem
def parse_detail(self, response): user_detail = json.loads(response.text) useritem = UserItem() user = user_detail.get('user') useritem['id'] = user.get('id') useritem['repliesCount'] = user.get('repliesCount') # 回复数 useritem['topicsCount'] = user.get('topicsCount') # 发表帖子数 useritem['coins'] = user.get('coins') # 金币数 useritem['stars'] = user.get('stars') # 星星数 useritem['nick'] = user.get('nick') # 昵称 useritem['gender'] = user.get('gender') # 性别 useritem['birthYear'] = user.get('birthYear') # 出生年份 useritem['location'] = user.get('location') # 出生地 useritem['profession'] = user.get('profession') # 职业 useritem['level'] = user.get('level') # 等级 useritem['followersCount'] = user_detail.get('followersCount') # 粉丝数 useritem['followingsCount'] = user_detail.get('followingsCount') # 关注数 useritem['dialogCount'] = user_detail.get('dialogCount') # 闯关总数 useritem['nonstopStudyDays'] = user_detail.get( 'nonstopStudyDays') # 连续学习天数 useritem['studyDays'] = user_detail.get('studyDays') # 累计学习天数 useritem['dialogAvgScore'] = user_detail.get('dialogAvgScore') # 闯关平均分 useritem['theSpeakingForce'] = user_detail.get( 'theSpeakingForce') # 口语力 useritem['rank'] = user_detail.get('rank') # 超过多少的人 useritem['recordTime'] = user_detail.get('recordTime') # 录音总计秒 yield useritem followersCount = useritem['followersCount'] followingsCount = useritem['followingsCount'] user_id = useritem['id'] # 如果 粉丝数大于0 if followersCount > 0: followers_maxpage = math.ceil(followersCount / 20) # 向上取整 followers_maxpage += 1 for page_followers in range(1, int(followers_maxpage)): # print(page) url = "https://apineo.llsapp.com/api/v1/users/" + user_id + "/followers?page=" + str( page_followers ) + "&pageSize=20&appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.follow_list)
class Community(scrapy.Spider): name = 'Community' start_urls = [ "http://apineo.llsapp.com/api/v1/circles/hot?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(), "http://apineo.llsapp.com/api/v1/leaderboard/global?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token() ] custom_settings = { # 重试机制 "RETRY_ENABLED": True, "RETRY_TIMES": 3, "COOKIES_ENABLED": False, "HTTPERROR_ALLOWED_CODES": [429], # 429的状态码不报错 "DOWNLOAD_DELAY": 0.1, # 'AUTOTHROTTLE_ENABLED': True, # 启动[自动限速] 'DEFAULT_REQUEST_HEADERS': { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 'Accept-Language': "zh-cn", 'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)", 'Accept-Encoding': "gzip,deflate", 'Host': "apineo.llsapp.com", 'cache-control': "no-cache", } } # 接口获取所有圈子的Id def parse(self, response): response_text = json.loads(response.text) # 圈子的数据 if isinstance(response_text, list): for i in response_text: community = i.get('id') url = "https://apineo.llsapp.com/api/v1/circles/" + community + "/leaderboards/users?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_list) # 排行榜数据 if isinstance(response_text, dict): # print(response_text) rank_list = [] stars = response_text.get('star').get('members') rank_list.extend(stars) coin = response_text.get('coin').get('members') rank_list.extend(coin) recordDuration = response_text.get('recordDuration').get( 'members') # 测试时没有数据 rank_list.extend(recordDuration) dialogCount = response_text.get('dialogCount').get('members') rank_list.extend(dialogCount) days = response_text.get('days').get('members') rank_list.extend(days) # 所有的榜单添加到一个列表里 for rank in rank_list: user_id = rank.get('id') url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail) # 获取圈子的排行榜100 def parse_list(self, response): leaders = json.loads(response.text).get('leaders') for leader in leaders: user_id = leader.get('id') # user_count = get_user(user_id) # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库,感觉只有粉丝页才需要这个 url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail) # 粉丝页或关注页 def follow_list(self, response): users = json.loads(response.text).get('users') for user in users: user_id = user.get('id') user_count = get_user(user_id) # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库 if user_count == 1: pass if user_count == 0: url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail) # 用户详情页 def parse_detail(self, response): user_detail = json.loads(response.text) useritem = UserItem() user = user_detail.get('user') useritem['id'] = user.get('id') useritem['repliesCount'] = user.get('repliesCount') # 回复数 useritem['topicsCount'] = user.get('topicsCount') # 发表帖子数 useritem['coins'] = user.get('coins') # 金币数 useritem['stars'] = user.get('stars') # 星星数 useritem['nick'] = user.get('nick') # 昵称 useritem['gender'] = user.get('gender') # 性别 useritem['birthYear'] = user.get('birthYear') # 出生年份 useritem['location'] = user.get('location') # 出生地 useritem['profession'] = user.get('profession') # 职业 useritem['level'] = user.get('level') # 等级 useritem['followersCount'] = user_detail.get('followersCount') # 粉丝数 useritem['followingsCount'] = user_detail.get('followingsCount') # 关注数 useritem['dialogCount'] = user_detail.get('dialogCount') # 闯关总数 useritem['nonstopStudyDays'] = user_detail.get( 'nonstopStudyDays') # 连续学习天数 useritem['studyDays'] = user_detail.get('studyDays') # 累计学习天数 useritem['dialogAvgScore'] = user_detail.get('dialogAvgScore') # 闯关平均分 useritem['theSpeakingForce'] = user_detail.get( 'theSpeakingForce') # 口语力 useritem['rank'] = user_detail.get('rank') # 超过多少的人 useritem['recordTime'] = user_detail.get('recordTime') # 录音总计秒 yield useritem followersCount = useritem['followersCount'] followingsCount = useritem['followingsCount'] user_id = useritem['id'] # 如果 粉丝数大于0 if followersCount > 0: followers_maxpage = math.ceil(followersCount / 20) # 向上取整 followers_maxpage += 1 for page_followers in range(1, int(followers_maxpage)): # print(page) url = "https://apineo.llsapp.com/api/v1/users/" + user_id + "/followers?page=" + str( page_followers ) + "&pageSize=20&appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.follow_list)
def follow_list(self, response): users = json.loads(response.text).get('users') for user in users: user_id = user.get('id') user_count = get_user(user_id) # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库 if user_count == 1: pass if user_count == 0: url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail)
def parse_list(self, response): leaders = json.loads(response.text).get('leaders') for leader in leaders: user_id = leader.get('id') # user_count = get_user(user_id) # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库,感觉只有粉丝页才需要这个 url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail)
def parse(self, response): response_text = json.loads(response.text) # 圈子的数据 if isinstance(response_text, list): for i in response_text: community = i.get('id') url = "https://apineo.llsapp.com/api/v1/circles/" + community + "/leaderboards/users?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_list) # 排行榜数据 if isinstance(response_text, dict): # print(response_text) rank_list = [] stars = response_text.get('star').get('members') rank_list.extend(stars) coin = response_text.get('coin').get('members') rank_list.extend(coin) recordDuration = response_text.get('recordDuration').get( 'members') # 测试时没有数据 rank_list.extend(recordDuration) dialogCount = response_text.get('dialogCount').get('members') rank_list.extend(dialogCount) days = response_text.get('days').get('members') rank_list.extend(days) # 所有的榜单添加到一个列表里 for rank in rank_list: user_id = rank.get('id') url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail)
def parse(self, response): # id_set = get_course_id() # 从课程池里拿到新建的ID id_set = un_get_id() # 拿到还未抓取的课程ID if id_set: for id in id_set: url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True) else: print('今日数据已经抓取完毕')
class Community(scrapy.Spider): name = 'Course_mysql' start_urls = [ "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token() ] custom_settings = { # 重试机制 "RETRY_ENABLED": True, "RETRY_TIMES": 3, "COOKIES_ENABLED": False, "HTTPERROR_ALLOWED_CODES": [429, 401], # 429的状态码不报错.401是token过期报错 # "DOWNLOAD_DELAY": 0.2, "ITEM_PIPELINES": { 'fast_fluent.pipelines.MysqlPipeline': 300 }, "DOWNLOADER_MIDDLEWARES": { # 'fast_fluent.middlewares.ProxyMiddleware': None, # 代理不启用 'fast_fluent.middlewares.ProxyMiddleware': 543, # 代理启用 }, 'DEFAULT_REQUEST_HEADERS': { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 'Accept-Language': "zh-cn", 'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)", 'Accept-Encoding': "gzip,deflate", 'Host': "apineo.llsapp.com", 'cache-control': "no-cache", } } def parse(self, response): # id_set = get_course_id() # 从课程池里拿到新建的ID id_set = un_get_id() # 拿到还未抓取的课程ID if id_set: for id in id_set: url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token( ) yield scrapy.Request(url, callback=self.parse_detail, dont_filter=True) else: print('今日数据已经抓取完毕') def parse_detail(self, response): courseitem = CourseItem() logger.info('详情页数据%s', response.text) if "操作过于频繁,请稍后再试" in response.text: yield scrapy.Request(url=response.url, callback=self.parse_detail, dont_filter=True) else: response_text = json.loads(response.text) courseitem["ID"] = response_text.get("id") courseitem["diamondPrice"] = response_text.get( "diamondPrice") / 10 # 除以10,统一转换为人民币 courseitem["studyUsersCount"] = response_text.get( "studyUsersCount") courseitem["translatedTitle"] = response_text.get( "translatedTitle") localtime = time.localtime(time.time()) strtime = time.strftime("%Y-%m-%d", localtime) courseitem["times"] = strtime yield courseitem