Esempio n. 1
0
def get_fluent_id_count():
    import requests
    url = "https://apineo.llsapp.com/api/v1/curriculums/filtered"
    querystring = {
        "type": "1",
        "pageSize": "20",
        "level": "",
        "sort": "diamond_consume_desc",
        "page": "1",
        "appVer": "6",
        "deviceId": "354730011088642",
        "sDeviceId": "354730010301566",
        "appId": "lls",
        "token": "4cc82410cb810136b78c0a5864605084"
    }
    querystring['token'] = get_token()
    headers = {
        'Accept-Language': "zh-cn",
        'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)",
        'Host': "apineo.llsapp.com",
        'Connection': "Keep-Alive",
        'Accept-Encoding': "gzip",
        'cache-control': "no-cache",
        'Postman-Token': "b4d8e656-09c2-4c88-a280-d0405f2f78d2"
    }
    response = requests.request("GET",
                                url,
                                headers=headers,
                                params=querystring)

    print(response.text)
    response_text = json.loads(response.text)
    return response_text.get('total')
Esempio n. 2
0
def course():
    url = "https://apineo.llsapp.com/api/v1/curriculums/3-cccccccccccccccccccccccc"
    querystring = {
        "appVer": "6",
        "clientAppVersion": "5.",
        "token": "6ff2fce0cf5b0136652b0a5864605265",
        "deviceId": "354730011088642",
        "sDeviceId": "354730010301566",
        "appId": "lls",
        "orderSourceType": "1"
    }
    payload = ""
    headers = {
        'cache-control': "no-cache",
        'Postman-Token': "ad36915a-dc71-43ee-a819-779ddf840eb9"
    }
    querystring['token'] = get_token()
    response = requests.request("GET",
                                url,
                                data=payload,
                                headers=headers,
                                params=querystring)
    print(response.text)
    response.encoding = "utf-8"  # 指定编码格式,防止乱码
    response_text = json.loads(response.text)
    storage(response_text)
Esempio n. 3
0
 def parse_list(self, response):
     ID_count = get_course_id()  # 获取自己建立的ID库的ID
     print('课程id库课程总长度', len(ID_count))
     if "操作过于频繁,请稍后再试" in response.text:
         yield scrapy.Request(url=response.url,
                              callback=self.parse_list,
                              dont_filter=True)
     else:
         response_text = json.loads(response.text)
         curriculums = response_text.get("curriculums")
         for curriculum in curriculums:
             # 获取全部课程数据
             id = curriculum.get("course").get("id")
             fluent_id_list.append(id)  # 获取流利说全部课程ID
         print('获取全部流利课程ID', fluent_id_list)
         print('获取全部流利课程ID长度', len(fluent_id_list))
         left_id_list = set(fluent_id_list) - set(ID_count)
         print('剩余IDlist', left_id_list)
         if left_id_list:
             for id in left_id_list:
                 url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
                 )
                 yield scrapy.Request(url,
                                      callback=self.parse_detail,
                                      dont_filter=True)
Esempio n. 4
0
 def parse(self, response):
     if "操作过于频繁,请稍后再试" in response.text:
         url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
         )
         yield scrapy.Request(url, callback=self.parse, dont_filter=True)
     response_text = json.loads(response.text)
     total = response_text.get("total")
     id_count = int(course_id_count())  # 返回的是有多少个课程ID
     # if total > id_count:
     total_int = int(total)
     total_page = math.ceil(total_int / 20)  # 向上取整
     total_page += 1
     for page in range(1, int(total_page)):
         url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=" + str(
             page
         ) + "&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
         )
         yield scrapy.Request(url,
                              callback=self.parse_list,
                              dont_filter=True)
Esempio n. 5
0
class Community(scrapy.Spider):
    name = 'Course'
    start_urls = [
        "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token="
        + get_token()
    ]
    custom_settings = {
        # 重试机制
        "RETRY_ENABLED": True,
        "RETRY_TIMES": 5,
        "COOKIES_ENABLED": False,
        "HTTPERROR_ALLOWED_CODES": [429, 401],  # 429的状态码不报错
        # "DOWNLOAD_DELAY": 0.2,
        # 'AUTOTHROTTLE_ENABLED': True,  # 启动[自动限速]
        "ITEM_PIPELINES": {
            'fast_fluent.pipelines.Mysql_Course_ID_Pipeline': 300
        },
        "DOWNLOADER_MIDDLEWARES": {
            # 'fast_fluent.middlewares.ProxyMiddleware': None,#代理是否启用
            'fast_fluent.middlewares.ProxyMiddleware': 543,  # 代理是否启用
        },
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            'Accept-Language': "zh-cn",
            'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)",
            'Accept-Encoding': "gzip,deflate",
            'Host': "apineo.llsapp.com",
            'cache-control': "no-cache",
        }
    }

    # 获取全部课程
    def parse(self, response):
        if "操作过于频繁,请稍后再试" in response.text:
            url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
            )
            yield scrapy.Request(url, callback=self.parse, dont_filter=True)
        response_text = json.loads(response.text)
        total = response_text.get("total")
        id_count = int(course_id_count())  # 返回的是有多少个课程ID
        # if total > id_count:
        total_int = int(total)
        total_page = math.ceil(total_int / 20)  # 向上取整
        total_page += 1
        for page in range(1, int(total_page)):
            url = "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=" + str(
                page
            ) + "&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
            )
            yield scrapy.Request(url,
                                 callback=self.parse_list,
                                 dont_filter=True)
        # print('当前没有新增的课程')

    def parse_list(self, response):
        ID_count = get_course_id()  # 获取自己建立的ID库的ID
        print('课程id库课程总长度', len(ID_count))
        if "操作过于频繁,请稍后再试" in response.text:
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_list,
                                 dont_filter=True)
        else:
            response_text = json.loads(response.text)
            curriculums = response_text.get("curriculums")
            for curriculum in curriculums:
                # 获取全部课程数据
                id = curriculum.get("course").get("id")
                fluent_id_list.append(id)  # 获取流利说全部课程ID
            print('获取全部流利课程ID', fluent_id_list)
            print('获取全部流利课程ID长度', len(fluent_id_list))
            left_id_list = set(fluent_id_list) - set(ID_count)
            print('剩余IDlist', left_id_list)
            if left_id_list:
                for id in left_id_list:
                    url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
                    )
                    yield scrapy.Request(url,
                                         callback=self.parse_detail,
                                         dont_filter=True)

    def parse_detail(self, response):
        courseitem = CourseItem()
        logger.info('详情页的数据%s' % response.text)
        if "操作过于频繁,请稍后再试" in response.text:
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_detail,
                                 dont_filter=True)
        else:
            response_text = json.loads(response.text)
            courseitem["ID"] = response_text.get("id")
            localtime = time.localtime(time.time())
            insert_time = time.strftime("%Y-%m-%d", localtime)
            courseitem["insert_time"] = insert_time
            yield courseitem
Esempio n. 6
0
    def parse_detail(self, response):
        user_detail = json.loads(response.text)
        useritem = UserItem()
        user = user_detail.get('user')
        useritem['id'] = user.get('id')
        useritem['repliesCount'] = user.get('repliesCount')  # 回复数
        useritem['topicsCount'] = user.get('topicsCount')  # 发表帖子数
        useritem['coins'] = user.get('coins')  # 金币数
        useritem['stars'] = user.get('stars')  # 星星数
        useritem['nick'] = user.get('nick')  # 昵称
        useritem['gender'] = user.get('gender')  # 性别
        useritem['birthYear'] = user.get('birthYear')  # 出生年份
        useritem['location'] = user.get('location')  # 出生地
        useritem['profession'] = user.get('profession')  # 职业
        useritem['level'] = user.get('level')  # 等级
        useritem['followersCount'] = user_detail.get('followersCount')  # 粉丝数
        useritem['followingsCount'] = user_detail.get('followingsCount')  # 关注数
        useritem['dialogCount'] = user_detail.get('dialogCount')  # 闯关总数
        useritem['nonstopStudyDays'] = user_detail.get(
            'nonstopStudyDays')  # 连续学习天数
        useritem['studyDays'] = user_detail.get('studyDays')  # 累计学习天数
        useritem['dialogAvgScore'] = user_detail.get('dialogAvgScore')  # 闯关平均分
        useritem['theSpeakingForce'] = user_detail.get(
            'theSpeakingForce')  # 口语力
        useritem['rank'] = user_detail.get('rank')  # 超过多少的人
        useritem['recordTime'] = user_detail.get('recordTime')  # 录音总计秒

        yield useritem
        followersCount = useritem['followersCount']
        followingsCount = useritem['followingsCount']
        user_id = useritem['id']
        # 如果 粉丝数大于0
        if followersCount > 0:
            followers_maxpage = math.ceil(followersCount / 20)  # 向上取整
            followers_maxpage += 1
            for page_followers in range(1, int(followers_maxpage)):
                # print(page)
                url = "https://apineo.llsapp.com/api/v1/users/" + user_id + "/followers?page=" + str(
                    page_followers
                ) + "&pageSize=20&appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )

                yield scrapy.Request(url, callback=self.follow_list)
Esempio n. 7
0
class Community(scrapy.Spider):
    name = 'Community'

    start_urls = [
        "http://apineo.llsapp.com/api/v1/circles/hot?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token="
        + get_token(),
        "http://apineo.llsapp.com/api/v1/leaderboard/global?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token="
        + get_token()
    ]
    custom_settings = {
        # 重试机制
        "RETRY_ENABLED": True,
        "RETRY_TIMES": 3,
        "COOKIES_ENABLED": False,
        "HTTPERROR_ALLOWED_CODES": [429],  # 429的状态码不报错
        "DOWNLOAD_DELAY": 0.1,
        # 'AUTOTHROTTLE_ENABLED': True,  # 启动[自动限速]
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            'Accept-Language': "zh-cn",
            'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)",
            'Accept-Encoding': "gzip,deflate",
            'Host': "apineo.llsapp.com",
            'cache-control': "no-cache",
        }
    }

    # 接口获取所有圈子的Id
    def parse(self, response):
        response_text = json.loads(response.text)
        # 圈子的数据
        if isinstance(response_text, list):
            for i in response_text:
                community = i.get('id')
                url = "https://apineo.llsapp.com/api/v1/circles/" + community + "/leaderboards/users?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )
                yield scrapy.Request(url, callback=self.parse_list)

        # 排行榜数据
        if isinstance(response_text, dict):
            # print(response_text)
            rank_list = []
            stars = response_text.get('star').get('members')
            rank_list.extend(stars)
            coin = response_text.get('coin').get('members')
            rank_list.extend(coin)
            recordDuration = response_text.get('recordDuration').get(
                'members')  # 测试时没有数据
            rank_list.extend(recordDuration)
            dialogCount = response_text.get('dialogCount').get('members')
            rank_list.extend(dialogCount)
            days = response_text.get('days').get('members')
            rank_list.extend(days)  # 所有的榜单添加到一个列表里
            for rank in rank_list:
                user_id = rank.get('id')
                url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )
                yield scrapy.Request(url, callback=self.parse_detail)

    # 获取圈子的排行榜100
    def parse_list(self, response):
        leaders = json.loads(response.text).get('leaders')
        for leader in leaders:
            user_id = leader.get('id')
            # user_count = get_user(user_id)  # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库,感觉只有粉丝页才需要这个
            url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
            )
            yield scrapy.Request(url, callback=self.parse_detail)

    # 粉丝页或关注页
    def follow_list(self, response):
        users = json.loads(response.text).get('users')
        for user in users:
            user_id = user.get('id')
            user_count = get_user(user_id)  # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库
            if user_count == 1:
                pass
            if user_count == 0:
                url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )
                yield scrapy.Request(url, callback=self.parse_detail)

    # 用户详情页
    def parse_detail(self, response):
        user_detail = json.loads(response.text)
        useritem = UserItem()
        user = user_detail.get('user')
        useritem['id'] = user.get('id')
        useritem['repliesCount'] = user.get('repliesCount')  # 回复数
        useritem['topicsCount'] = user.get('topicsCount')  # 发表帖子数
        useritem['coins'] = user.get('coins')  # 金币数
        useritem['stars'] = user.get('stars')  # 星星数
        useritem['nick'] = user.get('nick')  # 昵称
        useritem['gender'] = user.get('gender')  # 性别
        useritem['birthYear'] = user.get('birthYear')  # 出生年份
        useritem['location'] = user.get('location')  # 出生地
        useritem['profession'] = user.get('profession')  # 职业
        useritem['level'] = user.get('level')  # 等级
        useritem['followersCount'] = user_detail.get('followersCount')  # 粉丝数
        useritem['followingsCount'] = user_detail.get('followingsCount')  # 关注数
        useritem['dialogCount'] = user_detail.get('dialogCount')  # 闯关总数
        useritem['nonstopStudyDays'] = user_detail.get(
            'nonstopStudyDays')  # 连续学习天数
        useritem['studyDays'] = user_detail.get('studyDays')  # 累计学习天数
        useritem['dialogAvgScore'] = user_detail.get('dialogAvgScore')  # 闯关平均分
        useritem['theSpeakingForce'] = user_detail.get(
            'theSpeakingForce')  # 口语力
        useritem['rank'] = user_detail.get('rank')  # 超过多少的人
        useritem['recordTime'] = user_detail.get('recordTime')  # 录音总计秒

        yield useritem
        followersCount = useritem['followersCount']
        followingsCount = useritem['followingsCount']
        user_id = useritem['id']
        # 如果 粉丝数大于0
        if followersCount > 0:
            followers_maxpage = math.ceil(followersCount / 20)  # 向上取整
            followers_maxpage += 1
            for page_followers in range(1, int(followers_maxpage)):
                # print(page)
                url = "https://apineo.llsapp.com/api/v1/users/" + user_id + "/followers?page=" + str(
                    page_followers
                ) + "&pageSize=20&appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )

                yield scrapy.Request(url, callback=self.follow_list)
Esempio n. 8
0
 def follow_list(self, response):
     users = json.loads(response.text).get('users')
     for user in users:
         user_id = user.get('id')
         user_count = get_user(user_id)  # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库
         if user_count == 1:
             pass
         if user_count == 0:
             url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
             )
             yield scrapy.Request(url, callback=self.parse_detail)
Esempio n. 9
0
 def parse_list(self, response):
     leaders = json.loads(response.text).get('leaders')
     for leader in leaders:
         user_id = leader.get('id')
         # user_count = get_user(user_id)  # 判断这个数据是否已经抓取过,返回0或者1,0是为抓取,1为已经入库,感觉只有粉丝页才需要这个
         url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
         )
         yield scrapy.Request(url, callback=self.parse_detail)
Esempio n. 10
0
    def parse(self, response):
        response_text = json.loads(response.text)
        # 圈子的数据
        if isinstance(response_text, list):
            for i in response_text:
                community = i.get('id')
                url = "https://apineo.llsapp.com/api/v1/circles/" + community + "/leaderboards/users?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )
                yield scrapy.Request(url, callback=self.parse_list)

        # 排行榜数据
        if isinstance(response_text, dict):
            # print(response_text)
            rank_list = []
            stars = response_text.get('star').get('members')
            rank_list.extend(stars)
            coin = response_text.get('coin').get('members')
            rank_list.extend(coin)
            recordDuration = response_text.get('recordDuration').get(
                'members')  # 测试时没有数据
            rank_list.extend(recordDuration)
            dialogCount = response_text.get('dialogCount').get('members')
            rank_list.extend(dialogCount)
            days = response_text.get('days').get('members')
            rank_list.extend(days)  # 所有的榜单添加到一个列表里
            for rank in rank_list:
                user_id = rank.get('id')
                url = "http://apineo.llsapp.com/api/v1/users/" + user_id + "/profile?appId=lls&deviceId=354730010301566&sDeviceId=354730010301566&appVer=4&token=" + get_token(
                )
                yield scrapy.Request(url, callback=self.parse_detail)
Esempio n. 11
0
 def parse(self, response):
     # id_set = get_course_id()  # 从课程池里拿到新建的ID
     id_set = un_get_id()  # 拿到还未抓取的课程ID
     if id_set:
         for id in id_set:
             url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
             )
             yield scrapy.Request(url,
                                  callback=self.parse_detail,
                                  dont_filter=True)
     else:
         print('今日数据已经抓取完毕')
Esempio n. 12
0
class Community(scrapy.Spider):
    name = 'Course_mysql'
    start_urls = [
        "https://apineo.llsapp.com/api/v1/curriculums/filtered?type=1&pageSize=20&level=&sort=diamond_consume_desc&page=1&appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token="
        + get_token()
    ]
    custom_settings = {
        # 重试机制
        "RETRY_ENABLED": True,
        "RETRY_TIMES": 3,
        "COOKIES_ENABLED": False,
        "HTTPERROR_ALLOWED_CODES": [429, 401],  # 429的状态码不报错.401是token过期报错
        # "DOWNLOAD_DELAY": 0.2,
        "ITEM_PIPELINES": {
            'fast_fluent.pipelines.MysqlPipeline': 300
        },
        "DOWNLOADER_MIDDLEWARES": {
            # 'fast_fluent.middlewares.ProxyMiddleware': None,  # 代理不启用
            'fast_fluent.middlewares.ProxyMiddleware': 543,  # 代理启用
        },
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            'Accept-Language': "zh-cn",
            'User-Agent': "Lingome/5.0 (SM-G955N;Android 4.4.2;)",
            'Accept-Encoding': "gzip,deflate",
            'Host': "apineo.llsapp.com",
            'cache-control': "no-cache",
        }
    }

    def parse(self, response):
        # id_set = get_course_id()  # 从课程池里拿到新建的ID
        id_set = un_get_id()  # 拿到还未抓取的课程ID
        if id_set:
            for id in id_set:
                url = "https://apineo.llsapp.com/api/v1/courses/" + id + "?appVer=6&deviceId=354730011088642&sDeviceId=354730010301566&appId=lls&token=" + get_token(
                )
                yield scrapy.Request(url,
                                     callback=self.parse_detail,
                                     dont_filter=True)
        else:
            print('今日数据已经抓取完毕')

    def parse_detail(self, response):
        courseitem = CourseItem()
        logger.info('详情页数据%s', response.text)
        if "操作过于频繁,请稍后再试" in response.text:
            yield scrapy.Request(url=response.url,
                                 callback=self.parse_detail,
                                 dont_filter=True)
        else:
            response_text = json.loads(response.text)
            courseitem["ID"] = response_text.get("id")
            courseitem["diamondPrice"] = response_text.get(
                "diamondPrice") / 10  # 除以10,统一转换为人民币
            courseitem["studyUsersCount"] = response_text.get(
                "studyUsersCount")
            courseitem["translatedTitle"] = response_text.get(
                "translatedTitle")
            localtime = time.localtime(time.time())
            strtime = time.strftime("%Y-%m-%d", localtime)
            courseitem["times"] = strtime
            yield courseitem