Python do_requestの例、crawlertool.do_request Pythonの例

コード例 #1

0

ファイルを表示

    def running(self, pcode, pykm):
        """采集指定期刊的刊期列表

        :param pcode: <str> 中国知网期刊所属数据库
        :param pykm: <str> 中国知网期刊ID(主要ID)
        :return: <list:dict> 期刊刊期列表
        """
        issue_list = []

        ajax_url = "http://navi.cnki.net/knavi/JournalDetail/GetJournalYearList?pcode={}&pykm={}&pIdx=0".format(
            pcode, pykm)
        if html_text := tool.do_request(ajax_url).content.decode(
                errors="ignore"):  # 请求获取期刊刊期列表Ajax

            bs = BeautifulSoup(html_text,
                               "lxml")  # 将期刊刊期列表Ajax转换为BeautifulSoup对象

            for journal_label in bs.select(
                    "#page1 > div > dl > dd > a"):  # 定位到各个刊期的标签
                if match := re.search(
                        "[0-9]{6}",
                        journal_label["id"]):  # 提取刊期名称，格式例如：yq201908
                    journal_name = match.group()
                    issue_list.append({
                        "year": journal_name[0:4],
                        "issue": journal_name[4:]
                    })

コード例 #2

0

ファイルを表示

ファイル: Acfun_Video.py プロジェクト: xfyer/CxSpider

    def running(self, page_url) -> List[Dict]:
        # 获取视频基本信息
        response = tool.do_request(page_url, headers=self._HEADERS).text
        title = re.search(r"(?<=<title>)[^<]+(?=</title>)", response).group()  # 提取视频标题
        video_id = re.search(r"(?<=\"vid\":\")\d+(?=\",)", response).group()  # 提取视频ID
        resource_id = re.search(r"(?<=\"ac\":\")\d+(?=\",)", response).group()  # 提取视频资源ID

        # 获取视频下载Url
        rep_info = tool.do_request(self._DOWNLOAD_URL.format(video_id, resource_id), headers=self._HEADERS)
        video_url = rep_info.json()["playInfo"]["streams"][0]["playUrls"][0]

        # 返回结果
        return [{
            "Title": title,
            "Download Url": video_url
        }]

コード例 #3

0

ファイルを表示

ファイル: Wanplus_Lol_Date_List.py プロジェクト: caodaha/CxSpider

    def running(self, start_date, end_date) -> List[Dict]:
        result = []

        # 统计需要抓取的日期列表
        all_date_list = list()  # 需要获取的日期列表
        curr_date = end_date
        while curr_date >= start_date:
            all_date_list.append(curr_date.strftime("%Y%m%d"))
            curr_date += datetime.timedelta(days=-1)
        # print("需要抓取的日期总数:", len(all_date_list), all_date_list)

        if len(all_date_list) == 0:  # 若没有需要抓取的日期则结束运行
            return result

        # 统计需要追溯的日期所在的周的时间戳
        need_date_list = list()  # 需要抓取的时间戳列表(所有的周日+最早的一天)
        for curr_date_str in all_date_list:
            curr_date = datetime.datetime.strptime(curr_date_str, "%Y%m%d")
            if curr_date.weekday() == 0:  # 若时间戳为周日
                need_date_list.append(curr_date_str)
        need_date_list.append(all_date_list[-1])  # 添加最早的一天
        # print("需要抓取的时间戳总数:", len(need_date_list))

        # 格式化日期，方便比较
        end_date = end_date.strftime("%Y%m%d")
        start_date = start_date.strftime("%Y%m%d")

        # 依据时间戳抓取比赛数据
        for i in range(len(need_date_list)):
            curr_date_str = need_date_list[i]
            # print("正在抓取时间戳:", i + 1, "/", len(need_date_list), "(", curr_date_str, ")")
            curr_date_timestamp = str((datetime.datetime.strptime(curr_date_str, "%Y%m%d") - datetime.datetime(1970, 1, 1)).total_seconds())
            self._DATE_LIST_DATA["time"] = curr_date_timestamp  # 列表请求的表单数据
            response = tool.do_request(self._DATE_LIST_URL, method="post", headers=self._DATE_LIST_HEADERS, data=self._DATE_LIST_DATA)
            # print(response.content)
            if response.status_code == 200:
                response_json = json.loads(response.content.decode())
                # print(response_json)
                for curr_date_str, date_info in response_json["data"]["scheduleList"].items():
                    if int(end_date) >= int(curr_date_str) >= int(start_date):
                        if date_info["list"]:
                            for match in date_info["list"]:
                                result.append({
                                    "date": curr_date_str,
                                    "race_id": match["scheduleid"],
                                    "team_a_name": match["oneseedname"],
                                    "team_b_name": match["twoseedname"],
                                    "start_time": match["starttime"],
                                    "team_a_score": match["onewin"],
                                    "team_b_score": match["twowin"],
                                    "contest_name": match["ename"],
                                    "match_name": match["groupname"],
                                    "team_a_score_per": match["oneScore"],
                                    "team_b_score_per": match["twoScore"],
                                })
            time.sleep(5)

        return result

コード例 #4

0

ファイルを表示

ファイル: Wanplus_Lol_Match_List.py プロジェクト: ybxgood/CxSpider

 def running(self, race_id):
     match_id_list = []
     response = tool.do_request(self._RACE_LIST_URL % race_id,
                                headers=self._RACE_LIST_HEADERS)
     bs = BeautifulSoup(response.content.decode(), 'lxml')
     game_labels = bs.select(
         "body > div > div.content > div.left > div:nth-child(1) > div > a")
     for game_label in game_labels:
         if game_label.has_attr("data-matchid"):
             match_id_list.append(game_label["data-matchid"])
     return [{"race_id": race_id, "match_id_list": match_id_list}]

コード例 #5

0

ファイルを表示

def spider():
    # 连接到MySQL数据库
    mysql = tool.db.MySQL(host=MYSQL_HOST,
                          database=MYSQL_DATABASE,
                          user=MYSQL_USER,
                          password=MYSQL_PASSWORD)

    # 遍历所有年份
    for year, page_num in YEAR_PAGE_NUM_LIST:

        # 遍历年份的所有页面
        for page in range(1, page_num + 1):

            print("当前采集:", year, "-", page)

            data_list = []

            # 定义请求的Url参数
            url_params = {
                "t": 3,  # t=3 : 已经结束的赛事
                "year": year,
                "page": page
            }

            # 执行请求
            response = tool.do_request(EVENT_LIST_PAGE_URL, params=url_params)

            # 解析返回结果
            lxml = BeautifulSoup(response.text, "lxml")

            # 遍历所有赛事的外层标签
            for label in lxml.select(
                    "#info > div.left-slide > div.left-box > div.event-list > ul > li"
            ):
                wanplus_event_id = re.search(
                    "[0-9]+",
                    label.select_one("li > a")["href"]).group()  # 玩加电竞赛事ID
                event_name = label.select_one("li > a > span").text  # 赛事名称

                time_frame = label.select_one(
                    "li > a > p:nth-child(3)").text  # 时间范围
                start_date, end_date = time_frame.split(" — ")  # 赛事开始时间、赛事结束时间

                data_list.append({
                    "event_name": event_name,
                    "start_date": start_date,
                    "end_date": end_date,
                    "wanplus_event_id": wanplus_event_id
                })

            # 将数据写入到MySQL数据库
            mysql.insert("event", data_list)

            time.sleep(3)

コード例 #6

0

ファイルを表示

ファイル: Wanplus_Lol_Match_Info.py プロジェクト: ybxgood/CxSpider

 def running(self, race_id, match_id):
     # 执行场次请求
     self._MATCH_LIST_HEADERS[
         "referer"] = self._MATCH_LIST_REFERER % race_id
     response = tool.do_request(self._MATCH_LIST_URL % match_id,
                                headers=self._MATCH_LIST_HEADERS)
     return [{
         "race_id": race_id,
         "match_id": match_id,
         "match_detail": json.loads(response.content.decode())
     }]

コード例 #7

0

ファイルを表示

ファイル: spider.py プロジェクト: SmallLowBee/CxSpider

    def running(self):
        response = tool.do_request("https://www.qidian.com/rank/collect?style=2&chn=-1&page=1", headers=self._HEADERS)

        # 解析答案
        bs = BeautifulSoup(response.content.decode(), "lxml")

        book_list = []
        for book_label in bs.select("#rank-view-list > div > table > tbody > tr"):
            book_list.append({
                "book_name": book_label.select_one("tr > td:nth-child(2)").text.replace("「", "").replace("」", "")
            })

        return book_list

コード例 #8

0

ファイルを表示

    def running(self, schedule_id, match_id):
        # ----- 执行请求 -----
        response = tool.do_request(self._MATCH_URL % str(match_id),
                                   headers=self._MATCH_HEADERS)
        response_text = response.content.decode()

        # 如果请求结果为空，或请求结果不是Json格式数据，则直接返回
        if not response_text or response_text[0] == "<":
            return

        response_json = json.loads(response_text)

        # 如果请求失败（来历不明的请求），则直接返回
        if response_json["ret"] == -400:
            return

        # ----- 解析返回结果 -----
        return response_text

コード例 #9

0

ファイルを表示

    def running(self):
        # 执行网页请求
        response = tool.do_request("https://s.weibo.com/top/summary", headers=self._HEADERS)  # 请求微博热搜榜
        bs = BeautifulSoup(response.content.decode(errors="ignore"), "lxml")

        # 解析网页
        hot_list = []
        empty_rank = 0  # 统计空热搜(广告热搜)数量
        for label_item in bs.select("#pl_top_realtimehot > table > tbody > tr"):  # 遍历热搜的标签
            # 提取热搜排名
            if label_rank := label_item.select_one("tr > td.td-01"):
                if len(label_rank.text) == 0:
                    continue
                if match_rank := re.search("[0-9]+", label_rank.text):
                    ranking = int(match_rank.group()) - empty_rank
                else:
                    tool.console("报错", "提取的热搜排名不包含数字!")
                    continue

コード例 #10

0

ファイルを表示

    def running(self, journal, pcode, pykm, year, issue):
        """采集指定刊期的论文列表

        :param journal: <str> 期刊名称
        :param pcode: <str> 中国知网期刊所属数据库
        :param pykm: <str> 中国知网期刊ID(主要ID)
        :param year: <int> 刊期所属年份
        :param issue: <int> 刊期在年份中的序号
        :return: <list:dict> 指定刊期的期刊论文列表
        """
        ajax_url = "http://navi.cnki.net/knavi/JournalDetail/GetArticleList?year={}&issue={}&pykm={}&pageIdx=0&pcode={}"
        ajax_url = ajax_url.format(year, issue, pykm, pcode)
        response = tool.do_request(ajax_url)  # 请求获取期刊论文列表Ajax
        html_text = response.content.decode(errors="ignore")
        if html_text is None:
            return None
        bs = BeautifulSoup(html_text, "lxml")  # 将期刊论文列表Ajax转换为BeautifulSoup对象

        article_list = []  # 返回的期刊论文列表
        now_column = None  # 当前所处的栏目

        label_wrapper = bs.select_one("html > body")
        if label_wrapper is None:
            return None
        for article_label in bs.select_one(
                "html > body").children:  # 循环处理论文及所处栏目
            if article_label.name == "dt":
                now_column = article_label.get_text()
            elif article_label.name == "dd":
                title = re.sub(
                    r"\s", "",
                    article_label.select_one("dd > span > a").text)  # 读取论文标题
                href = article_label.select_one("dd > span > a")[
                    "href"]  # 读取论文链接
                if match := re.search("(?<=dbCode=)[^&]+(?=&)", href):
                    db_code = match.group()  # 在论文链接中提取变量值
                else:
                    continue
                if match := re.search("(?<=filename=)[^&]+(?=&)", href):
                    file_name = match.group()  # 在论文链接中提取变量值
                else:
                    continue
                if match := re.search("(?<=tableName=)[^&]+(?=&)", href):
                    db_name = match.group()  # 在论文链接中提取变量值

コード例 #11

0

ファイルを表示

    def running(self, mid):

        self.headers["referer"] = "https://space.bilibili.com/%s/video".format(
            str(mid))

        now_page = 1
        max_page = 2

        video_list = []
        while now_page <= max_page:
            print("正在请求第", now_page, "页......")

            # 生成执行请求的参数
            param_dict = {
                "mid": mid,
                "ps": 30,
                "tid": 0,
                "pn": now_page,  # 将当前页填入到参数列表中
                "keyword": "",
                "order": "pubdate",
                "jsonp": "jsonp",
            }
            response = tool.do_request(
                "https://api.bilibili.com/x/space/arc/search?" +
                urlencode(param_dict),
                headers=self._HEADERS)
            response_json = response.json()  # 将返回结果解析为Json格式

            now_page += 1  # 页面累加
            max_page = math.ceil(response_json["data"]["page"]["count"] /
                                 30)  # 获取UP主视频总数(用以控制翻页次数)

            for video_item in response_json["data"]["list"]["vlist"]:  # 遍历视频信息
                video_list.append({
                    "video_title": video_item["title"],  # 视频标题
                    "video_play": video_item["play"]  # 播放次数
                })

            time.sleep(5)

        return video_list

コード例 #12

0

ファイルを表示

ファイル: spider.py プロジェクト: SmallLowBee/CxSpider

    def running(self, params_item):
        response = tool.do_request(
            url=
            "https://qt.qq.com/lua/mlol_battle_info/get_total_tier_rank_list",
            params={
                "area_id": str(params_item[0]),
                "offset": str(params_item[1]),
                "sign": str(params_item[2])
            },
            verify=False)
        summoner_json = response.json()

        if "data" not in summoner_json or "player_list" not in summoner_json[
                "data"]:
            return None

        summoner_list = []
        for summoner_item in summoner_json["data"]["player_list"]:
            if "tier_title" not in summoner_item:
                continue
            if "name" not in summoner_item:
                continue
            if "uuid" not in summoner_item:
                continue
            if "ranking" not in summoner_item:
                continue
            if "league_points" not in summoner_item:
                continue
            summoner_list.append({
                "tier": summoner_item["tier_title"],
                "name": summoner_item["name"],
                "uuid": summoner_item["uuid"],
                "area": 1,
                "ranking": summoner_item["ranking"],
                "points": summoner_item["league_points"]
            })
        return summoner_list

コード例 #13

0

ファイルを表示

ファイル: WeGame_TFT_Exploit_Detail.py プロジェクト: ybxgood/CxSpider

    def running(self, skey, exploit_id, end_time, user_id):
        # 请求召唤师比赛记录
        response = tool.do_request(
            url=
            "https://mlol.qt.qq.com/gorpc/exploit/exploit/query_exploit_detail/proxy",
            params={
                "exploit_id": exploit_id,
                "game_area": "1",
                "scene": "tft_mlol",
                "user_id": user_id,
                "endtime": end_time,
                "plat": "android",
                "version": "9914"
            },
            headers=self._HEADERS,
            cookies={
                "l_uin": "o12345678",  # o+QQ号
                "p_uin": "o12345678",  # o+QQ号
                "uin": "o12345678",  # o+QQ号
                "skey": skey
            },  # 登录状态SKEY(需要通过模拟器+Fildder获得，有效期24小时左右)
            verify=False)

        exploit = response.json()

        if "info" not in exploit:
            print("Missing: info")
            return False
        if "exploit_detail" not in exploit["info"]:
            print("Missing: info - exploit_detail")
            return False
        if exploit["info"]["exploit_detail"] is None:
            print("Missing: info - exploit_detail - is None")
            return False
        if "exploit_id" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - exploit_id")
            return False
        if "end_time" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - end_time")
            return False
        if "game_match_type" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - game_match_type")
            return False
        if "duration" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - duration")
            return False
        if "specific_user_exploit" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - specific_user_exploit")
            return False
        if "buff_version" not in exploit["info"]["exploit_detail"][
                "specific_user_exploit"]:
            print(
                "Missing: info - exploit_detail - specific_user_exploit - buff_version"
            )
            return False
        if "member_exploit_list" not in exploit["info"]["exploit_detail"]:
            print("Missing: info - exploit_detail - member_exploit_list")
            return False

        member_list = [{} for _ in range(8)]
        member_info_list = [[] for _ in range(8)]

        for member_exploit in exploit["info"]["exploit_detail"][
                "member_exploit_list"]:
            if "user_id" not in member_exploit:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - user_id"
                )
                return False
            if "nickname" not in member_exploit:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - nickname"
                )
                return False
            if "ranking" not in member_exploit:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - ranking"
                )
                return False
            if "piece_list" not in member_exploit:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - piece_list"
                )
                return False
            if "game_rank_list" not in member_exploit:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - game_rank_list"
                )
                return False
            if "full_rank_title" not in member_exploit["game_rank_list"][0]:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - game_rank_list - full_rank_title"
                )
                return False

            ranking = int(member_exploit["ranking"])
            if ranking < 1 or ranking > 8:
                return False

            member_list[ranking - 1] = {
                "user_id":
                member_exploit["user_id"],
                "nickname":
                member_exploit["nickname"],
                "rank_title":
                member_exploit["game_rank_list"][0]["full_rank_title"]
            }

            if member_exploit["piece_list"] is None:
                print(
                    "Missing: info - exploit_detail - member_exploit_list - - piece_list - is None"
                )
                continue

            for piece in member_exploit["piece_list"]:
                if "star_num" not in piece:
                    print(
                        "Missing: info - exploit_detail - member_exploit_list - - piece_list - - star_num"
                    )
                    return False
                if "hero_name" not in piece:
                    print(
                        "Missing: info - exploit_detail - member_exploit_list - - piece_list - - hero_name"
                    )
                    return False
                member_info_list[ranking - 1].append(
                    [piece["hero_name"], piece["star_num"]])

        exploit_write = list()
        exploit_write.append({
            "exploit_id":
            exploit["info"]["exploit_detail"]["exploit_id"],
            "end_time":
            exploit["info"]["exploit_detail"]["end_time"],
            "game_match_type":
            exploit["info"]["exploit_detail"]["game_match_type"],
            "duration":
            exploit["info"]["exploit_detail"]["duration"],
            "version":
            exploit["info"]["exploit_detail"]["specific_user_exploit"]
            ["buff_version"],
            "member_list":
            json.dumps(member_list, ensure_ascii=False),
            "member_1":
            json.dumps(member_info_list[0], ensure_ascii=False),
            "member_2":
            json.dumps(member_info_list[1], ensure_ascii=False),
            "member_3":
            json.dumps(member_info_list[2], ensure_ascii=False),
            "member_4":
            json.dumps(member_info_list[3], ensure_ascii=False),
            "member_5":
            json.dumps(member_info_list[4], ensure_ascii=False),
            "member_6":
            json.dumps(member_info_list[5], ensure_ascii=False),
            "member_7":
            json.dumps(member_info_list[6], ensure_ascii=False),
            "member_8":
            json.dumps(member_info_list[7], ensure_ascii=False)
        })

        return exploit_write

コード例 #14

0

ファイルを表示

ファイル: WeGame_TFT_Exploit_List.py プロジェクト: ybxgood/CxSpider

    def running(self, skey, summoner, start_time, end_time) -> List[Dict]:
        params = {
            "user_id": summoner,
            "scene": "tft_mlol",
            "plat": "android",
            "version": "9914",
            "game_area": "1",
            "login_account_type": "1"
        }

        next_baton = None
        exploit_list = []

        # 执行请求
        for i in range(100):

            print("执行第", i + 1, "次请求...")

            if next_baton:
                params["baton"] = next_baton

            # 请求召唤师比赛记录列表(需要cookies,p_skey可以不需要,可以使用bacon参数控制翻页)
            response = tool.do_request(
                url=
                "https://mlol.qt.qq.com/gorpc/exploit/exploit/query_player_exploit_list/proxy",
                params=params,
                headers=self._HEADERS,
                cookies={
                    "l_uin": "o13578660",
                    "p_uin": "o13578660",
                    "uin": "o13578660",
                    "skey": skey
                },
                verify=False)
            exploit_json = response.json()

            if "info" not in exploit_json:
                break
            if "exploit_list" not in exploit_json["info"]:
                break
            if "next_baton" not in exploit_json["info"]:
                break
            next_baton = exploit_json["info"]["next_baton"]

            if exploit_json["info"]["exploit_list"] is None:
                break

            for exploit_item in exploit_json["info"]["exploit_list"]:
                if "exploit_id" not in exploit_item:
                    continue
                if "end_time" not in exploit_item:
                    continue
                if "game_match_type" not in exploit_item:
                    continue
                if "specific_user_exploit" not in exploit_item:
                    continue
                if "user_id" not in exploit_item["specific_user_exploit"]:
                    continue
                if exploit_item["end_time"] < start_time:
                    break
                if exploit_item["end_time"] < end_time:
                    exploit_list.append({
                        "exploit_id":
                        exploit_item["exploit_id"],
                        "end_time":
                        exploit_item["end_time"],
                        "user_id":
                        exploit_item["specific_user_exploit"]["user_id"],
                        "game_match_type":
                        exploit_item["game_match_type"]
                    })

        return exploit_list

コード例 #15

0

ファイルを表示

ファイル: 2_赛事比赛采集.py プロジェクト: ChangxingJiang/Data-Mining-HandBook

def spider():
    # 连接到MySQL数据库
    mysql = tool.db.MySQL(host=MYSQL_HOST,
                          database=MYSQL_DATABASE,
                          user=MYSQL_USER,
                          password=MYSQL_PASSWORD)

    # 从数据库中读取赛事列表
    event_list = mysql.select(
        "event",
        columns=["event_id", "event_name", "wanplus_event_id"],
        where="event_id>=251")

    # 遍历所有赛事
    for event_id, event_name, wanplus_event_id in event_list:
        print("当前采集:", event_id, "-", event_name, "-", wanplus_event_id)

        # ----- 采集赛事的赛段列表 -----
        # 执行请求
        response = tool.do_request(EVENT_INFO_URL.format(wanplus_event_id))
        time.sleep(5)

        # 解析返回结果
        lxml = BeautifulSoup(response.text, "lxml")

        # 提取所有赛段编号
        stage_id_list = []
        stage_name_list = []
        for label in lxml.select("#event_stage > li"):
            stage_id_list.append(int(label["data-stageid"]))
            stage_name_list.append(label.text.replace("\n", "").lstrip())

        # 将赛段信息写入数据库
        data_list_1 = []
        for i in range(len(stage_id_list)):
            data_list_1.append({
                "event_id": event_id,
                "stage_name": stage_name_list[i],
                "wanplus_stage_id": stage_id_list[i]
            })
        mysql.insert("stage", data_list_1)

        # ----- 遍历各个赛段中包含的比赛 -----
        for stage_id in stage_id_list:
            print("当前采集赛段:", stage_id)

            data_list_2 = []

            # 定义请求的Url参数
            url_params = {
                "_gtk": 868258461,  # 可视作常量
                "eId": wanplus_event_id,
                "stageId": stage_id,
                "gameType": 2  # 可视作常量
            }

            # 执行请求
            response = tool.do_request(STAGE_INFO_URL,
                                       method="POST",
                                       data=url_params)
            time.sleep(3)

            # 解析返回结果
            lxml = BeautifulSoup(response.text, "lxml")

            # 提取赛段中比赛的编号
            for label in lxml.select("a"):
                wanplus_schedule_id = int(
                    re.search("[0-9]+", label["href"]).group())

                data_list_2.append({
                    "event_id": event_id,
                    "stage_id": stage_id,
                    "wanplus_schedule_id": wanplus_schedule_id
                })

            # 将数据写入到MySQL数据库
            mysql.insert("schedule", data_list_2)

コード例 #16

0

ファイルを表示

    def running(self):
        movie_list = []

        for page_num in range(10):
            url = "https://movie.douban.com/top250?start={0}&filter=".format(
                page_num * 25)

            response = tool.do_request(url, headers=self._HEADERS)
            bs = BeautifulSoup(response.content.decode(errors="ignore"),
                               'lxml')

            for movie_label in bs.select(
                    "#content > div > div.article > ol > li"):  # 定位到电影标签
                # 获取电影链接(<a>标签的href属性)
                url = movie_label.select_one("li > div > div.pic > a")["href"]

                # 解析标题行
                title_text = movie_label.select_one(
                    "li > div > div.info > div.hd > a").text.replace(
                        "\n", "")  # 提取标题行内容+清除换行符
                title_chinese = title_text.split(
                    "/")[0].strip()  # 提取中文标题+清除前后空格
                title_other = [
                    title.strip() for title in title_text.split("/")[1:]
                ]  # 提取其他标题+清除前后空格

                # 解析内容信息(因长度原因，大部分主演名字不全暂不解析)
                info_text = movie_label.select_one(
                    "li > div > div.info > div.bd > p:nth-child(1)"
                ).text  # 获取说明部分内容
                info_text = re.sub("\n *", "\n", info_text)  # 清除行前多余的空格
                info_text = re.sub("^\n", "", info_text)  # 清除开头的空行
                info_line_1, info_line_2 = info_text.split("\n")[
                    0:2]  # 获取第1行内容信息:包括导演和主演、获取第2行内容信息:包括年份、国家和类型
                director = re.sub(" *(主演|主\\.{3}|\\.{3}).*$", "",
                                  info_line_1)  # 仅保留导演部分
                year = int(
                    re.search(
                        "[0-9]+",
                        info_line_2.split("/")[0]).group())  # 提取电影年份并转换为数字
                country = info_line_2.split("/")[1].strip() if len(
                    info_line_2.split("/")) >= 2 else None  # 提取电影国家
                classify = info_line_2.split("/")[2].strip() if len(
                    info_line_2.split("/")) >= 3 else None  # 提取电影类型
                classify = re.split(" +", classify)  # 将电影类型转换为list形式

                # 解析评分
                rating_num = movie_label.select_one(
                    "li > div > div.info > div.bd > div > span.rating_num"
                ).text  # 提取评分
                rating_num = float(re.search(
                    "[0-9.]+", rating_num).group())  # 将评分转换为浮点型数字

                # 解析评分人数
                rating_people = movie_label.select_one(
                    "li > div > div.info > div.bd > div > span:nth-child(4)"
                ).text  # 提取评分人数
                rating_people = int(
                    re.search("[0-9]+", rating_people).group())  # 将评分人数转换为数字

                # 解析评价(该标签可能会不存在)
                if quote_label := movie_label.select_one(
                        "li > div > div.info > div.bd > p.quote"):
                    quote = quote_label.text.replace("\n", "")  # 提取评价+清除换行符
                else:
                    quote = None

                movie_list.append({
                    "url": url,
                    "title_chinese": title_chinese,
                    "title_others": title_other,
                    "director": director,
                    "year": year,
                    "country": country,
                    "classify": classify,
                    "rating_num": rating_num,
                    "rating_people": rating_people,
                    "quote": quote
                })

            time.sleep(5)

コード例 #17

0

ファイルを表示

    def running(self, start_date, end_date) -> List[Dict]:
        # 初始化返回结果数据
        result = []

        # ----- 计算所有需要抓取的日期列表 -----
        all_date_list = []  # 需要获取的日期列表
        curr_date = end_date
        while curr_date >= start_date:
            all_date_list.append(curr_date.strftime("%Y%m%d"))
            curr_date += timedelta(days=-1)

        # 若没有需要抓取的日期则完成采集
        if len(all_date_list) == 0:
            return result

        # ----- 计算实际需要请求的日期列表 -----
        # 每一次请求均会返回该日所在周7天的数据，因此只需要请求每周的周日即可，和最后一个还没有到周日的周即可
        need_date_list = []
        # 如果最后一周还没有到周日，则添加最后一周的最后一天
        if datetime.strptime(all_date_list[0], "%Y%m%d").weekday() != 0:
            need_date_list.append(all_date_list[0])
        # 添加之前每一周的周日
        for curr_date in all_date_list:
            if datetime.strptime(curr_date, "%Y%m%d").weekday() == 0:
                need_date_list.append(curr_date)

        # ----- 依据时间戳抓取比赛数据 -----
        for i in range(len(need_date_list)):

            print("当前抓取:", self.format_date(need_date_list[i]), "(", i + 1,
                  "/", len(need_date_list), ")")
            curr_date = need_date_list[i]

            # 计算请求参数并执行请求
            curr_date_timestamp = str((datetime.strptime(curr_date, "%Y%m%d") -
                                       datetime(1970, 1, 1)).total_seconds())
            self._DATE_LIST_DATA["time"] = curr_date_timestamp
            response = tool.do_request(self._DATE_LIST_URL,
                                       method="post",
                                       headers=self._DATE_LIST_HEADERS,
                                       data=self._DATE_LIST_DATA)

            if response.status_code != 200:
                print("请求失败!")
                continue

            # 解析请求的返回结果
            response_json = json.loads(response.content.decode())
            for curr_date, date_info in response_json["data"][
                    "scheduleList"].items():  # 遍历该周的每一天
                print("当前抓取日期:", self.format_date(curr_date))
                if int(start_date.strftime("%Y%m%d")) <= int(curr_date) <= int(
                        end_date.strftime("%Y%m%d")):
                    if date_info["list"]:  # 检查当日是否有比赛
                        for match in date_info["list"]:  # 遍历该日的每一场比赛
                            result.append({
                                "schedule_id":
                                int(match["scheduleid"]
                                    ),  # 比赛ID(一场完整的BO1/BO3/BO5的比赛称为比赛)
                                "date":
                                self.format_date(curr_date),  # 比赛日期
                                "time":
                                match["starttime"],  # 比赛时间:比赛开始时间
                                "event_id":
                                int(match["eid"]),  # 赛事ID
                                "event_name":
                                match["ename"],  # 赛事名称
                                "event_group_name":
                                match["groupname"],  # 赛事赛段
                                "stage_id":
                                int(match["stageid"]),  # 疑似赛事ID
                                "bo_num":
                                int(match["bonum"]),  # 比赛场次:BO1=1,BO3=3,BO5=5
                                "team_a_id":
                                int(match["oneseedid"]),  # 队伍A的ID
                                "team_a_name":
                                match["oneseedname"],  # 队伍A的名称
                                "team_b_id":
                                int(match["twoseedid"]),  # 队伍B的ID
                                "team_b_name":
                                match["twoseedname"],  # 队伍B的名称
                                "team_a_win":
                                int(match["onewin"]),  # 队伍A的获胜小场数
                                "team_b_win":
                                int(match["twowin"]),  # 队伍B的获胜小场数
                                "team_a_score":
                                str(match["oneScore"]),  # 队伍A的每小场得分
                                "team_b_score":
                                str(match["twoScore"]),  # 队伍B的每小场得分
                            })

            # 执行延迟
            time.sleep(5)

        return result

コード例 #18

0

ファイルを表示

ファイル: spider_2.py プロジェクト: ChangxingJiang/Data-Mining-HandBook

    def running(self, schedule_id: int):
        result = []  # 初始化返回结果数据

        # ----- 计算请求参数并执行请求 -----
        response = tool.do_request(self._RACE_LIST_URL % str(schedule_id),
                                   headers=self._RACE_LIST_HEADERS)

        if response.status_code != 200:
            print("请求失败!")
            return result

        bs = BeautifulSoup(response.content.decode(), "lxml")

        # ----- 判断返回结果是否有效 -----
        label = bs.select_one("body")
        # 判断比赛是否为未进行的状态：如果是未进行的状态则跳出
        if label.has_attr("class") and "matchbf" in label["class"]:
            return
        # 判断是否为页面未找到的状态：如果是未找到的状态则跳出
        if label.has_attr("class") and "mess_html" in label["class"]:
            return

        # ----- 解析请求比赛基本信息 -----
        # 解析赛事ID和赛事名称
        selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > h1 > a"
        label = bs.select_one(selector)
        event_id = int(re.search("[0-9]+", label["href"]).group())  # 赛事ID
        event_name = label.text  # 赛事名称

        # 解析比赛时间
        selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li:nth-child(2) > span.time"
        label = bs.select_one(selector)
        if " " in label.text:
            schedule_date, schedule_time = label.text.split(" ")[
                0:2]  # 比赛日期、比赛时间
        else:
            schedule_date, schedule_time = "", ""

        # 解析比赛双方队伍信息
        selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li.team-left > a"
        label = bs.select_one(selector)
        team_a_id = int(re.search("[0-9]+", label["href"]).group())  # 队伍A的ID
        team_a_name = label.text.replace("\n", "")  # 队伍A的名称
        selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li.team-right.tr > a"
        label = bs.select_one(selector)
        team_b_id = int(re.search("[0-9]+", label["href"]).group())  # 队伍B的ID
        team_b_name = label.text.replace("\n", "")  # 队伍B的名称

        # 解析比赛比分、比赛赛制
        selector = "body > div.body-inner > div.content > div.left > div:nth-child(1) > ul > li:nth-child(2) > p"
        marks = bs.select_one(selector).text.split(":")
        team_a_win, team_b_win = int(marks[0]), int(marks[1])  # 队伍A的小分、队伍B的小分
        schedule_bo_num = max(team_a_win, team_b_win)  # 比赛赛制

        # ----- 解析请求比赛场次信息 -----
        game_labels = bs.select(
            "body > div > div.content > div.left > div:nth-child(1) > div > a")
        for game_label in game_labels:
            if game_label.has_attr("data-matchid"):
                result.append({
                    "match_id": game_label["data-matchid"],  # 场次ID
                    "schedule_id": schedule_id,  # 比赛ID
                    "schedule_date": schedule_date,
                    "schedule_time": schedule_time,
                    "schedule_bo_num": schedule_bo_num,
                    "event_id": event_id,
                    "event_name": event_name,
                    "team_a_id": team_a_id,
                    "team_a_name": team_a_name,
                    "team_b_id": team_b_id,
                    "team_b_name": team_b_name,
                    "team_a_win": team_a_win,
                    "team_b_win": team_b_win
                })

        # 输出数据结果
        self.output(result)