Beispiel #1
0
    def save_shaped_race_info(self,
                              shaped_race_info_list: List[ShapedRaceData]):
        # shaped_race_data を dataframeに変換する
        df_data = self.formatter.data_to_df(shaped_race_info_list)
        #logger.info(df_data)
        if df_data.shape[0] == 0:
            logger.info("no data to save.")
            return
        current_date_ymd = self.current_datetime.strftime("%Y-%m-%d")
        current_time = self.current_datetime.now().time().strftime("%H%M%S")
        # TODO 外だししたい
        os.environ[
            'GOOGLE_APPLICATION_CREDENTIALS'] = r'/Users/daikimiyazaki/.config/pndnism-project-fc40cb799b41.json'
        os.makedirs(
            f"./horse_info_crawler/race/data/race_histories/{current_date_ymd}",
            exist_ok=True)

        client = storage.Client()
        bucket = client.get_bucket('pndnism_horse_data')
        save_path = f"./horse_info_crawler/race/data/race_histories/{current_date_ymd}/shaped_race_history_{current_time}.csv"
        cs_save_path = f"race/data/race_histories/{current_date_ymd}/shaped_race_history_{current_time}.csv"
        # データフレームを CSV として ローカルに保存する
        df_data.to_csv(save_path, index=False)
        bucket.blob(cs_save_path).upload_from_string(df_data.to_csv(),
                                                     'text/csv')
Beispiel #2
0
 def get(self, listing_page_url: str) -> ListingPage:
     # listing_page_url が相対パスだったら絶対パスに変換する
     listing_page_absolute_url = urllib.parse.urljoin(
         NETKEIBA_BASE_URL, listing_page_url)
     logger.info(f"Accessing to {listing_page_absolute_url}.")
     response = requests.get(listing_page_absolute_url)
     response.raise_for_status()
     return self.parser.parse(response.content)
Beispiel #3
0
    def exec(self, crawl_limit: Optional[int] = None):
        logger.info(f'Start crawl_race_histories. crawl_limit: {crawl_limit}')
        race_histories = self._crawl_race_histories(crawl_limit)

        # race_histories を CSV に変換して ローカルに保存する
        self.race_info_repository.save_shaped_race_info(
            self._shape_race_infos(race_histories))
        logger.info("End crawl_race_histories.")
Beispiel #4
0
 def get(self, race_info_page_url: str) -> RaceInfo:
     # race_info_parser が相対パスだったら絶対パスに変換数
     race_info_page_absolute_url = urllib.parse.urljoin(
         NETKEIBA_BASE_URL, race_info_page_url)
     logger.info(f"Accessing to {race_info_page_absolute_url}.")
     response = requests.get(race_info_page_absolute_url)
     response.raise_for_status()
     return self.parser.parse(response.content)
Beispiel #5
0
    def exec(self, crawl_limit: Optional[int] = None):
        logger.info(f'Start crawl_horse_info. crawl_limit: {crawl_limit}')
        horse_info = self._crawl_horse_info(crawl_limit)

        # horse_histories を CSV に変換して ローカルに保存する
        self.horse_info_repository.save_shaped_horse_info(
            self._shape_horse_infos(horse_info))
        logger.info("End crawl_horse_info.")
Beispiel #6
0
    def _shape_race_infos(
            self, race_info_list: List[RaceInfo]) -> List[ShapedRaceData]:
        shaped_race_info_list = []
        for race_info in race_info_list:
            try:
                # Error が発生したら該当 RaceInfo は Skip する
                shaped_race_info_list.append(self._shape_race_info(race_info))
            except UnsupportedFormatError as e:
                logger.info(f"Skip getting race info:{e}")
                # TODO: エラー検知をsentryとかで実装する
            except InvalidFormatError as e:
                logger.warning(f"Skip getting race info:{e}")
                # TODO: エラー検知をsentryとかで実装する

        return shaped_race_info_list
Beispiel #7
0
    def _shape_horse_infos(
            self, horse_info_list: List[HorseInfo]) -> List[ShapedHorseInfo]:
        shaped_horse_info_list = []
        for horse_info in horse_info_list:
            try:
                # Error が発生したら該当 HorseInfo は Skip する
                shaped_horse_info_list.append(
                    self._shape_horse_info(horse_info))
            except UnsupportedFormatError as e:
                logger.info(f"Skip getting horse info:{e}")
                # TODO: エラー検知をsentryとかで実装する
            except InvalidFormatError as e:
                logger.warning(f"Skip getting horse info:{e}")
                # TODO: エラー検知をsentryとかで実装する

        return shaped_horse_info_list
Beispiel #8
0
    def _crawl_horse_info(self,
                          crawl_limit: Optional[int] = None
                          ) -> List[HorseInfo]:
        horse_info = []
        crawl_end_flg = False
        crawled_urls = self._check_crawled_urls()
        # リスティングページをクロールして物件詳細の URL 一覧を取得する
        listing_page_url = self.horse_info_listing_page_scraper.LISTING_PAGE_START_URLS
        count = 0
        while listing_page_url:
            listing_page = self.horse_info_listing_page_scraper.get(
                listing_page_url)
            for_log = listing_page_url[:20] + "~" + listing_page_url[-20:]
            logger.info(
                f"listing_page_url: {for_log}, horse_info_page_urls count: {len(listing_page.horse_info_page_urls)}"
            )

            # レース詳細ページにアクセスして、レースのデータを取得する
            for horse_info_page_url in listing_page.horse_info_page_urls:
                # CSV にアップロードするデータ構造をいれる
                # Errorが発生したら該当PropertyはSkipする
                count += 1
                if NETKEIBA_BASE_URL[:-1] + horse_info_page_url in crawled_urls:
                    logger.info("already crawled. skip...")
                    crawl_end_flg = True
                    break
                try:
                    if self._get_horse_info(horse_info_page_url):
                        horse_info.append(
                            self._get_horse_info(horse_info_page_url))
                    else:
                        raise DetailPageNotFoundError("table not found.")
                except DetailPageNotFoundError as e:
                    logger.warning(f"Skip getting horse:{e}")
                    # TODO: sentryとかエラー監視ツール入れる

                #if count == 100:
                #logger.info("10sec crawler idling... ")
                #time.sleep(10)
                #    count = 0
                if crawl_limit and len(horse_info) >= crawl_limit:
                    # crawl_limit の件数に達したらクロールを終了する
                    logger.info(
                        f"Finish crawl. horse_histories count: {len(horse_info)}"
                    )
                    return horse_info

            # next_page_url がある場合は次ページへアクセス
            if crawl_end_flg:
                break
            print(listing_page.next_page_url)
            listing_page_url = listing_page.next_page_url

        logger.info(f"Finish crawl. horse_histories count: {len(horse_info)}")
        return horse_info
Beispiel #9
0
 def get(self, horse_info_page_url: str) -> HorseInfo:
     #args = ['sudo', 'service', 'tor','restart']
     #subprocess.call(args)
     #socks.set_default_proxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 9050)
     #socket.socket = socks.socksocket
     #proxies = {
     #'http':'socks5://127.0.0.1:9050',
     #'https':'socks5://127.0.0.1:9050'
     #}
     # horse_info_parser が相対パスだったら絶対パスに変換数
     horse_info_page_absolute_url = urllib.parse.urljoin(
         NETKEIBA_BASE_URL, horse_info_page_url)
     logger.info(f"Accessing to {horse_info_page_absolute_url}.")
     response = requests.get(horse_info_page_absolute_url)
     #print(response)
     response.raise_for_status()
     return self.parser.parse(response.content,
                              horse_info_page_absolute_url)
Beispiel #10
0
    def _crawl_race_histories(self,
                              crawl_limit: Optional[int] = None
                              ) -> List[RaceInfo]:
        race_histories = []
        crawl_end_flg = False
        crawled_urls = self._check_crawled_urls()
        # リスティングページをクロールして物件詳細の URL 一覧を取得する
        listing_page_url = self.race_info_listing_page_scraper.LISTING_PAGE_START_URLS
        while listing_page_url:
            listing_page = self.race_info_listing_page_scraper.get(
                listing_page_url)
            for_log = listing_page_url[:20] + "~" + listing_page_url[-20:]
            logger.info(
                f"listing_page_url: {for_log}, race_info_page_urls count: {len(listing_page.race_info_page_urls)}"
            )

            # レース詳細ページにアクセスして、レースのデータを取得する
            for race_info_page_url in listing_page.race_info_page_urls:
                # CSV にアップロードするデータ構造をいれる
                # Errorが発生したら該当PropertyはSkipする

                if NETKEIBA_BASE_URL[:-1] + race_info_page_url in crawled_urls:
                    logger.info("already crawled. skip...")
                    #crawl_end_flg = True
                    #break
                try:
                    if self._get_race_info(race_info_page_url):
                        race_histories.append(
                            self._get_race_info(race_info_page_url))
                    else:
                        raise DetailPageNotFoundError("table not found.")
                except DetailPageNotFoundError as e:
                    logger.warning(f"Skip getting race:{e}")
                    # TODO: sentryとかエラー監視ツール入れる

                if crawl_limit and len(race_histories) >= crawl_limit:
                    # crawl_limit の件数に達したらクロールを終了する
                    logger.info(
                        f"Finish crawl. race_histories count: {len(race_histories)}"
                    )
                    return race_histories

            # next_page_url がある場合は次ページへアクセス
            if crawl_end_flg:
                break
            print(listing_page.next_page_url)
            listing_page_url = listing_page.next_page_url

        logger.info(
            f"Finish crawl. race_histories count: {len(race_histories)}")
        return race_histories
Beispiel #11
0
    def parse(self, html, url) -> HorseInfo:
        soup = BeautifulSoup(html, "lxml")
        if len(soup.find_all("table", summary="のプロフィール")) != 0:
            profile_table = soup.find_all("table", summary="のプロフィール")[0]
        else:
            logger.info("crawling by selenium...")
            options = webdriver.ChromeOptions()
            options.add_argument('--headless')
            driver = webdriver.Chrome(
                options=options,
                executable_path=
                '/Users/daikimiyazaki/workspace/pndnism/horse_race_prediction/horse_info_crawler/horse_info_crawler/components/chromedriver',
            )
            driver.get(url)
            content = driver.page_source
            soup = BeautifulSoup(content, "lxml")
            profile_table = soup.find_all("table", summary="のプロフィール")[0]

        profile_dic = {}
        for i, j in zip(profile_table.find_all("th"),
                        profile_table.find_all("td")):
            profile_dic[i.text] = j.text

        blood_table = soup.find_all("dd", class_="DB_ProfHead_dd_01")[0]
        return HorseInfo(
            horse_url=self._parse_horse_url(soup),
            name=self._parse_name(soup),
            birthday=self._parse_birthday(profile_dic),
            trainer_name=self._parse_trainer_name(profile_dic),
            owner_name=self._parse_owner_name(profile_dic),
            producer=self._parse_producer(profile_dic),
            origin_place=self._parse_origin_place(profile_dic),
            mother=self._parse_mother(blood_table),
            father=self._parse_father(blood_table),
            mother_of_father=self._parse_mother_of_father(blood_table),
            father_of_father=self._parse_father_of_father(blood_table),
            mother_of_mother=self._parse_mother_of_mother(blood_table),
            father_of_mother=self._parse_father_of_mother(blood_table))