Example #1
0
 def get_changes(self) -> pd.DataFrame:
     logger.info(f"get sp500 history changes......")
     # NOTE: may update the index of the table
     changes_df = pd.read_html(self.WIKISP500_CHANGES_URL)[-1]
     changes_df = changes_df.iloc[:, [0, 1, 3]]
     changes_df.columns = [self.DATE_FIELD_NAME, self.ADD, self.REMOVE]
     changes_df[self.DATE_FIELD_NAME] = pd.to_datetime(
         changes_df[self.DATE_FIELD_NAME])
     _result = []
     for _type in [self.ADD, self.REMOVE]:
         _df = changes_df.copy()
         _df[self.CHANGE_TYPE_FIELD] = _type
         _df[self.SYMBOL_FIELD_NAME] = _df[_type]
         _df.dropna(subset=[self.SYMBOL_FIELD_NAME], inplace=True)
         if _type == self.ADD:
             _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply(
                 lambda x: get_trading_date_by_shift(
                     self.calendar_list, x, 0))
         else:
             _df[self.DATE_FIELD_NAME] = _df[self.DATE_FIELD_NAME].apply(
                 lambda x: get_trading_date_by_shift(
                     self.calendar_list, x, -1))
         _result.append(_df[[
             self.DATE_FIELD_NAME, self.CHANGE_TYPE_FIELD,
             self.SYMBOL_FIELD_NAME
         ]])
     logger.info(f"end of get sp500 history changes.")
     return pd.concat(_result, sort=False)
Example #2
0
    def get_changes_with_history_companies(
            self, history_companies: pd.DataFrame) -> pd.DataFrame:
        """get changes with history companies

        Parameters
        ----------
        history_companies : pd.DataFrame
            symbol        date
            SH600000   2020-11-11

            dtypes:
                symbol: str
                date: pd.Timestamp

        Return
        --------
            pd.DataFrame:
                symbol      date        type
                SH600000  2019-11-11    add
                SH600000  2020-11-10    remove
            dtypes:
                symbol: str
                date: pd.Timestamp
                type: str, value from ["add", "remove"]

        """
        logger.info("parse changes from history companies......")
        last_code = []
        result_df_list = []
        _columns = [
            self.DATE_FIELD_NAME, self.SYMBOL_FIELD_NAME,
            self.CHANGE_TYPE_FIELD
        ]
        for _trading_date in tqdm(
                sorted(history_companies[self.DATE_FIELD_NAME].unique(),
                       reverse=True)):
            _currenet_code = history_companies[history_companies[
                self.DATE_FIELD_NAME] == _trading_date][
                    self.SYMBOL_FIELD_NAME].tolist()
            if last_code:
                add_code = list(set(last_code) - set(_currenet_code))
                remote_code = list(set(_currenet_code) - set(last_code))
                for _code in add_code:
                    result_df_list.append(
                        pd.DataFrame(
                            [[
                                get_trading_date_by_shift(
                                    self.calendar_list, _trading_date, 1),
                                _code, self.ADD
                            ]],
                            columns=_columns,
                        ))
                for _code in remote_code:
                    result_df_list.append(
                        pd.DataFrame(
                            [[
                                get_trading_date_by_shift(
                                    self.calendar_list, _trading_date, 0),
                                _code, self.REMOVE
                            ]],
                            columns=_columns,
                        ))
            last_code = _currenet_code
        df = pd.concat(result_df_list)
        logger.info("end of parse changes from history companies.")
        return df
Example #3
0
    def _read_change_from_url(self, url: str) -> pd.DataFrame:
        """read change from url

        Parameters
        ----------
        url : str
            change url

        Returns
        -------
            pd.DataFrame:
                symbol      date        type
                SH600000  2019-11-11    add
                SH600000  2020-11-10    remove
            dtypes:
                symbol: str
                date: pd.Timestamp
                type: str, value from ["add", "remove"]
        """
        resp = retry_request(url).json()["data"]
        title = resp["title"]
        if not title.startswith("关于"):
            return pd.DataFrame()
        if "沪深300" not in title:
            return pd.DataFrame()

        logger.info(
            f"load index data from https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
        )
        _text = resp["content"]
        date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
        if len(date_list) >= 2:
            add_date = pd.Timestamp("-".join(date_list[0]))
        else:
            _date = pd.Timestamp("-".join(
                re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
            add_date = get_trading_date_by_shift(self.calendar_list,
                                                 _date,
                                                 shift=0)
        if "盘后" in _text or "市后" in _text:
            add_date = get_trading_date_by_shift(self.calendar_list,
                                                 add_date,
                                                 shift=1)
        remove_date = get_trading_date_by_shift(self.calendar_list,
                                                add_date,
                                                shift=-1)

        excel_url = None
        if resp.get("enclosureList", []):
            excel_url = resp["enclosureList"][0]["fileUrl"]
        else:
            excel_url_list = re.findall('.*href="(.*?xls.*?)".*', _text)
            if excel_url_list:
                excel_url = excel_url_list[0]
                if not excel_url.startswith("http"):
                    excel_url = excel_url if excel_url.startswith(
                        "/") else "/" + excel_url
                    excel_url = f"http://www.csindex.com.cn{excel_url}"
        if excel_url:
            logger.info(
                f"get {add_date} changes from excel, title={title}, excel_url={excel_url}"
            )
            try:
                df = self._parse_excel(excel_url, add_date, remove_date)
            except ValueError:
                logger.warning(
                    f"error downloading file: {excel_url}, will parse the table from the content"
                )
                df = self._parse_table(_text, add_date, remove_date)
        else:
            logger.info(
                f"get {add_date} changes from url content, title={title}")
            df = self._parse_table(_text, add_date, remove_date)
        return df
Example #4
0
    def _read_change_from_url(self, url: str) -> pd.DataFrame:
        """read change from url
        The parameter url is from the _get_change_notices_url method.
        Determine the stock add_date/remove_date based on the title.
        The response contains three cases:
            1.Only excel_url(extract data from excel_url)
            2.Both the excel_url and the body text(try to extract data from excel_url first, and then try to extract data from body text)
            3.Only body text(extract data from body text)

        Parameters
        ----------
        url : str
            change url

        Returns
        -------
            pd.DataFrame:
                symbol      date        type
                SH600000  2019-11-11    add
                SH600000  2020-11-10    remove
            dtypes:
                symbol: str
                date: pd.Timestamp
                type: str, value from ["add", "remove"]
        """
        resp = retry_request(url).json()["data"]
        title = resp["title"]
        if not title.startswith("关于"):
            return pd.DataFrame()
        if "沪深300" not in title:
            return pd.DataFrame()

        logger.info(
            f"load index data from https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
        )
        _text = resp["content"]
        date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
        if len(date_list) >= 2:
            add_date = pd.Timestamp("-".join(date_list[0]))
        else:
            _date = pd.Timestamp("-".join(
                re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
            add_date = get_trading_date_by_shift(self.calendar_list,
                                                 _date,
                                                 shift=0)
        if "盘后" in _text or "市后" in _text:
            add_date = get_trading_date_by_shift(self.calendar_list,
                                                 add_date,
                                                 shift=1)
        remove_date = get_trading_date_by_shift(self.calendar_list,
                                                add_date,
                                                shift=-1)

        excel_url = None
        if resp.get("enclosureList", []):
            excel_url = resp["enclosureList"][0]["fileUrl"]
        else:
            excel_url_list = re.findall('.*href="(.*?xls.*?)".*', _text)
            if excel_url_list:
                excel_url = excel_url_list[0]
                if not excel_url.startswith("http"):
                    excel_url = excel_url if excel_url.startswith(
                        "/") else "/" + excel_url
                    excel_url = f"http://www.csindex.com.cn{excel_url}"
        if excel_url:
            try:
                logger.info(
                    f"get {add_date} changes from the excel, title={title}, excel_url={excel_url}"
                )
                df = self._parse_excel(excel_url, add_date, remove_date)
            except ValueError:
                logger.info(
                    f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
                )
                df = self._parse_table(_text, add_date, remove_date)
        else:
            logger.info(
                f"get {add_date} changes from the web page, title={title}, url=https://www.csindex.com.cn/#/about/newsDetail?id={url.split('id=')[-1]}"
            )
            df = self._parse_table(_text, add_date, remove_date)
        return df
Example #5
0
    def _read_change_from_url(self, url: str) -> pd.DataFrame:
        """read change from url

        Parameters
        ----------
        url : str
            change url

        Returns
        -------
            pd.DataFrame:
                symbol      date        type
                SH600000  2019-11-11    add
                SH600000  2020-11-10    remove
            dtypes:
                symbol: str
                date: pd.Timestamp
                type: str, value from ["add", "remove"]
        """
        resp = retry_request(url)
        _text = resp.text
        date_list = re.findall(r"(\d{4}).*?年.*?(\d+).*?月.*?(\d+).*?日", _text)
        if len(date_list) >= 2:
            add_date = pd.Timestamp("-".join(date_list[0]))
        else:
            _date = pd.Timestamp("-".join(
                re.findall(r"(\d{4}).*?年.*?(\d+).*?月", _text)[0]))
            add_date = get_trading_date_by_shift(self.calendar_list,
                                                 _date,
                                                 shift=0)
        remove_date = get_trading_date_by_shift(self.calendar_list,
                                                add_date,
                                                shift=-1)
        logger.info(f"get {add_date} changes")
        try:
            excel_url = re.findall('.*href="(.*?xls.*?)".*', _text)[0]
            content = retry_request(f"http://www.csindex.com.cn{excel_url}",
                                    exclude_status=[404]).content
            _io = BytesIO(content)
            df_map = pd.read_excel(_io, sheet_name=None)
            with self.cache_dir.joinpath(
                    f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.{excel_url.split('.')[-1]}"
            ).open("wb") as fp:
                fp.write(content)
            tmp = []
            for _s_name, _type, _date in [("调入", self.ADD, add_date),
                                          ("调出", self.REMOVE, remove_date)]:
                _df = df_map[_s_name]
                _df = _df.loc[_df["指数代码"] == self.index_code, ["证券代码"]]
                _df = _df.applymap(self.normalize_symbol)
                _df.columns = [self.SYMBOL_FIELD_NAME]
                _df["type"] = _type
                _df[self.DATE_FIELD_NAME] = _date
                tmp.append(_df)
            df = pd.concat(tmp)
        except Exception as e:
            df = None
            _tmp_count = 0
            for _df in pd.read_html(resp.content):
                if _df.shape[-1] != 4:
                    continue
                _tmp_count += 1
                if self.html_table_index + 1 > _tmp_count:
                    continue
                tmp = []
                for _s, _type, _date in [
                    (_df.iloc[2:, 0], self.REMOVE, remove_date),
                    (_df.iloc[2:, 2], self.ADD, add_date),
                ]:
                    _tmp_df = pd.DataFrame()
                    _tmp_df[self.SYMBOL_FIELD_NAME] = _s.map(
                        self.normalize_symbol)
                    _tmp_df["type"] = _type
                    _tmp_df[self.DATE_FIELD_NAME] = _date
                    tmp.append(_tmp_df)
                df = pd.concat(tmp)
                df.to_csv(
                    str(
                        self.cache_dir.joinpath(
                            f"{self.index_name.lower()}_changes_{add_date.strftime('%Y%m%d')}.csv"
                        ).resolve()))
                break
        return df