def record(self, entity, start, end, size, timestamps, http_session):
        the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN))
        if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]

        param = {
            'security_item': entity,
            'quarters': the_quarters,
            'level': self.level.value
        }

        security_item = param['security_item']
        quarters = param['quarters']
        level = param['level']

        result_df = pd.DataFrame()
        for year, quarter in quarters:
            query_url = self.url.format(security_item.code, year, quarter)
            response = request_get(http_session, query_url)
            response.encoding = 'gbk'

            try:
                dfs = pd.read_html(response.text)
            except ValueError as error:
                self.logger.error(
                    f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})'
                )
                self.sleep()
                continue

            if len(dfs) < 5:
                self.sleep()
                continue

            df = dfs[4].copy()
            df = df.iloc[1:]
            df.columns = [
                'timestamp', 'open', 'high', 'close', 'low', 'volume',
                'turnover'
            ]
            df['name'] = security_item.name
            df['level'] = level
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            df['provider'] = Provider.Sina

            result_df = pd.concat([result_df, df])

            self.logger.info(
                f'({security_item.code}{security_item.name})({year}-{quarter:02d})'
            )
            self.sleep()

        result_df = result_df.sort_values(by='timestamp')

        return result_df.to_dict(orient='records')
Beispiel #2
0
    def record(self, entity, start, end, size, timestamps):
        the_quarters = get_year_quarters(start)
        if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]

        param = {
            "security_item": entity,
            "quarters": the_quarters,
            "level": self.level.value
        }

        security_item = param["security_item"]
        quarters = param["quarters"]
        level = param["level"]

        result_df = pd.DataFrame()
        for year, quarter in quarters:
            query_url = self.url.format(security_item.code, year, quarter)
            response = requests.get(query_url)
            response.encoding = "gbk"

            try:
                dfs = pd.read_html(response.text)
            except ValueError as error:
                self.logger.error(
                    f"skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})"
                )
                time.sleep(10.0)
                continue

            if len(dfs) < 5:
                time.sleep(10.0)
                continue

            df = dfs[4].copy()
            df = df.iloc[1:]
            df.columns = [
                "timestamp", "open", "high", "close", "low", "volume",
                "turnover"
            ]
            df["name"] = security_item.name
            df["level"] = level
            df["timestamp"] = pd.to_datetime(df["timestamp"])
            df["provider"] = "sina"

            result_df = pd.concat([result_df, df])

            self.logger.info(
                f"({security_item.code}{security_item.name})({year}-{quarter:02d})"
            )
            time.sleep(10.0)

        result_df = result_df.sort_values(by="timestamp")

        return result_df.to_dict(orient="records")
Beispiel #3
0
    def generate_request_param(self, security_item, start, end, size,
                               timestamp):
        the_quarters = get_year_quarters(start)
        if not is_same_date(security_item.timestamp,
                            start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]

        return {
            'security_item': security_item,
            'quarters': the_quarters,
            'level': self.level.value
        }
Beispiel #4
0
    def record(self, entity, start, end, size, timestamps, http_session):
        the_quarters = get_year_quarters(start, now_pd_timestamp(Region.CHN))
        if not is_same_date(entity.timestamp, start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]

        param = {
            'security_item': entity,
            'quarters': the_quarters,
            'level': self.level.value
        }

        security_item = param['security_item']
        quarters = param['quarters']
        level = param['level']

        result_df = pd.DataFrame()
        for year, quarter in quarters:
            query_url = self.url.format(security_item.code, year, quarter)
            text = sync_get(http_session,
                            query_url,
                            encoding='gbk',
                            return_type='text')
            if text is None:
                continue

            try:
                dfs = pd.read_html(text)
            except ValueError as error:
                self.logger.error(
                    f'skip ({year}-{quarter:02d}){security_item.code}{security_item.name}({error})'
                )
                self.sleep()
                continue

            if len(dfs) < 5:
                self.sleep()
                continue

            df = dfs[4].copy()
            df = df.iloc[1:]
            df.columns = [
                'timestamp', 'open', 'high', 'close', 'low', 'volume',
                'turnover'
            ]
            result_df = pd.concat([result_df, df])

            self.sleep()

        if pd_is_not_null(result_df):
            result_df['level'] = level
            return result_df
        return None
Beispiel #5
0
    def record(self, security_item, start, end, size, timestamps):
        the_quarters = get_year_quarters(start)
        # treat has recorded the season if contains some date
        if not is_same_date(security_item.timestamp,
                            start) and len(the_quarters) > 1:
            the_quarters = the_quarters[1:]
        for year, quarter in the_quarters:
            kdatas = []

            for fuquan in ['bfq', 'hfq']:
                the_url = self.get_kdata_url(security_item.code, year, quarter,
                                             fuquan)
                resp = requests.get(the_url)

                trs = Selector(text=resp.text).xpath(
                    '//*[@id="FundHoldSharesTable"]/tr[position()>1 and position()<=last()]'
                ).extract()

                for idx, tr in enumerate(trs):
                    tds = Selector(text=tr).xpath('//td//text()').extract()
                    tds = [x.strip() for x in tds if x.strip()]

                    open = tds[1]
                    high = tds[2]
                    close = tds[3]
                    low = tds[4]
                    volume = tds[5]
                    turnover = tds[6]
                    if fuquan == 'hfq':
                        factor = tds[7]

                    the_timestamp = to_pd_timestamp(tds[0])
                    the_id = generate_kdata_id(security_id=security_item.id,
                                               timestamp=the_timestamp,
                                               level=self.level)

                    if fuquan == 'hfq':
                        # we got bfq at first and then update hfq data
                        existed = [
                            item for item in kdatas if item['id'] == the_id
                        ]

                        if existed:
                            kdata = existed[0]
                        else:
                            self.logger.error(
                                "bfq not got for:{}".format(the_id))
                            kdata = {
                                'id': the_id,
                                'timestamp': the_timestamp,
                                'name': security_item.name,
                                'level': self.level.value,
                                'open': to_float(open) / to_float(factor),
                                'close': to_float(close) / to_float(factor),
                                'high': to_float(high) / to_float(factor),
                                'low': to_float(low) / to_float(factor),
                                'volume': to_float(volume),
                                'turnover': to_float(turnover)
                            }
                            kdatas.append(kdata)

                        kdata['hfq_open'] = to_float(open)
                        kdata['hfq_high'] = to_float(high)
                        kdata['hfq_close'] = to_float(close)
                        kdata['hfq_low'] = to_float(low)
                        kdata['factor'] = to_float(factor)

                        self.latest_factors[security_item.id] = to_float(
                            factor)

                    else:
                        kdatas.append({
                            'id': the_id,
                            'timestamp': the_timestamp,
                            'name': security_item.name,
                            'level': self.level.value,
                            'open': to_float(open),
                            'close': to_float(close),
                            'high': to_float(high),
                            'low': to_float(low),
                            'volume': to_float(volume),
                            'turnover': to_float(turnover)
                        })

            return kdatas