def get_report(self, no=0, id_=None): """ :param no: int。在type_=3 中的第no个报告。 :param id_: id 可由 :meth:`show_report_list` 中条目的对应 ID 得到 :return: """ if id_: report_url = "http://fund.eastmoney.com/gonggao/{code},{id_}.html".format( code=self.code, id_=id_) r = rget(report_url) b = BeautifulSoup(r.text, "lxml") seasonr = b.find("pre") sr = [s.string.strip() for s in seasonr.findAll("p") if s.string] return sr if not self.report_detail.get(no): report_url = "http://fund.eastmoney.com/gonggao/{code},{id_}.html".format( code=self.code, id_=self.report_list[no]["ID"]) r = rget(report_url) b = BeautifulSoup(r.text, "lxml") seasonr = b.find("pre") sr = [s.string.strip() for s in seasonr.findAll("p") if s.string] self.report_detail[no] = sr return self.report_detail[no]
def _basic_init(self): self._page = rget(self._url) if self._page.text[:800].find("Data_fundSharesPositions") >= 0: raise FundTypeError( "This code seems to be a fund, use fundinfo instead") l = eval( re.match(r".*Data_millionCopiesIncome = ([^;]*);.*", self._page.text).groups()[0]) self.name = re.match(r".*fS_name = \"([^;]*)\";.*", self._page.text).groups()[0] tz_bj = dt.timezone(dt.timedelta(hours=8)) datel = [ dt.datetime.fromtimestamp(int(d[0]) / 1e3, tz=tz_bj).replace(tzinfo=None) for d in l ] ratel = [float(d[1]) for d in l] netvalue = [1] for dailyrate in ratel: netvalue.append(netvalue[-1] * (1 + dailyrate * 1e-4)) netvalue.remove(1) df = pd.DataFrame( data={ "date": datel, "netvalue": netvalue, "totvalue": netvalue, "comment": [0 for _ in datel], }) df = df[df["date"].isin(opendate)] df = df.reset_index(drop=True) self.price = df[df["date"] <= yesterdaydash()]
def _basic_init(self): self._page = rget(self._url) if self._page.status_code == 404: raise ParserFailure( "Unrecognized fund, please check fund code you input.") if self._page.text[:800].find("Data_millionCopiesIncome") >= 0: raise FundTypeError( "This code seems to be a mfund, use mfundinfo instead") l = re.match(r"[\s\S]*Data_netWorthTrend = ([^;]*);[\s\S]*", self._page.text).groups()[0] l = l.replace("null", "None") # 暂未发现基金净值有 null 的基金,若有,其他地方也很可能出问题! l = eval(l) ltot = re.match(r"[\s\S]*Data_ACWorthTrend = ([^;]*);[\s\S]*", self._page.text).groups()[0] # .* doesn't match \n ltot = ltot.replace("null", "None") ## 096001 总值数据中有 null! ltot = eval(ltot) ## timestamp transform tzinfo must be taken into consideration tz_bj = dt.timezone(dt.timedelta(hours=8)) infodict = { "date": [ dt.datetime.fromtimestamp(int(d["x"]) / 1e3, tz=tz_bj).replace(tzinfo=None) for d in l ], "netvalue": [float(d["y"]) for d in l], "comment": [_nfloat(d["unitMoney"]) for d in l], } if len(l) == len(ltot): # 防止总值和净值数据量不匹配,已知有该问题的基金:502010 infodict["totvalue"] = [d[1] for d in ltot] try: rate = float( eval( re.match(r"[\s\S]*fund_Rate=([^;]*);[\s\S]*", self._page.text).groups()[0])) except ValueError: rate = 0 logger.info( "warning: this fund has no data for rate") # know cases: ETF name = eval( re.match(r"[\s\S]*fS_name = ([^;]*);[\s\S]*", self._page.text).groups()[0]) self.rate = rate # shengou rate in tiantianjijin, daeshengou rate discount is not considered self.name = name # the name of the fund df = pd.DataFrame(data=infodict) df = df[df["date"].isin(opendate)] df = df.reset_index(drop=True) if len(df) == 0: raise ParserFailure("no price table found for this fund %s" % self.code) self.price = df[df["date"] <= yesterdaydash()] # deal with the redemption fee attrs finally if not self.priceonly: self._feepreprocess()
def __init__(self, code): url = "http://fundgz.1234567.com.cn/js/" + code + ".js" page = rget(url) self.code = code self.rtvalue = float(match(r'.*"gsz":"(\d*\.\d*)",.*', page.text)[1]) self.name = match(r'.*"name":"([^,]*)",.*', page.text)[1] self.time = dt.datetime.strptime( match(r'.*"gztime":"([\d\s\-\:]*)".*', page.text)[1], "%Y-%m-%d %H:%M")
def get_fund_list(ft): # hh, zq, zs, gp, qdii, fof r = rget( "http://fund.eastmoney.com/data/FundGuideapi.aspx?\ dt=0&ft={ft}&sd=&ed=&sc=z&st=desc&pi=1&pn=5000&zf=diy&sh=list".format(ft=ft), headers={ "Host": "fund.eastmoney.com", "Referer": "http://fund.eastmoney.com/daogou/", }, ) d = eval(r.text.split("=")[1].replace("null", "None")) return [code.split(",")[0] for code in d["datas"] if code.strip()]
def __init__(self, code): self.code = code r = rget( "http://api.fund.eastmoney.com/f10/JJGG?callback=&fundcode={code}&pageIndex=1&pageSize=20&type={type_}" .format(code=code, type_="3"), headers={ "Referer": "http://fundf10.eastmoney.com/jjgg_{code}_3.html".format( code=code) }, ) self.report_list = r.json()["Data"] self.report_detail = {}
def get_investing_id(suburl): url = "https://cn.investing.com" if not suburl.startswith("/"): url += "/" url += suburl r = rget( url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36" }, ) s = BeautifulSoup(r.text, "lxml") pid = s.find("span", id="last_last")["class"][-1].split("-")[1] return pid
def show_report_list(self, type_=3): """ :param type_: int。第0栏,第1栏,每栏的含义,请参照天天基金基金报告的页面。 :return: """ r = rget( "http://api.fund.eastmoney.com/f10/JJGG?callback=&fundcode={code}&pageIndex=1&pageSize=20&type={type_}" .format(code=self.code, type_=str(type_)), headers={ "Referer": "http://fundf10.eastmoney.com/jjgg_{code}_3.html".format( code=self.code) }, ) return r.json()["Data"]
def _feepreprocess(self): """ Preprocess to add self.feeinfo and self.segment attr according to redemption fee info """ feepage = rget(self._feeurl) soup = BeautifulSoup( feepage.text, "lxml") # parse the redemption fee html page with beautiful soup self.feeinfo = [ item.string for item in soup.findAll("a", {"name": "shfl"}) [0].parent.parent.next_sibling.next_sibling.find_all("td") if item.string != "---" ] # this could be [], known case 510030 if not self.feeinfo or len(self.feeinfo) % 2 != 0: logger.debug("feeinfo is not typical, mainly due to ETF: %s" % self.feeinfo) self.feeinfo = ["小于7天", "1.50%", "大于等于7天", "0.00%"] self.segment = fundinfo._piecewise(self.feeinfo)
def get_tdx_holidays(holidays=None, format="%Y-%m-%d"): r = rget("https://www.tdx.com.cn/url/holiday/") r.encoding = "gbk" b = BeautifulSoup(r.text, "lxml") l = b.find("textarea").string.split("\n") if not holidays: holidays = {} for item in l: if item.strip(): c = item.split("|") if c[2] in region_trans: rg = region_trans[c[2]] tobj = dt.datetime.strptime(c[0], "%Y%m%d") tstr = tobj.strftime(format) if rg not in holidays: holidays[rg] = [tstr] else: holidays[rg].append(tstr) return holidays
def _feepreprocess(self): """ Preprocess to add self.feeinfo and self.segment attr according to redemption fee info """ feepage = rget(self._feeurl) soup = BeautifulSoup( feepage.text, "lxml") # parse the redemption fee html page with beautiful soup somethingwrong = False if not soup.findAll("a", {"name": "shfl"}): somethingwrong = True logger.warning("%s 基金赎回信息为空,可能由于该基金已终止运作" % self.code) self.feeinfo = [] else: self.feeinfo = [ item.string for item in soup.findAll("a", {"name": "shfl"}) [0].parent.parent.next_sibling.next_sibling.find_all("td") if item.string != "---" ] # this could be [], known case 510030 if not self.feeinfo or len(self.feeinfo) % 2 != 0: somethingwrong = True else: for item in self.feeinfo: if "开放期" in item or "封闭" in item or "开放日期" in item or "运作期" in item: # 暂时没有完美维护定开基金赎回费处理的计划 somethingwrong = True if somethingwrong: logger.warning("%s 赎回费信息异常,多是因为定开基金,封闭基金或场内 ETF: %s" % (self.code, self.feeinfo)) self.feeinfo = ["小于7天", "1.50%", "大于等于7天", "0.00%"] # print(self.feeinfo) try: self.segment = fundinfo._piecewise(self.feeinfo) except (ValueError, IndexError) as e: logger.warning( "%s 赎回费信息抓取异常,请手动设定 ``self.segment`` 和 ``self.feeinfo``: %s" % (self.code, self.feeinfo)) # below is default one self.feeinfo = ["小于7天", "1.50%", "大于等于7天", "0.00%"] self.segment = fundinfo._piecewise(self.feeinfo)
def get_ri_status(suburl=None): if not suburl: suburl = "m=cb&a=cb_all" # 可转债 url = "http://www.richvest.com/index.php?" url += suburl r = rget(url, headers={"user-agent": "Mozilla/5.0"}) b = BeautifulSoup(r.text, "lxml") cl = [] for c in b.findAll("th"): cl.append(c.text) nocl = len(cl) rl = [] for i, c in enumerate(b.findAll("td")): if i % nocl == 0: r = [] r.append(c.text) if i % nocl == nocl - 1: rl.append(r) return pd.DataFrame(rl, columns=cl)
def fetch_treasure_bond_rate(self): if getattr(self.__class__, 'riskfree', None) is None: r = rget( 'http://yield.chinabond.com.cn/cbweb-czb-web/czb/moreInfo') if r.status_code != 200: raise Exception('fetch treasure bond rate failed') else: soup = bs(r.content, 'lxml') ref = '5年' index = 0 for label in soup.findAll('td'): if label.string is not None and label.string == ref: break index += 1 # print(f'{ref} 国债收益率: {soup.findAll("td")[index+1].string}') setattr( self.__class__, 'riskfree', round( float(soup.findAll("td")[index + 1].string) / 100, 6))
def __init__(self, code, bondrate=None, riskfreerate=None, volatility=None, name=None): """ :param code: str. 转债代码,包含 SH 或 SZ 字头 :param bondrate: Optional[float]. 评估所用的债券折现率,默认使用中证企业债对应信用评级对应久期的利率 :param riskfreerate: Optioal[float]. 评估期权价值所用的无风险利率,默认使用国债对应久期的年利率。 :param volatility: Optional[float]. 正股波动性百分点,默认在一个范围浮动加上历史波动率的小幅修正。 :param name: str. 对于历史回测,可以直接提供 str,免得多次 get_rt 获取 name """ # 应该注意到该模型除了当天外,其他时间估计会利用现在的转股价,对于以前下修过转股价的转债历史价值估计有问题 self.code = code self.refbondrate = bondrate self.bondrate = self.refbondrate self.refriskfreerate = riskfreerate self.riskfreerate = self.refriskfreerate self.refvolatility = volatility self.volatility = self.refvolatility self.name = name r = rget("https://www.jisilu.cn/data/convert_bond_detail/" + code[2:]) r.encoding = "utf-8" b = BeautifulSoup(r.text, "lxml") self.rlist = [ float(re.search(r"[\D]*([\d]*.[\d]*)\%", s).group(1)) for s in b.select("td[id=cpn_desc]")[0].string.split("、") ] self.rlist.append(float(b.select("td[id=redeem_price]")[0].string)) self.rlist[-1] -= self.rlist[-2] # 最后一年不含息返多少 self.scode = (b.select("td[class=jisilu_nav]") [0].contents[1].string.split("-")[1].strip()) self.scode = ttjjcode(self.scode) # 标准化股票代码 self.zgj = float(b.select("td[id=convert_price]")[0].string) # 转股价 self.rating = b.select("td[id=rating_cd]")[0].string self.enddate = b.select("td[id=maturity_dt]")[0].string
def get_cninvesting_rt(suburl): url = "https://cn.investing.com" if not suburl.startswith("/"): url += "/" url += suburl r = rget( url, headers={ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36" }, ) s = BeautifulSoup(r.text, "lxml") last_last = s.find("span", id="last_last") q = _float(last_last.string) name = s.find("h1").string.strip() ind = 0 l = s.find("div", class_="lighterGrayFont").contents for i, c in enumerate(l): if isinstance(c, str) and c.strip() == "货币": ind = i break if ind == 0: currency = None else: currency = l[ind - 1].string percent = _float( s.find("span", attrs={"dir": "ltr", "class": "parentheses"}).string[:-1] ) panhou = s.find("div", class_="afterHoursInfo") if panhou: q_ext = _float(panhou.find("span").string) else: q_ext = None return { "name": name, "current": q, "current_ext": q_ext, "currency": currency, "percent": percent, }
def _basic_init(self): raw = rget(self._url) cr = csv.reader(raw.text.splitlines(), delimiter=",") my_list = list(cr) factor = float(my_list[-1][3]) dd = { "date": [ dt.datetime.strptime(my_list[i + 1][0], "%Y-%m-%d") for i in range(len(my_list) - 1) ], "netvalue": [ float(my_list[i + 1][3]) / factor for i in range(len(my_list) - 1) ], "totvalue": [float(my_list[i + 1][3]) for i in range(len(my_list) - 1)], "comment": [0 for _ in range(len(my_list) - 1)], } index = pd.DataFrame(data=dd) index = index.iloc[::-1] index = index.reset_index(drop=True) self.price = index[index["date"].isin(opendate)] self.price = self.price[self.price["date"] <= yesterdaydash()] self.name = my_list[-1][2]
def get_token(): r = rget("https://xueqiu.com", headers={"user-agent": "Mozilla"}) return r.cookies["xq_a_token"]
def update(self): """ function to incrementally update the pricetable after fetch the old one """ lastdate = self.price.iloc[-1].date startvalue = self.price.iloc[-1].totvalue diffdays = (yesterdayobj() - lastdate).days if diffdays == 0: return None self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=1&per=1") con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items = soup.findAll("td") if dt.datetime.strptime(str(items[0].string), "%Y-%m-%d") == today(): diffdays += 1 if diffdays <= 10: # caution: there may be today data!! then a day gap will be in table self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=1&per=" + str(diffdays)) con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items = soup.findAll("td") elif ( diffdays > 10 ): ## there is a 20 item per page limit in the API, so to be safe, we query each page by 10 items only items = [] for pg in range(1, int(diffdays / 10) + 2): self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=" + str(pg) + "&per=10") con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items.extend(soup.findAll("td")) else: raise TradeBehaviorError( "Weird incremental update: the saved copy has future records") date = [] earnrate = [] comment = [] for i in range(int(len(items) / 6)): ts = pd.Timestamp(str(items[6 * i].string)) if (ts - lastdate).days > 0: date.append(ts) earnrate.append(float(items[6 * i + 1].string) * 1e-4) comment.append(_nfloat(items[6 * i + 5].string)) date = date[::-1] earnrate = earnrate[::-1] comment = comment[::-1] netvalue = [startvalue] for earn in earnrate: netvalue.append(netvalue[-1] * (1 + earn)) netvalue.remove(startvalue) df = pd.DataFrame({ "date": date, "netvalue": netvalue, "totvalue": netvalue, "comment": comment, }) df = df[df["date"].isin(opendate)] df = df.reset_index(drop=True) df = df[df["date"] <= yesterdayobj()] if len(df) != 0: self.price = self.price.append(df, ignore_index=True, sort=True) return df
def get_fund_holdings(code, year="", season="", month="", category="jjcc"): """ 获取基金详细的底层持仓信息 :param code: str. 6 位基金代码 :param year: int. eg. 2019 :param season: int, 1,2,3,4 :param month: Optional[int]. 指定 season 即可,一般不需理会 :param category: str. stock 股票持仓, bond 债券持仓,天天基金无法自动处理海外基金持仓,暂未兼容 FOF 的国内基金持仓 :return: pd.DataFrame or None. 没有对应持仓时返回 None。 """ if not month and season: month = 3 * int(season) if category in ["stock", "stocks", "jjcc", "", "gp", "s"]: category = "jjcc" elif category in ["bond", "bonds", "zq", "zqcc", "b"]: category = "zqcc" else: raise ParserFailure("unrecognized category %s" % category) if code.startswith("F"): code = code[1:] r = rget( "http://fundf10.eastmoney.com/FundArchivesDatas.aspx?type={category}&code={code}&topline=10&\ year={year}&month={month}".format(year=str(year), month=str(month), code=code, category=category), headers={ "Host": "fundf10.eastmoney.com", "Referer": "http://fundf10.eastmoney.com/ccmx_{code}.html".format(code=code), }, ) if len(r.text) < 50: return # raise ParserFailure( # "This fund has no holdings on stock or bonds in this period" # ) s = BeautifulSoup( re.match("[\s\S]*apidata={ content:(.*),arryear:", r.text).groups()[0], "lxml") if len(s.text) < 30: return # raise ParserFailure( # "This fund has no holdings on stock or bonds in this period" # ) timeline = [ i.string for i in s.findAll("font", class_="px12") if i.text.startswith("2") ] ind = 0 if month: for i, d in enumerate(timeline): if d.split("-")[1][-1] == str(month)[-1]: # avoid 09 compare to 9 ind = i break else: return # not update to this month t1 = s.findAll("table")[ind] main = [[j.text for j in i.contents] for i in t1.findAll("tr")[1:]] cols = [j.text for j in t1.findAll("tr")[0].contents if j.text.strip()] icode = 1 iname = 2 iratio = 4 ishare = 5 ivalue = 6 for j, col in enumerate(cols): if col.endswith("代码"): icode = j elif col.endswith("名称"): iname = j elif col.endswith("比例"): iratio = j elif col.startswith("持股数"): ishare = j elif col.startswith("持仓市值"): ivalue = j if category == "jjcc": result = { "code": [], "name": [], "ratio": [], "share": [], "value": [] } for l in main: result["code"].append(l[icode]) result["name"].append(l[iname]) result["ratio"].append(float(l[iratio][:-1])) result["share"].append(_float(l[ishare])) result["value"].append(_float(l[ivalue])) elif category == "zqcc": result = {"code": [], "name": [], "ratio": [], "value": []} for l in main: result["code"].append(l[1]) result["name"].append(l[2]) result["ratio"].append(float(l[3][:-1])) result["value"].append(_float(l[4])) return pd.DataFrame(result)
def update(self): """ function to incrementally update the pricetable after fetch the old one """ lastdate = self.price.iloc[-1].date diffdays = (yesterdayobj() - lastdate).days if ( diffdays == 0 ): ## for some QDII, this value is 1, anyways, trying update is compatible (d+2 update) return None self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=1&per=1") con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items = soup.findAll("td") if dt.datetime.strptime(str(items[0].string), "%Y-%m-%d") == today(): diffdays += 1 if diffdays <= 10: self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=1&per=" + str(diffdays)) con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items = soup.findAll("td") elif ( diffdays > 10 ): ## there is a 20 item per page limit in the API, so to be safe, we query each page by 10 items only items = [] for pg in range(1, int(diffdays / 10) + 2): self._updateurl = ( "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=" + self.code + "&page=" + str(pg) + "&per=10") con = rget(self._updateurl) soup = BeautifulSoup(con.text, "lxml") items.extend(soup.findAll("td")) else: raise TradeBehaviorError( "Weird incremental update: the saved copy has future records") date = [] netvalue = [] totvalue = [] comment = [] for i in range(int(len(items) / 7)): ts = pd.Timestamp(str(items[7 * i].string)) if (ts - lastdate).days > 0: date.append(ts) netvalue.append(float(items[7 * i + 1].string)) totvalue.append(float(items[7 * i + 2].string)) comment.append(_nfloat(items[7 * i + 6].string)) else: break df = pd.DataFrame({ "date": date, "netvalue": netvalue, "totvalue": totvalue, "comment": comment, }) df = df.iloc[::-1] ## reverse the time order df = df[df["date"].isin(opendate)] df = df.reset_index(drop=True) df = df[df["date"] <= yesterdayobj()] if len(df) != 0: self.price = self.price.append(df, ignore_index=True, sort=True) return df