Ejemplo n.º 1
0
class IndexDataLoader:
    engine = cfg.load_engine()["2Gb"]

    def __init__(self, **kwargs):
        [self.__setattr__(k, v) for k, v in kwargs.items()]

    @common.unhash_inscache()
    def load_bm(self):
        sql = "SELECT index_id, date, value " \
              "FROM base_finance.index_value " \
              "WHERE index_id in ({iids}) AND date BETWEEN '{sd}' AND '{ed}'".format(
            iids=sqlfmt(self.index_id), sd=self.start, ed=self.end)
        df = pd.read_sql(sql, self.engine)  # SQL Table
        df["date"] = df["date"].apply(
            lambda x: dt.datetime(x.year, x.month, x.day))
        return df.pivot(index="date", columns="index_id", values="value")

    @common.unhash_inscache()
    def load_rf(self):
        # 一年期国债利率多取一年, 以确保有值填充
        sql = "SELECT statistic_date as date, y1_treasury_rate as value " \
              "FROM base.market_index " \
              "WHERE statistic_date BETWEEN '{sd}' AND '{ed}'".format(
            sd=self.start - relativedelta(years=1), ed=self.end)
        df = pd.read_sql(sql, self.engine)  # SQL Table
        df["date"] = df["date"].apply(
            lambda x: dt.datetime(x.year, x.month, x.day))
        return df.set_index("date")
Ejemplo n.º 2
0
def main():
    db = input("choose a db...")
    engine = cfg.load_engine()[db]
    df_dict = parse(engine)
    for tb, df in df_dict.items():
        print(tb)
        df["comments"] = df["comments"].apply(lambda x: re.search(
            "(?P<name_2>.*?)[:|:]\s*\{(?P<comment>.*)\}|(?P<name_1>.*)", x).
                                              groupdict())
        df["name_sc_1"] = df["comments"].apply(lambda x: x["name_1"])
        df["name_sc_2"] = df["comments"].apply(lambda x: x["name_2"])
        df["name_sc"] = df["name_sc_1"].fillna(df["name_sc_2"])
        df["comments"] = df["comments"].apply(lambda x: x["comment"])
        df["default"] = df["default"].apply(
            lambda x: x.replace("DEFAULT ", ""))
        del df["name_sc_1"], df["name_sc_2"]
        df = df.fillna("")
        df = df.astype(str)
        df = df[[
            "name", "name_sc", "type", "nullable", "default", "comments",
            "is_pk"
        ]]
        df_dict[tb] = df

    io.export_to_xl(df_dict, db)
Ejemplo n.º 3
0
def main():
    server_name, db_name, file_name = get_source()
    engine_read = cfg.load_engine()[server_name]
    ddls = get_ddl(db_name, engine_read)
    cmts = get_cmt()
    dfs = merge(ddls, cmts)
    export(dfs, file_name=file_name)
    su.tic("done...")
Ejemplo n.º 4
0
class WindStockSeasonDataCrawler(ICrawler, WindStockTimeSeriesCrawler):
    ENGINE = cfg.load_engine()["2Gb"]
    cols_query = {**wsdconst.CommonQuery.S_STOCK_OTHERS}

    cols_store = {
        "base_finance.stock_ability_revenue": ["stock_id", "date", "roe_ttm2"],
        "base_finance.stock_ability_debt":
        ["stock_id", "date", "longdebttodebt"],
        "base_finance.stock_ability_growth":
        ["stock_id", "date", "yoyeps_basic"],
        "base_finance.stock_asset": ["stock_id", "date", "tot_assets"],
        "base_finance.stock_liability": ["stock_id", "date", "tot_liab"],
        "base_finance.stock_equity": ["stock_id", "date", "cap_stk"],
    }

    def _fetch_season_data(self, stock_ids):
        result = pd.DataFrame()

        s_start, s_end = [self.timefmt(x) for x in [self.start, self.end]]
        query_str = ",".join(self.cols_query)
        for stock_id in stock_ids:
            resprox = wind.wsd(stock_id, query_str, s_start, s_end,
                               "unit=1;rptType=1;Period=Q;Days=Alldays")

            if resprox.ErrorCode == 0:
                print("Success", stock_id)
                dates = resprox.Times
                resprox = resprox.Data
            else:
                print("Error: ", self.start, self.end, stock_id)
                continue

            d = pd.DataFrame(resprox).T
            d.columns = list(self.cols_query.values())
            d["stock_id"] = stock_id
            d["date"] = dates
            result = result.append(d)

        return result.dropna(how="all", subset=list(self.cols_query.values()))

    def crawl(self):
        wind.start()
        STEP = 5
        sliced = [
            self.ids_to_crawl[i:i + STEP]
            for i in range(0, len(self.ids_to_crawl), STEP)
        ]

        # 异步采集, 入库
        pool = ThreadPool(8)
        for sids in sliced:
            pool.apply_async(self._fetch_season_data,
                             args=(sids, ),
                             callback=self.store)
        pool.close()
        pool.join()
        print("done")
Ejemplo n.º 5
0
def test1():
    from utils.database import io
    for y in range(2016, 2018):
        for m in (3, 6, 9, 12):
            d = cld.monthrange(y, m)[1]
            date = dt.date(y, m, d)
            print("RANK DATE: ", date)
            f = MutRanking(date)
            a = f.rank_all
            io.to_sql("base_public.fund_rank", cfg.load_engine()["2Gb"], a)
Ejemplo n.º 6
0
def main():
    from utils.database import io
    from utils.algorithm.ranking.ranking import CalculateHelper
    t = dt.date.today()
    t = CalculateHelper.last_rank_date(dt.date.today())
    if (t.month, t.day) in {(3, 31), (6, 30), (9, 30), (12, 31)}:
        print("RANK DATE: ", t)
        f = MutRanking(t)
        res = f.rank_all
        io.to_sql("base_public.fund_rank", cfg.load_engine()["2Gb"], res)
Ejemplo n.º 7
0
class FundDataLoader:
    engine = cfg.load_engine()["2Gb"]

    def __init__(self, **kwargs):
        [self.__setattr__(k, v) for k, v in kwargs.items()]

    def load_nv(self):
        sql = "SELECT fund_id, statistic_date as date, swanav as nav " \
              "FROM base_public.fund_nv " \
              "WHERE fund_id in ({fid}) AND statistic_date BETWEEN '{sd}' AND '{ed}'".format(
            fid=sqlfmt(self.fund_id), sd=self.start, ed=self.end)
        df = pd.read_sql(sql, self.engine)  # SQL Table
        df["date"] = df["date"].apply(
            lambda x: dt.datetime(x.year, x.month, x.day))
        return df.pivot(index="date", columns="fund_id", values="nav")

    @classmethod
    def load_foundation_date(cls):
        sql = "SELECT fund_id, MIN(statistic_date) as foundation_date FROM base_public.fund_nv GROUP BY fund_id"
        return pd.read_sql(sql, cls.engine)

    @classmethod
    def load_funds_structured(cls):
        ids = const.MUTFUNDS_STRUCTURED

        return ids

    @classmethod
    def load_funds_open(cls):
        sql = "SELECT fund_id " \
              "FROM base_public.fund_type_mapping " \
              "WHERE typestandard_code = '01' AND type_code = '0102'"
        return set([x[0] for x in cls.engine.execute(sql).fetchall()])

    @classmethod
    def load_funds_daily(cls):
        sql = "SELECT fund_id " \
              "FROM base_public.fund_info " \
              "WHERE nv_freq = '日度'"
        return set([x[0] for x in cls.engine.execute(sql).fetchall()])

    @classmethod
    def load_funds_of_type(cls, args: tuple):
        tcode2, tcode4 = args
        if len(tcode2) == 4:
            s = "type_code"
        elif len(tcode4) == 6:
            s = "stype_code"

        sql = "SELECT fund_id FROM base_public.fund_type_mapping " \
              "WHERE typestandard_code = '02' AND `{s}` = '{tcode2}' " \
              "AND fund_id IN (" \
              "SELECT fund_id FROM base_public.fund_type_mapping " \
              "WHERE typestandard_code = '04' AND stype_code = '{tcode4}')".format(s=s, tcode2=tcode2, tcode4=tcode4)
        return set([x[0] for x in cls.engine.execute(sql).fetchall()])
Ejemplo n.º 8
0
def main():
    engines = cfg.load_engine()
    engine_4G = engines["4G"]
    conn_4G = engine_4G.connect()

    record = pd.read_excel("C:/Users/Yu/Desktop/CRM/Accounts.xlsx").iloc[-1]
    acc, pwd = record["Account"], record["Pwd"]
    # acc, pwd = record["Account"], record["Pwd"]

    generate_account(acc, pwd, conn_4G, "trial")
    test_connection(acc, pwd)
Ejemplo n.º 9
0
def test():
    from utils.etlkit.ext import tools
    ENGINER = cfg.load_engine()["2Gbp"]

    tc = tools.TableComparer(
        "base_test.fund_position_bond_test",
        "base_public.fund_position_bond",
        ENGINER,
        cols_included={"quantity", "scale", "asset_scale", "proportion_net"})
    res = tc.result
    res.to_csv("fund_position_bond.csv", )
Ejemplo n.º 10
0
class SyncInitializer:
    INIT_FROM = {
        FundNvDataStandard.__tablename__: (FundNvDataSource.__tablename__, )
    }

    CONFIG_TABLE = "config_private.sync_source"

    ENGINE = cfg.load_engine()["2Gb"]

    def __init__(self, target_table, default_prio=1):
        self._target_table = target_table
        splitted = target_table.split(".")
        self._target_table_name = splitted[-1]
        if len(splitted) == 2:
            self._target_table_db = splitted[0]
        self._default_prio = default_prio
        self._cfg = pd.DataFrame()

    def _constructor(self):
        if self._target_table_name == "fund_nv_data_standard":
            dbsession = sessionmaker()
            session = dbsession(bind=self.ENGINE)
            stmt = session.query(FundNvDataSource).filter(
                FundNvDataSource.is_used == 1,
                FundNvDataSource.source_id.notin_({"04", "05"}),
                FundNvDataSource.update_time.between(FROM, TO)).group_by(
                    FundNvDataSource.fund_id,
                    FundNvDataSource.source_id).with_entities(
                        FundNvDataSource.fund_id.label("pk"),
                        FundNvDataSource.source_id.label("source_id"))
            df = pd.DataFrame(stmt.all())
        return df

    def _load_default_cfg(self):
        if len(self._cfg) == 0:
            return
        self._cfg = self._cfg.drop_duplicates()
        self._cfg["priority"] = self._default_prio
        self._cfg["target_table"] = self._target_table_name

    def initialize(self):
        self._cfg = self._cfg.append(self._constructor())
        self._load_default_cfg()

    def update_syncsource(self):
        io.to_sql(self.CONFIG_TABLE, self.ENGINE, self.cfg,
                  type="ignore")  # `type` must be ignore

    @property
    def cfg(self):
        return self._cfg
Ejemplo n.º 11
0
def main():
    engine_rd = cfg.load_engine()["2G"]
    engine_wt = cfg.load_engine()["2Gc"]

    sql = "SELECT * FROM base.org_info \
    WHERE address IS NOT NULL and org_id NOT IN (SELECT oim.org_ID as matched_id \
    FROM crawl.x_fund_org xfo \
    JOIN (SELECT org_id, MAX(entry_time) msd FROM crawl.x_fund_org GROUP BY org_id ) tb_msd ON xfo.org_id = tb_msd.org_id AND xfo.entry_time = tb_msd.msd \
    JOIN base.org_id_match oim ON xfo.org_id = oim.source_org_ID)"

    with engine_rd.connect() as conn:
        df = pd.read_sql(sql, conn)
        df["data_date"] = df["entry_time"].apply(
            lambda x: x.date())  # 将入库日期作为data_date

        sql2 = "SELECT * FROM crawl.im_org_info LIMIT 0, 1"
        df2 = pd.read_sql(sql2, conn)
        df = df[list(set(df2).intersection(set(df)))]
        conn.close()

    with engine_wt.connect() as conn:
        io.to_sql("im_org_info", conn, df, chunksize=500)
        conn.close()
Ejemplo n.º 12
0
def test():
    import os
    from utils.etlkit.ext import tools
    from utils.database import config as cfg
    ENGINER = cfg.load_engine()["2Gbp"]

    tc = tools.TableComparer(
        "base_test.fund_position_bond_test_20180515",
        "base_public.fund_position_bond",
        ENGINER,
        cols_included={"quantity", "scale", "asset_scale", "proportion_net"},
    )
    res = tc.result
    res.to_csv(os.path.expanduser("~/Desktop/fund_position_bond.csv"),
               encoding="gbk")
Ejemplo n.º 13
0
def rematch_oid():
    eg_2Gb = cfg.load_engine()["2Gb"]

    df = pd.read_sql(
        "SELECT org_ID, source_org_ID, match_type FROM org_id_match", eg_2Gb)
    df["match_type"] = df["match_type"].apply(lambda x: SOURCES.get(x))
    df["id_type"] = 2
    df = df.rename_axis(
        {
            "source_org_ID": "source_id",
            "org_ID": "matched_id",
            "match_type": "source"
        },
        axis=1)
    df = df.dropna(subset=["matched_id", "source_id", "source"], how="any")
    io.to_sql("id_match", eg_2Gb, df)
Ejemplo n.º 14
0
def main():
    tb_m = sf.Table("m", "indicator")

    engines = cfg.load_engine()
    engine_read = engines["2Gb"]
    engine_write = engines["2Gb"]
    conn_read = engine_read.connect()
    conn_write = engine_write.connect()

    tb_names = [_tb["re_m"], _tb["ri_m"], _tb["sub_m"]]
    sqls = gen_sql_indicator_latest(tb_names)
    dfs = fetch_data(sqls, conn_read)

    dm = merge_result(dfs["m"], how="inner")

    refresh(dm, tb_m, conn_write)
Ejemplo n.º 15
0
def main():
    from utils.database import io
    from utils.algorithm.ranking.ranking import CalculateHelper
    # t = dt.date.today()
    t = CalculateHelper.last_rank_date(dt.date.today())
    if (t.month, t.day) in {(3, 31), (6, 30), (9, 30), (12, 31)}:
        print("RANK DATE: ", t)
        types = [
            '+1b1', '+1b2', '+1bl1', '+1bl2', '+1bl3', '+1bl4', '+1bl5', '+1s1'
        ]
        for type in types:
            f = MutRanking(t, [type])
            res = f.rank_all
            if len(res):
                io.to_sql("base_public.fund_rank",
                          cfg.load_engine()["2Gbp"], res)
                print(res.head())
Ejemplo n.º 16
0
def rematch_fid():
    # 将fund_id_match中配对过的记录中的match_type重新映射为统一编码;
    eg_2Gb = cfg.load_engine()["2Gb"]

    df = pd.read_sql(
        "SELECT fund_ID, source_ID, match_type FROM fund_id_match", eg_2Gb)
    df["match_type"] = df["match_type"].apply(lambda x: SOURCES.get(x))
    df["id_type"] = 1
    df = df.rename_axis(
        {
            "source_ID": "source_id",
            "fund_ID": "matched_id",
            "match_type": "source"
        },
        axis=1)
    df = df.dropna(subset=["matched_id", "source_id", "source"], how="any")
    io.to_sql("id_match", eg_2Gb, df)
Ejemplo n.º 17
0
def rematch_source_used():
    eg_2Gb = cfg.load_engine()["2Gb"]
    eg_2Gcfgp = create_engine(
        "mysql+pymysql://root:[email protected]:4171/config_private?charset=utf8"
    )

    df = pd.read_sql("SELECT * FROM fund_nv_updata_source WHERE is_updata = 1",
                     eg_2Gb)
    tmp = df.copy(False)
    tmp = tmp.loc[tmp["fund_id"].apply(lambda x: x[:2] == "JR")]
    tmp["source_id"] = tmp["data_source"].apply(lambda x: SOURCES.get(x))
    tmp["is_granted"] = tmp["data_source"].apply(lambda x: {9: 1}.get(x, 0))
    tmp["pk"] = tmp["fund_id"]
    tmp["target_table"] = "fund_nv_data_standard"
    tmp = tmp[["target_table", "pk", "source_id"]]
    tmp = tmp.dropna()

    io.to_sql("sync_source", eg_2Gcfgp, tmp)
Ejemplo n.º 18
0
def test():
    # temporary
    _engine_wt2 = cfg.load_engine()["4Gpp"]
    def tmp(fund_id):
        """

        Args:
            update_time: record update time

        Returns:
            pandas.DataFrame
        """

        query_fundincome = _session.query(IdMatch).join(
            FundInfo, and_(IdMatch.id_type == 1, IdMatch.matched_id == FundInfo.fund_id)
        ).join(
            DFundAnnouncement, and_(IdMatch.id_type == 1, IdMatch.source_id == DFundAnnouncement.fund_id, )
        # IdMatch.data_source == DFundAnnouncement.data_source

        ).filter(
            DFundAnnouncement.fund_id == fund_id
        ).with_entities(
            *_input_entities
        )
        df = pd.DataFrame(query_fundincome.all())
        df = df.rename(
            columns={x.name: y.name if y is not None else x.name for x, y in zip(_input_entities, _map_entities)}
        )
        # df.columns = [x.name for x in _map_entities]
        df.index = df[[FundAnnouncement.fund_id.name, ]]
        df.drop([DFundAnnouncement.data_source.name], axis=1, inplace=True)
        return df

    fids = sorted(pd.read_sql("SELECT DISTINCT fund_id FROM fund_info", _engine_wt)["fund_id"].tolist())
    for fid in fids:
        try:
            print(fid)
            res = tmp(fid)
            io.to_sql(FundAnnouncement.__tablename__, _engine_wt, res, chunksize=10)
            io.to_sql(FundAnnouncement.__tablename__, _engine_wt2, res, chunksize=10)
        except:
            continue
Ejemplo n.º 19
0
Archivo: fund_nv.py Proyecto: dxcv/fund
import pandas as pd
from sqlalchemy import and_
from sqlalchemy.orm import sessionmaker
from utils.database import io, config as cfg
from utils.algorithm import etl
from utils.database.models.config import ConfigSource
from utils.database.models.base_public import FundInfo, FundNvSource, FundNv

_engine_wt = cfg.load_engine()["2Gbp"]
_db_session = sessionmaker(bind=_engine_wt)
_session = _db_session()
UPDATE_TIME = etl.update_time["incremental"]

_entities_map = [
    (FundInfo.fund_id, FundNv.fund_id),
    (FundInfo.fund_name, FundNv.fund_name),
    (FundNvSource.data_source, FundNv.data_source),
    (FundNvSource.statistic_date, FundNv.statistic_date),
    (FundNvSource.nav, FundNv.nav),
    (FundNvSource.added_nav, FundNv.added_nav),
]
_input_entities = [x[0] for x in _entities_map]
_map_entities = [x[1] for x in _entities_map]

_derivative_entities = []
_output_entities = [*_map_entities, *_derivative_entities]


def fetch_multisource_nv(update_time_l, update_time_r=None):
    """
    Fetch records of DOrgInfo table where record update time >= `update_time`
Ejemplo n.º 20
0
同步 - 4119product库同步part1(3H)
"""
import sys
import os
try:
    sys.path.append(os.getcwd()[:os.getcwd().index("SCRIPT")])
except:
    pass

import datetime as dt
from dateutil.relativedelta import relativedelta
from utils.database import config as cfg
from utils.synckit.cli import tools
from utils.synckit.mysqlreader import splitter

ENGINE_RD = cfg.load_engine()["2Gb"]
ENGINE_4119PRODUCT = cfg.load_engine()["4Gp"]

tasks = [
    (splitter.MysqlReader("fund_info", ENGINE_RD),
     splitter.MysqlReader("fund_info", ENGINE_4119PRODUCT)),
    (splitter.MysqlReader("fund_asset_scale", ENGINE_RD),
     splitter.MysqlReader("fund_asset_scale", ENGINE_4119PRODUCT)),
    (splitter.MysqlReader("fund_allocation_data", ENGINE_RD),
     splitter.MysqlReader("fund_allocation_data", ENGINE_4119PRODUCT)),
    (splitter.MysqlReader("fund_fee_data", ENGINE_RD),
     splitter.MysqlReader("fund_fee_data", ENGINE_4119PRODUCT)),
    (splitter.MysqlReader("fund_manager_mapping", ENGINE_RD),
     splitter.MysqlReader("fund_manager_mapping", ENGINE_4119PRODUCT)),
    (splitter.MysqlReader("fund_org_mapping", ENGINE_RD),
     splitter.MysqlReader("fund_org_mapping", ENGINE_4119PRODUCT)),
Ejemplo n.º 21
0
    "商业贸易": 450000,
    "休闲服务": 460000,
    "银行": 480000,
    "非银金融": 490000,
    "综合": 510000,
    "建筑材料": 610000,
    "建筑装饰": 620000,
    "电气设备": 630000,
    "机械设备": 640000,
    "国防军工": 650000,
    "计算机": 710000,
    "传媒": 720000,
    "通信": 730000
}

engine_wt = cfg.load_engine()["2Gt"]

_path = "C:/Users/Yu/Desktop/上市股票一览.xlsx"


def read_type(path):
    df = pd.read_excel(path)
    df = df[["代码", "名称", "申万一级分类名称"]]
    df.columns = ["subject_id", "subject_name", "type_name_sws"]
    df.loc[df["type_name_sws"] == 0, "type_name_sws"] = None
    df["type_code_sws"] = df["type_name_sws"].apply(
        lambda x: str(type_code_sws.get(x)))
    return df


def main():
Ejemplo n.º 22
0
import numpy as np
import pandas as pd
from utils.database import io, config as cfg
from utils.script.scriptutils import tic

# 解决fund_nv_data_source表中data_source == 2的数据,added_nav为空的情况

engines = cfg.load_engine()
engine_rd = engines["2Gb"]
engine_wt = engines["2Gb"]


def check_equal(dataframe):
    dataframe = dataframe.dropna(subset=["nav", "added_nav"])
    is_equivalent = list(dataframe["nav"] == dataframe["added_nav"])
    if len(is_equivalent) >= 1:
        if False in is_equivalent:
            first_unequal = is_equivalent.index(False)
        else:
            first_unequal = dataframe.index[-1] + 1
    else:
        return 0
    return first_unequal


def main():
    sql_tgt = "SELECT fund_id, statistic_date, nav, added_nav, swanav1 FROM fund_nv_data_source WHERE data_source = 2 AND \
               fund_id IN (SELECT DISTINCT fund_id FROM fund_nv_updata_source WHERE data_source = 2)\
               ORDER BY fund_id ASC, statistic_date ASC"

    df = pd.read_sql(sql_tgt, engine_rd)
Ejemplo n.º 23
0
from utils.database import config as cfg, io
from utils.etlkit.core.base import Stream, Confluence
from utils.etlkit.core import transform
from utils.etlkit.reader.mysqlreader import MysqlInput

ENGINE_RD = cfg.load_engine()["2Gb"]


class StreamsMain:
    @classmethod
    def stream_03xxxx_type2(cls):
        """
            清洗id_match, 03xxx源基金管理人
        """
        sql = "SELECT idm.matched_id, oi.org_id, oi.org_full_name " \
              "FROM (" \
              "SELECT DISTINCT matched_id,source FROM base.id_match " \
              "WHERE id_type = 1 AND is_used = 1 AND source LIKE '030%%'" \
              "AND matched_id NOT IN (" \
              "SELECT fund_id FROM base.fund_org_mapping WHERE org_type_code=2)) idm " \
              "JOIN data_test.source_info_org as sig ON idm.source = sig.source_id " \
              "JOIN base.org_info oi ON sig.org_id = oi.org_id"

        inp = MysqlInput(ENGINE_RD, sql)

        vm = transform.ValueMap({
            "org_type": "基金管理人",
            "org_type_code": 2

        })
Ejemplo n.º 24
0
import datetime as dt
import pandas as pd
from utils.algorithm.base.exceptions import DataError
from utils.algorithm.fama import resultproxy
from utils.database import config as cfg, io
from utils.timeutils import const

engine = cfg.load_engine()["2Gf"]


def cal(start=None, end=None):
    start = start or dt.date(2015, 4, 1)
    end = end or dt.date.today()
    dates = [x.date() for x in pd.date_range(start, end, freq=const.bday_chn)]
    for ed in dates:
        try:
            prox = resultproxy.Fama3(ed, "d")
            io.to_sql("factor_style_d", engine, prox.result)
            print(prox.result)
        except DataError:
            continue


def main():
    cal()


if __name__ == "__main__":
    main()
Ejemplo n.º 25
0
from utils.database import config as cfg, io
from utils.etlkit.core.base import Stream, Confluence
from utils.etlkit.core import transform
from utils.etlkit.reader.mysqlreader import MysqlNativeInput
from utils.database.sqlfactory import SQL
from collections import OrderedDict
import re
import datetime as dt
from dateutil.relativedelta import relativedelta

engine_c = cfg.load_engine()["2Gcpri"]
engine_b = cfg.load_engine()["etl_base_private"]
engine_t = cfg.load_engine()["etl_base_test"]


def fetch_fids_to_update():
    upt = (dt.datetime.now() -
           relativedelta(minutes=12)).strftime("%Y%m%d%H%M%S")
    fids = []
    sql_srcs = [
        "SELECT DISTINCT matched_id FROM base.id_match WHERE update_time >= '{upt}'"
        .format(upt=upt),
        "SELECT DISTINCT fund_id FROM crawl_private.y_fund_info WHERE update_time >= '{upt}' AND is_used = 1"
        .format(upt=upt),
        "SELECT DISTINCT im.matched_id FROM crawl_private.x_fund_info_fundaccount xfi JOIN base.id_match im ON xfi.fund_id = im.source_id AND im.source='010002' WHERE xfi.update_time >= '{upt}' AND im.id_type=1 AND im.is_used=1"
        .format(upt=upt),
        "SELECT DISTINCT im.matched_id FROM crawl_private.x_fund_info_private xfi JOIN base.id_match im ON xfi.fund_id = im.source_id AND im.source='010003' WHERE xfi.update_time >= '{upt}' AND im.id_type=1 AND im.is_used=1"
        .format(upt=upt),
        "SELECT DISTINCT im.matched_id FROM crawl_private.x_fund_info_securities xfi JOIN base.id_match im ON xfi.fund_id = im.source_id AND im.source='010004' WHERE xfi.update_time >= '{upt}' AND im.id_type=1 AND im.is_used=1"
        .format(upt=upt),
        "SELECT DISTINCT im.matched_id FROM crawl_private.x_fund_info_futures xfi JOIN base.id_match im ON xfi.fund_id = im.source_id AND im.source='010005' WHERE xfi.update_time >= '{upt}' AND im.id_type=1 AND im.is_used=1"
Ejemplo n.º 26
0
from utils.database import config as cfg
from utils.etlkit.core import transform
from utils.etlkit.core.base import Frame, Stream, Confluence
from utils.etlkit.reader.mysqlreader import MysqlNativeInput, MysqlOrmInput
from sqlalchemy import and_
from sqlalchemy.orm import sessionmaker, Session
from utils.database.models.crawl_finance import DBondRating
from utils.database.models.base_finance import BondRating
from utils.database import io
import re


engine_r = cfg.load_engine()["etl_finance"]
engine_w = cfg.load_engine()["etl_base_finance"]
dbsession = sessionmaker()


def stream_020004():
    """
    债券评级信息(万得)
    """
    session = dbsession(bind=engine_r)

    stmt = Session().query(DBondRating).filter(
        and_(DBondRating.source_id == "020004", DBondRating.is_used == 1)
    ).with_entities(
        DBondRating.bond_id, DBondRating.source_id, DBondRating.statistic_date, DBondRating.rating_type,
        DBondRating.credit_rating, DBondRating.rating_agency, DBondRating.rating_outlook
    )

    vm = transform.ValueMap(
Ejemplo n.º 27
0
import datetime as dt
import pandas as pd
from utils.database import io, config as cfg

engine_rd = cfg.load_engine()["2Gb"]
engine_wt = cfg.load_engine()["2Gb"]

yesterday = dt.date.today() - dt.timedelta(1)
sql_fi = "SELECT fund_id FROM fund_info"
sql_fom = "SELECT fund_id, org_name, org_type_code FROM fund_org_mapping WHERE org_type_code IN (1, 2)"


def reduce_dup(df):
    df_tmp = df.copy()
    df_tmp.index = df_tmp["fund_id"]
    grouped = df.groupby("fund_id")["fund_id"]
    grouped_cnt = grouped.count()
    idxs = df_tmp.drop_duplicates(subset=["fund_id"]).index[grouped_cnt >= 2]
    return df_tmp.ix[idxs]


def gen_dict(df):
    dict_dup = {}

    df_2 = reduce_dup(df)
    for idx in df_2.index:
        tmp = str(df_2.ix[idx]["org_name"].tolist())[1:-1]
        tmp = tmp.replace("'", "")
        dict_dup[idx] = tmp

    df_1 = df.loc[df["fund_id"].apply(lambda x: x not in set(df_2.index))]
Ejemplo n.º 28
0
Archivo: tools.py Proyecto: dxcv/fund
from utils.database import config as cfg
from utils.decofactory import common
from sqlalchemy.engine import reflection
from utils.database.sqlfactory import SQL
import pandas as pd
import re

ENGINE = cfg.load_engine()["2Gb"]


class TableComparer:
    def __init__(self,
                 table_new,
                 table_old,
                 engine,
                 engine_old=None,
                 **kwargs):
        """

        Args:
            table_new: str
                "schema.table" 格式
            table_old: str
                "schema.table" 格式
            engine: sqlalchemy.engine.base.Engine
            engine_old: sqlalchemy.engine.base.Engine, default None
                if None, use `engine` instead

            **kwargs:
                col_map: dict
                    新旧表字段名映射, 如{"col_old_name": "col_new_name", ...}
Ejemplo n.º 29
0
import numpy as np
from utils.database import config as cfg, io
from utils.etlkit.core.base import Stream, Confluence
from utils.etlkit.core import transform
from utils.etlkit.reader.mysqlreader import MysqlNativeInput

engine_t = cfg.load_engine()["etl_base_test"]
engines = cfg.load_engine()
engine_base, engine_crawl_private = engines["2Gb"], engines["2Gcpri"]


def turn_dict(lst):
    global key, value
    dic = {}
    if all([not isinstance(item, list) for item in lst]):
        if len(lst) == 2:
            key, value = lst
        elif len(lst) == 1:
            key = lst[0]
            value = ''
        elif len(lst) == 3:
            key = lst[0]
            value = lst[1]
        dic[key] = value
    else:
        for item in lst:
            subdic = turn_dict(item)
            dic.update(subdic)
    return dic

Ejemplo n.º 30
0
class MainStream:
    engine = cfg.load_engine()["2Gbp"]
    upt_from = dt.datetime.now() - relativedelta(hours=3)
    upt_until = dt.datetime.now()
    from multiprocessing.dummy import Pool as ThreadPool
    pool = ThreadPool(8)

    @classmethod
    def _clean_amount(cls, string):
        unit_trans = {"万": 1, "亿": 1e4}
        sre = re.search("(?P<amt>\d*(\.\d*)?)(?P<unit>万|亿).*", string)
        if sre:
            return float(sre.groupdict()["amt"]) * unit_trans.get(
                sre.groupdict()["unit"], 1)
        return None

    @classmethod
    def _fetch_fund_ids(cls, all=False):
        if all:
            sql = "SELECT DISTINCT fund_id FROM base_public.fund_info"
        else:
            sql = "SELECT DISTINCT matched_id FROM crawl_public.d_fund_position dfp " \
                  "JOIN base_public.id_match im ON dfp.fund_id = im.source_id AND dfp.data_source = im.data_source " \
                  "WHERE dfp.update_time >= '{upt_from}' AND dfp.update_time <= '{upt_until}'".format(
                upt_from=str(cls.upt_from), upt_until=str(cls.upt_until))

        fids = sorted([x[0] for x in cls.engine.execute(sql).fetchall()])
        return fids

    @classmethod
    def stream_020002(cls, fund_ids):
        sql = "SELECT im.matched_id, fi.fund_name, im.data_source, dfp.statistic_date, dfp.subject_id, dfp.subject_name, dfp.scale, " \
              "dfp.proportion, fas.total_asset " \
              "FROM base_public.id_match im " \
              "JOIN crawl_public.d_fund_position dfp ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \
              "JOIN base_public.fund_info fi ON im.matched_id = fi.fund_id " \
              "LEFT JOIN base_public.fund_asset_scale fas " \
              "ON dfp.fund_id = fas.fund_id AND dfp.statistic_date = fas.statistic_date " \
              "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \
              "AND im.data_source = '020002'".format(fids=sf.SQL.values4sql(fund_ids))

        inp = MysqlInput(cls.engine, sql)

        vm = transform.ValueMap({
            "proportion":
            lambda x: float(x.replace("%", "")) / 100,
            "scale":
            lambda x: cls._clean_amount(x),
        })

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "fund_name": None,
            "data_source": None,
            "statistic_date": None,
            "subject_id": None,
            "subject_name": None,
            "scale": None,
            "proportion": "proportion_net",
            "total_asset": "asset_scale",
        })

        return base.Stream(inp, transform=[vm, sk])

    @classmethod
    def stream_020001(cls, fund_ids):
        sql = "SELECT im.matched_id, fi.fund_name, im.data_source, dfp.statistic_date, dfp.subject_id, dfp.subject_name, " \
              "dfp.scale, dfp.proportion, fas.total_asset " \
              "FROM base_public.id_match im " \
              "JOIN crawl_public.d_fund_position dfp " \
              "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \
              "JOIN base_public.fund_info fi ON im.matched_id = fi.fund_id " \
              "left JOIN base_public.fund_asset_scale fas ON dfp.fund_id = fas.fund_id AND dfp.statistic_date = fas.statistic_date " \
              "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \
              "AND dfp.fund_id NOT IN (" \
              "SELECT DISTINCT fund_id FROM crawl_public.d_fund_position WHERE data_source = '020002')" \
              "AND im.data_source = '020001'".format(fids=sf.SQL.values4sql(fund_ids))

        inp = MysqlInput(cls.engine, sql)

        vm = transform.ValueMap({
            "proportion":
            lambda x: float(x.replace("%", "")) / 100,
            "scale":
            lambda x: cls._clean_amount(x),
        })

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "fund_name": None,
            "data_source": None,
            "statistic_date": None,
            "subject_id": None,
            "subject_name": None,
            "scale": None,
            "proportion": "proportion_net",
            "total_asset": "asset_scale",
        })

        return base.Stream(inp, transform=[vm, sk])

    @classmethod
    def stream_020003(cls, fund_ids):
        sql = "SELECT im.matched_id, fi.fund_name, im.data_source, dfp.subject_id, dfp.subject_name, dfp.statistic_date, " \
              "dfp.scale, dfp.proportion, fas.total_asset " \
              "FROM base_public.id_match im " \
              "JOIN crawl_public.d_fund_position dfp " \
              "ON im.source_id = dfp.fund_id AND im.data_source = dfp.data_source " \
              "JOIN base_public.fund_info fi ON im.matched_id = fi.fund_id " \
              "LEFT JOIN base_public.fund_asset_scale fas ON dfp.fund_id = fas.fund_id AND dfp.statistic_date = fas.statistic_date " \
              "WHERE type = '债券' AND im.matched_id IN {fids} AND im.id_type = 1 AND im.is_used = 1 " \
              "AND dfp.fund_id NOT IN (" \
              "SELECT DISTINCT fund_id FROM crawl_public.d_fund_position WHERE data_source IN ('020002', '020001'))" \
              "AND im.data_source = '020003'".format(fids=sf.SQL.values4sql(fund_ids))

        inp = MysqlInput(cls.engine, sql)

        vm = transform.ValueMap({
            "proportion":
            lambda x: float(x.replace("%", "")) / 100,
            "scale":
            lambda x: cls._clean_amount(x),
        })

        sk = transform.MapSelectKeys({
            "matched_id": "fund_id",
            "fund_name": None,
            "data_source": None,
            "statistic_date": None,
            "subject_id": None,
            "subject_name": None,
            "scale": None,
            "proportion": "proportion_net",
            "total_asset": "asset_scale",
        })

        return base.Stream(inp, transform=[vm, sk])

    @classmethod
    def clean(cls):
        fids = cls._fetch_fund_ids(True)
        cls.save(cls.stream_020001(fids))
        cls.save(cls.stream_020003(fids))
        chunks = [fids[i:i + 100] for i in range(0, len(fids), 100)]
        [
            cls.pool.apply_async(cls.stream_020002,
                                 args=(id_chunk, ),
                                 callback=cls.save) for id_chunk in chunks
        ]
        cls.pool.close()
        cls.pool.join()

    @classmethod
    def save(cls, stream):
        io.to_sql("base_test.fund_position_bond_test_20180515", cls.engine,
                  stream.flow()[0])