def work2(df, f):
    setname = "sp500"
    taname = "base1"
    (phase1, phase2, phase3) = \
        feat_select.split_dates(feat_select.load_feat(taname, setname))
    df2 = feat_select.apply(df,phase2, "label5", "_p2")
    df2 = feat_select.apply(df2,phase3, "label5", "_p3")
    feat_select.ana2(df2, f, setname)
def work2(df, f):
    setname = "sp500"
    taname = "base1"
    (phase1, phase2, phase3) = \
        feat_select.split_dates(feat_select.load_feat(taname, setname))
    df2 = feat_select.apply(df, phase2, "label5", "_p2")
    df2 = feat_select.apply(df2, phase3, "label5", "_p3")
    feat_select.ana2(df2, f, setname)
Beispiel #3
0
def phase1_dump(taname, setname):
    dfTa = feat_select.load_feat(taname, setname)
    (phase1, phase2, phase3) = feat_select.split_dates(dfTa)
    dfmetas = feat_select.flat_metas(feat_select.get_metas(phase1))
    outdir = os.path.join(root, "data", "feat_select", "phase1_dump")
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    dfmetas.to_pickle(os.path.join(outdir, "%s_%s.pkl" % (setname, taname)))
Beispiel #4
0
def phase1_dump(taname, setname):
    dfTa = feat_select.load_feat(taname, setname)
    (phase1, phase2, phase3) = feat_select.split_dates(dfTa)
    dfmetas = feat_select.flat_metas(feat_select.get_metas(phase1))
    outdir = os.path.join(root, "data", "feat_select", "phase1_dump")
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    dfmetas.to_pickle(os.path.join(outdir, "%s_%s.pkl" % (setname, taname)))
def work(df, f):
    for i in range(10):
        frm = 50 * i
        to = frm + 50
        setname = "sp500R%dT%d" % (frm, to)
        taname = "base1"
        (phase1, phase2, phase3) = \
            feat_select.split_dates(feat_select.load_feat(taname, setname))
        df2 = feat_select.apply(df, phase2, "label5", "_p2")
        df2 = feat_select.apply(df2, phase3, "label5", "_p3")
        feat_select.ana2(df2, f, setname)
def work(df, f):
    for i in range(10):
        frm = 50  * i
        to  = frm + 50
        setname = "sp500R%dT%d" % (frm, to)
        taname = "base1"
        (phase1, phase2, phase3) = \
            feat_select.split_dates(feat_select.load_feat(taname, setname))
        df2 = feat_select.apply(df,phase2, "label5", "_p2")
        df2 = feat_select.apply(df2,phase3, "label5", "_p3")
        feat_select.ana2(df2, f, setname)
def cross_test(df, sets, dates, name, depth):
    f = open(os.path.join(dataroot, "feat_select_phase1_%s_%d.ana" \
                          % (name,args.depth)), "w")
    feat_select.ana_fmetas(df, "base1", "sp500", f)

    abs_direct_p_set = Counter(set(df[df.direct == 1].name.unique()))
    abs_direct_n_set = Counter(set(df[df.direct == -1].name.unique()))

    orig_direct_p_set = abs_direct_p_set.copy()
    orig_direct_n_set = abs_direct_n_set.copy()
    print len(set(abs_direct_p_set))
    print len(set(abs_direct_n_set))

    abs_direct_p_set = abs_direct_p_set + abs_direct_p_set
    abs_direct_n_set = abs_direct_n_set + abs_direct_n_set
    print >> f, "=" * 8
    for s in sets:
        for d in dates:
            setname = s
            taname = "base1"
            filename = os.path.join(
                dataroot, "phase1_dump",
                "sp500_base1_apply_phase1_%s_%s_%d_%s_%s.pkl" %
                (setname, taname, args.depth, d[0], d[1]))
            if not os.path.exists(filename):
                fs = feat_select.load_feat(taname, setname, d[0], d[1])
                df2 = feat_select.apply(df, fs, "label5", "_p1")
                df2.to_pickle(filename)
            df2 = pd.read_pickle(filename)
            feat_select.ana_apply(df2, "_p1", setname, f)
            cur_p_set = set(df2[df2.direct_p1 == 1].name.unique())
            cur_n_set = set(df2[df2.direct_p1 == -1].name.unique())
            abs_direct_p_set = abs_direct_p_set - Counter(
                set(abs_direct_p_set) - cur_p_set)
            abs_direct_n_set = abs_direct_n_set - Counter(
                set(abs_direct_n_set) - cur_n_set)
            print len(set(abs_direct_n_set))
    df.loc[:,"istable"] = df.apply(lambda row: 1 if row["name"] in set(abs_direct_p_set) else \
             (1 if row["name"] in set(abs_direct_n_set) else 0), axis = 1)
    df.loc[:, "direct"] = df.apply(lambda row: 0
                                   if row["istable"] == 0 else row["direct"],
                                   axis=1)
    df.to_pickle(os.path.join(dataroot,
                                "phase1_dump",
                                "sp500_base1_%s_%d_stable.pkl" % \
                                (name,args.depth)
                             )
                 )
    print "|%d|%d|%d|" % (len(
        set(orig_direct_p_set)), len(set(abs_direct_p_set)),
                          len(set(orig_direct_p_set) - set(abs_direct_p_set)))
    print "|%d|%d|%d|" % (len(
        set(orig_direct_n_set)), len(set(abs_direct_n_set)),
                          len(set(orig_direct_n_set) - set(abs_direct_n_set)))
    print >> f, "## stable feats on postive direct"
    for name in abs_direct_p_set:
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"],
                                            each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## UNstable feats on postive direct"
    for name in set(orig_direct_p_set) - set(abs_direct_p_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"],
                                            each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## stable feats on negtive direct"
    for name in set(abs_direct_n_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"],
                                            each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## unstable feats on negtive direct"
    for name in set(orig_direct_n_set) - set(abs_direct_n_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"],
                                            each["end"])
            assert idx < 1
            idx += 1

    f.close()
    orig_direct_n_set = abs_direct_n_set.copy()
    print len(abs_direct_p_set)
    print len(abs_direct_n_set)
    print >> f, "=" * 8
    for i in range(10):
        frm = 50 * i
        to = frm + 50
        setname = "sp500R%dT%d" % (frm, to)
        taname = "base1"
        filename = os.path.join(
            dataroot, "phase1_dump", "sp500_base1_apply_phase1_%s_%s_%d.pkl" %
            (setname, taname, args.depth))
        if not os.path.exists(filename):
            df2 = feat_select.apply(
                df,
                feat_select.split_dates(feat_select.load_feat(taname,
                                                              setname))[0],
                "label5", "_p1")

            df2.to_pickle(filename)
        df2 = pd.read_pickle(filename)
        feat_select.ana_apply(df2, "_p1", setname, f)
        cur_p_set = set(df2[df2.direct_p1 == 1].name.unique())
        cur_n_set = set(df2[df2.direct_p1 == -1].name.unique())
        abs_direct_p_set = abs_direct_p_set.intersection(cur_p_set)
        abs_direct_n_set = abs_direct_n_set.intersection(cur_n_set)
        print list(abs_direct_n_set)
    df.loc[:,"istable"] = df.apply(lambda row: 1 if row["name"] in abs_direct_p_set else \
             (1 if row["name"] in abs_direct_n_set else 0), axis = 1)
    df.loc[:, "direct"] = df.apply(lambda row: 0
                                   if row["istable"] == 0 else row["direct"],
                                   axis=1)
def cross_test(df, sets, dates, name, depth):
    f = open(os.path.join(dataroot, "feat_select_phase1_%s_%d.ana" % (name, args.depth)), "w")
    feat_select.ana_fmetas(df, "base1", "sp500", f)

    abs_direct_p_set = Counter(set(df[df.direct == 1].name.unique()))
    abs_direct_n_set = Counter(set(df[df.direct == -1].name.unique()))

    orig_direct_p_set = abs_direct_p_set.copy()
    orig_direct_n_set = abs_direct_n_set.copy()
    print len(set(abs_direct_p_set))
    print len(set(abs_direct_n_set))

    abs_direct_p_set = abs_direct_p_set + abs_direct_p_set
    abs_direct_n_set = abs_direct_n_set + abs_direct_n_set
    print >> f, "=" * 8
    for s in sets:
        for d in dates:
            setname = s
            taname = "base1"
            filename = os.path.join(
                dataroot,
                "phase1_dump",
                "sp500_base1_apply_phase1_%s_%s_%d_%s_%s.pkl" % (setname, taname, args.depth, d[0], d[1]),
            )
            if not os.path.exists(filename):
                fs = feat_select.load_feat(taname, setname, d[0], d[1])
                df2 = feat_select.apply(df, fs, "label5", "_p1")
                df2.to_pickle(filename)
            df2 = pd.read_pickle(filename)
            feat_select.ana_apply(df2, "_p1", setname, f)
            cur_p_set = set(df2[df2.direct_p1 == 1].name.unique())
            cur_n_set = set(df2[df2.direct_p1 == -1].name.unique())
            abs_direct_p_set = abs_direct_p_set - Counter(set(abs_direct_p_set) - cur_p_set)
            abs_direct_n_set = abs_direct_n_set - Counter(set(abs_direct_n_set) - cur_n_set)
            print len(set(abs_direct_n_set))
    df.loc[:, "istable"] = df.apply(
        lambda row: 1 if row["name"] in set(abs_direct_p_set) else (1 if row["name"] in set(abs_direct_n_set) else 0),
        axis=1,
    )
    df.loc[:, "direct"] = df.apply(lambda row: 0 if row["istable"] == 0 else row["direct"], axis=1)
    df.to_pickle(os.path.join(dataroot, "phase1_dump", "sp500_base1_%s_%d_stable.pkl" % (name, args.depth)))
    print >> f, "|%d|%d|%d|" % (
        len(set(orig_direct_p_set)),
        len(set(abs_direct_p_set)),
        len(set(orig_direct_p_set) - set(abs_direct_p_set)),
    )
    print >> f, "|%d|%d|%d|" % (
        len(set(orig_direct_n_set)),
        len(set(abs_direct_n_set)),
        len(set(orig_direct_n_set) - set(abs_direct_n_set)),
    )
    print >> f, "## stable feats on postive direct"

    print >> f, "=" * 8, "Top feature analysis"
    for name in abs_direct_p_set:
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"], each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## UNstable feats on postive direct"
    for name in set(orig_direct_p_set) - set(abs_direct_p_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"], each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## stable feats on negtive direct"
    for name in set(abs_direct_n_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"], each["end"])
            assert idx < 1
            idx += 1

    print >> f, "## unstable feats on negtive direct"
    for name in set(orig_direct_n_set) - set(abs_direct_n_set):
        idx = 0
        for i, each in df[df.name == name].iterrows():
            print >> f, "|%s|%.4f|%.4f|" % (each["fname"], each["start"], each["end"])
            assert idx < 1
            idx += 1

    f.close()