Example #1
0
    def test_replace_mixed(self):
        self.mixed_frame.ix[5:20, "foo"] = nan
        self.mixed_frame.ix[-10:, "A"] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({"A": Series([1.0, 2.0], dtype="float64"), "B": Series([0, 1], dtype="int64")})
        expected = DataFrame({"A": Series([1.0, 2.0], dtype="float64"), "B": Series([0.5, 1], dtype="float64")})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame(
            {
                "A": Series([1.0, 2.0], dtype="float64"),
                "B": Series([0, 1], dtype="int64"),
                "C": Series([1, 2], dtype="int64"),
            }
        )
        expected = DataFrame(
            {
                "A": Series([1.0, 2.0], dtype="float64"),
                "B": Series([0.5, 1], dtype="float64"),
                "C": Series([1, 2], dtype="int64"),
            }
        )
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({"A": Series([1.0, 2.0], dtype="float64"), "B": Series([0, 1], dtype="int64")})
        expected = DataFrame({"A": Series([1, "foo"], dtype="object"), "B": Series([0, 1], dtype="int64")})
        result = df.replace(2, "foo")
        assert_frame_equal(result, expected)

        expected = DataFrame({"A": Series(["foo", "bar"], dtype="object"), "B": Series([0, "foo"], dtype="object")})
        result = df.replace([1, 2], ["foo", "bar"])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({"A": Series([3, 0], dtype="int64"), "B": Series([0, 3], dtype="int64")})
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype("float64")
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)
def kurtosis(str, list):

    s = list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h = len(w)

    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) / d) ** 4

    g = e.sum()

    p1 = h * (h + 1)
    p2 = float((h - 1) * (h - 2) * (h - 3))
    p3 = float(3 * ((h - 1) ** 2))
    p4 = (h - 2) * (h - 3)

    i = ((p1 / p2) * g) - (p3 / p4)

    print "kurtosis=", i
def mydeviate(str, list, Deviation=0, MeanAbsDeviation=1, MeanSqDev=0):

    s = list

    w = pd.read_csv(str, usecols=s)

    s = DataFrame(w)
    t = s.mean()

    if Deviation == 1:

        b = [w - t]

        print b

    if MeanAbsDeviation == 1:

        a = [abs(s) - t]
        print (a)

    if MeanSqDev == 1:

        c = [(w - t) ** 2]
        print c

    return
Example #4
0
def BackTestSignal(dfXAlpha, dfXReturn, XPrice, strategy, riskmgr=None, freq=252):
    dfAlphaWeight = strategy.GenSingleAlphaWeight(dfXAlpha)
    if riskmgr is not None:
        dfAlphaWeight = riskmgr.AdjustAlphaWeight(dfAlphaWeight)
    dfSignalReturn = GenSingleFactorReturn(dfAlphaWeight, dfXReturn)

    # (simple_sharpe, geo_sharpe, sim_mean * N, geo_mean * N, vol)
    sharpe = CalcSharpeRatio(dfSignalReturn["Return"], freq)

    # Detailed Data
    dfLongCount = DataFrame(columns=["LongCount"], data=dfAlphaWeight.apply(lambda s: s[s > 0].count(), axis=1))
    dfShortCount = DataFrame(columns=["ShortCount"], data=dfAlphaWeight.apply(lambda s: s[s < 0].count(), axis=1))
    dfLongExposure = DataFrame(columns=["LongExposure"], data=dfAlphaWeight.apply(lambda s: s[s > 0].sum(), axis=1))
    dfShortExposure = DataFrame(columns=["ShortExposure"], data=dfAlphaWeight.apply(lambda s: s[s < 0].sum(), axis=1))
    dfNetExposure = DataFrame(columns=["NetExposure"], data=dfAlphaWeight.apply(sum, axis=1))
    dfTotalDollarInvest = DataFrame(columns=["I"], data=dfAlphaWeight.apply(lambda s: abs(s).sum(), axis=1))
    dfTotalDollarTraded = DataFrame(
        columns=["D"], data=(dfAlphaWeight - dfAlphaWeight.shift(1)).apply(lambda s: abs(s).sum(), axis=1)
    )
    dfSharesTraded = dfAlphaWeight / XPrice
    dfTotalSharesTraded = DataFrame(
        columns=["Q"], data=(dfSharesTraded - dfSharesTraded.shift(1)).apply(lambda s: abs(s).sum(), axis=1)
    )

    TurnOver = dfTotalDollarTraded.mean()[0] / dfTotalDollarInvest.mean()[0]
    CentsPerShare = 100 * dfSignalReturn["Return"].iloc[1:].mean() / dfTotalSharesTraded.mean()[0]

    dfMetrics = DataFrame(list(sharpe)).T
    dfMetrics.columns = ["Simple Sharpe", "Geo. Sharpe", "Simple Mean", "Geo. Mean", "Anual Vol"]
    dfMetrics["Turnover"] = TurnOver
    dfMetrics["CentsPerShare"] = CentsPerShare
    dfMetrics["AvgHolding"] = 1.0 / TurnOver
    dfMetrics.index = [dfXAlpha.index.name]

    dfSignalReturn = pd.merge(dfSignalReturn, dfLongCount, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortCount, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongExposure, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortExposure, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfNetExposure, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarInvest, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarTraded, left_index=True, right_index=True, how="outer")
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalSharesTraded, left_index=True, right_index=True, how="outer")

    return dfMetrics, dfSignalReturn, dfAlphaWeight
Example #5
0
    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        result = df.replace(0, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 2.0, "b": 1.0}})
        assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({"zero": {"a": 0.0, "b": 1}, "one": {"a": 2.0, "b": 0}})
        s = Series({"zero": 0.0, "one": 2.0})
        result = df.replace(s, {"zero": 0.5, "one": 1.0})
        expected = DataFrame({"zero": {"a": 0.5, "b": 1}, "one": {"a": 1.0, "b": 0.0}})
        assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        assert_frame_equal(result, expected)
    def test_align_int_fill_bug(self):
        # GH #910
        X = np.arange(10 * 10, dtype="float64").reshape(10, 10)
        Y = np.ones((10, 1), dtype=int)

        df1 = DataFrame(X)
        df1["0.X"] = Y.squeeze()

        df2 = df1.astype(float)

        result = df1 - df1.mean()
        expected = df2 - df2.mean()
        assert_frame_equal(result, expected)
def stndize(str, list):

    s = list
    w = pd.read_csv(str, usecols=s)
    frame = DataFrame(w)

    t = frame.mean()
    print t
    z = frame.std()
    print z
    print (w - t) / z

    return
Example #8
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ("left", "right", "com", "dist", "radius", "diameter"):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(data=data[i].spread[val]["val"], index=data[i].times)
        data[i].times = np.array(data[i].times) - shift[i]

    spread = Spread()
    spread.spread["num"] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]["val"] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]["val"] = mean
        spread.spread[val]["std"] = std_error
        spread.spread["times"] = times

    return spread, data
Example #9
0
    def test_ops(self):

        # tst ops and reversed ops in evaluation
        # GH7198

        # smaller hits python, larger hits numexpr
        for n in [4, 4000]:

            df = DataFrame(1, index=range(n), columns=list("abcd"))
            df.iloc[0] = 2
            m = df.mean()

            for op_str, op, rop in [
                ("+", "__add__", "__radd__"),
                ("-", "__sub__", "__rsub__"),
                ("*", "__mul__", "__rmul__"),
                ("/", "__truediv__", "__rtruediv__"),
            ]:

                base = DataFrame(np.tile(m.values, n).reshape(n, -1), columns=list("abcd"))  # noqa

                expected = eval("base{op}df".format(op=op_str))

                # ops as strings
                result = eval("m{op}df".format(op=op_str))
                assert_frame_equal(result, expected)

                # these are commutative
                if op in ["+", "*"]:
                    result = getattr(df, op)(m)
                    assert_frame_equal(result, expected)

                # these are not
                elif op in ["-", "/"]:
                    result = getattr(df, rop)(m)
                    assert_frame_equal(result, expected)

        # GH7192
        df = DataFrame(dict(A=np.random.randn(25000)))
        df.iloc[0:5] = np.nan
        expected = 1 - np.isnan(df.iloc[0:25])
        result = (1 - np.isnan(df)).iloc[0:25]
        assert_frame_equal(result, expected)
def skewness(str, list):
    s = list

    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h = len(w)

    t = frame.mean()

    d = frame.std()

    e = ((w - t) / d) ** 3

    g = e.sum()

    i = (h * g) / ((h - 1) * (h - 2))

    print "skewness=", i
Example #11
0
class GetGenes(object):
    def __init__(self, data):
        self.dataframe = DataFrame(data)

        # read a text file and return a data frame. Records should be separated by TAB
        # There should not be duplicate column names

    def import_file(self, filename):
        # this function use to convert string to float
        def convert(x):
            try:
                x = float(x)
            except ValueError:
                pass
            return x

        table = []
        for line in open(filename):
            if line.strip():  # If not empty line
                line = line.rstrip("\n").split("\t")
                line = list(map(convert, line))
                table.append(line)
        self.dataframe = DataFrame(table[1:], columns=table[0])
        return

    def houseKeepingGenes(self, geneNum):
        # compute the CV of data
        std = array(self.dataframe.std(axis=1))
        mean = array(self.dataframe.mean(axis=1))
        CV = std / mean
        CV = list(map(abs, CV))  # convert to positive number

        # get the fist N minimum value
        mins = nsmallest(geneNum, CV)
        print("The GOOD genes are:\n")
        for item in mins:
            print(self.dataframe.ix[CV.index(item)][0])
        return
        if False:
            kw = dict(method="time")
            df = df.reindex(index).interpolate(**kw).ix[index]
        dfs.update({model: df})

dfs = Panel.fromDict(dfs).swapaxes(0, 2)


# In[ ]:

from pandas import DataFrame

means = dict()
for station, df in dfs.iteritems():
    df.dropna(axis=1, how="all", inplace=True)
    mean = df.mean()
    df = df - mean + mean["OBS_DATA"]
    means.update({station: mean["OBS_DATA"] - mean.drop("OBS_DATA")})

bias = DataFrame.from_dict(means).dropna(axis=1, how="all")
bias = bias.applymap("{:.2f}".format).replace("nan", "--")

columns = dict()
[columns.update({station: get_coops_longname(station)}) for station in bias.columns.values]

bias.rename(columns=columns, inplace=True)

to_html(bias.T, "style.css")


# In[ ]:
print(string_data[string_data.notnull()])

data = DataFrame([[1.0, 6.5, 3.0], [1.0, NA, NA], [NA, NA, NA], [NA, 6.5, 3.0]])
cleaned = data.dropna()
print(data)
print(cleaned)
print(data.dropna(how="all"))
data[4] = NA
print(data)
print(data.dropna(axis=1, how="all"))

df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA
df.ix[:2, 2] = NA
print(df)
print(df.dropna(thresh=3))
print(df.ffill(0))
print(df)
print(df.fillna({1: 0.5, 3: -1}))
print(df.fillna(0, inplace=True))
print(df)

df = DataFrame(np.random.randn(7, 3))
df.ix[2:, 1] = NA
df.ix[4:, 2] = NA
print(df.fillna(method="ffill"))
print(df.fillna(method="ffill", limit=2))

data = Series([1.0, NA, 3.5, NA, 7])
print(data.fillna(data.mean()))
Example #14
0
def getRepos(user):
    myrepos = requests.get(
        "https://api.github.com/users/" + user + "/repos",
        headers={"Authorization": "token 5218551eb082bffa572318de0c2de10d255170b1"},
    ).json()
    return myrepos


# Getting number of stars
data = DataFrame()
i = 0
for user in topGitUsers:
    userRepos = getRepos(user)
    i += 1
    print i  # check progress
    if len(userRepos) > 0:
        stars = []
        listUserStars = [("", 0)]
        for repo in userRepos:
            # print repo['stargazers_count']
            stars.append(repo["stargazers_count"])
        userStars = DataFrame(stars)
        userMeanSt = userStars.mean(axis=0)
        listUserStars.append((user, userMeanSt))
        # print user + str(userMeanSt[0])
        result = DataFrame({"userId": user, "Mean of stars": userMeanSt})
        data = data.append(result)
    else:
        print user + ": No repos found for this user"

data.to_csv("gitTopUsersMean.csv")
Example #15
0
    def bin(self, cleared, binsize, reject_count=100, dropna=False):
        """Bin spike data by `binsize` millisecond bins.

        Roughly, sum up the ones (and zeros) in the data using bins of size
        `binsize`.

        See :func:span.utils.utils.bin_data for the actual loop that
        executes this binning. This method is a wrapper around that function.

        Parameters
        ----------
        cleared : array_like
            The "refractory-period-cleared" array of booleans to bin.

        binsize : numbers.Real
            The size of the bins to use, in milliseconds

        reject_count : numbers.Real, optional, default 100
            Assign ``NaN`` to channels whose firing rates are less than this
            number over the whole recording.

        dropna : bool, optional
            Whether to drop NaN'd values if any

        Raises
        ------
        AssertionError
            * If `binsize` is not a positive number or if `reject_count` is
              not a nonnegative number

        Returns
        -------
        binned : SpikeGroupedDataFrame of float64

        See Also
        --------
        span.utils.utils.bin_data
        """
        assert binsize > 0 and isinstance(binsize, numbers.Real), '"binsize" must be a positive number'
        assert reject_count >= 0 and isinstance(
            reject_count, numbers.Real
        ), '"reject_count" must be a nonnegative real number'

        ms_per_s = 1e3
        bin_samples = cast(np.floor(binsize * self.fs / ms_per_s), np.uint64)
        bins = np.arange(0, self.nsamples - 1, bin_samples, np.uint64)

        shape = bins.shape[0] - 1, cleared.shape[1]
        btmp = np.empty(shape, np.uint64)

        bin_data(cleared.values.view(np.uint8), bins, btmp)

        # make a datetime index of milliseconds
        freq = binsize * datetools.Milli()
        index = (
            date_range(start=self.date, periods=btmp.shape[0], freq=freq, name=r"$t\left(i\right)$", tz="US/Eastern")
            + freq
        )
        binned = DataFrame(btmp, index=index, columns=cleared.columns, dtype=np.float64)

        # samples / (samples / s) == s
        rec_len_s = self.nsamples / self.fs

        # spikes / s
        min_sp_per_s = reject_count / rec_len_s

        # spikes / s * ms / ms == spikes / s
        sp_per_s = binned.mean() * ms_per_s / binsize

        # get rid of channels who have less then "reject_count" spikes over
        # the whole recording
        binned.ix[:, sp_per_s < min_sp_per_s] = np.nan

        if dropna:
            binned = binned.dropna(axis=1)

        return SpikeGroupedDataFrame(binned)
Example #16
0
def test():
    # a : adulte isolé
    # b : couple
    # c : enfant dans couple
    # d : enfan isolé
    # e : ado couple
    # f : ado isolé
    # g : chambre d'enfant

    # A: 2a,2e
    #  b + 2*c + g
    fa = [0, 1, 2, 0, 0, 0, 1]
    ma = 2754.74

    # B : 2a,2ea,supp:
    #  b + 2*e + 2*g
    fb = [0, 1, 0, 0, 2, 0, 2]
    mb = 3165.15

    # C : 1a,2e:
    #  a + 2*d + g
    fc = [1, 0, 0, 2, 0, 0, 1]
    mc = 2291.04

    # D: 2a, 2e, 2ea, 2*supp :
    #   b + 2*c + 2*e + 3*g
    fd = [0, 1, 2, 0, 2, 0, 3]
    md = 3969.81

    # E : 2a,1ea
    #    b + e + g
    fe = [0, 1, 0, 0, 1, 0, 1]
    me = 2549.17

    # F : 2a, 1e, 2ea
    #    b + c + 2*e + 2*g
    ff = [0, 1, 1, 0, 2, 0, 2]
    mf = 3514.12

    # G: 2a, 1e ,1ea, supp
    #   b + c + e + 2*g
    fg = [0, 1, 1, 0, 1, 0, 2]
    mg = 3042.39

    # H: 1a, 1ea
    #    a + f + g
    fh = [1, 0, 0, 0, 0, 1, 1]
    mh = 2103.91

    # solve f*x = m

    # A supplementary equation is needed because the system is inconsistant
    fsup = [1, -1 / 1.5, 0, 0, 0, 0, 0]
    msup = 0
    f = [fa, fb, fc, fd, fe, ff, fg, fh, fsup]
    m = [ma, mb, mc, md, me, mf, mg, mh, msup]

    results = DataFrame()

    for i in range(8):
        selected_f1 = list(f)
        selected_m1 = list(m)
        selected_f1.pop(i)
        selected_m1.pop(i)
        for j in range(7):
            selected_f = list(selected_f1)
            selected_m = list(selected_m1)
            selected_f.pop(j)
            selected_m.pop(j)

            f_mat = np.array(selected_f)

            m_vec = np.array(selected_m)

            # print i, np.linalg.det(f_mat)
            try:
                x = DataFrame({str(i) + str(j): np.linalg.solve(f_mat, m_vec)}).T
            except:

                x = None

            from pandas import concat

            if x is not None:
                results = concat([results, x])

    print results
    print results.mean()
    print results.std()
    print results.std() / results.mean()
Example #17
0
def avg_medal_count():
    """
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    """

    countries = [
        "Russian Fed.",
        "Norway",
        "Canada",
        "United States",
        "Netherlands",
        "Germany",
        "Switzerland",
        "Belarus",
        "Austria",
        "France",
        "Poland",
        "China",
        "Korea",
        "Sweden",
        "Czech Republic",
        "Slovenia",
        "Japan",
        "Finland",
        "Great Britain",
        "Ukraine",
        "Slovakia",
        "Italy",
        "Latvia",
        "Australia",
        "Croatia",
        "Kazakhstan",
    ]

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    olympic_medal_counts = {
        "country_name": countries,
        "gold": Series(gold),
        "silver": Series(silver),
        "bronze": Series(bronze),
    }
    df = DataFrame(olympic_medal_counts)

    # YOUR CODE HERE
    # df['average_medal_count'] = df.mean(axis=1)
    # avg_medal_count_by_country = df[['country_name','average_medal_count']]
    avg_medal_count = df.mean()
    # Or, we could do it this way
    avg_medal_count = df[["gold", "silver", "bronze"]].apply(numpy.mean)
    print(avg_medal_count)

    return avg_medal_count
df = pd.read_csv("demo_data_1.csv")
pprint(df)

# <demo> --- stop ---

# 'Panel' objects are 3D.

wp = Panel({"Item1": DataFrame(randn(4, 3)), "Item2": DataFrame(randn(4, 2))})
pprint(wp)

# There are also 'TimeSeries', 'SparseSeries', and 'SparsePanel' objects.
# In newer versions, there is experiemental support for higher-dimensional
# panels.

# Stats can also be performed on Pandas objects.
df = DataFrame(randn(6, 4), columns=["A", "B", "C", "D"])
pprint(df)

# You can choose which axis number to perform the operation along.
pprint(df.mean(0))
pprint(df.mean(1))

# Much more to Pandas, but that's the basic idea.

# For more information, see:
#   http://pandas.pydata.org/pandas-docs/stable/index.html
# Also, definitely have a look at StatsModels:
#   http://statsmodels.sourceforge.net/
#   http://statsmodels.sourceforge.net/stable/

# <demo> --- stop ---
Example #19
0
# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
Example #20
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df = DataFrame(abs(np.random.randn(30).reshape(6, 5)) * 100)

plt.bar(
    np.arange(len(df.mean())),
    df.mean(),
    align="center",
    color="white",
    yerr=df.std(),
    ecolor="black",
    capsize=5,
    linewidth=1,
)
plt.grid()


plt.show()
Example #21
0
experiment_data_Raw = DataFrame({"Timestamp": timestampsRaws, "Raw key": raws, "Dataset": datasetR})
experiment_data_Raw = experiment_data_Raw.set_index("Timestamp")

final_data = concat([experiment_data_Qber, experiment_data_Raw])

final_data = final_data.sort_index()

# after prepaired data, time to plot it:

for new_counter in range(counter + 1):
    # print new_counter
    Qbers = final_data[(final_data["Dataset"] == new_counter) & (final_data["Qber"] > 0)]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    # prepairing proper time:
    x1[:] = [x - initialTimestamps[new_counter] for x in x1]

    Raws = final_data[(final_data["Dataset"] == new_counter) & (final_data["Raw key"] > 0)]
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()
    # x2_average = 2
    # x2_std_dev = 3
    # once again correcting counter:
    x2[:] = [x - initialTimestamps[new_counter] for x in x2]
    print x1[0], x2[0], initialTimestamps[new_counter]
    # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].grid()
Example #22
0
# -*- coding: utf-8 -*-

import numpy as np
from pandas import Series, DataFrame

print "求和"
df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"]
)
print df
print df.sum()  # 按列求和
print df.sum(axis=1)  # 按行求和
print

print "平均数"
print df.mean(axis=1, skipna=False)
print df.mean(axis=1)
print

print "其它"
print df.idxmax()
print df.cumsum()
print df.describe()
obj = Series(["a", "a", "b", "c"] * 4)
print obj.describe()
plt.plot(np.random.randn(1000).cumsum())
np.arange(5)[:2]

# index in Series
index = ["a", "b", "c", "d", "e"]
s = Series(np.arange(5), index=index)
s[:3]
s["d"]
s["b":]
s[[4]]
s[["a", "c"]]

# create date_range by day
dates = pd.date_range("2012-07-16", "2012-07-21")
atemps = Series([101.4, 99, 90, 232, 233, 123], index=dates)
atemps.index[2]

sdtemps = Series([73, 78, 77, 78, 78, 77], index=dates)
temps = DataFrame({"Austin": atemps, "San Diego": sdtemps})
temps["diff"] = temps["San Diego"] - temps["Austin"]

del temps["diff"]
temps["Austin"]
idx = temps.index[2]
temps.ix[[1, 2, 3], "Austin"]

temps.mean()
# compute mean over the row
np.randn(5, 5).mean(0)
# compute mean over the column
np.randn(5, 5).mean(1)
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame(
        [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list("abcd"), columns=["one", "two"]
    )
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1)  # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list("aabc") * 4)
    print obj.describe()

    methods = [
        "count",
        "min",
        "max",  # 'argmin', 'argmax',
        "quantile",
        "median",
        "mad",
        "var",
        "std",
        "skew",
        "kurt",
        "cummin",
        "cummax",
        "cumprod",
        "diff",
        "pct_change",
    ]

    for method in methods:
        print u"「{0}」".format(method)
        print getattr(df, method)()
        print ""

    # Correspond and Covariance
    all_data = {}
    lst = []  # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst:  # , 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, "1/1/2000", "1/1/2010")
    price = DataFrame({tic: data["Adj Close"] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data["Volume"] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ""
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ""
        print returns.corr()
        print returns.cov()
        print ""
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print "", ""
    obj = Series(list("cadaabbcc"))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(["b", "c"])
    print mask
    print obj[mask]

    data = DataFrame({"Qu1": [1, 3, 4, 3, 4], "Qu2": [2, 3, 1, 2, 3], "Qu3": [1, 5, 2, 4, 4]})
    print data
    print data.apply(pd.value_counts).fillna(0)
Example #25
0
    a = DataFrame([np.linspace(i, i * 5, 5)], index=[index[i]], columns=["shut", "the", "fuck", "up", "fucking"])
    df = pd.concat([df, a], axis=0)
print "----------------------------------"
print df

print "打印一列或多列的数据,并筛选行数----------------------------------"
print "----------------------------------"
print df["fucking"]
print df.fucking
print type(df["fucking"])
print df.loc["r5"]
print df.loc["r1":"r3"]
print "----------------------------------"
print df[["shut", "fuck", "up"]]
print type(df[["shut", "fuck", "up"]])
print df[["fuck", "up", "fucking"]].loc[["r1", "r2"]]

print "三种访问特定元素的方法:at.双[]-------------------------------------"
print df.at["r5", "fucking"]
print df.fucking["r5"]
print df["fucking"]["r5"]
print type(df["fucking"]["r5"])

print df.mean()

# 将数据库与数据框结合: http://www.dcharm.com/?p=341

reTest = open("reTest.txt", "wb")
pattern = re.compile('^<td.*?href(http.*?)"\s')
model = re.findall(pattern, reTest, re.I | re.S)
print model
        except IOError, e:
            print "Unable to download data for %s. Reason: %s" % (symbol, str(e))
            return None

        f[symbol] = hist_prices

        # ret[symbol] = (hist_prices['Adj Close'] - hist_prices['Adj Close'].shift(1)) / hist_prices['Adj Close'].shift(1)
        ret[symbol] = hist_prices["Adj Close"].pct_change()
        excRet[symbol] = ret[symbol] - (riskFreeRate / 252)  # RiskFreeRate is annualized

    # Create a new DataFrame based on the Excess Returns.
    df = DataFrame(excRet).dropna()

    # Calculate the CoVariance and Mean of the DataFrame
    C = 252 * df.cov()
    M = 252 * df.mean()

    # Calculate the Kelly-Optimal Leverages using Matrix Multiplication
    F = inv(C).dot(M)

    # Return a list of (security, leverage) tuple
    return zip(df.columns.values.tolist(), F)


def main(argv):
    """Entry point of Kelly Criterion calculation."""

    print "Kelly Criterion calculation"
    args = docopt(__doc__, argv[1:])

    # Parse risk-free-rate
Example #27
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt

df = DataFrame(abs(np.random.randn(30).reshape(6, 5)) * 100)

plt.bar(np.arange(len(df.mean())), df.mean(), align="center", color="white", linewidth=1.5)

plt.hold(True)

plt.errorbar(np.arange(len(df.mean())), df.mean(), df.std(), elinewidth=1.2, capsize=7.5, fmt=None)
plt.show()
Example #28
0
    def run(
        self,
        Model="ridge",
        kernel="linear",
        cross_validationMethod="KFold",
        FeatureSelection="PCA",
        n_features=20,
        scoringList=["specificity", "sensitivity", "precision", "f1", "accuracy", "ss_mean"],
        isSaveCsv=None,
        isSavePickle=None,
        isSaveFig=None,
        isPerm=0,
        isBetweenSubjects=True,
        isConcatTwoLabels=False,
    ):
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel)
        # --  # select best model (svm, otherwise ridge regression)
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html)
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test)
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init
        FeatureTypeList = [j for j in tuple(self.FeaturesDF.index)]
        self.FullResults = DF()
        self.Learningdetails = {
            "Model": Model,
            "Kernel": kernel,
            "CrossVal": cross_validationMethod,
            "FeatureSelection": FeatureSelection,
            "LabelBy": self.Details["LabelDetails"].keys()[0],
            "FeatureMethod": self.Details["FeatureMethod"],
            "PieceLength": self.Details["PieceLength"],
        }
        print("\n------------Learning Details------------")
        print(DF.from_dict(self.Learningdetails, orient="index"))
        print("\n----" + cross_validationMethod + " Cross validation Results:----")

        # Set learning params (cross validation method, and model for learning)
        isBoolLabel = self.LabelsObject.isBoolLabel
        isBoolScores = isBoolLabel
        model, isBoolModel, featureSelectionMethod, selectFeaturesFunction = learningUtils.setModel(
            Model, FeatureSelection, n_features
        )
        # define global variables over modules (to be used in myUtils)
        globalVars.transformMargins = 0  # lambda x:x
        globalVars.isBoolLabel = isBoolLabel
        globalVars.isBoolModel = isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels, isAddDroppedSubjects
        trainLabels_all, testLabels_all, TrueLabels, isAddDroppedSubjects = labelUtils.initTrainTestLabels_all(
            self.LabelsObject
        )
        trainLabels_all2, testLabels_all2, TrueLabels2, isAddDroppedSubjects2 = labelUtils.initTrainTestLabels_all(
            self.LabelsObject2
        )

        LabelingList = ["N1"]  # trainLabels_all.columns
        self.ResultsDF = DF()
        self.BestFeatures = DF(columns=LabelingList)  # dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels = pandas.Panel(
            items=range(len(trainLabels_all)), major_axis=LabelingList, minor_axis=TrueLabels.index
        )  # panel: items=cv_ind, major=labels, minor=#TODO

        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList.
        for label_ind, Labeling in enumerate(LabelingList):
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            # set subjects list according to labels and features
            X, SubjectsList, droppedSubjects, Xdropped = featuresUtils.initX(self.FeaturesDF, trainLabels_all, Labeling)
            X2, SubjectsList2, droppedSubjects2, Xdropped2 = featuresUtils.initX(
                self.FeaturesDF, trainLabels_all2, Labeling, is2=1
            )

            # init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(
                Labeling, SubjectsList, trainLabels_all, testLabels_all
            )
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(
                Labeling, SubjectsList2, trainLabels_all2, testLabels_all2
            )

            # make sure only labeled subjects are used for classification
            X = X.query("subject == " + str(list(trainLabels.index)))
            X.index.get_level_values(X.index.names[0])
            SubjectIndex = list(set(X.index.get_level_values("subject")))

            X2 = X2.query("subject == " + str(list(trainLabels2.index)))
            X2.index.get_level_values(X2.index.names[0])
            SubjectIndex2 = list(set(X2.index.get_level_values("subject")))
            # init vars
            if isBetweenSubjects:
                cv_param = len(SubjectIndex)
                self.Learningdetails["CrossValSubjects"] = "between"
                isWithinSubjects = False
            else:
                isWithinSubjects = True
                X = X.swaplevel(0, 1)
                PieceIndex = list(set(X.index.get_level_values("Piece_ind")))
                cv_param = len(PieceIndex)
                self.Learningdetails["CrossValSubjects"] = "within"

            self.Learningdetails["NumOfFeatures"] = n_features

            print("\n**" + Labeling + "**")

            cv, crossValScores = learningUtils.setCrossValidation(
                cross_validationMethod, cv_param, trainLabels, isWithinSubjects
            )

            ## Learning - feature selection for different scoring types, with cross validation -

            BestFeaturesForLabel = self.BestFeaturesForLabel(
                FeatureTypeList, LabelingList, n_features
            )  # saves dataframe with best features for each label, for later analysis
            cv_ind = 0
            # used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores = DF()
            test_index = X.index
            testScores = concat([DF(index=test_index), DF(index=["std_train_err"])])
            testScores2 = concat([DF(index=testLabels.index), DF(index=["std_train_err"])])
            # impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange = LabelRange

            ModelWeights1 = DF(columns=range(len(cv)), index=X.columns)
            Components = pandas.Panel(
                items=range(len(cv)), major_axis=X.columns, minor_axis=range(n_features)
            )  # todo fix this for 1st and second learning
            ExplainedVar = DF(columns=range(len(cv)))
            ModelWeights2 = DF(columns=range(len(cv)))
            for train, test in cv:

                if isBetweenSubjects:
                    # set X and Y
                    train_subjects = trainLabels.iloc[train].index
                    test_subjects = testLabels.iloc[test].index
                    Xtrain, Xtest, Ytrain, YtrainTrue, Ytest = learningUtils.setXYTrainXYTest(
                        X, Labeling, trainLabels, testLabels, TrueLabels, train_subjects, test_subjects
                    )
                    Xtrain2, Xtest2, Ytrain2, YtrainTrue2, Ytest2 = learningUtils.setXYTrainXYTest(
                        X2, Labeling, trainLabels2, testLabels2, TrueLabels2, train_subjects, test_subjects
                    )

                    if isConcatTwoLabels:  # used when there is more than one doctor
                        Xtrain = concat([Xtrain, Xtrain2])
                        Xtest = concat([Xtest, Xtest2])
                        Ytrain = concat([Ytrain, Ytrain2])
                        YtrainTrue = concat([YtrainTrue, YtrainTrue2])
                        Ytest = concat([Ytest, Ytest2])
                        Xdropped = concat([Xdropped, Xdropped2])
                        SubjectsList = list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects = list(
                            set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList))
                        )  # diff from SubjectsList to make sure no subjects are both in train and test.
                    """else:
                        Xtrain=Xtrain1
                        Xtest=Xtest1
                        Xdropped=Xdropped1
                        Ytrain=Ytrain1
                        YtrainTrue=YtrainTrue1
                        Ytest=Ytest1"""

                    # select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar, decomposeFunc = learningUtils.selectBestNfeatures(
                        Xtrain, Xtest, Ytrain, n_features, selectFeaturesFunction
                    )
                    BestFeaturesForLabel.add(bestNfeatures)  # todo - delete this??

                    # train 1
                    TrainModel = model
                    TrainModel.fit(Xtrain.sort_index(), Ytrain.T.sort_index())
                    try:
                        Components[cv_ind] = components.T
                        ExplainedVar[cv_ind] = explainedVar
                        isDecompose = True
                        if cv_ind == 0:
                            ModelWeights1 = DF(columns=range(len(cv)), index=range(len(bestNfeatures)))
                        ModelWeights1[cv_ind] = TrainModel.coef_.flatten()
                    except AttributeError:
                        isDecompose = False
                        ModelWeights1[cv_ind].loc[bestNfeatures] = TrainModel.coef_.flatten()
                    self.isDecompose = isDecompose
                    # train 2
                    if isBoolLabel:
                        PiecePrediction_train = DF(
                            TrainModel.predict(Xtrain), index=Xtrain.index, columns=["prediction"]
                        )
                        TrainModel2 = svm.SVC(kernel="linear", probability=True, class_weight={0: 1, 1: 1})
                    else:
                        PiecePrediction_train = DF(
                            TrainModel.decision_function(Xtrain), index=Xtrain.index, columns=["prediction"]
                        )
                        TrainModel2 = linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2 = learningUtils.getX2Y2(
                        Xtrain, Ytrain, YtrainTrue, PiecePrediction_train, isBoolLabel
                    )
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind == 0:
                        ModelWeights2 = DF(columns=range(len(cv)), index=Xtrain2.columns)
                    ModelWeights2[cv_ind] = TrainModel2.coef_.flatten()

                    # test 1
                    if (
                        isAddDroppedSubjects
                    ):  # take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped = DF(decomposeFunc(Xdropped).values, index=Xdropped.index)
                        XtestDropped = dXdropped[bestNfeatures]
                        YtestDropped = Series(XtestDropped.copy().icol(0))
                        # YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject] = testLabels_all[Labeling].loc[subject]
                            # YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest = concat([Ytest, YtestDropped]).sort_index()
                        Xtest = concat([Xtest, XtestDropped]).sort_index()

                    if isPerm:  # TODO- Check this!!
                        Ytest = y_perms.loc[Ytest.index]
                    Xtest = Xtest.fillna(0.0)

                elif isWithinSubjects:
                    # train 1
                    train_pieces = PieceIndex[train]
                    test_pieces = PieceIndex[
                        test
                    ]  # TODO - make sure that if test/train> piece index, it ignores it and repeate the process

                    XtrainAllFeatures = X.query("Piece_ind == " + str(list(train_pieces)))
                    Ytrain = Series(index=X.index)
                    Ytest = Series(index=X.index)
                    YtrainTrue = Series(index=X.index)

                    for subject in PieceIndex:
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject] = trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject] = TrueLabels[Labeling].loc[subject]
                            Ytest.loc[piece].loc[subject] = testLabels[subject]
                    Ytrain = Ytrain.dropna()
                    YtrainTrue = YtrainTrue.dropna()
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject] = testLabels[subject]
                # train scores 1
                if cv_ind == 0:
                    trainScores, YtrainPredicted = learningUtils.getTrainScores(Ytrain, Xtrain, YtrainTrue, TrainModel)
                    plt.figure(1)
                    if len(LabelingList) > 1:
                        plt.subplot(round(len(LabelingList) / 2), 2, label_ind + 1)
                    if isBoolLabel:
                        testScores = learningUtils.getTestScores(Ytest, Xtest, TrainModel)
                    else:
                        testScores[cv_ind] = learningUtils.getTestScores(Ytest, Xtest, TrainModel)
                        plt.title(Labeling, fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores, YtrainPredicted = learningUtils.getTrainScores(
                        Ytrain, Xtrain, YtrainTrue, TrainModel
                    )
                    trainScores = concat([trainScores, new_trainScores], axis=1)
                    # test 1
                    testScores[cv_ind] = learningUtils.getTestScores(Ytest, Xtest, TrainModel)

                # train2

                if isBoolLabel:
                    PiecePrediction_test = DF(TrainModel.predict(Xtest), index=Xtest.index, columns=["prediction"])
                else:
                    PiecePrediction_test = DF(
                        TrainModel.decision_function(Xtest), index=Xtest.index, columns=["prediction"]
                    )
                Xtest2, Ytest2, YtestTrue2 = learningUtils.getX2Y2(
                    Xtest, Ytest, Ytest, PiecePrediction_test, isBoolLabel
                )

                if cv_ind == 0:
                    trainScores2, YtrainPredicted2 = learningUtils.getTrainScores(
                        Ytrain2, Xtrain2, YtrainTrue2, TrainModel2
                    )
                    YpredictedOverAllLabels[cv_ind].loc[Labeling] = YtrainPredicted2
                    # plt.figure(1)
                    # if len(LabelingList)>1:
                    # plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    # test2
                    if isBoolLabel:
                        testScores2 = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                    else:
                        testScores2[cv_ind] = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                    # plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2, YtrainPredicted2 = learningUtils.getTrainScores(
                        Ytrain2, Xtrain2, YtrainTrue2, TrainModel2
                    )
                    YpredictedOverAllLabels[cv_ind].loc[Labeling] = YtrainPredicted2
                    trainScores2 = concat([trainScores2, new_trainScores2], axis=1)
                    testScores2[cv_ind] = learningUtils.getTestScores(Ytest2, Xtest2, TrainModel2)
                cv_ind += 1

                # crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data.
            fig2 = plt.figure(2)
            if len(LabelingList) > 1:
                plt.subplot(round(len(LabelingList) / 2), 2, label_ind + 1)
            # if isAddDroppedSubjects:
            # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
            # else:
            # testLabelsSummary=testLabels
            scoresSummary = learningUtils.getScoresSummary(trainScores2, testScores2, TrueLabels[Labeling])
            # reset global vars
            globalVars.fitYscale = "notDefined"
            globalVars.beta = DF()

            plt.title(Labeling, fontsize=10)
            plt.xlabel("Ytrue", fontsize=8)
            plt.ylabel("Ypredicted", fontsize=8)
            plt.tick_params(labelsize=6)
            # print(crossValScores.T)
            scores = scoresSummary.fillna(0.0)

            # analyze feature weightsL

            WeightedFeatures1 = DF(
                [ModelWeights1.mean(axis=1), ModelWeights1.std(axis=1)], index=["mean", "std"]
            ).T.fillna(0)
            if isDecompose == 0:
                WeightedFeatures1FeatureType = WeightedFeatures1.mean(level="FeatureType")
                WeightedFeatures1FsSingal = WeightedFeatures1.mean(level="fs-signal")
                WeightedFeatures1 = concat(
                    [
                        DF(index=["-------(A) FeatureType-------"]),
                        WeightedFeatures1FeatureType,
                        DF(index=["-------(B) faceshift signal-------"]),
                        WeightedFeatures1FsSingal,
                    ]
                )

            WeightedFeatures2 = DF(
                [ModelWeights2.mean(axis=1), ModelWeights2.std(axis=1)], index=["mean", "std"]
            ).T.fillna(0)
            BestFeatures = concat(
                [
                    DF(index=["------------- Learning 1 -------------"]),
                    WeightedFeatures1,
                    DF(index=["------------- Learning 2 -------------"]),
                    WeightedFeatures2,
                ]
            )
            self.BestFeatures[Labeling] = BestFeatures["mean"]

            # analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T  # todo- check!
                ExplainedVar_mean.index = ["ExplainedVar_mean"]
                ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T  # todo- check!
                ExplainedVar_std.index = ["ExplainedVar_std"]
                try:
                    self.LabelComponents[Labeling] = concat(
                        [
                            DF(index=["---components mean---"]),
                            Components_mean,
                            ExplainedVar_mean,
                            DF(index=["---components std over cross validation---"]),
                            Components_std,
                            ExplainedVar_std,
                        ]
                    )
                except AttributeError:
                    self.LabelComponents = dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling] = concat(
                        [
                            DF(index=["---components mean---"]),
                            Components_mean,
                            ExplainedVar_mean,
                            DF(index=["---components std over cross validation---"]),
                            Components_std,
                            ExplainedVar_std,
                        ]
                    )

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

            # BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults = concat([DF(index=[Labeling]), scores])

            self.FullResults = concat([self.FullResults, LabelFullResults])
            self.ResultsDF = concat([self.ResultsDF, DF(scores[0], columns=[Labeling])], axis=1)
        # continue here!! to build pseudo inverse matrix from predicted to true - make sure columns + rows are set!

        # self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

        # plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3 = pandas.Panel(items=range(len(X2.index)))  # for each cv score...
        FullSubjectsList = YpredictedOverAllLabels[0].columns
        YdroppNans = YpredictedOverAllLabels.dropna(axis=0, how="all")
        YdroppNans = YdroppNans.dropna(axis=1, how="all")
        YpredictedOverAllLabels = YdroppNans.dropna(axis=2, how="all")
        notNans_cv_ind = YpredictedOverAllLabels.items
        notNans_trainSubjects = YpredictedOverAllLabels.minor_axis
        notNans_LabelsList = YpredictedOverAllLabels.major_axis
        notNans_TrueLabels = TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind = 0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train = list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test = list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train) > 0 and len(test) > 0:
                    AllLabelsYTrainPredicted = YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted = AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue = notNans_TrueLabels[train]
                    AllLabelsYTestPredicted = YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue = notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue = DF(
                        np.linalg.pinv(AllLabelsYTrainTrue),
                        columns=AllLabelsYTrainTrue.index,
                        index=AllLabelsYTrainTrue.columns,
                    )
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix = DF(
                        AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),
                        columns=pseudoInverse_AllLabelsYTrainTrue.columns,
                    )  # change to real code!!
                TrainModel3 = lambda y: y.T.dot(AllLabelsTransformationMatrix)
                testscores3[cv_ind] = learningUtils.getTestScores(
                    AllLabelsYTrainTrue, AllLabelsYTrainPredicted, TrainModel3
                )
            cv_ind += 1

        self.ResultsDF = self.ResultsDF.fillna(0.0)

        ## Print and save results
        print("\n")
        print(self.ResultsDF)
        print("\n")
        D = self.Learningdetails
        savePath = (
            resultsPath
            + "\\"
            + D["Model"]
            + "_"
            + D["CrossVal"]
            + "_LabelBy"
            + D["LabelBy"]
            + "_Features"
            + D["FeatureMethod"]
            + "_FS"
            + FeatureSelection
            + "_Kernel"
            + D["Kernel"]
            + "_"
            + D["CrossValSubjects"]
            + "Subjects_PieceSize"
            + D["PieceLength"]
        )
        if isPerm:
            savePath = savePath + "_PERMStest"
        saveName = savePath + "\\" + str(n_features) + "_features"
        self.Learningdetails["saveDir"] = savePath
        dir = os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle = int(raw_input("Save Results to pickle? "))
        if isSaveCsv is None:
            isSaveCsv = int(raw_input("save Results to csv? "))
        if isSaveFig is None:
            isSaveFig = int(raw_input("save Results to figure? "))

        if isSavePickle:
            self.ResultsDF.to_pickle(saveName + ".pickle")
            self.BestFeatures.to_pickle(saveName + "_bestFeatures.pickle")

        if isSaveCsv:
            DetailsDF = DF.from_dict(self.Learningdetails, orient="index")
            ResultsCSV = concat(
                [
                    self.ResultsDF,
                    DF(index=["-------Label Details-------"]),
                    self.N,
                    DF(index=["-------Learning Details-------"]),
                    DetailsDF,
                    DF(index=["-------Selected Features Analysis------"]),
                    self.BestFeatures,
                ]
            )
            ResultsCSV.to_csv(saveName + ".csv")

        if isSaveCsv or isSavePickle:
            print("successfully saved as:\n" + saveName)

        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + "Train.png")
            plt.figure(2)
            plt.savefig(saveName + "Test.png")
        plt.close()
        plt.close()
Example #29
0
final_data = concat([experiment_data_Qber, experiment_data_Raw])

final_data = final_data.sort_index()

# after prepaired data, time to plot it:

for new_counter in range(file_counter + 1):
    # print new_counter
    Qbers = final_data[(final_data["Dataset"] == new_counter) & (final_data["Qber"] > 0)]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    # prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]

    Raws = final_data[(final_data["Dataset"] == new_counter) & (final_data["Raw key"] > 0)]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]

    Raws = Raws[Raws["Raw key"] < (x2_max - (x2_max / 100) * 20)]

    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    # x2_std_dev = 3
    # once again correcting counter:
    x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2]
Example #30
0
from pandas import Series, DataFrame

###############################################################

df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=["a", "b", "c", "d"], columns=["one", "two"]
)

print(df)
print("\n")
print(df.sum())
print("\n")
print(df.sum(axis=1))
print("\n")
print(df.mean())
print("\n")
print(df.mean(axis=1, skipna=False))
print("\n")
print(df.idxmax())
print("\n")
print(df.cumsum())
print("\n")
print(df.cumsum(axis=1))
print("\n")
print(df.describe())
print("\n")

###############################################################

obj = Series(["a", "a", "b", "c"] * 4)
print(obj)