Example #1
0
    def test_replace_mixed(self):
        mf = self.mixed_frame
        mf.iloc[5:20, mf.columns.get_loc('foo')] = nan
        mf.iloc[-10:, mf.columns.get_loc('A')] = nan

        result = self.mixed_frame.replace(np.nan, -18)
        expected = self.mixed_frame.fillna(value=-18)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-18, nan), self.mixed_frame)

        result = self.mixed_frame.replace(np.nan, -1e8)
        expected = self.mixed_frame.fillna(value=-1e8)
        assert_frame_equal(result, expected)
        assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

        # int block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        df.replace(0, 0.5, inplace=True)
        assert_frame_equal(df, expected)

        # int block splitting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64'),
                        'C': Series([1, 2], dtype='int64')})
        expected = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                              'B': Series([0.5, 1], dtype='float64'),
                              'C': Series([1, 2], dtype='int64')})
        result = df.replace(0, 0.5)
        assert_frame_equal(result, expected)

        # to object block upcasting
        df = DataFrame({'A': Series([1.0, 2.0], dtype='float64'),
                        'B': Series([0, 1], dtype='int64')})
        expected = DataFrame({'A': Series([1, 'foo'], dtype='object'),
                              'B': Series([0, 1], dtype='int64')})
        result = df.replace(2, 'foo')
        assert_frame_equal(result, expected)

        expected = DataFrame({'A': Series(['foo', 'bar'], dtype='object'),
                              'B': Series([0, 'foo'], dtype='object')})
        result = df.replace([1, 2], ['foo', 'bar'])
        assert_frame_equal(result, expected)

        # test case from
        df = DataFrame({'A': Series([3, 0], dtype='int64'),
                        'B': Series([0, 3], dtype='int64')})
        result = df.replace(3, df.mean().to_dict())
        expected = df.copy().astype('float64')
        m = df.mean()
        expected.iloc[0, 0] = m[0]
        expected.iloc[1, 1] = m[1]
        assert_frame_equal(result, expected)
Example #2
0
def kurtosis(str,list):

    s=list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)


    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) /d) ** 4

    g=e.sum()


    p1=h*(h+1)
    p2=float((h-1)*(h-2)*(h-3))
    p3=float(3*((h-1)**2))
    p4=(h-2)*(h-3)

    i=(((p1/p2)*g)-(p3/p4))

    print 'kurtosis=',i
def mydeviate(str,list,Deviation=0,MeanAbsDeviation=1,MeanSqDev=0):

    s=list

    w= pd.read_csv(str,usecols=s)

    s=DataFrame(w)
    t= s.mean()

    if Deviation==1:


        b=[w-t]

        print b

    if MeanAbsDeviation==1:

        a=[abs(s)-t]
        print(a)

    if MeanSqDev==1:

        c=[(w-t)**2]
        print c


    return
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
Example #5
0
def skewness(str,list):
    s= list



    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)

    t = frame.mean()



    d = frame.std()

    e = ((w - t) /d) ** 3

    g=e.sum()

    i=(h*g)/((h-1)*(h-2))


    print 'skewness=',i
Example #6
0
def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results
    def get_topwords(self, countries, thresh=10, tf_idf=False):
        tw = DataFrame()
        for r in range(len(self.df)):
            if self.df.loc[r, 'country_id'] in countries:
                if tf_idf:
                    tw = tw.append(self.tf_idf.loc[r, :])
                else:
                    tw = tw.append(self.df.loc[r, :])

        return tw.mean().order(ascending=False)[:thresh]
Example #8
0
def pd_03():
    df=DataFrame(np.random.randn(6,3))
    df.ix[2:,1]=np.nan
    df.ix[4:,2]=np.nan
    print df
    print df.fillna(method='ffill')
    print df.fillna(method='ffill',limit=2)
    data=Series([1.,None,3.5,None,7])
    print data.fillna(data.mean())
    print df.fillna(df.mean())
Example #9
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
Example #10
0
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments
Example #11
0
    def test_replace_series_dict(self):
        # from GH 3064
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        result = df.replace(0, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 2.0, 'b': 1.0}})
        assert_frame_equal(result, expected)

        result = df.replace(0, df.mean())
        assert_frame_equal(result, expected)

        # series to series/dict
        df = DataFrame({'zero': {'a': 0.0, 'b': 1}, 'one': {'a': 2.0, 'b': 0}})
        s = Series({'zero': 0.0, 'one': 2.0})
        result = df.replace(s, {'zero': 0.5, 'one': 1.0})
        expected = DataFrame(
            {'zero': {'a': 0.5, 'b': 1}, 'one': {'a': 1.0, 'b': 0.0}})
        assert_frame_equal(result, expected)

        result = df.replace(s, df.mean())
        assert_frame_equal(result, expected)
Example #12
0
def BackTestSignal(dfXAlpha, dfXReturn, XPrice, strategy, riskmgr=None, freq = 252):
    dfAlphaWeight  = strategy.GenSingleAlphaWeight(dfXAlpha)
    if riskmgr is not None:
        dfAlphaWeight = riskmgr.AdjustAlphaWeight(dfAlphaWeight)
    dfSignalReturn = GenSingleFactorReturn(dfAlphaWeight, dfXReturn)
    
    # (simple_sharpe, geo_sharpe, sim_mean * N, geo_mean * N, vol)
    sharpe = CalcSharpeRatio(dfSignalReturn['Return'], freq)
    
    # Detailed Data
    dfLongCount = DataFrame(columns=['LongCount'], data=dfAlphaWeight.apply(lambda s: s[s>0].count(), axis=1))
    dfShortCount = DataFrame(columns=['ShortCount'], data=dfAlphaWeight.apply(lambda s: s[s<0].count(), axis=1))
    dfLongExposure = DataFrame(columns=['LongExposure'], data=dfAlphaWeight.apply(lambda s: s[s>0].sum(), axis=1))
    dfShortExposure = DataFrame(columns=['ShortExposure'], data=dfAlphaWeight.apply(lambda s: s[s<0].sum(), axis=1))
    dfNetExposure = DataFrame(columns=['NetExposure'], data=dfAlphaWeight.apply(sum, axis=1))
    dfTotalDollarInvest = DataFrame(columns=['I'], data=dfAlphaWeight.apply(lambda s: abs(s).sum(), axis=1))
    dfTotalDollarTraded = DataFrame(columns=['D'], data=(dfAlphaWeight - dfAlphaWeight.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    dfSharesTraded = dfAlphaWeight / XPrice
    dfTotalSharesTraded = DataFrame(columns=['Q'], data=(dfSharesTraded - dfSharesTraded.shift(1)).apply(lambda s: abs(s).sum(), axis=1))
    
    TurnOver = dfTotalDollarTraded.mean()[0]/dfTotalDollarInvest.mean()[0]
    CentsPerShare = 100 * dfSignalReturn['Return'].iloc[1:].mean() / dfTotalSharesTraded.mean()[0]
    
    dfMetrics = DataFrame(list(sharpe)).T
    dfMetrics.columns = ['Simple Sharpe', 'Geo. Sharpe', 'Simple Mean', 'Geo. Mean', 'Anual Vol']
    dfMetrics['Turnover'] = TurnOver
    dfMetrics['CentsPerShare'] = CentsPerShare
    dfMetrics['AvgHolding'] = 1.0/TurnOver
    dfMetrics.index = [dfXAlpha.index.name]
    
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortCount,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfLongExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfShortExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfNetExposure,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarInvest,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalDollarTraded,left_index=True,right_index=True,how='outer')
    dfSignalReturn = pd.merge(dfSignalReturn, dfTotalSharesTraded,left_index=True,right_index=True,how='outer')    
    
    return dfMetrics, dfSignalReturn, dfAlphaWeight
    def test_align_int_fill_bug(self):
        # GH #910
        X = np.arange(10 * 10, dtype='float64').reshape(10, 10)
        Y = np.ones((10, 1), dtype=int)

        df1 = DataFrame(X)
        df1['0.X'] = Y.squeeze()

        df2 = df1.astype(float)

        result = df1 - df1.mean()
        expected = df2 - df2.mean()
        assert_frame_equal(result, expected)
def stndize(str,list):

    s=list
    w= pd.read_csv(str,usecols=s)
    frame = DataFrame(w)

    t=frame.mean()
    print t
    z=frame.std()
    print z
    print (w-t)/z

    return;
Example #15
0
    def test_common_start_returns(self, before, after, mean_by_date, demeaned,
                                  expected_vals):
        dr = date_range(start='2015-1-17', end='2015-2-2')
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
        prices = DataFrame(index=dr, columns=tickers,
                           data=[[r1**1, r2**1, r3**1, r4**1],
                                 [r1**2, r2**2, r3**2, r4**2],
                                 [r1**3, r2**3, r3**3, r4**3],
                                 [r1**4, r2**4, r3**4, r4**4],
                                 [r1**5, r2**5, r3**5, r4**5],
                                 [r1**6, r2**6, r3**6, r4**6],
                                 [r1**7, r2**7, r3**7, r4**7],
                                 [r1**8, r2**8, r3**8, r4**8],
                                 [r1**9, r2**9, r3**9, r4**9],
                                 [r1**10, r2**10, r3**10, r4**10],
                                 [r1**11, r2**11, r3**11, r4**11],
                                 [r1**12, r2**12, r3**12, r4**12],
                                 [r1**13, r2**13, r3**13, r4**13],
                                 [r1**14, r2**14, r3**14, r4**14],
                                 [r1**15, r2**15, r3**15, r4**15],
                                 [r1**16, r2**16, r3**16, r4**16],
                                 [r1**17, r2**17, r3**17, r4**17]])
        dr2 = date_range(start='2015-1-21', end='2015-1-29')
        factor = DataFrame(index=dr2, columns=tickers,
                           data=[[3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1]]).stack()
        factor.index = factor.index.set_names(['date', 'asset'])
        factor.name = 'factor'

        cmrt = common_start_returns(
            factor,
            prices,
            before,
            after,
            False,
            mean_by_date,
            factor if demeaned else None)
        cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
        expected = DataFrame(index=range(-before, after + 1),
                             columns=['mean', 'std'], data=expected_vals)
        assert_frame_equal(cmrt, expected)
def calc_kelly_leverages(securities: Set[str],
                         start_date: date,
                         end_date: date,
                         risk_free_rate: float = 0.04) -> Dict[str, float]:
    """Calculates the optimal leverages for the given securities and
    time frame. Returns a list of (security, leverage) tuple with the
    calculate optimal leverages.

    Note: risk_free_rate is annualized
    """
    f = {}
    ret = {}
    excess_return = {}

    # Download the historical prices from Yahoo Finance and calculate the
    # excess return (return of security - risk free rate) for each security.
    for symbol in securities:
        try:
            hist_prices = get_historical_data(
                symbol, start=start_date, end=end_date,
                output_format='pandas')
        except IOError as e:
            raise ValueError(f'Unable to download data for {symbol}. '
                             f'Reason: {str(e)}')

        f[symbol] = hist_prices

        ret[symbol] = hist_prices['close'].pct_change()
        # risk_free_rate is annualized
        excess_return[symbol] = (ret[symbol] - (risk_free_rate / 252))

    # Create a new DataFrame based on the Excess Returns.
    df = DataFrame(excess_return).dropna()

    # Calculate the CoVariance and Mean of the DataFrame
    C = 252 * df.cov()
    M = 252 * df.mean()

    # Calculate the Kelly-Optimal Leverages using Matrix Multiplication
    F = inv(C).dot(M)

    # Return a list of (security, leverage) tuple
    return {security: leverage
            for security, leverage in zip(df.columns.values.tolist(), F)}
Example #17
0
    def test_ops(self):

        # tst ops and reversed ops in evaluation
        # GH7198

        # smaller hits python, larger hits numexpr
        for n in [4, 4000]:

            df = DataFrame(1, index=range(n), columns=list('abcd'))
            df.iloc[0] = 2
            m = df.mean()

            for op_str, op, rop in [('+', '__add__', '__radd__'),
                                    ('-', '__sub__', '__rsub__'),
                                    ('*', '__mul__', '__rmul__'),
                                    ('/', '__truediv__', '__rtruediv__')]:

                base = (DataFrame(np.tile(m.values, n)  # noqa
                                  .reshape(n, -1),
                                  columns=list('abcd')))

                expected = eval("base{op}df".format(op=op_str))

                # ops as strings
                result = eval("m{op}df".format(op=op_str))
                assert_frame_equal(result, expected)

                # these are commutative
                if op in ['+', '*']:
                    result = getattr(df, op)(m)
                    assert_frame_equal(result, expected)

                # these are not
                elif op in ['-', '/']:
                    result = getattr(df, rop)(m)
                    assert_frame_equal(result, expected)

        # GH7192
        df = DataFrame(dict(A=np.random.randn(25000)))
        df.iloc[0:5] = np.nan
        expected = (1 - np.isnan(df.iloc[0:25]))
        result = (1 - np.isnan(df)).iloc[0:25]
        assert_frame_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Example #19
0
def avg_medal_count():
    '''
    Using the dataframe's apply method, create a new Series called 
    avg_medal_count that indicates the average number of gold, silver,
    and bronze medals earned amongst countries who earned at 
    least one medal of any kind at the 2014 Sochi olympics.  Note that
    the countries list already only includes countries that have earned
    at least one medal. No additional filtering is necessary.
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':countries,
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    #df['average_medal_count'] = df.mean(axis=1)
    #avg_medal_count_by_country = df[['country_name','average_medal_count']]
    avg_medal_count = df.mean()
    # Or, we could do it this way
    avg_medal_count = df[['gold', 'silver', 'bronze']].apply(numpy.mean)
    print(avg_medal_count)
    
    return avg_medal_count
Example #20
0
class GetGenes(object):

	def __init__(self, data):
		self.dataframe = DataFrame(data)

	# read a text file and return a data frame. Records should be separated by TAB
	# There should not be duplicate column names
	def import_file(self, filename):
		# this function use to convert string to float
		def convert(x):
			try:
				x = float(x)
			except ValueError:
				pass
			return(x)

		table = []
		for line in open(filename):
			if(line.strip()):	# If not empty line
				line = line.rstrip('\n').split('\t')
				line = list(map(convert, line))
				table.append(line)
		self.dataframe = DataFrame(table[1:],columns=table[0])
		return

	def houseKeepingGenes(self, geneNum):
		# compute the CV of data
		std = array(self.dataframe.std(axis = 1))
		mean = array(self.dataframe.mean(axis = 1))
		CV = std/mean
		CV = list(map(abs, CV))		# convert to positive number

		# get the fist N minimum value
		mins = nsmallest(geneNum, CV)
		print("The GOOD genes are:\n")
		for item in mins:
			print(self.dataframe.ix[CV.index(item)][0])
		return
Example #21
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),
        align='center',
        color='white',
        linewidth=1.5)
plt.hold(True)
plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(),
             elinewidth=1.2,
             capsize=7.5,
             fmt=None)


plt.show()
Example #22
0
    def run(self,Model='svc',kernel='linear',is_cross_validation=True, cross_validationMethod='LOO', DecompositionMethod='PCA',decompositionLevel='FeatureType',n_components=30, FeatureSelection='TopExplainedVarianceComponents', n_features=10, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False,isSaveCsv=None, isSavePickle=None, isSaveFig=None,isSelectSubFeatures=False,SubFeatures='ExpressionLevel'):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init 
        if isSelectSubFeatures:
            print('Features : ' + SubFeatures)
            f=self.FeaturesDF.copy()
            featureNames=self.FeaturesDF.index.names
            try:
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([[SubFeatures],f.index], names=featureNames)
            except KeyError:
               f.index=f.index.swaplevel(0,1)
               f=f.loc[SubFeatures]
               f.index=MultiIndex.from_product([f.index,[SubFeatures]], names=featureNames)
            self.FeaturesDF=f.copy()
        else:
            SubFeatures='allFeatureTypes'

        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
           
        # set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        if DecompositionMethod==None and (FeatureSelection == 'TopExplainedVarianceComponents' or FeatureSelection == 'TopNComponents'):
            print("ERROR- feature selection method cannot be '"+ FeatureSelection +"' when X is not decomposed")
            FeatureSelection=raw_input("Choose a different feature selection method ('RFE','f_regression','dPrime','AllFeatures'): ")

        model, isBoolModel= learningUtils.setModel(Model)
        selectFeatures =learningUtils.setFeatureSelection(FeatureSelection,n_features)
        n_components=min(n_features,n_features) #cannot have more components than features. 
        decompositionTitle, decomposeFunction= learningUtils.setDecomposition(DecompositionMethod,n_components,decompositionLevel)
        isDecompose=  decompositionTitle!='noDecomposition'


        # save learning params
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'Decomposition':decompositionTitle,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
        
        #define global variables over modules (to be used in myUtils)

        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)

        
        LabelingList=trainLabels_all.columns #['N1']
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList
        
        isMultivarLabels=False      
        LabelingIndex=enumerate(LabelingList)
        if isMultivarLabels:
            LabelingIndex=enumerate([LabelingList])

        for label_ind, Labeling in LabelingIndex:
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            
            try:
                print('\n**' + Labeling + '**')
            except TypeError:
                print('\n*******')
                print(Labeling)
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            testProbas=DF(index=X.index)
            testProbas2=DF(index=SubjectIndex)

            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            bestNfeaturesPanel=Panel(items=LabelingList,minor_axis=range(len(cv)),major_axis=range(n_features))
            
            #bestNfeaturesPanel=Panel(items=LabelingList,major_axis=range(len(cv)),minor_axis=MultiIndex.from_tuples(('a','b')))
            

            for train, test in cv:

                if not is_cross_validation:
                   train=np.append(train,test)
                   #test=np.append(train,test)
                   self.Learningdetails['CrossVal']='NONE'
                   #if cv_ind>0:
                    #    break

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                 

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar = learningUtils.decomposeAndSelectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeatures,decomposeFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??  
                    bestNfeaturesPanel[Labeling][cv_ind]=bestNfeatures   
                    """for  feature_ind,feature_name in enumerate(bestNfeatures):
                         
                         try:
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind
                        except KeyError:
                            bestNfeaturesPanel[Labeling].columns=bestNfeaturesPanel[Labeling].columns.append(feature_name)#continue here!! use 
                            bestNfeaturesPanel[Labeling][feature_name].loc[cv_ind]=feature_ind



                    [bestNfeatures].iloc[cv_ind]=range(len(bestNfeatures))"""
                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    """try:
                        #Components[cv_ind]=components.T
                        #ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True"""
                    if cv_ind==0:
                        ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))    
                    ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                  
                    #get ROC scores without cross validation:
                                           
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict_proba(Xtrain).T[1],index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores,testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind],testProbas=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind],testProbas_new=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    testProbas=concat([testProbas,testProbas_new])
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict_proba(Xtest).T[1],index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test,isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2,testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind],testProbas2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    if len(Xtest2)>0: # if there is more than one segment for subject
                        testScores2[cv_ind],testProbas2_new=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                        testProbas2=concat([testProbas2,testProbas2_new])
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary,rocDF = learningUtils.getScoresSummary(trainScores2,testScores2,testProbas2,TrueLabels[Labeling])

            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weights             
            ModelWeights1=ModelWeights1.dropna(how='all')
            WeightedFeatures1_index0=analysisUtils.getFeaturesWeights(0,bestNfeaturesPanel[Labeling],ModelWeights1) #FeatureAnalysisIndex=0 for featureType, 1= au's (if not decomposed) or component rank (if decomposed)
            WeightedFeatures1_index1=analysisUtils.getFeaturesWeights(1,bestNfeaturesPanel[Labeling],ModelWeights1)
            WeightedFeatures1=concat([DF(index=['-------(A) Index0-------']),WeightedFeatures1_index0,DF(index=['-------(B) Index1 -------']),WeightedFeatures1_index1])
            
            WeightedFeatures2=DF(ModelWeights2.mean(axis=1)).fillna(0)
            #WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=Series(BestFeatures.values.flatten(),index=BestFeatures.index)

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                normalize=lambda df:DF(StandardScaler().fit_transform(df.T).T,index=df.index,columns=df.columns) 

                """#componentsMeanFeatureType=normalize(Components.mean(axis=1,level='FeatureType'))
                #componentsMeanFeatureTypeABS=normalize(componentsDF.abs().mean(axis=1,level='FeatureType'))
                #componentsMeanFSsignal=normalize(componentsDF.mean(axis=1,level='fs-signal'))
                #componentsMeanFSsignalABS=normalize(componentsDF.abs().mean(axis=1,level='fs-signal'))
                #ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                #ExplainedVar_mean.index=['ExplainedVar_mean']
                #ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                #ExplainedVar_std.index=['ExplainedVar_std']
                #componentsToCSV=concat([DF(index='---meanFeatureType----'),componentsMeanFeatureType,DF(index='---meanFeatureType - abs ----'),componentsMeanFeatureTypeABS,DF(index='---mean fs-signal ----'),componentsMeanFSsignal,DF(index='---mean fs-signal - abs ----'),componentsMeanFSsignalABS])
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])"""

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                #testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

        self.BestNFeaturesAll=bestNfeaturesPanel 
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+ '_FSelection'+FeatureSelection+'_Decompostion'+D['Decomposition']+'PieceSize'+D['PieceLength']+'_'+SubFeatures
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv= int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')
            if isBoolLabel:
                ROCfig=learningUtils.save_plotROC(rocDF,isSave=True,saveName=saveName,title=SubFeatures)

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()
    sys.stdout = Logger(title)

if t == 2:
    authors = y_options.get(0)[1]
    recipients = y_options.get(1)[1]
    accuracies = []
    results = {}
    clf, author_score = load_classifier(clfs.get(7), X, y_options.get(0))

    for a in np.unique(authors):
        s_targets = list(compress(recipients, authors == a))
        s_data = list(compress(X, authors == a))
        _, score = classify(clfs.get(c), ("Recipients of {}".format(class_labels[a]), s_targets), s_data)
        accuracies.append(np.mean(score))
        results[a] = dict(zip(np.unique(s_targets), np.atleast_1d(score)))

    df = DataFrame(results).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title)

    df = df.T.fillna(df.mean(axis=1)).T
    plot_accuracy_matrix(df, class_labels[np.unique(recipients)], class_labels[np.unique(authors)], title+" (filled)")
    print("##################################")
    print("Mean Recipient Score: {}".format(np.mean(accuracies)))
    print("##################################")
    print("FINAL SCORE: {}".format(np.mean(author_score * np.mean(accuracies))))
    print("##################################")

else:
    classify(clfs.get(c), y, X, save=save_results if t == 0 else False)

Example #24
0
# reductions or summary statistics
f = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
Example #25
0
BPbin= []
for y in frame2.sbp:
    BPbin.append(int((y - min(frame2.sbp))/3))
frame2['bin'] = BPbin
frame2 = frame2[:(len(frame2))-2] #removes trailing incomplete cardiac cycle
print frame2

groupedRR = frame2['RR'].groupby(frame2['bin'])
RRarray = groupedRR.mean() 

groupedSBP = frame2['sbp'].groupby(frame2['bin'])
SBParray = np.asarray(groupedSBP.mean())
print SBParray

bin_weight = groupedSBP.size()/frame2['hb'].max()
frame3 = frame2.mean()


#linear regression
#RR vs SBP
slope, intercept, r_value, p_value, std_err = linregress(SBParray, RRarray)
frame3['BRS slope'] = slope
frame3['R^2'] = r_value**2
print frame3
bestfit = [(i*0.012020)+intercept for i in SBParray]

#plots plots plots plots plots plots plots plots plots plots plots
fig = plt.figure()

#ECG plot
ax1 = fig.add_subplot(2, 1, 1)
# 'Panel' objects are 3D.

wp = Panel( {
    'Item1' : DataFrame(randn(4, 3)),
    'Item2' : DataFrame(randn(4, 2))
} )
pprint( wp )

# There are also 'TimeSeries', 'SparseSeries', and 'SparsePanel' objects.
# In newer versions, there is experiemental support for higher-dimensional
# panels.

# Stats can also be performed on Pandas objects.
df = DataFrame( randn( 6, 4 ), columns=[ 'A', 'B', 'C', 'D' ] )
pprint( df )

# You can choose which axis number to perform the operation along.
pprint( df.mean( 0 ) )
pprint( df.mean( 1 ) )

# Much more to Pandas, but that's the basic idea.

# For more information, see:
#   http://pandas.pydata.org/pandas-docs/stable/index.html
# Also, definitely have a look at StatsModels:
#   http://statsmodels.sourceforge.net/
#   http://statsmodels.sourceforge.net/stable/

# <demo> --- stop ---
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
        if False:
            kw = dict(method='time')
            df = df.reindex(index).interpolate(**kw).ix[index]
        dfs.update({model: df})

dfs = Panel.fromDict(dfs).swapaxes(0, 2)


# In[ ]:

from pandas import DataFrame

means = dict()
for station, df in dfs.iteritems():
    df.dropna(axis=1, how='all', inplace=True)
    mean = df.mean()
    df = df - mean + mean['OBS_DATA']
    means.update({station: mean['OBS_DATA'] - mean.drop('OBS_DATA')})

bias = DataFrame.from_dict(means).dropna(axis=1, how='all')
bias = bias.applymap('{:.2f}'.format).replace('nan', '--')

columns = dict()
[columns.update({station: get_coops_longname(station)}) for
 station in bias.columns.values]

bias.rename(columns=columns, inplace=True)

to_html(bias.T, 'style.css')

Example #29
0
## 带有重复值的轴索引
obj = Series(range(5), index=['a','a','b','b','c'])
# 检验是否唯一
obj.index.is_unique
# 一个索引有多个值,那么该索引就会返回多个值。
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
Example #30
0
# Returns all repos data from a given user
def getRepos(user):
    myrepos=requests.get("https://api.github.com/users/"+ user +"/repos", \
    	headers={'Authorization': 'token 5218551eb082bffa572318de0c2de10d255170b1'}).json()
    return myrepos

# Getting number of stars
data = DataFrame()
i = 0
for user in topGitUsers:
    userRepos = getRepos(user)
    i += 1
    print i #check progress
    if len(userRepos) > 0:
	    stars = []
	    listUserStars = [('',0)]
	    for repo in userRepos:
	        #print repo['stargazers_count']
	        stars.append(repo['stargazers_count'])
	    userStars = DataFrame(stars)
	    userMeanSt = userStars.mean(axis=0)
	    listUserStars.append((user,userMeanSt))
	    #print user + str(userMeanSt[0])
	    result = DataFrame({'userId': user,'Mean of stars': userMeanSt})
	    data = data.append(result)
    else:
		print user + ': No repos found for this user'

data.to_csv('gitTopUsersMean.csv')