Example #1
0
def summarize_he( analytical_sets ):

    results = {}
    he = {}

    for analytical_set in analytical_sets:
        he[analytical_set.label] = calculate_he(analytical_set.allele_df)

    he_df = DataFrame( he )
    labels = list(he_df.columns)
    if len(labels) == 2:
        # use Mann-Whitney / Wilcoxon test
        results['test'] = 'Wilcoxon test (paired)'
        results['stats'] = wilcoxon( he_df[labels[0]], he_df[labels[1]])

    elif len(labels) > 2:
        # use Kruskal Wallis
        results['test'] = 'Kruskal-Wallis test'
        results['stats'] = kruskal( * [he_df[x] for x in labels])
        results['warning'] = ''

    results['data'] = he_df
    results['mean'] = he_df.mean()
    results['stddev'] = he_df.std()
    #raise RuntimeError

    return results
Example #2
0
def skewness(str,list):
    s= list



    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)

    t = frame.mean()



    d = frame.std()

    e = ((w - t) /d) ** 3

    g=e.sum()

    i=(h*g)/((h-1)*(h-2))


    print 'skewness=',i
Example #3
0
def kurtosis(str,list):

    s=list
    w = pd.read_csv(str, usecols=s)

    frame = DataFrame(w)

    h=len(w)


    print h
    t = frame.mean()

    d = frame.std()

    e = ((w - t) /d) ** 4

    g=e.sum()


    p1=h*(h+1)
    p2=float((h-1)*(h-2)*(h-3))
    p3=float(3*((h-1)**2))
    p4=(h-2)*(h-3)

    i=(((p1/p2)*g)-(p3/p4))

    print 'kurtosis=',i
Example #4
0
def bse(data: pd.DataFrame,
        weight_name: Optional[str]=None,
        ignore: List[str]=None) -> pd.DataFrame:
    """
    Calculate the Block Standard Error (BSE).

    Parameters
    ----------
    data : Dataframe with CV data over time and weights.
    weight_name : Name of the weight column.
    ignore : List of column names to ignore.

    Returns
    -------
    bse : Dataframe containing BSEs over all iterations.

    References
    ----------
    Flyvbjerg, H., Petersen, H. G. Error estimates on averages of correlated
    data. The Journal of Chemical Physics, 91(1), 461 (1989)

    """
    if ignore is None:
        ignore = []
    if 'time' not in ignore:
        ignore.append('time')

    # Prepare input, first element
    if weight_name is not None:
        weights = data[weight_name].values
        ignore.append(weight_name)

    length = data.shape[0]
    width = data.shape[1]
    index = data.T.index
    data = data.values
    blist = [data.std(axis=0) / np.sqrt(length)]
    length //= 2

    # Iteratively increase block size
    while length > 2:
        halved = np.empty((length, width))

        # Each iteration, we halve the dataset
        for i in range(0, length):
            if weight_name is not None:
                halved[i] = (1 / (weights[2 * i - 1] + weights[2 * i]) *
                             (data[2 * i - 1] * weights[2 * i - 1] +
                              data[2 * i] * weights[2 * i]))
            else:
                halved[i] = 0.5 * (data[2 * i - 1] + data[2 * i])

        # Calculate the BSE
        bse = halved.std(axis=0) / np.sqrt(length)
        blist.append(bse)
        length //= 2

    # Reconstruct Dataframe
    return pd.DataFrame(np.asarray(blist), columns=index).drop(ignore, axis=1)
Example #5
0
File: views.py Project: Mihkorz/AMD
 def form_valid(self, form):
     document = form.save(commit=False)
     project = form.cleaned_data['project']
     document.save()
     filename = settings.MEDIA_ROOT+"/"+document.document.name
     sniffer = csv.Sniffer()
     dialect = sniffer.sniff(open(filename, 'r').read(), delimiters='\t,;') # defining the separator of the csv file
     df = read_csv(filename, delimiter=dialect.delimiter)
     tumour_cols = [col for col in df.columns if 'Tumour' in col]
     norm_cols = [col for col in df.columns if 'Norm' in col]
     document.sample_num = len(tumour_cols)
     document.norm_num = len(norm_cols)
     document.row_num = len(df)
     document.save()
     
     """ Use PANDAS to preprocess input file(calculate Mean_norm CNR and STD) and save to process folder 
         Create ProcessDocument instance to store the file in database"""
         
     path = os.path.join('users', str(document.project.owner),
                                         str(document.project),'process', 'process_'+str(document.get_filename()))
     if not os.path.exists(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner),
                                         str(document.project),'process')):
         os.mkdir(settings.MEDIA_ROOT+'/'+os.path.join('users', str(document.project.owner),
                                         str(document.project),'process'))
     
     process_doc = ProcessDocument()
     process_doc.document = path
     process_doc.input_doc = document
     process_doc.created_by = self.request.user
     process_doc.save()
     
     new_file = settings.MEDIA_ROOT+"/"+path
             
     df = df.set_index('SYMBOL') #create index by SYMBOL column
       
     df = df.groupby(df.index, level=0).mean() #deal with duplicate genes by taking mean value
     
     mean_norm = df[[norm for norm in norm_cols]].mean(axis=1)
     from scipy.stats.mstats import gmean
     gmean_norm = df[[norm for norm in norm_cols]].apply(gmean, axis=1)
     
     df1 = DataFrame(df[[norm for norm in norm_cols]], index=df.index)
     
     df1 = df1.std(axis=1)
             
     df['Mean_norm'] = mean_norm
            
     df = df.div(df.Mean_norm, axis='index')
    
     df['Mean_norm'] = mean_norm
     df['gMean_norm'] = gmean_norm
     df['std'] = df1
             
     
     df.to_csv(new_file, sep='\t')
 
      
     return HttpResponseRedirect(self.success_url+project.name)
Example #6
0
    def testWLS(self):
        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Example #7
0
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments
Example #8
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
def stndize(str,list):

    s=list
    w= pd.read_csv(str,usecols=s)
    frame = DataFrame(w)

    t=frame.mean()
    print t
    z=frame.std()
    print z
    print (w-t)/z

    return;
Example #10
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        if sm.version.version < '0.5.0':
            raise nose.SkipTest

        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Example #11
0
    def test_common_start_returns(self, before, after, mean_by_date, demeaned,
                                  expected_vals):
        dr = date_range(start='2015-1-17', end='2015-2-2')
        dr.name = 'date'
        tickers = ['A', 'B', 'C', 'D']
        r1, r2, r3, r4 = (1.20, 1.40, 0.90, 0.80)
        prices = DataFrame(index=dr, columns=tickers,
                           data=[[r1**1, r2**1, r3**1, r4**1],
                                 [r1**2, r2**2, r3**2, r4**2],
                                 [r1**3, r2**3, r3**3, r4**3],
                                 [r1**4, r2**4, r3**4, r4**4],
                                 [r1**5, r2**5, r3**5, r4**5],
                                 [r1**6, r2**6, r3**6, r4**6],
                                 [r1**7, r2**7, r3**7, r4**7],
                                 [r1**8, r2**8, r3**8, r4**8],
                                 [r1**9, r2**9, r3**9, r4**9],
                                 [r1**10, r2**10, r3**10, r4**10],
                                 [r1**11, r2**11, r3**11, r4**11],
                                 [r1**12, r2**12, r3**12, r4**12],
                                 [r1**13, r2**13, r3**13, r4**13],
                                 [r1**14, r2**14, r3**14, r4**14],
                                 [r1**15, r2**15, r3**15, r4**15],
                                 [r1**16, r2**16, r3**16, r4**16],
                                 [r1**17, r2**17, r3**17, r4**17]])
        dr2 = date_range(start='2015-1-21', end='2015-1-29')
        factor = DataFrame(index=dr2, columns=tickers,
                           data=[[3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1],
                                 [3, 4, 2, 1]]).stack()
        factor.index = factor.index.set_names(['date', 'asset'])
        factor.name = 'factor'

        cmrt = common_start_returns(
            factor,
            prices,
            before,
            after,
            False,
            mean_by_date,
            factor if demeaned else None)
        cmrt = DataFrame({'mean': cmrt.mean(axis=1), 'std': cmrt.std(axis=1)})
        expected = DataFrame(index=range(-before, after + 1),
                             columns=['mean', 'std'], data=expected_vals)
        assert_frame_equal(cmrt, expected)
Example #12
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        sm_version = sm.version.version
        if sm_version < LooseVersion("0.5.0"):
            raise nose.SkipTest("WLS centered SS not fixed in statsmodels" " version {0}".format(sm_version))

        X = DataFrame(np.random.randn(30, 4), columns=["A", "B", "C", "D"])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
Example #13
0
    def testWLS(self):
        # WLS centered SS changed (fixed) in 0.5.0
        if sm.version.version < '0.5.0':
            raise nose.SkipTest

        print( "Make sure you're using statsmodels 0.5.0.dev-cec4f26 or later.")

        X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D'])
        Y = Series(np.random.randn(30))
        weights = X.std(1)

        self._check_wls(X, Y, weights)

        weights.ix[[5, 15]] = np.nan
        Y[[2, 21]] = np.nan
        self._check_wls(X, Y, weights)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Example #15
0
class GetGenes(object):

	def __init__(self, data):
		self.dataframe = DataFrame(data)

	# read a text file and return a data frame. Records should be separated by TAB
	# There should not be duplicate column names
	def import_file(self, filename):
		# this function use to convert string to float
		def convert(x):
			try:
				x = float(x)
			except ValueError:
				pass
			return(x)

		table = []
		for line in open(filename):
			if(line.strip()):	# If not empty line
				line = line.rstrip('\n').split('\t')
				line = list(map(convert, line))
				table.append(line)
		self.dataframe = DataFrame(table[1:],columns=table[0])
		return

	def houseKeepingGenes(self, geneNum):
		# compute the CV of data
		std = array(self.dataframe.std(axis = 1))
		mean = array(self.dataframe.mean(axis = 1))
		CV = std/mean
		CV = list(map(abs, CV))		# convert to positive number

		# get the fist N minimum value
		mins = nsmallest(geneNum, CV)
		print("The GOOD genes are:\n")
		for item in mins:
			print(self.dataframe.ix[CV.index(item)][0])
		return
Example #16
0
    def run(self,Model='ridge',kernel='linear', cross_validationMethod='KFold',FeatureSelection='PCA',n_features=20,scoringList=['specificity','sensitivity','precision','f1','accuracy','ss_mean'],isSaveCsv=None,isSavePickle=None, isSaveFig=None, isPerm=0,isBetweenSubjects=True,isConcatTwoLabels=False):       
        # -- TODO :
        # --  # Greedy selection on features + Other feature selection types...
        # --  # Make sure featuers are Best only based on train data!!!
        # --  # Keep a list of n_train, n_test from each Label and scoring (accuracy, f1..) in each cross validation iteration
        # --  # Plot results summary (see CARS paper for desired results for Ein Gedi Poster 22-1-2015)
        # --  # remove irelevant data using 'Tracking Success' and consider 'TimeStamps' for feature calculation
        # --  # add f feature analysis by facial part (see excel) 
        # --  # select best model (svm, otherwise ridge regression) 
        # --  # compare svc results with regerssion results (using LOO and different Params for regression  - params for unbalanced data, different kernels, etc.), model evaluation - http://scikit-learn.org/stable/modules/model_evaluation.html) 
        # --  # check how the model weights behave - feature selection analysis
        # --  # calc model error
        # --  # divide data to subparts for training and testing - try within/ between subject, and analyze distribution of features when data is divided
        # --  # LOO - also on bool labels (patients vs controls and mental status bool)
        # --  # add mental status rank scores (0-4)
        # --  # make sure p-val returns the right value in 'scores'
        # --  # run it over random data (permutation test) 
        # --  # continoue here - check regression results-Make sure regression works (not so good).. check what happens in svc for G7 (high train R, negative test R)

        ## init        
        FeatureTypeList=[j for j in tuple(self.FeaturesDF.index)]
        self.FullResults=DF()
        self.Learningdetails={'Model':Model,'Kernel':kernel,'CrossVal':cross_validationMethod,'FeatureSelection':FeatureSelection,'LabelBy':self.Details['LabelDetails'].keys()[0],'FeatureMethod':self.Details['FeatureMethod'],'PieceLength':self.Details['PieceLength']}
        print('\n------------Learning Details------------')
        print(DF.from_dict(self.Learningdetails,orient='index'))
        print('\n----' + cross_validationMethod + ' Cross validation Results:----')
           
        # Set learning params (cross validation method, and model for learning)
        isBoolLabel=self.LabelsObject.isBoolLabel
        isBoolScores=isBoolLabel
        model, isBoolModel, featureSelectionMethod,selectFeaturesFunction= learningUtils.setModel(Model,FeatureSelection,n_features)
        #define global variables over modules (to be used in myUtils)
        globalVars.transformMargins=0#lambda x:x         
        globalVars.isBoolLabel=isBoolLabel
        globalVars.isBoolModel=isBoolModel
        global trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects 
        trainLabels_all, testLabels_all, TrueLabels,isAddDroppedSubjects=labelUtils.initTrainTestLabels_all(self.LabelsObject)
        trainLabels_all2, testLabels_all2, TrueLabels2,isAddDroppedSubjects2=labelUtils.initTrainTestLabels_all(self.LabelsObject2)



        
        
        
        LabelingList=['N1']#trainLabels_all.columns
        self.ResultsDF=DF()
        self.BestFeatures=DF(columns=LabelingList) #dict of BestFeaturesDF according to Labeling methods
        YpredictedOverAllLabels=pandas.Panel(items=range(len(trainLabels_all)),major_axis=LabelingList,minor_axis=TrueLabels.index) #panel: items=cv_ind, major=labels, minor=#TODO 
       
                                              
        ## Create train and test sets according to LabelBy, repeat learning each time on different Labels from LabelingList.
        for label_ind, Labeling in enumerate(LabelingList):
            """if isPerm: #TODO - fix this to work with continous / bool data
                try:
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]
                except AttributeError:
                    self.LabelsObject.permLabels()
                    trainLabels=self.LabelsObject.permedLabelsDF[Labeling]"""
            #set subjects list according to labels and features
            X,SubjectsList,droppedSubjects,Xdropped=featuresUtils.initX(self.FeaturesDF,trainLabels_all,Labeling)
            X2,SubjectsList2,droppedSubjects2,Xdropped2=featuresUtils.initX(self.FeaturesDF,trainLabels_all2,Labeling,is2=1)
            
            #init train and test labels
            trainLabels, testLabels, LabelRange = labelUtils.initTrainTestLabels(Labeling,SubjectsList,trainLabels_all, testLabels_all)
            trainLabels2, testLabels2, LabelRange2 = labelUtils.initTrainTestLabels(Labeling,SubjectsList2,trainLabels_all2, testLabels_all2)
            
            #make sure only labeled subjects are used for classification
            X=X.query('subject == '+ str(list(trainLabels.index)) ) 
            X.index.get_level_values(X.index.names[0]) 
            SubjectIndex=list(set(X.index.get_level_values('subject')))

            X2=X2.query('subject == '+ str(list(trainLabels2.index)) )  
            X2.index.get_level_values(X2.index.names[0]) 
            SubjectIndex2=list(set(X2.index.get_level_values('subject')))                       
            #init vars
            if isBetweenSubjects:
                cv_param=len(SubjectIndex)
                self.Learningdetails['CrossValSubjects']='between'
                isWithinSubjects=False
            else:
                isWithinSubjects=True
                X=X.swaplevel(0,1)
                PieceIndex=list(set(X.index.get_level_values('Piece_ind')))
                cv_param=len(PieceIndex)
                self.Learningdetails['CrossValSubjects']='within'
            
            self.Learningdetails['NumOfFeatures']=n_features
            
            print('\n**' + Labeling + '**')
            
            cv, crossValScores= learningUtils.setCrossValidation(cross_validationMethod,cv_param,trainLabels,isWithinSubjects) 
            
            ## Learning - feature selection for different scoring types, with cross validation - 

            BestFeaturesForLabel=self.BestFeaturesForLabel(FeatureTypeList,LabelingList,n_features) #saves dataframe with best features for each label, for later analysis
            cv_ind=0
            #used for transforming from margins returned from svm to continouse labels (e.g . PANSS)
            trainScores=DF()
            test_index=X.index
            testScores=concat([DF(index=test_index),DF(index=['std_train_err'])])
            testScores2=concat([DF(index=testLabels.index),DF(index=['std_train_err'])]) 
            #impt=Imputer(missing_values='NaN', strategy='median', axis=0)

            globalVars.LabelRange=LabelRange

            ModelWeights1=DF(columns=range(len(cv)),index=X.columns)
            Components=pandas.Panel(items=range(len(cv)),major_axis=X.columns,minor_axis=range(n_features)) #todo fix this for 1st and second learning
            ExplainedVar=DF(columns=range(len(cv)))
            ModelWeights2=DF(columns=range(len(cv)))
            for train, test in cv:

                if isBetweenSubjects:
                    #set X and Y
                    train_subjects=trainLabels.iloc[train].index
                    test_subjects=testLabels.iloc[test].index 
                    Xtrain,Xtest, Ytrain, YtrainTrue, Ytest=learningUtils.setXYTrainXYTest(X,Labeling,trainLabels,testLabels,TrueLabels,train_subjects,test_subjects)
                    Xtrain2,Xtest2, Ytrain2, YtrainTrue2, Ytest2=learningUtils.setXYTrainXYTest(X2,Labeling,trainLabels2,testLabels2,TrueLabels2,train_subjects,test_subjects)

                    
                    if isConcatTwoLabels: #used when there is more than one doctor
                        Xtrain=concat([Xtrain,Xtrain2])
                        Xtest=concat([Xtest,Xtest2])
                        Ytrain=concat([Ytrain,Ytrain2])
                        YtrainTrue=concat([YtrainTrue,YtrainTrue2])
                        Ytest=concat([Ytest,Ytest2])
                        Xdropped=concat([Xdropped,Xdropped2])
                        SubjectsList=list(set(SubjectsList).intersection(set(SubjectsList2)))
                        droppedSubjects=list(set(droppedSubjects).union(set(droppedSubjects2)).difference(set(SubjectsList)))#diff from SubjectsList to make sure no subjects are both in train and test.
                    """else:
                        Xtrain=Xtrain1
                        Xtest=Xtest1
                        Xdropped=Xdropped1
                        Ytrain=Ytrain1
                        YtrainTrue=YtrainTrue1
                        Ytest=Ytest1"""

                    #select N best features:
                    Xtrain, Xtest, bestNfeatures, components, explainedVar,decomposeFunc=learningUtils.selectBestNfeatures(Xtrain,Xtest,Ytrain,n_features,selectFeaturesFunction)
                    BestFeaturesForLabel.add(bestNfeatures) #todo - delete this??     

                    #train 1 
                    TrainModel=model
                    TrainModel.fit(Xtrain.sort_index(),Ytrain.T.sort_index())
                    try:
                        Components[cv_ind]=components.T
                        ExplainedVar[cv_ind]=explainedVar
                        isDecompose=True
                        if cv_ind==0:
                            ModelWeights1=DF(columns=range(len(cv)),index=range(len(bestNfeatures)))
                        ModelWeights1[cv_ind]=TrainModel.coef_.flatten()
                    except AttributeError:
                        isDecompose=False
                        ModelWeights1[cv_ind].loc[bestNfeatures]=TrainModel.coef_.flatten()
                    self.isDecompose=isDecompose                    
                    #train 2
                    if isBoolLabel:
                       PiecePrediction_train=DF(TrainModel.predict(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=svm.SVC(kernel='linear', probability=True,class_weight={0:1,1:1})
                    else:
                       PiecePrediction_train=DF(TrainModel.decision_function(Xtrain),index=Xtrain.index,columns=['prediction'])
                       TrainModel2=linear_model.LinearRegression()

                    Xtrain2, Ytrain2, YtrainTrue2=learningUtils.getX2Y2(Xtrain,Ytrain,YtrainTrue,PiecePrediction_train, isBoolLabel)                 
                    TrainModel2.fit(Xtrain2, Ytrain2)
                    if cv_ind==0:
                        ModelWeights2=DF(columns=range(len(cv)),index= Xtrain2.columns)
                    ModelWeights2[cv_ind]=TrainModel2.coef_.flatten()         

                              
                    #test 1
                    if isAddDroppedSubjects: #take test subjects from cv + subjects that were dropped for labeling used for test
                        if isDecompose:
                            dXdropped=DF(decomposeFunc(Xdropped).values,index=Xdropped.index)
                        XtestDropped=dXdropped[bestNfeatures]
                        YtestDropped=Series(XtestDropped.copy().icol(0))
                        #YTrueDropped=Series(Xdropped.copy().icol(0))
                        for subject in droppedSubjects:
                            YtestDropped[subject]=testLabels_all[Labeling].loc[subject]
                            #YTrueAll.loc[subject]=TrueLabels[Labeling].loc[subject]
                        Ytest=concat([Ytest,YtestDropped]).sort_index()
                        Xtest=concat([Xtest,XtestDropped]).sort_index()


                    if isPerm: #TODO- Check this!!
                        Ytest=y_perms.loc[Ytest.index]
                    Xtest=Xtest.fillna(0.)
                    
                    
                elif isWithinSubjects:
                    #train 1
                    train_pieces=PieceIndex[train]
                    test_pieces=PieceIndex[test] #TODO - make sure that if test/train> piece index, it ignores it and repeate the process
                    
                    XtrainAllFeatures=X.query('Piece_ind == '+ str(list(train_pieces)))
                    Ytrain=Series(index=X.index)
                    Ytest=Series(index=X.index)
                    YtrainTrue=Series(index=X.index)
                    
                    for subject in PieceIndex: 
                        for piece in train_pieces:
                            Ytrain.loc[piece].loc[subject]=trainLabels[subject]
                            YtrainTrue.loc[piece].loc[subject]=TrueLabels[Labeling].loc[subject] 
                            Ytest.loc[piece].loc[subject]=testLabels[subject]   
                    Ytrain=Ytrain.dropna()
                    YtrainTrue=YtrainTrue.dropna() 
                    for subject in test_subjects:
                        Ytest.loc[piece].loc[subject]=testLabels[subject]
                #train scores 1       
                if cv_ind==0:
                    trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    plt.figure(1)
                    if len(LabelingList)>1:
                        plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                    if isBoolLabel:
                        testScores=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                    else:
                        testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                        plt.title(Labeling,fontsize=10)
                else:
                    plt.figure(3)
                    new_trainScores,YtrainPredicted=learningUtils.getTrainScores(Ytrain,Xtrain,YtrainTrue,TrainModel)
                    trainScores=concat([trainScores,new_trainScores],axis=1)
                #test 1   
                    testScores[cv_ind]=learningUtils.getTestScores(Ytest,Xtest,TrainModel)
                
                #train2

                if isBoolLabel:
                    PiecePrediction_test=DF(TrainModel.predict(Xtest),index=Xtest.index,columns=['prediction'])
                else:
                    PiecePrediction_test=DF(TrainModel.decision_function(Xtest),index=Xtest.index,columns=['prediction'])
                Xtest2, Ytest2 , YtestTrue2 =learningUtils.getX2Y2(Xtest,Ytest,Ytest,PiecePrediction_test, isBoolLabel)
                
                if cv_ind==0:
                    trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    #plt.figure(1)
                    #if len(LabelingList)>1:
                        #plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
                #test2
                    if isBoolLabel:
                        testScores2=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    else:
                        testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)
                    #plt.title(Labeling,fontsize=10)
                else:
                    new_trainScores2,YtrainPredicted2=learningUtils.getTrainScores(Ytrain2,Xtrain2,YtrainTrue2,TrainModel2)
                    YpredictedOverAllLabels[cv_ind].loc[Labeling]=YtrainPredicted2
                    trainScores2=concat([trainScores2,new_trainScores2],axis=1)
                    testScores2[cv_ind]=learningUtils.getTestScores(Ytest2,Xtest2,TrainModel2)     
                cv_ind+=1

                #crossValScores=crossValScores.append(CVscoresDF,ignore_index=True) #information about entire train test data. 
            fig2=plt.figure(2)
            if len(LabelingList)>1:
                plt.subplot(round(len(LabelingList)/2),2,label_ind+1)
            #if isAddDroppedSubjects:
               # testLabelsSummary=testLabels_all[Labeling].loc[AllSubjects]
           # else:
               # testLabelsSummary=testLabels
            scoresSummary = learningUtils.getScoresSummary(trainScores2,testScores2,TrueLabels[Labeling])
            # reset global vars
            globalVars.fitYscale='notDefined'
            globalVars.beta=DF()

            plt.title(Labeling,fontsize=10)
            plt.xlabel('Ytrue',fontsize=8)
            plt.ylabel('Ypredicted',fontsize=8)
            plt.tick_params(labelsize=6)
            #print(crossValScores.T)    
            scores=scoresSummary.fillna(0.)
            
            #analyze feature weightsL

            WeightedFeatures1=DF([ModelWeights1.mean(axis=1),ModelWeights1.std(axis=1)],index=['mean','std']).T.fillna(0)
            if isDecompose==0:
                WeightedFeatures1FeatureType=WeightedFeatures1.mean(level='FeatureType')
                WeightedFeatures1FsSingal=WeightedFeatures1.mean(level='fs-signal')
                WeightedFeatures1=concat([DF(index=['-------(A) FeatureType-------']),WeightedFeatures1FeatureType,DF(index=['-------(B) faceshift signal-------']),WeightedFeatures1FsSingal])
            
            WeightedFeatures2=DF([ModelWeights2.mean(axis=1),ModelWeights2.std(axis=1)],index=['mean','std']).T.fillna(0)
            BestFeatures=concat([DF(index=['------------- Learning 1 -------------']),WeightedFeatures1,DF(index=['------------- Learning 2 -------------']),WeightedFeatures2])
            self.BestFeatures[Labeling]=BestFeatures['mean']

            #analyze decomposition
            if isDecompose:
                Components_mean = Components.mean(axis=0)
                Components_std = Components.std(axis=0)
                ExplainedVar_mean = DF(ExplainedVar.mean(axis=1)).T#todo- check!
                ExplainedVar_mean.index=['ExplainedVar_mean']
                ExplainedVar_std = DF(ExplainedVar.std(axis=1)).T#todo- check!
                ExplainedVar_std.index=['ExplainedVar_std']
                try:
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])
                except AttributeError:
                    self.LabelComponents=dict.fromkeys(LabelingList)
                    self.LabelComponents[Labeling]=concat([DF(index=['---components mean---']),Components_mean,ExplainedVar_mean,DF(index=['---components std over cross validation---']),Components_std,ExplainedVar_std])

                """print(Components_mean)
                print(ExplainedVar_mean)
                print(WeightedFeatures1)"""

                        
            #BestFeaturesForLabel.analyze(ByLevel=0) #TODO change to regression coeff
            LabelFullResults=concat([DF(index=[Labeling]),scores]) 
  
            self.FullResults=concat([self.FullResults,LabelFullResults])            
            self.ResultsDF=concat([self.ResultsDF,DF(scores[0],columns=[Labeling])],axis=1)
#continue here!! to build pseudo inverse matrix from predicted to true - make sure columns + rows are set!

            #self.BestFeatures[Labeling]=BestFeaturesForLabel.WeightedMean

            #plt.savefig('C:\\Users\\taliat01\\Desktop\\TALIA\\Code-Python\\Results\\'+Labeling+'png')
        testScores3=pandas.Panel(items=range(len(X2.index))) #for each cv score...
        FullSubjectsList=YpredictedOverAllLabels[0].columns
        YdroppNans=YpredictedOverAllLabels.dropna(axis=0,how='all')
        YdroppNans=YdroppNans.dropna(axis=1,how='all')
        YpredictedOverAllLabels=YdroppNans.dropna(axis=2,how='all')
        notNans_cv_ind=YpredictedOverAllLabels.items
        notNans_trainSubjects=YpredictedOverAllLabels.minor_axis
        notNans_LabelsList=YpredictedOverAllLabels.major_axis
        notNans_TrueLabels=TrueLabels.T[notNans_trainSubjects].loc[notNans_LabelsList]
        cv_ind=0
        for train, test in cv:
            if cv_ind in notNans_cv_ind:
                print(test)
                train=list(set(FullSubjectsList[train]).intersection(set(notNans_trainSubjects)))
                test=list(set(FullSubjectsList[test]).intersection(set(notNans_trainSubjects)))
                if len(train)>0 and len(test)>0: 
                    AllLabelsYTrainPredicted=YpredictedOverAllLabels[cv_ind][train]
                    AllLabelsYTrainPredicted=AllLabelsYTrainPredicted.fillna(0)
                    AllLabelsYTrainTrue=notNans_TrueLabels[train]
                    AllLabelsYTestPredicted=YpredictedOverAllLabels[cv_ind][test]
                    AllLabelsYTestTrue=notNans_TrueLabels[test]

                    pseudoInverse_AllLabelsYTrainTrue=DF(np.linalg.pinv(AllLabelsYTrainTrue),columns=AllLabelsYTrainTrue.index,index=AllLabelsYTrainTrue.columns)
                    global AllLabelsTransformationMatrix
                    AllLabelsTransformationMatrix=DF(AllLabelsYTrainPredicted.dot(pseudoInverse_AllLabelsYTrainTrue),columns=pseudoInverse_AllLabelsYTrainTrue.columns)#change to real code!!
                TrainModel3=lambda y: y.T.dot(AllLabelsTransformationMatrix)
                testscores3[cv_ind]=learningUtils.getTestScores(AllLabelsYTrainTrue,AllLabelsYTrainPredicted,TrainModel3)
            cv_ind+=1

           
        self.ResultsDF=self.ResultsDF.fillna(0.)  
        
        ## Print and save results  
        print('\n')
        print(self.ResultsDF)
        print('\n')
        D=self.Learningdetails 
        savePath=resultsPath+'\\'+D['Model']+'_'+D['CrossVal']+'_LabelBy'+D['LabelBy']+'_Features'+D['FeatureMethod']+ '_FS'+FeatureSelection+'_Kernel'+D['Kernel']+'_'+D['CrossValSubjects']+'Subjects_PieceSize'+D['PieceLength']
        if isPerm:
            savePath=savePath+'_PERMStest'
        saveName=savePath+'\\'+str(n_features)+'_features'        
        self.Learningdetails['saveDir']=savePath
        dir=os.path.dirname(saveName)
        if not os.path.exists(dir):
            os.makedirs(dir)
        if isSavePickle is None:
            isSavePickle=int(raw_input('Save Results to pickle? '))
        if isSaveCsv is None:
            isSaveCsv=int(raw_input('save Results to csv? '))
        if isSaveFig is None:
            isSaveFig=int(raw_input('save Results to figure? '))

       
        if isSavePickle:        
            self.ResultsDF.to_pickle(saveName+'.pickle')
            self.BestFeatures.to_pickle(saveName+'_bestFeatures.pickle')
                
        if isSaveCsv:
            DetailsDF=DF.from_dict(self.Learningdetails,orient='index')
            ResultsCSV=concat([self.ResultsDF,DF(index=['-------Label Details-------']),self.N,DF(index=['-------Learning Details-------']),DetailsDF,DF(index=['-------Selected Features Analysis------']),self.BestFeatures])
            ResultsCSV.to_csv(saveName+'.csv')

        if isSaveCsv or isSavePickle:
            print('successfully saved as:\n' + saveName)
        
        if isSaveFig:
            plt.figure(1)
            plt.savefig(saveName + 'Train.png')
            plt.figure(2)
            plt.savefig(saveName + 'Test.png')
        plt.close()
        plt.close()
Example #17
0
experiment_data_Raw = DataFrame({"Timestamp": quelle_timestampsRaws, "Raw key": quelle_raws, "Dataset": quelle_datasetR})
experiment_data_Raw = experiment_data_Raw.set_index("Timestamp")

final_data = concat([experiment_data_Qber,experiment_data_Raw])

final_data = final_data.sort_index()

# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
test.ix[ser_id2].value_counts(sort=False).plot(kind='bar')
test.ix[ser_id1].value_counts(sort=False).plot(kind='bar')

# Sampling from the overlapped rated movies to calculate the correlation
periods_test = DataFrame(np.zeros((20,7)),columns=[int(ser_max/100),int(ser_max/50),int(ser_max/20),int(ser_max/10),int(ser_max/5),int(ser_max/2),ser_max])
for i in periods_test.index:   # Sampling 20 times
    for j in periods_test.columns:
         sample = test.reindex(columns=np.random.permutation(test.columns)[:j])
         periods_test.ix[i,j] = sample.iloc[0].corr(sample.iloc[1])  # ix is for label index, iloc is for int index
print periods_test[:5]
print periods_test.describe()

threshold = 0.1
temp_std = 0
# Take the threshold num which makes sampling correlation stable
for i, std in enumerate(periods_test.std()):
    if std < 0.1 and temp_std >= 0.1:
        mini_period = periods_test.columns[i]
        break
    temp_std = std

# Decide the value of min_periods. Set std 0.05 as threshold
# mini_period = 200
check_size = int(len(data.index) * 0.2)   # 20% dataset for testing
check = {}
check_data = data.copy() # Avoid the changes on original data
check_data = check_data.ix[check_data.count(axis=1) > mini_period]    # Filter users with few ratings. If there is no axis, the sum is the whole matrix
for user in np.random.permutation(check_data.index):
    movie = np.random.permutation(check_data.ix[user].dropna().index)[0]
    check[(user,movie)] = check_data.ix[user,movie]
    check_data.ix[user,movie] = np.nan
Example #19
0
def discretise_cnv(matrix, filter_sd=True, lower_bound=-1, upper_bound=1):
    matrix_discrete = DataFrame(0, index=matrix.axes[0], columns=matrix.axes[1])
    matrix_discrete[matrix <= lower_bound] = -1.2
    matrix_discrete[matrix >= upper_bound] = 1.2
    return matrix_discrete.loc[:, matrix_discrete.std() != 0] if filter_sd else matrix_discrete
Example #20
0
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

data['food'].map(lambda x: meat_to_animal[x.lower()])

# 数据标准化
datafile = 'd:/data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据

(data - data.min())/(data.max() - data.min()) #最小-最大规范化
(data - data.mean())/data.std() #零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化


###替换值
data = Series([1., -999., 2., -999., -1000., 3.])
data

data.replace(-999, np.nan)

data.replace([-999, -1000], np.nan)

data.replace([-999, -1000], [np.nan, 0])

data.replace({-999: np.nan, -1000: 0})
WEEKCOLS=[WEEKDF,updf,dwndf]
WEEKDF=pd.concat(WEEKCOLS,axis=1)
WEEKDF.columns=['PLUSMINUSWEEK','UP RATE','DOWN RATE']
'''
print(' ')
print (WEEKDF)
'''

#find the current high low of futures


POSITION=DataFrame([0])
VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0]


WEEKSTDUP=float((VALUE*FIVEPATTERNCHANGE.std())+VALUE)
WEEKSTDDOWN=float(VALUE-(VALUE*FIVEPATTERNCHANGE.std()))


print ("The last week through the last five weeks have done the following")
print (FINALDF)
print (' ')
VALUE=FIVEPATTERNCHANGE.mean()*CLWEEK.iloc[0,0]+CLWEEK.iloc[0,0]
VALUE=DataFrame(VALUE)
print (' ')
print ('Value price based on pattern')
print ("%.2f" % VALUE.iloc[0,0])
print (' ')

print ('One Standard Deviation Up')
print ((VALUE*FIVEPATTERNCHANGE.std())+VALUE)
Example #22
0
def ExerciseCheckerAlmostCorrect(path):
    
    # Gather the solution
    solution_path = path + "Solutions/Week1.xlsx"
    solution = load_workbook(solution_path, read_only=True, use_iterators=False, 
                             keep_vba=False, guess_types=False, data_only=True)

    solution_rows = prepare_book(solution)
    num_solution_records = len(solution_rows)
    
    print "The number of solution records is: " + str(num_solution_records) + "\n"
    
    num_responses = 0
    
    all_accuracy_array = []
    almost_accuracy_array = []
    
    # Gather the response
    answer_path = path + "Response/Week_1/"
    files = os.listdir(answer_path)
    for file in files:
                        
        file_type_array = file.split(".")
        file_type = file_type_array[len(file_type_array)-1]
        
        if file_type not in ["xlsx", "xlsm", "xltx", "xltm"]:
            continue
        
        print file
        
        num_responses += 1
        
        num_check = 0
        num_contain = 0
         
        try:
                        
            answer = load_workbook(answer_path + file, read_only=True, use_iterators=False, 
                               keep_vba=False, guess_types=False, data_only=True)
        
            # Gather each sheet in the answer file
            for sheet in answer:
            
                answer_rows = {}
                       
                for row in sheet.rows:
                    full_address = str.lower(str.strip(str(row[0].value)))
                    remaining_elements = set()
                    for i in range(1, len(row)):
                        remaining_elements.add(str.lower(str.strip(str(row[i].value))))
                    answer_rows[full_address] = remaining_elements
            
                # Compare the answer and the solution
                for row in solution_rows:
                    full_address = str.lower(str.strip(str(row[0].value)))
                    if answer_rows.has_key(full_address):
                        for i in range(1, len(row)):
                            num_check += 1
                            row_element = str.lower(str.strip(str(row[i].value)))
                            
                            if row_element in answer_rows[full_address] or row_element[1:len(row_element)] in answer_rows[full_address]:
                                num_contain += 1
                    
        except Exception as e:
            print "False\t" + str(e)
                            
        if num_check > 0:            
            accuracy = float(num_contain) / num_check
            all_accuracy_array.append(accuracy)
            if accuracy < 1:
                almost_accuracy_array.append(accuracy)
        else:
            almost_accuracy_array.append(accuracy)
    
    print    
    df1 = DataFrame(all_accuracy_array)
    print "ALl: " + str(num_responses)
    print df1.mean()
    print df1.std()
    
    print 
    df2 = DataFrame(almost_accuracy_array)
    print "Almost: " + str(len(almost_accuracy_array))
    print df2.mean()
    print df2.std()
Example #23
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),
        align='center',
        color='white',
        linewidth=1.5)
plt.hold(True)
plt.errorbar(np.arange(len(df.mean())),df.mean(),df.std(),
             elinewidth=1.2,
             capsize=7.5,
             fmt=None)


plt.show()
Example #24
0
        float( len( timedeltas_above_double_average ) ) / len( delta_t ) * 100

    print "Timedeltas above double average",
    print len(timedeltas_above_double_average), 
    print timedeltas_above_double_average_percent

    last_timestamp = timestamps[-1]

    print "Last timestamp", timestamps[-1]
    print "Maximal timestamp", max( timestamps )
    print "Average frequency", float( len( timestamps ) ) / ( float( last_timestamp ) / 1000 ) 


    delta_t = DataFrame( delta_t )
    delta_t.plot()
    pyplot.show()

    print "Timedelta standard deviation", float( delta_t.std() )

    font = {
        'family': 'Consolas',
        'weight': 'x-small',
        'size': 11.0,
        'stretch': 0
    }

    
    
    pyplot.rc( 'font', **font )
    pyplot.show( block=True )    
Example #25
0
def test():
    # a : adulte isolé
    # b : couple
    # c : enfant dans couple
    # d : enfan isolé
    # e : ado couple
    # f : ado isolé
    # g : chambre d'enfant

    # A: 2a,2e
    #  b + 2*c + g
    fa = [0, 1, 2, 0, 0, 0, 1]
    ma = 2754.74

    # B : 2a,2ea,supp:
    #  b + 2*e + 2*g
    fb = [0, 1, 0, 0, 2, 0, 2]
    mb = 3165.15

    # C : 1a,2e:
    #  a + 2*d + g
    fc = [1, 0, 0, 2, 0, 0, 1]
    mc = 2291.04

    # D: 2a, 2e, 2ea, 2*supp :
    #   b + 2*c + 2*e + 3*g
    fd = [0, 1, 2, 0, 2, 0, 3]
    md = 3969.81

    # E : 2a,1ea
    #    b + e + g
    fe = [0, 1, 0, 0, 1, 0, 1]
    me = 2549.17

    # F : 2a, 1e, 2ea
    #    b + c + 2*e + 2*g
    ff = [0, 1, 1, 0, 2, 0, 2]
    mf = 3514.12

    # G: 2a, 1e ,1ea, supp
    #   b + c + e + 2*g
    fg = [0, 1, 1, 0, 1, 0, 2]
    mg = 3042.39

    # H: 1a, 1ea
    #    a + f + g
    fh = [1, 0, 0, 0, 0, 1, 1]
    mh = 2103.91

    # solve f*x = m

    # A supplementary equation is needed because the system is inconsistant
    fsup = [1, -1 / 1.5, 0, 0, 0, 0, 0]
    msup = 0
    f = [fa, fb, fc, fd, fe, ff, fg, fh, fsup]
    m = [ma, mb, mc, md, me, mf, mg, mh, msup]

    results = DataFrame()

    for i in range(8):
        selected_f1 = list(f)
        selected_m1 = list(m)
        selected_f1.pop(i)
        selected_m1.pop(i)
        for j in range(7):
            selected_f = list(selected_f1)
            selected_m = list(selected_m1)
            selected_f.pop(j)
            selected_m.pop(j)

            f_mat = np.array(selected_f)

            m_vec = np.array(selected_m)

            # print i, np.linalg.det(f_mat)
            try:
                x = DataFrame({str(i) + str(j): np.linalg.solve(f_mat, m_vec)}).T
            except:

                x = None

            from pandas import concat

            if x is not None:
                results = concat([results, x])

    print results
    print results.mean()
    print results.std()
    print results.std() / results.mean()
Example #26
0
import numpy as np
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

df=DataFrame(abs(np.random.randn(30).reshape(6,5))*100)

plt.bar(np.arange(len(df.mean())), df.mean(),        
        align='center',
        color='white',
        

        yerr=df.std(),
        ecolor='black',
        capsize=5,
        linewidth=1,)
plt.grid()



plt.show()