Beispiel #1
0
def get_sms_feats(df):
    df['request_datetime'] = pd.to_datetime(df['request_datetime'])
    df["hour"] = df['request_datetime'].dt.hour
    df["day"] = df['request_datetime'].dt.day

    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    #对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count",
                                                        sms_nunique="nunique")
    tmp["sms_rate"] = tmp["sms_count"] / tmp["sms_nunique"]
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """短信下行比例
    """
    calltype2 = df[df["calltype_id"] == 2].copy()
    calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(
        calltype_2="count")
    phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
    phone_no_m[
        "calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
    """短信时间
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(
        hour_mode=lambda x: stats.mode(x)[0][0],
        hour_mode_count=lambda x: stats.mode(x)[1][0],
        hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        day_mode=lambda x: stats.mode(x)[0][0],
        day_mode_count=lambda x: stats.mode(x)[1][0],
        day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    return phone_no_m
Beispiel #2
0
 def actionUpdate(self):
     x = np.arange(-np.pi, np.pi, np.pi / 10)
     y = eval(str(self.ui.lineEdit.text()));
     self.mpl.axes.plot(x, y, '--rx', linewidth=2);
     self.mpl.axes.set_title('Sine Function');
     self.mpl.draw()
     print stats.mode([1, 2, 3, 3, 4, 5])
Beispiel #3
0
 def actionUpdate(self):
     x = np.arange(-np.pi, np.pi, np.pi / 10)
     if str(self.ui.lineEdit.text()) != "":
         y = eval(str(self.ui.lineEdit.text()))
         self.mpl.axes.plot(x, y, '--rx', linewidth=2)
         self.mpl.axes.set_title('Sine Function')
         self.mpl.draw()
         print stats.mode([1, 2, 3, 3, 4, 5])
    def stat_mode(df, cate_fea, num_fea):
        aim_ = pd.concat([
            df.groupby(cate_fea)[num_].agg(
                {
                    num_ + '_mode': lambda x: stats.mode(x)[0][0],
                    num_ + '_mode_count': lambda x: stats.mode(x)[1][0]
                }) for num_ in num_fea
        ],
                         axis=1)
        aim_.reset_index(inplace=True)

        return aim_
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min())/2),
        present("Range", np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode", stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value", normaltest(np_values)[1])
        ]
    return output
Beispiel #6
0
def arrayStatistics(numpy_array, missing_value=N.nan):
    if N.isfinite(missing_value):
        valid_values = numpy_array[N.where(numpy_array!=missing_value)]
        if numpy_array.dtype.kind == 'f':
            valid_values = valid_values[N.where(N.isfinite(valid_values))]
    else:
        valid_values = numpy_array[N.where(N.isfinite(numpy_array))]

    if len(valid_values) > 0:
        statistics =  { 'min' : N.min(valid_values),
                        'max' : N.max(valid_values),
                        'mean' : N.mean(valid_values),
                        'stddev' : N.std(valid_values),
                        'median' : N.median(valid_values),
                        'mode' : scipy_stats.mode(valid_values),
                        'missing' : len(numpy_array) - len(valid_values),
                      }
    else:
        statistics =  { 'min' : missing_value, 'max' : missing_value,
                        'mean' : missing_value, 'stddev' : 0.0,
                        'median' : missing_value,
                        'mode' : ( N.array([missing_value,]),
                                   N.array([len(numpy_array),]) ),
                        'missing' : len(numpy_array),
                      }
    return statistics
def knn_classifier(X_train, y_train, X_validation, X_test, k):
    # Returns the labels for test_data, predicted by the k-NN clasifier trained on X_train and y_train
    # Input:
    # X_train - num_train x num_features matrix with features for the training data
    # y_train - num_train x 1 vector with labels for the training data
    # X_validation - num_test x num_features matrix with features for the validation data
    # X_test - num_test x num_features matrix with features for the test data
    # k - Number of neighbors to take into account
    # Output:
    # y_pred_validation - num_test x 1 predicted vector with labels for the validation data
    # y_pred_test - num_test x 1 predicted vector with labels for the test data

    X_test_val = np.vstack((X_validation, X_test))
    # Compute standardized euclidian distance of validation and test points to the other points
    D = cdist(X_test_val, X_train, metric='seuclidean')
    # Sort distances per row and return array of indices from low to high
    sort_ix = np.argsort(D, axis=1)
    # Get the k smallest distances
    sort_ix_k = sort_ix[:, :k]
    predicted_labels = y_train[sort_ix_k]
    # Predictions for each point is the mode of the K labels closest to the point
    predicted_labels = mode(predicted_labels, axis=1)[0]
    y_pred_validation = predicted_labels[:len(X_validation)]
    y_pred_test = predicted_labels[len(X_validation):]
    
    return y_pred_validation, y_pred_test
Beispiel #8
0
def get_diff(set_val,time_slots,num_type,conf_lev):

    time_slots_utc = dtime_to_unix(time_slots)
    TIMELET_INV_seconds = (time_slots[1]-time_slots[0]).seconds
    diff_mean = list()

    for r, utc_t in enumerate(time_slots_utc):
        utc_t_s = utc_t
        utc_t_e = utc_t + TIMELET_INV_seconds
        idx = np.nonzero((set_val[0] >= utc_t_s) & (set_val[0] < utc_t_e))[0]

        if len(idx) < 2:
            diff_val = np.inf
        else:
            temp_val = abs(np.diff(set_val[1][idx]))
            upper_val = np.sort(temp_val)[int(np.floor(len(temp_val)*conf_lev)):]
            if len(upper_val) == 0:
                 diff_val = np.inf
            else:
                if num_type == FLOAT_TYPE:
                    diff_val = np.mean(upper_val)
                elif num_type == INT_TYPE:
                    diff_val = int(stats.mode(upper_val)[0])
                else:
                    log.error('Sample type must either INT or FLOAT type')
                    raise NameError('Sample type must either INT or FLOAT type')

            #diff_val=max(abs(diff(set_val[1][idx])))
            #sort(abs(diff(set_val[1][idx])))[::-1]

        diff_mean.append(diff_val)

    #diff_mean=np.array(diff_mean)[:,np.newaxis]
    diff_mean = np.array(diff_mean)
    return diff_mean
def get_data(column, np_values, alpha):

    mvs = bayes_mvs(np_values, alpha)

    #report these metrics
    output = [
        present("Column", column),
        present("Length", len(np_values)),
        present("Unique", len(np.unique(np_values))),
        present("Min", np_values.min()),
        present("Max", np_values.max()),
        present("Mid-Range", (np_values.max() - np_values.min()) / 2),
        present("Range",
                np_values.max() - np_values.min()),
        present("Mean", np_values.mean()),
        present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])),
        present("Variance", mvs[1][0]),
        present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])),
        present("StdDev", mvs[2][0]),
        present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])),
        present("Mode",
                stats.mode(np_values)[0][0]),
        present("Q1", stats.scoreatpercentile(np_values, 25)),
        present("Q2", stats.scoreatpercentile(np_values, 50)),
        present("Q3", stats.scoreatpercentile(np_values, 75)),
        present("Trimean", trimean(np_values)),
        present("Minhinge", midhinge(np_values)),
        present("Skewness", stats.skew(np_values)),
        present("Kurtosis", stats.kurtosis(np_values)),
        present("StdErr", sem(np_values)),
        present("Normal-P-value",
                normaltest(np_values)[1])
    ]
    return output
def predict(clf2, test_set):
    uid = pd.DataFrame()
    # test_set = processing(trainSpan=(1, 30), label=False)
    uid["user_id"] = test_set["user_id"]
    test_set = test_set.drop(labels=["user_id"], axis=1)
    # if isinstance(selector,RFECV):
    #     test_set_new = selector.transform(test_set.values)
    # elif isinstance(selector,list):
    #     test_set_new = test_set[selector]
    # else:
    #     test_set_new = test_set
    print("begin to make predictions")
    res = clf2.predict(test_set.values)
    uid["y_hat"] = pd.Series(res)
    uid["label"] = uid.groupby(by=["user_id"])["y_hat"].transform(lambda x: stats.mode(x)[0][0])
    str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
    uid_file = "result/uid_" + str_time + ".csv"
    uid.to_csv(uid_file,header=True,index=False)
    active_users = (uid.loc[uid["label"] == 1]).user_id.unique().tolist()
    print(len(active_users))
    print(active_users)
    str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"))
    submission_file = "result/submission_" + str_time + ".csv"
    with open(submission_file, "a", newline="") as f:
        writer = csv.writer(f)
        for i in active_users:
            writer.writerow([i])
Beispiel #11
0
    def fit_predict(self, X, y, **fit_params):
        self.X_ = X
        self.y_ = y

        # 这里为了尽可能用np的向量化简化计算,对训练集自身的预测通过滚动训练集的方法进行。
        X_roll = X
        distances = np.zeros((len(X), len(X) - 1))
        for i in range(len(self.X_) - 1):
            X_roll = np.roll(X_roll, -1, axis=0)
            # 通过对样本整体滚动计算样本两两之间的距离
            distances[:, i] = np.power(
                np.sum(np.power(X - X_roll, self.p), axis=1), 1 / self.p)
        min_k_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k]
        # distances的第[i, j]个元素为x[i]与x[(i + j + 1) % len(X)]的距离,因此需要调整一下
        neighbors_ind = (min_k_ind +
                         np.arange(1,
                                   len(X) + 1).reshape(-1, 1)) % len(X)
        neighbors_labels = y[neighbors_ind]

        # 等权预测
        if self.weights == 'uniform':
            return stats.mode(neighbors_labels, axis=1).mode.flatten()

        # 加权预测
        neighbors_distances = np.vstack(
            (distances[i][min_k_ind[i]] for i in range(len(X))))
        # 距离倒数
        if self.weights == 'inverse':
            weights_ = 1 / neighbors_distances
        # 距离的高斯函数
        if self.weights == 'Gaussian':
            weights_ = np.exp(-np.square(neighbors_distances) / 2)
        return self.weighted_predict(neighbors_labels, weights_)
Beispiel #12
0
 def extractfromfits(self, filename, loc, size, sky=0.):
     """
     Extracts a subregion from a fits file  and converts it according
     to the astro and math convention: pixel (0,0) is at the bottom left
     """
     x, y = loc
     radius = int(size/2)
     r = size-radius * 2
     hdulist = pyfits.open(filename)  # open a FITS file
     if len(hdulist) != 1:
         raise RuntimeError, "extractfromfits : len(hdulist) > 1 not allowed"
     fulldata = hdulist[0].data       # assumes the first extension is an image
     if x< 0 or y< 0 or x>= fulldata.shape[1] or y>= fulldata.shape[0]:
         raise RuntimeError, "extractfromfits : bad extraction parameters"
     if x+radius+r >= fulldata.shape[1] or y+radius+r >= fulldata.shape[0] or x-radius<0 or y-radius<0:
         #TODO: set outside pixels to NaN
         print "outside"
     self.array = self.array + np.zeros(self.array.shape, dtype=np.float64)    # switch to 8 byte   
     self.array = fulldata[y-radius:y+radius+r, x-radius:x+radius+r].transpose()            # get values from the subsection 
     #    This tansposition makes the pixelarray coordinates (x,y) equal to those in the ds9 display etc.
     #    In other words, we are in the math and astro convention.
     #    x = horizontal, y = vertical, (0, 0) is bottom left.
     self.array[np.where(np.isnan(self.array))] = sky
     hdulist.close()
     if sky is None:
         #TODO: check...
         self.array -= stats.mode(self.array.ravel())[0][0]
     else:
         self.array -= sky
     self.setzscale()
Beispiel #13
0
 def create_dataset(X, y, time_steps=1, step=1):
     Xs, ys = [], []
     for i in range(0, len(X) - time_steps, step):
         v = X.iloc[i:(i + time_steps)].values
         labels = y.iloc[i:i + time_steps]
         Xs.append(v)
         ys.append(stats.mode(labels)[0][0])
     return np.array(Xs), np.array(ys).reshape(-1, 1)
Beispiel #14
0
 def unsupervised_habitat_class_modes(self):
     hcm = {}
     for hab in self.habitats:
         md, cn = mode( self.unsupervised_habitat_class_dict[hab] )
         if len( md )==1:
             hcm[hab] = md[0]
         else:
             hcm[hab] = None
     return hcm
Beispiel #15
0
 def unsupervised_habitat_class_modes(self):
     hcm = {}
     for hab in self.habitats:
         md, cn = mode(self.unsupervised_habitat_class_dict[hab])
         if len(md) == 1:
             hcm[hab] = md[0]
         else:
             hcm[hab] = None
     return hcm
def calculate_weighted_loss(data):
    wighted_loss = 0
    if data.shape[0] != 0:
        label_column = data[:, -1]
        label_column = np.array(label_column.tolist())
        y_prediction_mode = stats.mode(label_column, axis=0)[0][0]
        y_prediction = [y_prediction_mode] * label_column.shape[0]
        wighted_loss = ch.custom_weighted_loss(label_column, np.array(y_prediction))
    return wighted_loss
Beispiel #17
0
    def Aggregate(self,
                  sourceRaster,
                  outFName,
                  method=None,
                  numSourcePerTarget=10):

        tmpOutput = os.path.splitext(outFName)[0] + ".tif"
        tmpOutDataset = SpatialUtilities.SAHMRaster(tmpOutput)
        tmpOutDataset.pullParamsFromRaster(self.templateRaster.source)
        tmpOutDataset.createNewRaster()

        rows = int(sourceRaster.height)
        cols = int(sourceRaster.width)

        # loop of 'blocks' of data maybe.
        bSize = 2048  # source pixels
        # convert this to the nearest whole number of target pixels
        bSize = int(round(bSize / numSourcePerTarget) * numSourcePerTarget)
        if bSize == 0:
            bSize = int(numSourcePerTarget)

        for i in range(0, rows, bSize):
            if i + bSize < rows:
                numRows = bSize
            else:
                numRows = rows - i

            for j in range(0, cols, bSize):
                if j + bSize < cols:
                    numCols = bSize
                else:
                    numCols = cols - j

                data = sourceRaster.getBlock(j, i, numCols, numRows)

                if method == None:
                    method = "Mean"
                if method in ["Mean", "Max", "Min", "STD"]:
                    ans = self.rebin(data, (numRows / numSourcePerTarget,
                                            numCols / numSourcePerTarget),
                                     method)
                else:
                    X, Y = data.shape
                    x = X // numSourcePerTarget
                    y = Y // numSourcePerTarget
                    ndMask = data.reshape(
                        (x, numSourcePerTarget, y, numSourcePerTarget))
                    ndMask = ndMask.transpose([0, 2, 1, 3])
                    ndMask = ndMask.reshape(
                        (x * y, numSourcePerTarget * numSourcePerTarget))
                    ans = np.array(stats.mode(ndMask, 1)[0]).reshape(x, y)

                tmpOutDataset.putBlock(ans, int(j / numSourcePerTarget),
                                       int(i / numSourcePerTarget))

        tmpOutDataset.calcStats()
        tmpOutDataset.close()
def HMM(X, Y, human_bounds):
    """fit hidden markov model
  
       Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations              

       Parameters
       ----------
       A: voxel by time ndarray (2D)
       B: voxel by time ndarray (2D)
       C: voxel by time ndarray (2D)
       D: voxel by time ndarray (2D)
       K: # of events for HMM (scalar)
 
       Returns
       -------
       z: z-score after performing permuted cross-validation analysis      

    """

    # Fit to all but one subject
    nPerm = 1000
    within_across = np.zeros(nPerm + 1)
    K = len(human_bounds) + 1
    nTR = X.shape[1]

    # create events vector out of human bounds
    add_zero = np.append([0], human_bounds)
    full_human_bounds = np.append(add_zero, [nTR])
    diff_bounds = np.diff(full_human_bounds)
    events = np.zeros((nTR))

    for l in range(len(diff_bounds)):
        events[full_human_bounds[l]:full_human_bounds[l + 1]] = l

    max_event_length = stats.mode(events)[1][0]

    # compute timepoint by timepoint correlation matrix
    cc = np.corrcoef(Y.T)  # Should be a time by time correlation matrix

    # Create a mask to only look at values up to max_event_length
    local_mask = np.zeros(cc.shape, dtype=bool)
    for k in range(1, max_event_length):
        local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True

    for p in range(nPerm + 1):
        same_event = events[:, np.newaxis] == events
        within = fisher_mean(cc[same_event * local_mask])
        across = fisher_mean(cc[(~same_event) * local_mask])
        within_across[p] = within - across

        np.random.seed(p)
        events = np.zeros(nTR, dtype=np.int)
        events[np.random.choice(nTR, K - 1, replace=False)] = 1
        events = np.cumsum(events)

    return within_across
def HMM(X, Y, human_bounds):
    """fit hidden markov model
  
       Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations              

       Parameters
       ----------
       A: voxel by time ndarray (2D)
       B: voxel by time ndarray (2D)
       C: voxel by time ndarray (2D)
       D: voxel by time ndarray (2D)
       K: # of events for HMM (scalar)
 
       Returns
       -------
       z: z-score after performing permuted cross-validation analysis      

    """

    # Fit to all but one subject
    nPerm = 1000
    within_across = np.zeros(nPerm + 1)
    K = len(human_bounds) + 1
    nTR = X.shape[1]
    ev = brainiak.eventseg.event.EventSegment(K,
                                              split_merge=True,
                                              split_merge_proposals=3)
    ev.fit(X.T)
    events = np.argmax(ev.segments_[0], axis=1)
    bounds = np.where(np.diff(np.argmax(ev.segments_[0], axis=1)))[0]
    _, event_lengths = np.unique(events, return_counts=True)
    max_event_length = stats.mode(events)[1][0]

    # compute timepoint by timepoint correlation matrix
    cc = np.corrcoef(Y.T)  # Should be a time by time correlation matrix

    # Create a mask to only look at values up to max_event_length
    local_mask = np.zeros(cc.shape, dtype=bool)
    for k in range(1, max_event_length):
        local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True

    for p in range(nPerm + 1):
        same_event = events[:, np.newaxis] == events
        within = fisher_mean(cc[same_event * local_mask])
        across = fisher_mean(cc[(~same_event) * local_mask])
        within_across[p] = within - across

        np.random.seed(p)
        perm_lengths = np.random.permutation(event_lengths)
        events = np.zeros(nTR, dtype=np.int)
        events[np.cumsum(perm_lengths[:-1])] = 1
        events = np.cumsum(events)

    return within_across, bounds
Beispiel #20
0
def test_kpi_level_model(predict_result_file, final_result_file):
    df = pd.read_csv(predict_result_file, sep=',', dtype=str)
    df = df[df['predict_event'] == '1']
    mapping_dict = {'Biz': 0, 'Mon': 1, 'Ora': 2, 'Trd': 3, 'Other': 4}
    knn_model_list = []
    knn_model_list = kpi_level_model.test_KNN_model(cluster_data_dir)
    all_df = pd.DataFrame(columns=[
        'alertgroup', 'classifier', 'hostname', 'predict_event',
        'predict_level'
    ])
    for alertgroup, group in df.groupby('alertgroup'):
        column_list = [
            'cpu_max', 'cpu_min', 'mem_max', 'mem_min', 'cpu_max_1',
            'cpu_min_1', 'mem_max_1', 'mem_min_1', 'cpu_max_2', 'cpu_min_2',
            'mem_max_2', 'mem_min_2'
        ]
        data = group[column_list]
        kpi_predict_result = []
        for i in knn_model_list:
            kpi_predict_result.append(i.predict(data))
        print(kpi_predict_result)
        predict_results = np.zeros(len(group))
        df_res = pd.DataFrame(columns=['predict_level'])
        for idx in range(len(group)):
            sample_predict_vec = np.array([
                np.round(kpi_predict_result[0][idx]),
                np.round(kpi_predict_result[1][idx]),
                np.round(kpi_predict_result[2][idx]),
                np.round(kpi_predict_result[3][idx]),
                np.round(kpi_predict_result[4][idx])
            ])
            # print(sample_predict_vec)
            mode_prediction_res = stats.mode(sample_predict_vec)[0][
                0]  # 5个模型预测结果的众数
            print(mode_prediction_res)
            max_prediction_res = sample_predict_vec[np.argmax(
                sample_predict_vec)]  # 5个模型预测结果的最大值
            print(max_prediction_res)
            group_prediction_res = sample_predict_vec[mapping_dict[
                alertgroup]]  # group_prediction_val <= max_prediction_val, 该条数据对应的业务模型预测的结果
            print(group_prediction_res)
            if (mode_prediction_res <= 2 and max_prediction_res <= 2):
                predict_results[idx] = group_prediction_res
            else:
                predict_results[idx] = max_prediction_res
            df_res.loc[idx] = int(predict_results[idx])

        new_df = group[[
            'alertgroup', 'classifier', 'hostname', 'predict_event'
        ]].reset_index(drop=True).join(df_res, how='outer')
        all_df = pd.concat([all_df, new_df])

    print(all_df)
    all_df.to_csv(final_result_file, sep=',', index=False)
Beispiel #21
0
def average_predictions(models, test_features, np_value="int", method="mode"):
    predictions = np.column_stack([model.predict(test_features) for model in models])
    print(f"Predictions of first 5 rows: {predictions[:5]}")
    if method == "mode":
        averaged_predictions = stats.mode(predictions, axis=1)[0].astype(np_value)
    elif method == "mean":
        averaged_predictions = np.average(predictions, axis=1).astype(np_value)
    else:
        raise Exception("Method undefined")
    print(f"Averaged predictions of first 5 rows: {averaged_predictions[:5]}")
    return np.hstack(averaged_predictions)
Beispiel #22
0
    def predict(self, examples):
        X = vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] )

        dist, ind = self.lsh.kneighbors(X)

        rows, columns = ind.shape
        for row in xrange(0, rows):
            for column in xrange(0, columns):
                ind[row, column] = self.Y[ind[row, column]]
                
        vals, counts = mode(ind, axis=1)
        
        return reshape(vals, (1, len(examples))).tolist()[0]
Beispiel #23
0
def info_univariate(data, features_name):
    df_np = np.array(data)
    df_transposed = np.transpose(d)
    for f in range(0, len(df_transposed), 1):
        ds = sorted(df_transposed[f])
        moda = stats.mode(ds)
        print(
            'Feature: {}:\nMAX: --> {}\nMIN:  --> {}\nAVG:  --> {}\nMODE:  --> V:{} --> {}\nMed  --> {}\n'
            .format(features_name[f], np.max(df_transposed[f]),
                    np.min(df_transposed[f]),
                    round(np.mean(df_transposed[f]), 1), moda[0], moda[1],
                    np.median(ds)))
    plot_boxnotch_univariateanalysis(df_transposed, features_name)
    return
def ensemble_voting(predictions, gold, dataset):
    stacked = numpy.stack(predictions, axis=0)
    modals = stats.mode(stacked, axis=0)[0].squeeze().astype(int)

    if dataset != "test":
        accuracy = acc(gold, modals)
        f1 = f1_macro(gold, modals)
        print("acc: ", accuracy)
        print("f1: ", f1)
    else:
        accuracy = 0
        f1 = 0

    return modals, accuracy, f1
def density_categorical_accuracy(labels, predicted_labels, classes):
    assert (len(labels) == len(predicted_labels))
    if len(labels) == 0:
        return 0

    n_cluster = np.max(predicted_labels) + 1
    clusters = [[] for _ in range(n_cluster)]

    for label, predicted_label in zip(labels, predicted_labels):
        clusters[predicted_label].append(label)

    catacc = np.average([stats.mode(d)[1][0] / len(d) for d in clusters],
                        weights=[len(d) for d in clusters])
    corrected_catacc = (catacc - 1.0 / classes) / (1.0 - 1.0 / classes)

    return corrected_catacc
Beispiel #26
0
def get_qda_oof_prediction(x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = QuadraticDiscriminantAnalysis()
        y_tr = y_train[train_ind]
        x_tr = x_train[train_ind]
        x_ts = x_train[test_ind]
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Beispiel #27
0
    def predict(self, X):
        distances = np.zeros((len(X), len(self.X_)))
        for i in range(len(X)):
            distances[i, :] = np.power(
                np.sum(np.power(X[i] - self.X_, self.p), axis=1), 1 / self.p)
        neighbors_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k]
        neighbors_labels = self.y_[neighbors_ind]

        if self.weights == 'uniform': return stats.mode(neighbors_labels).mode

        neighbors_distances = np.vstack(
            (distances[i][neighbors_ind[i]] for i in range(len(X))))
        if self.weights == 'inverse':
            weights_ = 1 / neighbors_distances
        if self.weights == 'Gaussian':
            weights_ = np.exp(-np.square(neighbors_distances) / 2)
        return self.weighted_predict(neighbors_labels, weights_)
Beispiel #28
0
def get_feature(data_dict_samples,num_type):
    x_temp=[]
    for i,sample in enumerate(data_dict_samples):
        # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number
        if len(sample)==0: # Set infty if no sample is availble
            x_temp.append(np.inf)                

        else:
            if num_type==INT_TYPE:
                x_temp.append(int(stats.mode(sample)[0]))                
            elif num_type==FLOAT_TYPE:
                x_temp.append(np.mean(sample))                
            else:
                raise NameError('Sample type must either INT or FLOAT type')

    x_temp=np.array(x_temp)[:,np.newaxis]
    return x_temp
    def fit(self, trainExamples):
        self.expectedValues = {}
        
        for x in trainExamples:
            for (key, value) in x.items():
                if key != "Image":
                    if not key in self.expectedValues:
                        self.expectedValues[key] = []
                    
                    if len(value) > 0:
                        self.expectedValues[key].append(round(float(value)/0.5,0)*0.5)
                        
        for key in self.expectedValues.keys():
            self.expectedValues[key], _ = mode(self.expectedValues[key])
            self.expectedValues[key] = self.expectedValues[key][0]
            

        return self
Beispiel #30
0
def get_create_feature(row):
    feature = pd.Series()
    feature['user_id'] = list(row['user_id'])[0]
    # feature['create_count'] = len(row)
    diff_day = np.diff(row['day'])
    if len(diff_day) != 0:
        # feature['create_day_diff_mean'] = np.mean(diff_day)
        # feature['create_day_diff_std'] = np.std(diff_day)
        # feature['create_day_diff_min'] = np.min(diff_day)
        # feature['create_day_diff_mode'] = stats.mode(interval_data)[0][0]
        feature['create_day_diff_ske'] = stats.skew(diff_day)
        feature['create_day_diff_kur'] = stats.kurtosis(diff_day)
        # feature['create_day_diff_max'] = np.max(diff_day)
        feature['create_day_last'] = diff_day[-1]
        feature['create_sub_register'] = np.subtract(np.max(row['max_day']),
                                                     np.max(row['day']))
        feature['create_mode'] = stats.mode(row['day'])[0][0]
        return feature
Beispiel #31
0
def get_sgd_oof_prediction(SEED,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = SGDClassifier(max_iter=100,random_state=SEED,loss="squared_hinge",alpha=0.009,penalty='l1')
        y_tr = y_train[train_ind]
        scaler = StandardScaler()
        x_tr = scaler.fit_transform(x_train[train_ind])
        x_ts = scaler.transform(x_train[test_ind])
        x_test_s = scaler.transform(x_test)
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test_s)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))
        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Beispiel #32
0
def get_log_oof_prediction(SEED,x_train,y_train,x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS,ntest))
    
    for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)):
        model = LogisticRegression(random_state=SEED,C=0.8252042855888113,penalty='l1',verbose=2)
        y_tr = y_train[train_ind]
        scaler = StandardScaler()
        x_tr = scaler.fit_transform(x_train[train_ind])
        x_ts = scaler.transform(x_train[test_ind])
        x_test_s = scaler.transform(x_test)
        model.fit(x_tr,y_tr)
        oof_train[test_ind] = model.predict(x_ts)
        oof_test_skf[i,:] = model.predict(x_test_s)
        print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind])))
        
    oof_test = stats.mode(oof_test_skf,axis=0)[0]
    return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
Beispiel #33
0
    def process(img, head, gray_old):
        if shot_frame:
            cv2.imwrite(folder_depth + head + f"{loop:05d}.jpg", img)
        img = cv2.resize(img, resize_shape, interpolation=cv2.INTER_LINEAR)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.medianBlur(gray, 3)
        if mix_image:
            diff = cv2.absdiff(gray, gray_old)
            th = stats.mode(diff, axis=None)[0][0]
            diff_valid = diff > th + gray_diff_th
            if gray_mode:
                diff[np.logical_not(diff_valid)] = 0
                return diff, gray

            if np.sum(diff_valid) > change_th: return None, gray
            return gray, gray
        elif gray_mode:
            return gray, gray
        return img, gray
Beispiel #34
0
def split_data_into_steps(data, N_TIME_STEPS, N_FEATURES, step, RANDOM_SEED):
    segments = []
    labels = []
    activities = set()
    for i in range(0, len(data) - N_TIME_STEPS, step):
        xs = data['x'].values[i: i + N_TIME_STEPS]
        ys = data['y'].values[i: i + N_TIME_STEPS]
        zs = data['z'].values[i: i + N_TIME_STEPS]
        label = stats.mode(data['activity'][i: i + N_TIME_STEPS])[0][0]
        segments.append([xs, ys, zs])
        labels.append(label)
        activities.add(label)

    reshaped_segments = pd.np.asarray(segments, dtype=pd.np.float32).reshape(-1, N_TIME_STEPS, N_FEATURES)
    labels = pd.np.asarray(pd.get_dummies(labels), dtype=pd.np.float32)

    validation_split = get_validation_split()
    x_train, x_test, y_train, y_test = train_test_split(reshaped_segments, labels, test_size=validation_split,
                                                        random_state=RANDOM_SEED)
    return x_train, x_test, y_train, y_test, activities
Beispiel #35
0
def test_voting(model, ipca, tensor, labels, times, interval_len, cnt):
    ok = 0.
    total = 0.
    with tf.Session() as sess:
        for i in range(cnt):
            x, y = prepate_data_for_voting(sess, ipca, tensor, labels, times, interval_len)
            num, width, temp, feat = x.shape
            x = np.reshape(x, (-1, x.shape[2] * x.shape[3]))

            x = (x - ipca.mean_my) / ipca.var_my
            x = ipca.transform(x)
            y_pred = model.predict(x)
            y_pred = np.reshape(y_pred, (-1, width))
            y_pred = stats.mode(y_pred, axis=1)[0]
            y_pred = np.reshape(y_pred, (-1))
            y = np.argmax(y, axis=1)
            ok += np.sum(y == y_pred, axis=0)
            total += y.shape[0]

    return ok / total
Beispiel #36
0
def get_feature(data_dict_samples,num_type):
    x_temp = []
    for i, sample in enumerate(data_dict_samples):

        # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number

        # Set infty if no sample is availble
        if len(sample) == 0:
            x_temp.append(np.inf)                

        else:
            if num_type == INT_TYPE:
                x_temp.append(int(stats.mode(sample)[0]))                
            elif num_type == FLOAT_TYPE:
                x_temp.append(np.mean(sample))                
            else:
                raise NameError('Sample type must either INT or FLOAT type')

    x_temp = np.array(x_temp)[:, np.newaxis]
    return x_temp
Beispiel #37
0
 def train(self, input_data, target_data):
     (self._most_frequent_value, ), _ = mode(target_data, axis = 0)
     self._target_type = target_data.dtype
Beispiel #38
0
    #A special column
    #This would be obsolete with:
    #from matplotlib.dates import MONDAY, MonthLocator, WeekdayLocator, DateFormatter
    time = getTime(DateTimeUT)
    
    #Just an empty class
    class Dummy(object): pass
    
    #Lets get some statistics
    tdfst = Dummy()
    tdfStat = []
    
    tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev = \
    "Seeing", median(medianFWHM), max(medianFWHM), min(medianFWHM), mean(medianFWHM), std(medianFWHM)
    
    tdfst.mode = mode(medianFWHM)
 
    tdfStat.append(tdfst)
    
    if opts.verbose == True:
        print
        print ("%5s" + "%11s"*6) % ("Name", "Median", "Max", "Min", "Mean", "Stdev", "Mode")
        frmt = "%5s" + "%11.2f"*6
        print frmt % (tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev, tdfst.mode[0])
        print
    
    #Calculates some 2D correlations
    WDCorr = spearmanr(medianFWHM, WindDirectionDeg)
    Humidity = spearmanr(medianFWHM, HumidityPercent)
    Pressure = spearmanr(medianFWHM, PressureHPA)
Beispiel #39
0
def open_behavioural(path, subj, **kwargs):
    ############# BOLOGNA ##################
    
    dropped_trials = []
    behavioural_data = []
    for arg in kwargs:
        if arg == 'dropped_trials':
            dropped_trials = np.int_(kwargs[arg].split(','))
        if arg == 'behavioural_data':
            behavioural_data = kwargs[arg].split(',')
    
    import xlrd
    fn = os.path.join(path, subj)
    
    book = xlrd.open_workbook(fn) #Open workbook
    sh = book.sheet_by_index(0) #Choose sheet
    
    labels = sh.row_values(0)
    labels = [unicode.lower(unicode(l)) for l in labels]
    l_array = np.array(labels, dtype = np.str)
    
    indexes = []
    data_tot = []
    dtype = []
    for field in behavioural_data:
        index = np.nonzero(l_array == str.lower(field))[0][0]
        
        data = sh.col_values(int(index))[1:]
        print field
        type_ = mode([x.__class__ for x in data])[0][0]
        if type_ == unicode or type == str:
            data = [x.__class__.lower(x) for x in data]
            t = (field, np.str_, 45)
        else:
            #print data
            data = [(int(x) if (x != 'NULL') and (x != '') else 0) for x in data]
            t = (field, np.int_, 1)
        
        dtype.append(t)
        data_tot.append(data)
    
    data_tot.append(range(1,len(sh.col_values(0)[1:])+1))
    dtype.append(('TrialNo.', np.int_, 1))
    
    '''    
    behavioural = np.array(zip(
                               sh.col_values(6)[1:], #Condition Label
                               sh.col_values(19)[1:],
                               np.float_([(int(x) if x else 0) for x in sh.col_values(18)[1:]]), #Accuracy
                               np.int_([(int(x) if x else 0) for x in sh.col_values(4)[1:]]),
                               np.arange(len(sh.col_values(0)[1:]))+1 #Combination
                            ), 
                           dtype=[('Condition', np.str_,2),
                                  ('SlideImage', np.str_,10),
                                  ('Accuracy', np.int_, 1),
                                  ('Combination', np.int_, 1),
                                  ('TrialNo.', np.int_, 1)]
                           )
    '''
    
    behavioural = np.array(zip(*data_tot), dtype=dtype)
    
    
    if len(dropped_trials) > 0:
        mask = 0
        for trial in dropped_trials:
            mask = mask + np.int_(behavioural['TrialNo.'] == trial)
    
        behavioural = behavioural[~np.bool_(mask)]
    
    return behavioural
Beispiel #40
0
Southwest, 4.79, 2.71
Wales, 5.27, 3.53
Scotland, 6.08, 4.51
Northern Ireland, 4.02, 4.56'''

data = data.splitlines()
data = [i.split(', ') for i in data]

column_names = data[0]
data_rows = data[1::]
df = pd.DataFrame(data_rows, columns=column_names)

df['Alcohol'] = df['Alcohol'].astype(float)
df['Tobacco'] = df['Tobacco'].astype(float)

print "The mean for the Alcohol dataset is", df['Alcohol'].mean() 
print  "The median for the Alcohol dataset is", df['Alcohol'].median() 
# NOT SURE HOW TO MODIFY THE SCRIPT TO ONLY RETURN THE FIRST ARRAY
# ALSO NOT SURE HOW TO IMPROVE UPON THE VALUE RETURNED SO THAT IF THE FREQUENCY ARRAY RETURNED IS 1, "there is no mode" is the response returned.
print  "The mode for the Alcohol dataset is", stats.mode(df['Alcohol']) 
print "The range for the Alcohol dataset is", max(df['Alcohol']) - min(df['Alcohol'])
print "The standard deviation for the Alcohol dataset is", df['Alcohol'].std() 
print "The variance for the Alcohol dataset is", df['Alcohol'].var() 

print  "The mean for the Tobacco dataset is", df['Tobacco'].mean() 
print "The median for the Tobacco dataset is", df['Tobacco'].median() 
print "The mode for the Tobacco dataset is", stats.mode(df['Tobacco']) 
print "The range for the Tobacco dataset is", max(df['Tobacco']) - min(df['Tobacco'])
print "The standard deviation for the Tobacco dataset is", df['Tobacco'].std() 
print "The variance for the Tobacco dataset is", df['Tobacco'].var()