def group_log_means_predict_manual(df_train, df_test, vali):
    start_time = time.time()
    all_mean = df_train['log_Demanda_uni_equil'].mean()
    P_mean = df_train.groupby(by=['short_name'])['log_Demanda_uni_equil'].mean()
    C_mean = df_train.groupby(by=['Cliente_ID'])['log_Demanda_uni_equil'].mean()
    PA_mean = df_train.groupby(by=['short_name', 'Agencia_ID'])['log_Demanda_uni_equil'].mean()
    PR_mean = df_train.groupby(by=['short_name', 'Ruta_SAK'])['log_Demanda_uni_equil'].mean()
    PCA_mean = df_train.groupby(by=['short_name', 'Cliente_ID', 'Agencia_ID'])['log_Demanda_uni_equil'].mean()

    print 'mean calculating time=', time.time()-start_time

    start_time = time.time()
    if not vali:
        df_test['Demanda_uni_equil']=np.apply_along_axis((lambda x:log_means_pred_demand_manual_func(x,\
            P_mean, C_mean, PA_mean, PR_mean, PCA_mean, all_mean)), 1, df_test.values)
        df_test.to_csv('output/'+'manual_group_log_mean_'+ \
                str(datetime.datetime.now().strftime('%Y-%m-%d-%H-%M'))+'.csv', \
                columns=['id','Demanda_uni_equil'], index=False)
    else:
        # global pred_demand, true_demand
        pred_demand = np.apply_along_axis((lambda x:log_means_pred_demand_manual_func(x, \
            P_mean, C_mean, PA_mean, PR_mean, PCA_mean, all_mean)), 1, df_test[labels].values)
        true_demand = df_test['Demanda_uni_equil'].values
        RMSLE = np.sqrt(MSE(np.log1p(pred_demand), np.log1p(true_demand)))
        print 'RMSLE=', RMSLE

    print 'predicting time=', time.time()-start_time
Example #2
0
    def run_cv(self, num_round, params):
        '''
        Using FoldTubeID split, loop over CV to get RMSLE for each split
        params is a list of parameters for XGBoost.

        After finishing CV, run score() to get the results
        '''
        self.pred = []
        self.real = []
        if len(params) == 0:
            raise ValueError('Please read in parameters')

        for tr, te in self.cv:
            self.train = self.trainset.loc[tr,:].copy()
            self.test = self.trainset.loc[te,:].copy()

            # Randomize and set seed
            # np.random.permutation(len(trainp1))
            np.random.seed(1)
            self.train = self.train.iloc[np.random.permutation(len(self.train))]
            np.random.seed(2)
            self.test = self.test.iloc[np.random.permutation(len(self.test))]
            y_real = np.array(self.test.iloc[:,-1])

            # Section for training multi-models if you like
            y_pred_xgb = xgboost_model(self.train, self.test, num_round, params)

            y_pred = y_pred_xgb
            self.pred += [y_pred]
            self.real += [y_real]
            self.rmsle_score += [np.sqrt(mean_squared_error(np.log1p(y_real),
                                 np.log1p(y_pred)))]
        print '==========================================================='
        print 'Finished Cross-validation'
        print '==========================================================='
Example #3
0
def _gpinv(p, k, sigma):
    """Inverse Generalized Pareto distribution function"""
    x = np.full_like(p, np.nan)
    if sigma <= 0:
        return x
    ok = (p > 0) & (p < 1)
    if np.all(ok):
        if np.abs(k) < np.finfo(float).eps:
            x = - np.log1p(-p)
        else:
            x = np.expm1(-k * np.log1p(-p)) / k
        x *= sigma
    else:
        if np.abs(k) < np.finfo(float).eps:
            x[ok] = - np.log1p(-p[ok])
        else:
            x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k
        x *= sigma
        x[p == 0] = 0
        if k >= 0:
            x[p == 1] = np.inf
        else:
            x[p == 1] = - sigma / k

    return x
Example #4
0
def transform_log(series, robust=True):
    """Perform element-wise logarithm transformation in a numerical series.
    
    Parameters
    ----------
    series : pandas.Series
        series to transform
    robust : bool
        True - handle negative and zero values properly
        False - transform negative value to nan, zero to -inf

    Returns
    -------
    log_series : pandas.Series
        ANOTHER series consisting of the transformed values
    """
    # TODO: support log10
    # TODO: separate log1p and log explicitly
    if not isinstance(series, pd.Series):
        raise TypeError("argument 'series' is NOT 'pandas.Series' type")
    if not is_numerical_type(series):
        raise ValueError("value type of argument 'series' is NOT numerical")
    
    if robust:
        return series.apply(lambda x: np.log1p(x) if x>=0 else -np.log1p(-x))
    else:
        return series.apply(np.log)
 def transform(self, X):
   if self.columns:
     for column in self.columns:
       X[column] = np.log1p(X[column])
       return X
   else:
     return np.log1p(X)
Example #6
0
    def __init__(self, past, future, features = None):
        """Create a training pattern.

        Parameters:
        past -- past feature vectors as a tensor of shape [P, V]
            where P is past days and V is the vectors/day
        future -- future feature vectors as a tensor of [F, V]
            where F is future days and V is the vectors/day
        features -- a sequence of feature names to use
            where None means use all features
        """

        # calculate training input from past features
        past_subfeatures = [[self._subfeatures(vector, features)
            for vector in vectors]
                for vectors in past]
        self._input = numpy.array(
            [list(util.flatten(vectors)) for vectors in past_subfeatures])

        # calculate training output from future volatility
        future_returns = numpy.log1p(
            [[vector.ret for vector in vectors] for vectors in future])
        self._output = numpy.std(future_returns, axis = 0, ddof = 1)\
            * numpy.sqrt(252)

        # calculate past returns for forecasts
        self._past_returns = numpy.log1p(
            [[vector.ret for vector in vectors] for vectors in past])
Example #7
0
def compute_weights(data, Nlive):
    """Returns log_ev, log_wts for the log-likelihood samples in data,
    assumed to be a result of nested sampling with Nlive live points."""

    start_data=concatenate(([float('-inf')], data[:-Nlive]))
    end_data=data[-Nlive:]

    log_wts=zeros(data.shape[0])

    log_vols_start=cumsum(ones(len(start_data)+1)*log1p(-1./Nlive))-log1p(-1./Nlive)
    log_vols_end=np.zeros(len(end_data))
    log_vols_end[-1]=np.NINF
    log_vols_end[0]=log_vols_start[-1]+np.log1p(-1.0/Nlive)
    for i in range(len(end_data)-1):
        log_vols_end[i+1]=log_vols_end[i]+np.log1p(-1.0/(Nlive-i))

    log_likes = concatenate((start_data,end_data,[end_data[-1]]))

    log_vols=concatenate((log_vols_start,log_vols_end))

    log_ev = log_integrate_log_trap(log_likes, log_vols)

    log_dXs = logsubexp(log_vols[:-1], log_vols[1:])
    log_wts = log_likes[1:-1] + log_dXs[:-1]

    log_wts -= log_ev

    return log_ev, log_wts
def exp1():
	train,y,test,idx = get_data_1()
	train = np.log1p(train.astype(float))
	test = np.log1p(test.astype(float))
	scaler = StandardScaler().fit(train)
	train = scaler.transform(train)
	test = scaler.transform(test)
	mtrain = pd.read_csv('meta_features_train.csv')
	mtest = pd.read_csv('meta_features_test.csv')
	scaler2 = StandardScaler().fit(mtrain)
	mtrain = scaler2.transform(mtrain)
	mtest = scaler2.transform(mtest)
	train = np.column_stack((train,mtrain))
	test = np.column_stack((test,mtest))
	rtrain_nn,rtest_nn = nn_features(train,y,test,model=build_nn2,random_state=1,n_folds=5,early_stop=50)
	rtrain_nn_total = rtrain_nn
	rtest_nn_total = rtest_nn
	for i in range(9):
		rand_seed = i*113+9201
		rtrain_nn,rtest_nn = nn_features(train,y,test,model=build_nn2,random_state=rand_seed,n_folds=5,early_stop=50)
		rtrain_nn_total += rtrain_nn
		rtest_nn_total += rtest_nn
		pd.DataFrame(data=rtrain_nn_total).to_csv('rtrain_nn_last.csv',index=False)
		pd.DataFrame(data=rtest_nn_total).to_csv('rtest_nn_last.csv',index=False)
	
	pd.DataFrame(data=rtrain_nn_total/10).to_csv('rtrain_nn_final.csv',index=False)
	pd.DataFrame(data=rtest_nn_total/10).to_csv('rtest_nn_final.csv',index=False)
def my_logaddexp(a, b):
    tmp = a - b
    return np.select([a == b, tmp > 0, tmp <= 0], [
        a + 0.69314718055994529,
        a + np.log1p(np.exp(-tmp)),
        b + np.log1p(np.exp(tmp))
    ], default=tmp)
Example #10
0
    def keras_cv(self, params):
        """
        Using FoldTubeID split, loop over CV to get RMSLE for each split
        params is a list of parameters for Keras Neural Networks.

        After finishing CV, run score() to get the results
        """
        self.pred = []
        self.real = []
        if len(params) == 0:
            raise ValueError("Please read in parameters")

        for tr, te in self.cv:
            self.train = self.trainset.loc[tr, :].copy()
            self.test = self.trainset.loc[te, :].copy()

            # Randomize and set seed
            # np.random.permutation(len(trainp1))
            np.random.seed(1)
            self.train = self.train.iloc[np.random.permutation(len(self.train))]
            np.random.seed(2)
            self.test = self.test.iloc[np.random.permutation(len(self.test))]
            y_real = np.array(self.test.iloc[:, -1])

            # Section for training multi-models if you like
            y_pred = keras_model(self.train, self.test, params)

            self.pred += [y_pred]
            self.real += [y_real]
            self.rmsle_score += [np.sqrt(mean_squared_error(np.log1p(y_real), np.log1p(y_pred)))]
        print "==========================================================="
        print "Finished Keras Cross-validation"
        print "==========================================================="
    def getval(self, keys):
        array = tuple(keys)

        array = np.unique(array)
        array_length = len(array)
        rsmleValues_array = []
        count = 0
        maxValue = np.amax(array)
        minValue = np.amin(array)



        for i in array:
            count = 0
            for j in array:
                count = count + (np.log1p(i) - np.log1p(j))**2
            rsmleValues_array.append(np.sqrt(count / array_length))

        count = -1;
        index = 0
        min_error_value = rsmleValues_array[0]
        for val in rsmleValues_array:
            count += 1
            if val < min_error_value:
                min_error_value = val
                index = count

        demand = array[index]
        return demand
def logsum_pair(logx, logy):
    """
    Return log(x+y), avoiding arithmetic underflow/overflow.

    logx: log(x)
    logy: log(y)

    Rationale:

    x + y    = e^logx + e^logy
             = e^logx (1 + e^(logy-logx))
    log(x+y) = logx + log(1 + e^(logy-logx)) (1)

    Likewise,
    log(x+y) = logy + log(1 + e^(logx-logy)) (2)

    The computation of the exponential overflows earlier and is less precise
    for big values than for small values. Due to the presence of logy-logx
    (resp. logx-logy), (1) is preferred when logx > logy and (2) is preferred
    otherwise.
    """
    if logx == logzero():
        return logy
    elif logx > logy:
        return logx + np.log1p(np.exp(logy - logx))
    else:
        return logy + np.log1p(np.exp(logx - logy))
Example #13
0
    def __init__(self, daily_returns, benchmark_daily_returns, risk_free_rate, days, period=DAILY):
        assert(len(daily_returns) == len(benchmark_daily_returns))

        self._portfolio = daily_returns
        self._benchmark = benchmark_daily_returns
        self._risk_free_rate = risk_free_rate
        self._annual_factor = _annual_factor(period)
        self._daily_risk_free_rate = self._risk_free_rate / self._annual_factor

        self._alpha = None
        self._beta = None
        self._sharpe = None
        self._return = np.expm1(np.log1p(self._portfolio).sum())
        self._annual_return = (1 + self._return) ** (365 / days) - 1
        self._benchmark_return = np.expm1(np.log1p(self._benchmark).sum())
        self._benchmark_annual_return = (1 + self._benchmark_return) ** (365 / days) - 1
        self._max_drawdown = None
        self._volatility = None
        self._annual_volatility = None
        self._benchmark_volatility = None
        self._benchmark_annual_volatility = None
        self._information_ratio = None
        self._sortino = None
        self._tracking_error = None
        self._annual_tracking_error = None
        self._downside_risk = None
        self._annual_downside_risk = None
        self._calmar = None
        self._avg_excess_return = None
Example #14
0
    def pdf(self, x: Array, log=False):

        n, d = x.shape

        theta = self.params
        ok = valid_rows_in_u(x)
        log_pdf = np.repeat(np.nan, n)
        if not ok.any():
            return log_pdf
        elif theta == 0:
            log_pdf[ok] = 0
            return log_pdf

        lu = np.log(x).sum(1)
        t = self.ipsi(x).sum(1)

        if theta < 0:  # dim == 2
            pos_t = t < 1
            log_pdf = np.log1p(theta) - (1 + theta) * lu - (d + 1 / theta) * np.log1p(-t)
            log_pdf[~ok] = np.nan
            log_pdf[ok & ~pos_t] = -np.inf
        else:
            p = np.log1p(theta * np.arange(1, d)).sum()
            log_pdf = p - (1 + theta) * lu - (d + 1 / theta) * np.log1p(t)

        return log_pdf if log else np.exp(log_pdf)
Example #15
0
    def fit ( self , X , y ):
        N = len( y )

        # num of happy tweets
        N_1  = np.sum( y )
        # num of sad tweets
        N_0  = N - N_1

        # ratio of happy/sad tweet
        Pi_0 = ( N_0 + 2 / N )
        Pi_1 = ( N_1 + 2 / N )

        #output is an array, N_jc[0] is the count
        #of how many 'obamas' when happy/sad
        N_j0 = (1-y)*X
        N_j1 = y*X

        Theta_j0 = ( ( N_j0 + 1 ) / ( N_0 + 2 ) )
        Theta_j1 = ( ( N_j1 + 1 ) / ( N_1 + 2 ) )

        logpi = [ np.log( Pi_0 ), np.log( Pi_1 ) ]
        self.logpi = np.array( logpi ) 
        self.logtheta = np.array([ np.log( Theta_j0 ), np.log( Theta_j1 ) ])
        self.log1theta = np.array( [ np.log1p( -1*Theta_j0 ), np.log1p( -1*Theta_j0 ) ] )

        save_params( self, 'params' )
Example #16
0
def log_likelihood_state(params,sender,time):
    #params = [theta,A,alpha,delta,epsilon,sigma]
    tol = 1e-24
    theta = float(params[0])
    alpha = float(params[2])
    if min(theta,alpha)<0:
        ll = -float('inf')
    else:
        (S,X,SX,m1,m2,N)= sufficient_statistics(sender,time)
        
        if theta < 0:
            theta = 0

        if 1 - theta + tol < 0:
            theta = 1   

        puu = alpha*np.log(theta+tol)
        pvv = alpha*np.log(1-theta+tol)
        puv = np.log1p(-np.exp(alpha*np.log(theta+tol)))
        pvu = np.log1p(-np.exp(alpha*np.log(1-theta+tol)))
        try:
            ll = (N[0]*puu+N[1]*puv+
                  N[2]*pvu+N[3]*pvv)
        except:
            print 'll error: theta = %s, alpha = %s'%(theta,alpha)
            ll=0
    return -ll #take negative for minimization
def _logistic(X, y, w):
    """Compute the logistic function of the data: sum(sigmoid(yXw))

    Parameters
    ----------
    X : ndarray, shape (n_samples, n_features)
        Design matrix.

    y : ndarray, shape (n_samples,)
        Target / response vector. Each entry must be +1 or -1.

    w : ndarray, shape (n_features,)
        Unmasked, ravelized input map.

    Returns
    -------
    energy : float
        Energy contribution due to logistic data-fit term.
    """

    z = np.dot(X, w[:-1]) + w[-1]
    yz = y * z
    idx = yz > 0
    out = np.empty_like(yz)
    out[idx] = np.log1p(np.exp(-yz[idx]))
    out[~idx] = -yz[~idx] + np.log1p(np.exp(yz[~idx]))
    out = out.sum()
    return out
Example #18
0
def stitch(record1, record2):
    seq1 = array([record1.seq.tostring()])
    seq2 = array([reverse_complement(record2.seq.tostring())])
    seq1.dtype = '|S1'
    seq2.dtype = '|S1'
    quals1 = array(record1.letter_annotations['phred_quality'])
    quals2 = array(record2.letter_annotations['phred_quality'][::-1])
    
    log10p_consensus_1 = log1p(-power(10, -quals1 / 10.)) / log(10)
    log10p_consensus_2 = log1p(-power(10, -quals2 / 10.)) / log(10)
    log10p_error_1 = -log10(3) - (quals1 / 10.)
    log10p_error_2 = -log10(3) - (quals2 / 10.)
    
    min_overlap = 1
    max_overlap = max(len(record1), len(record2))
    overlaps = {}
    for overlap in range(1, max_overlap):
        s1 = seq1[-overlap:]
        s2 = seq2[:overlap]
        q1 = quals1[-overlap:]
        q2 = quals2[:overlap]
        lpc1 = log10p_consensus_1[-overlap:]
        lpc2 = log10p_consensus_2[:overlap]
        lpe1 = log10p_error_1[-overlap:]
        lpe2 = log10p_error_2[:overlap]
        
        consensus = choose(q1 < q2, [s1, s2])
        score = sum(choose(consensus == s1, [lpe1, lpc1])) + sum(choose(consensus == s2, [lpe2, lpc2])) + len(consensus) * log10(4) * 2    # last term is null hypothesis, p=1/4
        consensus.dtype = '|S%i' % len(consensus)
        overlaps[overlap] = (consensus[0],score)
    
    return overlaps
def ndcg_at_k(
    rating_true,
    rating_pred,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    relevancy_method="top_k",
    k=DEFAULT_K,
    threshold=DEFAULT_THRESHOLD,
):
    """Normalized Discounted Cumulative Gain (nDCG).
    
    Info: https://en.wikipedia.org/wiki/Discounted_cumulative_gain
    
    Args:
        rating_true (pd.DataFrame): True DataFrame
        rating_pred (pd.DataFrame): Predicted DataFrame
        col_user (str): column name for user
        col_item (str): column name for item
        col_rating (str): column name for rating
        col_prediction (str): column name for prediction
        relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold']
        k (int): number of top k items per user
        threshold (float): threshold of top items per user (optional)

    Returns:
        float: nDCG at k (min=0, max=1).
    """

    df_hit, df_hit_count, n_users = merge_ranking_true_pred(
        rating_true=rating_true,
        rating_pred=rating_pred,
        col_user=col_user,
        col_item=col_item,
        col_rating=col_rating,
        col_prediction=col_prediction,
        relevancy_method=relevancy_method,
        k=k,
        threshold=threshold,
    )

    if df_hit.shape[0] == 0:
        return 0.0

    # calculate discounted gain for hit items
    df_dcg = df_hit.copy()
    # relevance in this case is always 1
    df_dcg["dcg"] = 1 / np.log1p(df_dcg["rank"])
    # sum up discount gained to get discount cumulative gain
    df_dcg = df_dcg.groupby(col_user, as_index=False).agg({"dcg": "sum"})
    # calculate ideal discounted cumulative gain
    df_ndcg = pd.merge(df_dcg, df_hit_count, on=[col_user])
    df_ndcg["idcg"] = df_ndcg["actual"].apply(
        lambda x: sum(1 / np.log1p(range(1, min(x, k) + 1)))
    )

    # DCG over IDCG is the normalized DCG
    return (df_ndcg["dcg"] / df_ndcg["idcg"]).sum() / n_users
Example #20
0
def Devroye(N, dvc, delta):
    tol = 1e-5
    prev_result = 0.5
    result = np.sqrt((4*prev_result*(1+prev_result + np.log1p(4/delta) + np.log1p(N)*2*dvc))/(2*N))
    while abs(result - prev_result) > tol:
        prev_result = result
        result = np.sqrt((4*prev_result*(1+prev_result + np.log1p(4/delta) + np.log1p(N)*2*dvc))/(2*N))
    return result
def preprocess(df):
    df = create_datetime_features(df)
    df['period'] = df.datetime.map(calculate_period)
    if 'count' in df.columns:
        df['log_count'] = np.log1p(df['count'])
        df['log_registered'] = np.log1p(df['registered'])
        df['log_casual'] = np.log1p(df['casual'])
    return df
def rmsle(pred, ans):
    """
    [list of ints], [list of ints] -> float
    Calculate the RMS Log Error between a set of predictions, and their correponding answers.
    """
    no_samps = float(len(pred))
    err = math.sqrt( 1.0/no_samps * np.sum((np.log1p(np.float64(pred)) - np.log1p(np.float64(ans)))**2.0))
    return err
Example #23
0
def rmsle(y_true, y_pred):
    loss_sum = 0
    loss_count = 0
    for t, p in zip(y_true.values, y_pred):
        loss_sum += (np.log1p(t[0]) - np.log1p(p))**2
        loss_count += 1

    return np.sqrt(loss_sum/loss_count)
  def test_correct(self):
    self.assertAllClose(
        self.evaluate(tfp.vi.modified_gan(self._logu)),
        np.log1p(self._u) - self._logu)

    self.assertAllClose(
        self.evaluate(tfp.vi.modified_gan(self._logu, self_normalized=True)),
        np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
def evalerror(preds, dtrain):
    labels = dtrain.get_label();
    n=len(labels);
    preds = np.log1p(np.power(preds,16.0))
    labels = np.log1p(np.power(labels,16.0))
    delta_error=(preds-labels);
    error_metric=np.sqrt((pow(np.linalg.norm(delta_error),2))/n);
    return 'error', error_metric
  def test_correct(self):
    with self.test_session():
      self.assertAllClose(
          cd.modified_gan(self._logu).eval(),
          np.log1p(self._u) - self._logu)

      self.assertAllClose(
          cd.modified_gan(self._logu, self_normalized=True).eval(),
          np.log1p(self._u) - self._logu + 0.5 * (self._u - 1))
def var_transform_log(df,var_name):
    col = np.array(df[var_name])
    if col.min() >= 0:
        col_sqrt = np.log1p(col)
        df[var_name+"_Log"] = col_sqrt
    elif col.max() <= 0:
        col_sqrt = np.log1p(-col)
        df[var_name+"_NegLog"] = col_sqrt
    return df
Example #28
0
def test_run(fn, features, type):
    """ load dataset, build feature set, and do learning
        Parameters
        ----------
        fn: file name of dataset
        features: a list of list, each of which is a feature list for different models
        type: str for indicating feature set
        
        Returns
        -------
        predictions and feature-engineered dataset are saved to files
    """
    np.set_printoptions(precision=4)
    print('test_run ' + type)
    df = load_data(fn)
    check_df(df)
    df = feature_engineering(df)
    
    print(df.columns)
#    print(df.head())
#    print(df.groupby(['peak_hr'])['cnt'].agg(sum))
    y_pred_list = []
    for i, est in enumerate((
        DecisionTreeRegressor(min_samples_split=20),
        ExtraTreesRegressor(n_estimators=100, max_depth=None, min_samples_split=1, random_state=1234),
        RandomForestRegressor(n_estimators=1000, max_depth=15, random_state=1234, min_samples_split=3, n_jobs=-1),
        GradientBoostingRegressor(n_estimators=150, max_depth=10, random_state=0, min_samples_leaf=20, learning_rate=0.1, subsample=0.7, loss='ls'),
        svm.SVR(C=30)
        )):
#        print(features[i])
        df, X_train, X_test, y_train, y_test, y_train_cas, y_test_cas, y_train_reg, y_test_reg, time_test = split_data(df, features=features[i])
        y_pred, mse = predict_evaluate(est, X_train, y_train, X_test, y_test)
        est_name = str(est).split('(')[0]
        print(type, est_name, np.round(mse, 4))
        """ feature importance
        if est_name != 'SVR':
            # print out feature importance
            sfi = sorted([(x[0], float('%.4f'%x[1])) for x in zip(features[i], est.feature_importances_)], key=lambda x: x[1], reverse=True)
            print(sfi)
            print([x[0] for x in sfi])
        """
        y_pred_list.append([est_name, mse, y_pred])

    # blending models
    y_pred_blend = np.log1p(.2*(np.exp(y_pred_list[2][2])-1) + .8*(np.exp(y_pred_list[3][2])-1))
    print(type+' blending: 0.2*'+y_pred_list[2][0]+' + 0.8*'+y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4))
    y_pred_blend = np.log1p(.3*(np.exp(y_pred_list[1][2])-1) + .7*(np.exp(y_pred_list[3][2])-1))
    print(type+' blending: 0.3*'+y_pred_list[1][0]+' + 0.7*'+y_pred_list[3][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4))
    y_pred_blend = np.log1p(.3*(np.exp(y_pred_list[3][2])-1) + .7*(np.exp(y_pred_list[4][2])-1))
    print(type+ ' blending: 0.2*'+y_pred_list[3][0]+' + 0.8*'+y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4))
    y_pred_blend = np.log1p(.6*(np.exp(y_pred_list[3][2])-1) + .4*(np.exp(y_pred_list[4][2])-1))
    print(type+ ' blending: 0.6*'+y_pred_list[3][0]+' + 0.4*'+y_pred_list[4][0], metrics.mean_squared_error(y_test, y_pred_blend).round(4))
    dff = pd.DataFrame({'datetime': time_test[:, 0], 'mnth': time_test[:, 1], 'hr': time_test[:, 2], 'cnt': np.expm1(y_test), 'prediction': y_pred_blend})
    dff.to_csv('../output/prediction_blended.csv', index = False, columns=['datetime', 'mnth', 'hr', 'cnt', 'prediction'])
    print('blended predictions saved in ../output/prediction_blended.csv')
    df.to_csv('../data/hour_ext.csv')
    print('extended dataset saved in ../data/hour_ext.csv')
Example #29
0
def inspect_zeros(trainer, filedir, inspect=None, FIGWIDTH=FIGWIDTH, FIGHEIGHT=FIGHEIGHT):
    '''Produce side-by-side log histograms.'''
    plt.close()
    complete = []
    D = trainer.now.copy()
    if not inspect:
        inspect = D.columns.tolist()
    save_this_directory = filedir + '/{}'.format(trainer.name)
    save_this_here = save_this_directory + '/zeros'
    try:
        os.mkdir(filedir)
    except:
        pass
    try:
        os.mkdir(save_this_directory)
    except:
        pass
    try:
        os.mkdir(save_this_here)
    except:
        pass
    for feature in inspect:
        print('Inspect {} for Zeros'.format(feature))
        plt.close()
        for x in inspect:
            if x != feature and (x, feature) not in complete:
                compare = (x, feature)
                complete += [compare]
                try:
                    fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGHEIGHT))
                    np.log1p(D[D[feature] == 0][x]).hist(bins=30, label ='{} == 0'.format(feature), normed=True)
                    np.log1p(D[D[feature] > 0][x]).hist(bins=30, label ='{} > 0'.format(feature), normed=True).legend(loc='upper right')
                    t = "Log {} | {} = Zero.".format(x, feature)
                    plt.title(t)
                    axs.grid(False)
                    doc = '{}/{}.png'.format(save_this_here,'inspect_{}_when_{}_zero'.format(x, feature))
                    plt.tight_layout()
                    plt.savefig(doc) 
                except:
                    plt.close('all')
                    pass
                plt.close()
        
        fig, axs = plt.subplots(figsize=(FIGWIDTH, FIGHEIGHT)) 
        tag = '{}_pairplot_when_zero'.format(feature)
        t = "General Distribution | {} = Zero.".format(feature)
        doc = '{}/{}.png'.format(save_this_here, tag)  
        try:
            g = sns.pairplot(data=D[D[feature]==0][inspect].dropna(), 
                             hue = trainer.target, palette="Set1", ax=axs)
            plt.title(t)
            plt.tight_layout()
            fig.savefig(doc) 
        except:
            plt.close('all')
            pass
        plt.close('all')
def _calc_Jeff(inds, l_x, J):
    """
    Coupling between two indices
    """
    x, y = inds / l_x, inds % l_x
    dist = np.abs(x[1:] - x[:-1]) + np.abs(y[1:] - y[:-1])
    res = np.tanh(J) ** dist
    res = .5 * np.log1p(res) - .5 * np.log1p(-res)
    return res
Example #31
0
def highly_variable_genes(adata,
                          min_disp=None,
                          max_disp=None,
                          min_mean=None,
                          max_mean=None,
                          n_top_genes=None,
                          n_bins=20,
                          flavor='seurat',
                          binning_method='equal_width',
                          subset=False,
                          inplace=True):
    """Annotate highly variable genes [Satija15]_ [Zheng17]_.

    Expects logarithmized data.

    Depending on `flavor`, this reproduces the R-implementations of Seurat
    [Satija15]_ and Cell Ranger [Zheng17]_.

    The normalized dispersion is obtained by scaling with the mean and standard
    deviation of the dispersions for genes falling into a given bin for mean
    expression of genes. This means that for each bin of mean expression, highly
    variable genes are selected.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        The annotated data matrix of shape `n_obs` × `n_vars`. Rows correspond
        to cells and columns to genes.
    min_mean : `float`, optional (default: 0.0125)
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored.
    max_mean : `float`, optional (default: 3)
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored.
    min_disp : `float`, optional (default: 0.5)
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored.
    max_disp : `float`, optional (default: `None`)
        If `n_top_genes` unequals `None`, this and all other cutoffs for the means and the
        normalized dispersions are ignored.
    n_top_genes : `int` or `None`, optional (default: `None`)
        Number of highly-variable genes to keep.
    n_bins : `int`, optional (default: 20)
        Number of bins for binning the mean gene expression. Normalization is
        done with respect to each bin. If just a single gene falls into a bin,
        the normalized dispersion is artificially set to 1. You'll be informed
        about this if you set `settings.verbosity = 4`.
    flavor : `{'seurat', 'cell_ranger'}`, optional (default: 'seurat')
        Choose the flavor for computing normalized dispersion. In their default
        workflows, Seurat passes the cutoffs whereas Cell Ranger passes
        `n_top_genes`.
    binning_method : `{'equal_width', 'equal_frequency'}`, optional (default: 'equal_width')
        Choose the binning method for the means. In `equal_width`, each bin covers the same width.
        For `equal_frequency`, each bin has an equal number of genes. 
    subset : `bool`, optional (default: `False`)
        Inplace subset to highly-variable genes if `True` otherwise merely indicate
        highly variable genes.
    inplace : `bool`, optional (default: `True`)
        Whether to place calculated metrics in `.var` or return them.

    Returns
    -------
    :class:`~numpy.recarray`, `None`
        Depending on `inplace` returns calculated metrics (:class:`~numpy.recarray`) or
        updates `.var` with the following fields

        * `highly_variable` - boolean indicator of highly-variable genes
        * `means` - means per gene
        * `dispersions` - dispersions per gene
        * `dispersions_norm` - normalized dispersions per gene

    Notes
    -----
    This function replaces :func:`~scanpy.pp.filter_genes_dispersion`.
    """
    logg.msg('extracting highly variable genes', r=True, v=4)

    if not isinstance(adata, AnnData):
        raise ValueError(
            '`pp.highly_variable_genes` expects an `AnnData` argument, '
            'pass `inplace=False` if you want to return a `np.recarray`.')

    if n_top_genes is not None and not all([
            min_disp is None, max_disp is None, min_mean is None,
            max_mean is None
    ]):
        logg.info('If you pass `n_top_genes`, all cutoffs are ignored.')
    if min_disp is None: min_disp = 0.5
    if min_mean is None: min_mean = 0.0125
    if max_mean is None: max_mean = 3

    X = np.expm1(adata.X) if flavor == 'seurat' else adata.X

    mean, var = materialize_as_ndarray(_get_mean_var(X))
    # now actually compute the dispersion
    mean[mean == 0] = 1e-12  # set entries equal to zero to small value
    dispersion = var / mean
    if flavor == 'seurat':  # logarithmized mean as in Seurat
        dispersion[dispersion == 0] = np.nan
        dispersion = np.log(dispersion)
        mean = np.log1p(mean)
    # all of the following quantities are "per-gene" here
    df = pd.DataFrame()
    df['mean'] = mean
    df['dispersion'] = dispersion
    if flavor == 'seurat':
        if binning_method == 'equal_width':
            df['mean_bin'] = pd.cut(df['mean'], bins=n_bins)
        elif binning_method == 'equal_frequency':
            df['mean_bin'] = pd.qcut(df['mean'], q=n_bins, duplicates='drop')
        else:
            raise ValueError(
                '`binning_method` needs to be "equal_width" or "equal_frequency"'
            )
        disp_grouped = df.groupby('mean_bin')['dispersion']
        disp_mean_bin = disp_grouped.mean()
        disp_std_bin = disp_grouped.std(ddof=1)
        # retrieve those genes that have nan std, these are the ones where
        # only a single gene fell in the bin and implicitly set them to have
        # a normalized disperion of 1
        one_gene_per_bin = disp_std_bin.isnull()
        gen_indices = np.where(
            one_gene_per_bin[df['mean_bin'].values])[0].tolist()
        if len(gen_indices) > 0:
            logg.msg(
                'Gene indices {} fell into a single bin: their '
                'normalized dispersion was set to 1.\n    '
                'Decreasing `n_bins` will likely avoid this effect.'.format(
                    gen_indices),
                v=4)
        # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32,
        # but there’s still a dtype error without “.value”.
        disp_std_bin[one_gene_per_bin.values] = disp_mean_bin[
            one_gene_per_bin.values].values
        disp_mean_bin[one_gene_per_bin.values] = 0
        # actually do the normalization
        df['dispersion_norm'] = ((
            df['dispersion'].values  # use values here as index differs
            - disp_mean_bin[df['mean_bin'].values].values) /
                                 disp_std_bin[df['mean_bin'].values].values)
    elif flavor == 'cell_ranger':
        from statsmodels import robust
        df['mean_bin'] = pd.cut(
            df['mean'],
            np.r_[-np.inf,
                  np.percentile(df['mean'], np.linspace(10, 100, n_bins - 1)),
                  np.inf])
        disp_grouped = df.groupby('mean_bin')['dispersion']
        disp_median_bin = disp_grouped.median()
        # the next line raises the warning: "Mean of empty slice"
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            disp_mad_bin = disp_grouped.apply(robust.mad)
        df['dispersion_norm'] = (
            np.abs(df['dispersion'].values -
                   disp_median_bin[df['mean_bin'].values].values) /
            disp_mad_bin[df['mean_bin'].values].values)
    else:
        raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"')
    dispersion_norm = df['dispersion_norm'].values.astype('float32')
    if n_top_genes is not None:
        dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)]
        dispersion_norm[::-1].sort(
        )  # interestingly, np.argpartition is slightly slower
        disp_cut_off = dispersion_norm[n_top_genes - 1]
        gene_subset = np.nan_to_num(
            df['dispersion_norm'].values) >= disp_cut_off
        logg.msg(
            'the {} top genes correspond to a normalized dispersion cutoff of'.
            format(n_top_genes, disp_cut_off),
            v=5,
        )
    else:
        max_disp = np.inf if max_disp is None else max_disp
        dispersion_norm[np.isnan(dispersion_norm)] = 0  # similar to Seurat
        gene_subset = np.logical_and.reduce((
            mean > min_mean,
            mean < max_mean,
            dispersion_norm > min_disp,
            dispersion_norm < max_disp,
        ))

    logg.msg('    finished', time=True, v=4)

    if inplace or subset:
        logg.hint('added\n'
                  '    \'highly_variable\', boolean vector (adata.var)\n'
                  '    \'means\', float vector (adata.var)\n'
                  '    \'dispersions\', float vector (adata.var)\n'
                  '    \'dispersions_norm\', float vector (adata.var)')
        adata.var['highly_variable'] = gene_subset
        adata.var['means'] = df['mean'].values
        adata.var['dispersions'] = df['dispersion'].values
        adata.var['dispersions_norm'] = df['dispersion_norm'].values.astype(
            'float32', copy=False)
        if subset:
            adata._inplace_subset_var(gene_subset)
    else:
        arrays = (gene_subset, df['mean'].values, df['dispersion'].values,
                  df['dispersion_norm'].values.astype('float32', copy=False))
        dtypes = [
            ('highly_variable', np.bool_),
            ('means', 'float32'),
            ('dispersions', 'float32'),
            ('dispersions_norm', 'float32'),
        ]
        return np.rec.fromarrays(arrays, dtype=dtypes)
Example #32
0
 def _cdf(self, x, p):
     k = floor(x)
     return -expm1(log1p(-p) * k)
Example #33
0
 def _ppf(self, q, lambda_):
     vals = ceil(-1.0 / lambda_ * log1p(-q) - 1)
     vals1 = (vals - 1).clip(self.a, np.inf)
     temp = self._cdf(vals1, lambda_)
     return np.where(temp >= q, vals1, vals)
Example #34
0
def std_income(df: pd.DataFrame) -> None:
    """Change "$84,835.00 " to float; then get log1p"""
    df[cst.H_INCOME] = np.log1p(
        df[cst.H_INCOME].map(lambda s: float(s[1:-1].replace(',', ''))
                             if isinstance(s, str) else np.nan))
    df[cst.H_INCOME].fillna(df[cst.H_INCOME].mean(), inplace=True)
Example #35
0
def update_map(region, color_var, size_var, map_layout_data):
    print(region, color_var, size_var)

    if region:
        trd_selection = trd[trd.CONJ.isin(region)]

        color_norm = trd_selection[color_var].clip(
            0, trd_selection[color_var].quantile(0.9))

        if size_var == 'FIX_SIZE':
            size_norm = 10
            trd_selection['FIX_SIZE'] = trd_selection[color_var]

        else:
            trd_max = trd_selection[size_var].quantile(0.95)
            size_norm = np.log1p(trd_selection[size_var] / trd_max) * 30
            size_norm.clip(6, 25, inplace=True)

        info = trd_selection.FIC.map('<b>Frec Corte:</b> {:,.2f}'.format) + \
               trd_selection.DIC.map('<br><b>Dur Corte:</b> {:,.2f}'.format) + \
               trd_selection.ENE_12.map('<br><b>Consumo:</b> {:,.2f}'.format)

        map_data = [
            go.Scattermapbox(
                lat=trd_selection.lat,
                lon=trd_selection.lon,
                text=info,
                hoverinfo='text',
                mode='markers',
                marker=dict(size=size_norm,
                            color=color_norm,
                            colorscale='RdBu',
                            showscale=True,
                            opacity=0.7),
            )
        ]

        side_graph = dcc.Graph(
            figure=go.Figure(data=[
                go.Line(x=trd_selection[color_var],
                        y=trd_selection[size_var],
                        mode='markers',
                        name='Correlacion')
            ],
                             layout=go.Layout(
                                 title='Correlacion Entre Variables',
                                 margin=dict(l=20, t=50, b=20, r=20),
                             )))

        if map_layout_data:
            print(map_layout_data)
            print(map_layout_data.keys())

            if 'mapbox.center' in map_layout_data.keys():
                # Lock Camera Position
                cam_lat = float(map_layout_data['mapbox.center']['lat'])
                cam_lon = float(map_layout_data['mapbox.center']['lon'])
                cam_zoom = float(map_layout_data['mapbox.zoom'])

                map_layout.mapbox.center.lat = cam_lat
                map_layout.mapbox.center.lon = cam_lon
                map_layout.mapbox.zoom = cam_zoom

    else:
        map_data = [go.Scattermapbox(lat=[], lon=[], mode='markers')]

        side_graph = []

    return dict(data=map_data, layout=map_layout), side_graph
Example #36
0
params = {
    "objective": "reg:linear",
    "booster": "gbtree",
    "eta": 0.3,
    "max_depth": 10,
    "subsample": 0.9,
    "colsample_bytree": 0.7,
    "silent": 1,
    "seed": 1301
}
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.5, random_state=10)
y_train = np.log1p(X_train.unit_sales)
y_valid = np.log1p(X_valid.unit_sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_xgb = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds=20, verbose_eval=True)

create_feature_map(features)
importance = model_xgb.get_fscore(fmap='xgb.fmap')
print(importance)

#-------------------------------------------------------------------------------------
#Load test
#test = valid
Example #37
0
# 1.找出最接近的norm分布曲线
sns.distplot(train['SalePrice'], fit=norm)
plt.title('SalePrice before normalized')
(mu, sigma) = norm.fit(train['SalePrice'])
print('正态化之前房价的分布拟合:')
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(
    ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    loc='best')
plt.show()
# 2.用QQ图判断数据是否为正态分布,蓝点和红线越重合就越符合正态分布
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
stats.probplot(train['SalePrice'], plot=ax)
plt.show()
# 对房价取log让它趋近于正态分布
train['SalePrice'] = np.log1p(train['SalePrice'])

# 3.变换后的房价曲线
sns.distplot(train['SalePrice'], fit=norm)
plt.title('SalePrice after normalized')

(mu, sigma) = norm.fit(train['SalePrice'])
print('正态化之后房价的分布拟合:')
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(
    ['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    loc='best')
plt.show()

# 先合并找出缺失比例最多的前20名
n_train = train.shape[0]
Example #38
0
# print missing_data.head(20)

df_train = train_df.drop((missing_data[missing_data['Total'] > 250]).index, 1)
# print df_train.isnull().sum().max()

#deleting points
df_train.sort_values(by='GrLivArea', ascending=False)[:2]
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)

# concat函数相当于拼接,拼接方式是增加行数,不增加列数
all_data = pd.concat((df_train.loc[:, 'MSSubClass':'SaleCondition'],
                      test_df.loc[:, 'MSSubClass':'SaleCondition']))

#log transform the target:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

size_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}

size_mapping2 = {'Ex': 6, 'Gd': 5, 'TA': 4, 'Fa': 3, 'Po': 2, 'NA': 1}

all_data['ExterQual'] = all_data['ExterQual'].map(size_mapping)
all_data['ExterCond'] = all_data['ExterCond'].map(size_mapping)
all_data['BsmtQual'] = all_data['BsmtQual'].map(size_mapping2)
all_data['BsmtCond'] = all_data['BsmtCond'].map(size_mapping2)
all_data['HeatingQC'] = all_data['HeatingQC'].map(size_mapping)
all_data['KitchenQual'] = all_data['KitchenQual'].map(size_mapping)
all_data['GarageQual'] = all_data['GarageQual'].map(size_mapping2)
all_data['GarageCond'] = all_data['GarageCond'].map(size_mapping2)

all_data = pd.get_dummies(all_data)
Example #39
0
 def param_var(self, alpha):
     import scipy.stats as stats
     log_return = np.log1p(self._portfolio)
     mean = np.mean(log_return)
     std = np.std(log_return)
     return np.expm1(-stats.norm(mean, std).ppf(alpha))
Example #40
0
def log_alpha(values):
    min = values.min()
    alpha = np.log1p(values - min)
    return alpha / alpha.max() * 0.9 + 0.1
Example #41
0
def root_mean_squared_logarithmic_error(true, pred):
     return np.sqrt( mean_squared_error( np.log1p(true), np.log1p(pred) ) )
Example #42
0
            y_train = target.iloc[mask[0]].astype('float32').values.reshape(-1, 1)

            X_test = train.iloc[mask[1]].drop(['tube_assembly_id'], axis=1)
            y_test = target.iloc[mask[1]].astype('float32').values.reshape(-1, 1)

            X_train_ = preprocess.fit_transform( X_train ).astype('float32').values
            X_test_ = preprocess.transform( X_test ).astype('float32').values
            X_val_ = preprocess.transform( X_val ).astype('float32').values
            test_ = preprocess.transform( test ).astype('float32').values
            X_train_, y_train_ = sklearn.utils.shuffle(X_train_, y_train, random_state=random_state)

            directory = os.path.join(cwd, 'stage2', name, '%i'%(iparam), fold_outer_dir, fold_inner_dir)
            reg = sklearn.clone(nnet_cater3)
#            reg.fit(X_train_, y_train_, X_test_, y_test)
#            reg.fit(X_train_, np.log1p(y_train_), X_test_, np.log1p(y_test))
            reg.fit(X_train_, np.log1p(y_train_))

            y_pred_test = np.expm1( reg.predict( X_test_ ) )
            print('RMSLE (%5s)= %.5f'%( epoch_save_range[-1], root_mean_squared_logarithmic_error( y_test, y_pred_test ) ))

        print(''.join(['-']*90))
        print('refit on train_val set')
        X_train_val_ = preprocess.fit_transform( X_train_val )
        X_val_ = preprocess.transform( X_val )
        X_train_val_, y_train_val_ = sklearn.utils.shuffle(X_train_val_, y_train_val, random_state=random_state)

        directory = os.path.join(cwd, 'stage2', name, '%i', fold_outer_dir, refit_train_val_dir)
#        os.makedirs(directory)
        nn_params = {}
        nn_params.update(core_params)
        nn_params.update(params)
 def plot_series(self,
                 omic1=OMIC.transcriptomic,
                 omic2=OMIC.proteomic,
                 var_names1='auto',
                 var_names2='auto',
                 log1=True,
                 log2=True,
                 fontsize=10,
                 title='',
                 return_figure=False):
   r""" Plot lines of 2 OMICs sorted in ascending order of `omic1` """
   import seaborn as sns
   ## prepare
   omic1 = OMIC.parse(omic1)
   omic2 = OMIC.parse(omic2)
   omic1_ids = self.get_var_indices(omic1)
   omic2_ids = self.get_var_indices(omic2)
   if isinstance(var_names1, string_types) and var_names1 == 'auto':
     var_names1 = omic1.markers
   if isinstance(var_names2, string_types) and var_names2 == 'auto':
     var_names2 = omic2.markers
   ## filtering variables
   ids1 = []
   ids2 = []
   for v1, v2 in zip(var_names1, var_names2):
     i1 = omic1_ids.get(v1, None)
     i2 = omic2_ids.get(v2, None)
     if i1 is not None and i2 is not None:
       ids1.append(i1)
       ids2.append(i2)
   assert len(ids1) > 0, \
     (f"No variables found for omic1={omic1} var1={var_names1} "
      f"and omic2={omic2} var2={var_names2}")
   x1 = self.get_omic(omic1)[:, ids1]
   x2 = self.get_omic(omic2)[:, ids2]
   if log1:
     x1 = np.log1p(x1)
   if log2:
     x2 = np.log1p(x2)
   names1 = self.get_var_names(omic1)[ids1]
   names2 = self.get_var_names(omic2)[ids2]
   n_series = len(names1)
   ### prepare the plot
   colors = sns.color_palette(n_colors=2)
   fig = plt.figure(figsize=(12, n_series * 4))
   for idx in range(n_series):
     y1 = x1[:, idx]
     y2 = x2[:, idx]
     order = np.argsort(y1)
     ax = plt.subplot(n_series, 1, idx + 1)
     ## the second series
     ax.plot(y1[order],
             linewidth=1.8,
             color=colors[0],
             label=f"{omic1.name}-{names1[idx]}")
     ax.set_ylabel(f"{'log' if log1 else 'raw'}-{omic1.name}-{names1[idx]}",
                   color=colors[0])
     ax.set_xlabel(f"Cell in ascending order of {omic1.name}")
     ax.tick_params(axis='y', colors=colors[0], labelcolor=colors[0])
     ax.grid(False)
     ## the second series
     ax = ax.twinx()
     ax.plot(y2[order],
             linestyle='--',
             alpha=0.88,
             linewidth=1.2,
             color=colors[1])
     ax.set_ylabel(f"{'log' if log1 else 'raw'}-{omic2.name}-{names2[idx]}",
                   color=colors[1])
     ax.tick_params(axis='y', colors=colors[1], labelcolor=colors[1])
     ax.grid(False)
   ### finalize the figure style
   if len(title) > 0:
     plt.suptitle(title, fontsize=fontsize + 2)
   with catch_warnings_ignore(UserWarning):
     plt.tight_layout(rect=[0., 0.02, 1., 0.98])
   if return_figure:
     return fig
   return self.add_figure(f'series_{omic1.name}_{omic2.name}', fig)
Example #44
0
 def excess_return_rate(self):
     if self._excess_return_rate is None:
         self._excess_return_rate = np.expm1(
             np.log1p(self._excess_portfolio).sum())
     return self._excess_return_rate
  def plot_correlation_scatter(self,
                               omic1=OMIC.transcriptomic,
                               omic2=OMIC.proteomic,
                               var_names1='auto',
                               var_names2='auto',
                               is_marker_pairs=True,
                               log1=True,
                               log2=True,
                               max_scatter_points=200,
                               top=3,
                               bottom=3,
                               title='',
                               return_figure=False):
    r""" Mapping from omic1 to omic2

    Arguments:
      omic1, omic2 : instance of OMIC.
        With `omic1` represent the x-axis, and `omic2` represent the y-axis.
      var_names1 : list of all variable name for `omic1`
    """
    omic1 = OMIC.parse(omic1)
    omic2 = OMIC.parse(omic2)
    if isinstance(var_names1, string_types) and var_names1 == 'auto':
      var_names1 = omic1.markers
    if isinstance(var_names2, string_types) and var_names2 == 'auto':
      var_names2 = omic2.markers
    if var_names1 is None or var_names2 is None:
      is_marker_pairs = False
    max_scatter_points = int(max_scatter_points)
    # get all correlations
    corr = self.get_correlation(omic1, omic2)
    corr_map = {(x[0], x[1]):
                (0 if np.isnan(x[2]) else x[2], 0 if np.isnan(x[3]) else x[3])
                for x in corr}
    om1_names = self.get_var_names(omic1)
    om2_names = self.get_var_names(omic2)
    om1_idx = {j: i for i, j in enumerate(om1_names)}
    om2_idx = {j: i for i, j in enumerate(om2_names)}
    # extract the data and normalization
    X1 = self.numpy(omic1)
    library = np.sum(X1, axis=1, keepdims=True)
    library = discretizing(library, n_bins=10, strategy='quantile').ravel()
    if log1:
      s = np.sum(X1, axis=1, keepdims=True)
      X1 = np.log1p(X1 / s * np.median(s))
    X2 = self.numpy(omic2)
    if log2:
      s = np.sum(X2, axis=1, keepdims=True)
      X2 = np.log1p(X2 / s * np.median(s))
    ### getting the marker pairs
    all_pairs = []
    # coordinate marker pairs
    if is_marker_pairs:
      pairs = [(i1, i2)
               for i1, i2 in zip(var_names1, var_names2)
               if i1 in om1_idx and i2 in om2_idx]
      var_names1 = [i for i, _ in pairs]
      var_names2 = [i for _, i in pairs]
    # filter omic2
    if var_names2 is not None:
      var_names2 = [i for i in var_names2 if i in om2_names]
    else:
      var_names2 = om2_names
    assert len(var_names2) > 0, \
      (f"None of the variables {var_names2} is contained in variable list "
       f"of OMIC {omic2.name}")
    nrow = len(var_names2)
    # filter omic1
    if var_names1 is not None:
      var_names1 = [i for i in var_names1 if i in om1_names]
      ncol = len(var_names1)
      assert len(var_names1) > 0, \
        (f"None of the variables {var_names1} is contained in variable list "
         f"of OMIC {omic1.name}")
      for name2 in var_names2:
        for name1 in var_names1:
          all_pairs.append((om1_idx[name1], om2_idx[name2]))
    else:
      # top and bottom correlation pairs
      top = int(top)
      bottom = int(bottom)
      ncol = top + bottom
      # pick all top and bottom of omic1 coordinated to omic2
      for name in var_names2:
        i2 = om2_idx[name]
        pairs = sorted(
            [[sum(corr_map[(i1, i2)]), i1] for i1 in range(len(om1_names))])
        for _, i1 in pairs[-top:][::-1] + pairs[:bottom][::-1]:
          all_pairs.append((i1, i2))
    ### downsampling scatter points
    if max_scatter_points > 0:
      ids = np.random.permutation(len(X1))[:max_scatter_points]
    else:
      ids = np.arange(len(X1), dtype=np.int32)
    ### plotting
    fig = plt.figure(figsize=(ncol * 2, nrow * 2 + 2), dpi=80)
    for i, pair in enumerate(all_pairs):
      ax = plt.subplot(nrow, ncol, i + 1)
      p, s = corr_map[pair]
      idx1, idx2 = pair
      x1 = X1[:, idx1]
      x2 = X2[:, idx2]
      crow = i // ncol
      ccol = i % ncol
      if is_marker_pairs:
        color = 'salmon' if crow == ccol else 'blue'
      else:
        color = 'salmon' if ccol < top else 'blue'
      vs.plot_scatter(x=x1[ids],
                      y=x2[ids],
                      color=color,
                      ax=ax,
                      size=library[ids],
                      size_range=(6, 30),
                      legend_enable=False,
                      linewidths=0.,
                      cbar=False,
                      alpha=0.3)
      # additional title for first column
      ax.set_title(f"{om1_names[idx1]}\n$p={p:.2g}$ $s={s:.2g}$", fontsize=8)
      # beginning of every column
      if i % ncol == 0:
        ax.set_ylabel(f"{om2_names[idx2]}", fontsize=8, weight='bold')
    ## big title
    plt.suptitle(f"[x:{omic1.name}_y:{omic2.name}]{title}", fontsize=10)
    fig.tight_layout(rect=[0.0, 0.02, 1.0, 0.98])
    ### store and return
    if return_figure:
      return fig
    self.add_figure(
        f"corr_{omic1.name}{'log' if log1 else 'raw'}_"
        f"{omic2.name}{'log' if log2 else 'raw'}", fig)
    return self
#for optimal_model in optimal_models:
#    print(optimal_model.best_params_)
"""
Model Selection, Ensembling and Local Validation Result

We set parameters here again so that we do not have to rerun the above cell.
Hyper parameter tuning is time costly.
"""

n = round(len(train) * 0.012)
X_train = train[n:]
X_valid = train[:n]
y_train = labels[n:]
y_valid = labels[:n]
y_train = np.log1p(np.array(y_train, dtype=np.int32))
y_valid = np.log1p(np.array(y_valid, dtype=np.int32))

## XGBoost Results using optimal parameters selected above
params = {
    "objective": "reg:linear",
    "booster": "gbtree",
    "eta": 0.3,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "silent": 1,
    "seed": 3244,
    "n_estimators": 1000
}
Example #47
0
# Isaac Li
# 1.25.2018

import time
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import function

train, test = function.read_file()
train["血糖"] = np.log1p(train["血糖"])
train, test = function.add_column(train, test, sqrt=True)
train, test = function.transform(train, test)

print('\n\nStart...')
t0, mses = time.time(), []
train_preds, test_preds = np.zeros(train.shape[0]), np.zeros(
    (test.shape[0], 5))
predictors = [f for f in test.columns if f not in ['血糖']]
kf = KFold(n_splits=5, shuffle=True, random_state=520)

for i, (train_index, test_index) in enumerate(kf.split(train)):
    print('   .{}/5.'.format(i + 1))
    train_feat1, train_feat2 = train.iloc[train_index], train.iloc[test_index]
    gbm = function.settings.model_xgb.fit(train_feat1[predictors],
                                          train_feat1['血糖'])

    predict = gbm.predict(train_feat2[predictors])
    base, power, minimum = 1.7, 1, 7

    predict = np.expm1(predict)
Example #48
0
def mean_squared_log_error(y_true,
                           y_pred,
                           *,
                           sample_weight=None,
                           multioutput="uniform_average",
                           squared=True):
    """Mean squared logarithmic error regression loss.

    Read more in the :ref:`User Guide <mean_squared_log_error>`.

    Parameters
    ----------
    y_true : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Ground truth (correct) target values.

    y_pred : array-like of shape (n_samples,) or (n_samples, n_outputs)
        Estimated target values.

    sample_weight : array-like of shape (n_samples,), default=None
        Sample weights.

    multioutput : {'raw_values', 'uniform_average'} or array-like of shape \
            (n_outputs,), default='uniform_average'

        Defines aggregating of multiple output values.
        Array-like value defines weights used to average errors.

        'raw_values' :
            Returns a full set of errors when the input is of multioutput
            format.

        'uniform_average' :
            Errors of all outputs are averaged with uniform weight.
    squared : bool, default=True
        If True returns MSLE (mean squared log error) value.
        If False returns RMSLE (root mean squared log error) value.

    Returns
    -------
    loss : float or ndarray of floats
        A non-negative floating point value (the best value is 0.0), or an
        array of floating point values, one for each individual target.

    Examples
    --------
    >>> from sklearn.metrics import mean_squared_log_error
    >>> y_true = [3, 5, 2.5, 7]
    >>> y_pred = [2.5, 5, 4, 8]
    >>> mean_squared_log_error(y_true, y_pred)
    0.039...
    >>> mean_squared_log_error(y_true, y_pred, squared=False)
    0.199...
    >>> y_true = [[0.5, 1], [1, 2], [7, 6]]
    >>> y_pred = [[0.5, 2], [1, 2.5], [8, 8]]
    >>> mean_squared_log_error(y_true, y_pred)
    0.044...
    >>> mean_squared_log_error(y_true, y_pred, multioutput='raw_values')
    array([0.00462428, 0.08377444])
    >>> mean_squared_log_error(y_true, y_pred, multioutput=[0.3, 0.7])
    0.060...
    """
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
        y_true, y_pred, multioutput)
    check_consistent_length(y_true, y_pred, sample_weight)

    if (y_true < 0).any() or (y_pred < 0).any():
        raise ValueError("Mean Squared Logarithmic Error cannot be used when "
                         "targets contain negative values.")

    return mean_squared_error(
        np.log1p(y_true),
        np.log1p(y_pred),
        sample_weight=sample_weight,
        multioutput=multioutput,
        squared=squared,
    )
def load_adnimerge(remove_outliers=False, outcome="ADAS13"):
    adni = pd.read_csv(_ADNIMERGE_PATH, low_memory=False)
    baseline = adni.sort_values(
        by=["PTID", "M"]).groupby("PTID").first().set_index("RID")
    assert baseline.index.is_unique

    baseline.loc[:,
                 "PTGENDER"] = baseline.loc[:,
                                            "PTGENDER"]  #.replace({"Female": 0, "Male": 1})

    baseline.loc[:, "EDU-ATTAIN"] = pd.cut(
        baseline.PTEDUCAT,
        bins=[0, 12, 16, np.infty],
        labels=["less_or_equal_12", "12-16", "more_than_16"],
        right=True,
    )
    LOG.info("\n%s\n", baseline.loc[:, "EDU-ATTAIN"].value_counts())

    log_cols = ["PTAU", "TAU"]
    for col, series in baseline.loc[:, log_cols].iteritems():
        baseline.loc[:, col] = np.log1p(series.values)

    features_sum = {}
    for col in Volumes.VOLUMES_LR:
        features_sum[col] = baseline.loc[:,
                                         [f"Left-{col}", f"Right-{col}"]].sum(
                                             axis=1)
    # Sum all CC volumes
    features_sum["CC"] = baseline.loc[:, Volumes.VOLUMES_CC].sum(axis=1)
    features_sum["Ventricle"] = baseline.loc[:, Volumes.VOLUMES_VENTRICLE].sum(
        axis=1)

    for col in Volumes.THICKNESS:
        features_sum[col] = baseline.loc[:, [f"lh_{col}", f"rh_{col}"]].mean(
            axis=1)
    features_sum = pd.DataFrame.from_dict(features_sum)

    mri_features = pd.concat(
        (baseline.loc[:, Volumes.VOLUMES_SINGLE], features_sum),
        axis=1).dropna(axis=0)
    sd = mri_features.std(ddof=1)
    assert (sd > 1e-6).all(), "features with low variance:\n{}".format(
        sd[sd <= 1e-6])

    eTIV = mri_features.loc[:, "eTIV"]
    mri_features.drop("eTIV", axis=1, inplace=True)
    mri_features.loc[:, Volumes.VOLUMES_LR] = mri_features.loc[:, Volumes.
                                                               VOLUMES_LR].div(
                                                                   eTIV,
                                                                   axis=0)

    if remove_outliers:
        mri_features = drop_outliers(mri_features)

    is_atn = baseline.loc[:, "ATN_status"].isin(
        ["A+/T+/N+", "A+/T+/N-", "A+/T-/N-"])
    has_outcome = baseline.loc[:, outcome].notnull()
    positive_outcome = baseline.loc[:, outcome] > 0
    LOG.info("Dropping %d with missing or zero %s\n",
             baseline.shape[0] - positive_outcome.sum(), outcome)

    data = baseline.loc[is_atn & has_outcome & positive_outcome, :]

    y = data.loc[:, outcome].round(0).astype(int)
    LOG.info("\n%s\n", data.loc[:, "ATN_status"].value_counts())

    csf_features = ['ABETA', 'TAU', 'PTAU']

    demo_features = [
        'IMAGEUID', 'COLPROT', 'SITE', 'AGE', 'PTGENDER', 'PTEDUCAT',
        'EDU-ATTAIN'
    ]
    features = pd.concat(
        (data.loc[:, demo_features], data.loc[:, "ATN_status"],
         data.loc[:, csf_features], mri_features),
        axis=1,
        join="inner")

    assert features.notnull().all().all()
    assert y.notnull().all()

    return features, y
Example #50
0
params = {"objective": "reg:linear",
          "booster": "gbtree",
          "eta": 0.3,
          "max_depth": 10,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "seed": 1301
          }
num_boost_round = 300

print("Train a XGBoost model")
X_train, X_valid = train_test_split(train, test_size=0.012, random_state=10)
print(X_train.columns)

y_train = np.log1p(X_train.Sales)
y_valid = np.log1p(X_valid.Sales)
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)

watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
                early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True)

print("Validating")
yhat = gbm.predict(xgb.DMatrix(X_valid[features]))
error = rmspe(X_valid.Sales.values, np.expm1(yhat))
print('RMSPE: {:.6f}'.format(error))

print("Make predictions on the test set")
dtest = xgb.DMatrix(test[features])
Example #51
0
 def _softsign_ildj_before_reduction(self, y):
     """Inverse log det jacobian, before being reduced."""
     return -2. * np.log1p(-np.abs(y))
Example #52
0
        'TA': 0,
        'Fa': 1,
        'Po': 1
    })

    train_new = alldata[alldata['SalePrice'].notnull()]
    test_new = alldata[alldata['SalePrice'].isnull()]

    numeric_features = [
        f for f in train_new.columns if train_new[f].dtype != object
    ]
    skewed = train_new[numeric_features].apply(
        lambda x: skew(x.dropna().astype(float)))
    skewed = skewed[skewed > 0.75]
    skewed = skewed.index
    train_new[skewed] = np.log1p(train_new[skewed])
    test_new[skewed] = np.log1p(test_new[skewed])
    del test_new['SalePrice']

    scaler = StandardScaler()
    scaler.fit(train_new[numeric_features])
    scaled = scaler.transform(train_new[numeric_features])

    for i, col in enumerate(numeric_features):
        train_new[col] = scaled[:, i]

    numeric_features.remove('SalePrice')
    scaled = scaler.fit_transform(test_new[numeric_features])

    for i, col in enumerate(numeric_features):
        test_new[col] = scaled[:, i]
Example #53
0
 def _logsf(self, x, p):
     k = floor(x)
     return k * log1p(-p)
Example #54
0
File: LITA.py Project: ifuding/ASAC
    test,
    sub3,
    how='left',
    on='ID',
)

from scipy.sparse import csr_matrix, vstack
train = train.replace(0, np.nan)
test = test.replace(0, np.nan)
train = pd.concat((train, test), axis=0, ignore_index=True)

test['target'] = 0.0
folds = 5
for fold in range(folds):
    x1, x2, y1, y2 = model_selection.train_test_split(train[col],
                                                      np.log1p(
                                                          train.target.values),
                                                      test_size=0.20,
                                                      random_state=fold)
    params = {
        'learning_rate': 0.02,
        'max_depth': 7,
        'boosting': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'is_training_metric': True,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': fold
    }
    model = lgb.train(params,
    print()

    # conserve memory
    #del train_df
    #return pandas_frame
    return train_df


# Get data
pandas_train = pd.read_csv(
    'C:/Users/Soomin/Google Drive/01. MSBA/03. Summer 2017/Machine Learning/Project/House Prices/train.csv'
)
pandas_train.shape

# Log transform the target for official scoring
pandas_train.SalePrice = np.log1p(pandas_train.SalePrice)
y = pandas_train.SalePrice

# << Preprocessing >>

# Lotfrontage
temp = pandas_train.groupby('Neighborhood',
                            as_index=False)['LotFrontage'].median()
temp = temp.rename(columns={"LotFrontage": "LotFrontage2"})
pandas_train = pd.merge(pandas_train, temp, how='left', on='Neighborhood')
pandas_train['LotFrontage'][pandas_train['LotFrontage'].isnull(
)] = pandas_train['LotFrontage2'][pandas_train['LotFrontage'].isnull()]
pandas_train = pandas_train.drop('LotFrontage2', axis=1)

# Alley
pandas_train["Alley"].fillna("None", inplace=True)
Example #56
0
def logsubexp(x, y):
    assert all(x >= y), 'cannot take log of negative number %s - %s' % (str(x),
                                                                        str(y))

    return x + log1p(-exp(y - x))
    def read_floor_ys(self,
                      output_dim,
                      include_floor_number=None,
                      only_biggest_floor=False,
                      sorted_xs=False,
                      upscale_xs_factor=1,
                      xs_from_biggest_floor=False,
                      floor_always_positive=False,
                      verbose=0):
        """
        :param int output_dim:
        :param bool include_floor_number:
        :param bool only_biggest_floor:
        :param bool sorted_xs: this is useful for plotting (dump-dataset --type plot).
            Otherwise you probably do not want this, because if your output_dim < len(xs), you might miss
            important information.
            Except with upscale_xs_factor, where this again probably makes sense.
        :param float|int upscale_xs_factor:
        :param bool xs_from_biggest_floor: False is old behavior, but probably you want to use this
          (only relevant if not only_biggest_floor)
        :param bool floor_always_positive:
        :param int verbose:
        :return: float values in [-1,1], shape (time,dim)
        :rtype: numpy.ndarray
        """
        if only_biggest_floor:
            assert include_floor_number in (None, False)
            include_floor_number = False
        if include_floor_number is None:
            include_floor_number = True
        floor_multipliers = []
        floor_xs = []
        floor_xs_upscaled = []
        while True:
            name, channel, data = self.read_entry()
            if name == "floor1_unpack multiplier":
                assert len(data) == 1
                floor_multipliers.append(data[0])
            if name == "floor1_unpack xs":
                if sorted_xs:
                    data = sorted(data)
                floor_xs.append(numpy.array(data))
                if upscale_xs_factor != 1:
                    import scipy.ndimage
                    data_upscaled = scipy.ndimage.zoom(numpy.array(
                        data, dtype="float32"),
                                                       zoom=upscale_xs_factor,
                                                       order=1,
                                                       mode="nearest")
                    data_upscaled = numpy.round(data_upscaled).astype("int32")
                    assert data_upscaled.shape[0] == len(
                        data) * upscale_xs_factor
                    floor_xs_upscaled.append(data_upscaled)
            if name == "finish_setup":
                break
        assert len(floor_multipliers) == len(floor_xs) > 0
        res_float = numpy.zeros((500, output_dim), dtype="float32")
        num_floors = len(floor_xs)
        biggest_floor_idx = max(range(num_floors),
                                key=lambda i: len(floor_xs[i]))
        dim = output_dim
        if include_floor_number:
            dim -= 1
        if verbose:
            if verbose >= 5:
                for i in range(num_floors):
                    print(
                        "Floor %i/%i, multiplier %i, xs: %r" %
                        (i + 1, num_floors, floor_multipliers[i], floor_xs[i]))
                print(
                    "Biggest floor: %i, len(xs) = %i" %
                    (biggest_floor_idx + 1, len(floor_xs[biggest_floor_idx])))
            if dim > len(floor_xs[biggest_floor_idx]):
                print("Warning: Dim = %i > len(biggest floor xs) = %i" %
                      (dim, len(floor_xs[biggest_floor_idx])))
        recent_floor_number = None
        frame_num = 0
        offset_dim = 0
        while True:
            try:
                name, channel, data = self.read_entry()
            except EOFError:
                break

            if name == "floor_number":
                recent_floor_number = data[0]
                assert 0 <= recent_floor_number < len(floor_xs)

            xs = None
            factor = None
            if recent_floor_number is not None:
                if only_biggest_floor and recent_floor_number != biggest_floor_idx:
                    continue
                xs = floor_xs_upscaled if floor_xs_upscaled else floor_xs
                if xs_from_biggest_floor:
                    xs = xs[biggest_floor_idx]
                    if biggest_floor_idx != recent_floor_number:
                        max_big_x = max(floor_xs[biggest_floor_idx])
                        max_cur_x = max(floor_xs[recent_floor_number])
                        factor = int(round(
                            float(max_big_x) / float(max_cur_x)))
                        xs = xs // factor
                    xs = numpy.clip(xs, 0, len(data) - 1)
                else:
                    xs = xs[recent_floor_number]

            if name in {"floor1 ys", "floor1 final_ys"}:
                assert recent_floor_number is not None
                if only_biggest_floor and recent_floor_number != biggest_floor_idx:
                    continue
                assert len(data) == len(floor_xs[recent_floor_number])
                # values [0..255]
                data_int = numpy.array(
                    data[:dim],
                    dtype="float32") * floor_multipliers[recent_floor_number]
                if floor_always_positive:
                    # values [0,1.0]
                    data_float = data_int.astype("float32") / 255.0
                else:
                    # values [-1.0,1.0]
                    data_float = (data_int.astype("float32") - 127.5) / 127.5
                frame_float = numpy.zeros((output_dim, ), dtype="float32")
                offset_dim = 0
                if include_floor_number:
                    frame_float[0] = (recent_floor_number +
                                      1.0) / num_floors - 0.5  # (-0.5,0.5)
                    offset_dim = 1
                frame_float[offset_dim:offset_dim +
                            data_float.shape[0]] = data_float
                if frame_num >= res_float.shape[0]:
                    res_float = numpy.concatenate(
                        [res_float, numpy.zeros_like(res_float)], axis=0)
                res_float[frame_num] = frame_float
                frame_num += 1
            elif name == "floor1 floor":
                assert recent_floor_number is not None
                data = numpy.array(data)[xs]
                # values [0..255] (data is already with multiplier)
                data_int = numpy.array(data[:dim], dtype="float32")
                if floor_always_positive:
                    # values [0,1.0]
                    data_float = data_int.astype("float32") / 255.0
                else:
                    # values [-1.0,1.0]
                    data_float = (data_int.astype("float32") - 127.5) / 127.5
                frame_float = numpy.zeros((output_dim, ), dtype="float32")
                offset_dim = 0
                if include_floor_number:
                    frame_float[0] = (recent_floor_number +
                                      1.0) / num_floors - 0.5  # (-0.5,0.5)
                    offset_dim = 1
                frame_float[offset_dim:offset_dim +
                            data_float.shape[0]] = data_float
                offset_dim += data_float.shape[0]
                if frame_num >= res_float.shape[0]:
                    res_float = numpy.concatenate(
                        [res_float, numpy.zeros_like(res_float)], axis=0)
                res_float[frame_num] = frame_float
                frame_num += 1
            elif name == "after_residue":
                assert recent_floor_number is not None
                if offset_dim == 0:  # no floor before, can happen for some
                    continue
                assert frame_num > 0  # had floor before
                assert output_dim >= offset_dim
                # Could use xs, but instead, this seems more interesting.
                idxs = numpy.arange(start=0, stop=len(data), step=1)
                if factor:
                    idxs = idxs // factor
                # Some hardcoded hyper params here...
                data = numpy.array(data)[idxs]
                data = numpy.log1p(numpy.abs(data)) * 0.1
                import scipy.ndimage
                data = scipy.ndimage.zoom(data, zoom=0.5)
                data = data[:output_dim - offset_dim]
                res_float[frame_num - 1,
                          offset_dim:offset_dim + data.shape[0]] = data
                offset_dim = 0
        return res_float[:frame_num]
Example #58
0
def log1p(obj):
    obj = to_dual(obj)
    return Dual(np.log1p(obj.re), obj.im / (1 + obj.re))
Example #59
0
def normalizing(frame):
    normalised_dset = np.log1p(frame)
    return normalised_dset
 def read_residue_ys(self,
                     output_dim,
                     scale=1.0,
                     clip_abs_max=None,
                     log1p_abs_space=False,
                     sorted_xs=False,
                     ignore_xs=False,
                     floor_base_factor=1):
     """
     :param int output_dim:
     :param float scale:
     :param float clip_abs_max:
     :param bool log1p_abs_space:
     :param float floor_base_factor:
     :param bool sorted_xs: this is useful for plotting (dump-dataset --type plot).
         Otherwise you probably do not want this, because if your output_dim < len(xs), you might miss
         important information.
     :param bool ignore_xs:
     :return: float values in [-1,1], shape (time,dim)
     :rtype: numpy.ndarray
     """
     floor_multipliers = []
     floor_xs = []
     while True:
         name, channel, data = self.read_entry()
         if name == "floor1_unpack multiplier":
             assert len(data) == 1
             floor_multipliers.append(data[0])
         if name == "floor1_unpack xs":
             if sorted_xs:
                 data = sorted(data)
             floor_xs.append(numpy.array(data, dtype="int32"))
         if name == "finish_setup":
             break
     assert len(floor_multipliers) == len(floor_xs) > 0
     res_float = numpy.zeros((500, output_dim), dtype="float32")
     num_floors = len(floor_xs)
     biggest_floor_idx = max(range(num_floors),
                             key=lambda i: len(floor_xs[i]))
     recent_floor_number = None
     frame_num = 0
     floor_base = None
     while True:
         try:
             name, channel, data = self.read_entry()
         except EOFError:
             break
         if name == "floor_number":
             recent_floor_number = data[0]
             assert 0 <= recent_floor_number < len(floor_xs)
         idxs = None
         if recent_floor_number is not None:
             if ignore_xs:
                 idxs = numpy.arange(start=0, stop=len(data),
                                     step=1)[:output_dim]
             else:
                 idxs = floor_xs[recent_floor_number][:output_dim]
                 # We might be just at the edge (e.g. idx==512 and len(data)==512).
                 idxs = numpy.clip(idxs, 0, len(data) - 1)
         if name == "floor1 floor":
             assert recent_floor_number is not None
             if recent_floor_number != biggest_floor_idx:
                 continue
             data = numpy.array(data)[idxs]
             # values [0..255] (data is already with multiplier)
             data_int = numpy.array(data, dtype="float32")
             # values [0.0,1.0]
             data_float = (data_int.astype("float32")) / 255.0
             floor_base = data_float
         if name == "after_residue":
             assert recent_floor_number is not None
             if recent_floor_number != biggest_floor_idx:
                 continue
             data_float = numpy.array(data, dtype="float32")
             selected_data = data_float[idxs]
             if not ignore_xs:
                 assert len(selected_data) == len(
                     floor_xs[recent_floor_number])
             assert isinstance(selected_data, numpy.ndarray)
             if log1p_abs_space:
                 selected_data = numpy.log1p(numpy.abs(selected_data))
             if floor_base is not None:
                 if log1p_abs_space:
                     selected_data += floor_base * floor_base_factor
                 else:
                     selected_data *= numpy.exp(
                         (floor_base - 1.0) * floor_base_factor)
             if scale != 1:
                 selected_data *= scale
             if clip_abs_max is not None and clip_abs_max > 0:
                 selected_data = numpy.clip(selected_data, -clip_abs_max,
                                            clip_abs_max)
             frame_float = numpy.zeros((output_dim, ), dtype="float32")
             frame_float[0:selected_data.shape[0]] = selected_data
             if frame_num >= res_float.shape[0]:
                 res_float = numpy.concatenate(
                     [res_float, numpy.zeros_like(res_float)], axis=0)
             res_float[frame_num] = frame_float
             frame_num += 1
     return res_float[:frame_num]