Beispiel #1
0
def skew_report(dataframe: pd.DataFrame, threshold: int = 5) -> None:
    highly_skewed = [
        i[0]
        for i in zip(dataframe.columns.values, abs(dataframe.skew(numeric_only=True)))
        if i[1] > threshold
    ]
    print(f"There are {len(highly_skewed)} highly skewed data columns.")
    if highly_skewed:
        print("Please check them for miscoded na's")
        print(highly_skewed)
Beispiel #2
0
    def test_skew(self):
        from scipy.stats import skew

        string_series = tm.makeStringSeries().rename("series")

        alt = lambda x: skew(x, bias=False)
        self._check_stat_op("skew", alt, string_series)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                assert np.isnan(s.skew())
                assert np.isnan(df.skew()).all()
            else:
                assert 0 == s.skew()
                assert (df.skew() == 0).all()
    def test_skew(self):
        from scipy.stats import skew

        string_series = tm.makeStringSeries().rename('series')

        alt = lambda x: skew(x, bias=False)
        self._check_stat_op('skew', alt, string_series)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                assert np.isnan(s.skew())
                assert np.isnan(df.skew()).all()
            else:
                assert 0 == s.skew()
                assert (df.skew() == 0).all()
Beispiel #4
0
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(float(x) for x in [df.mean()[0], df.std()[0], df.skew()[0]])
    return moments
Beispiel #5
0
    def myconnect(self):
        user_text1 = str(self.host_le.text())
        user_text2 = str(self.user_le.text())
        user_text3 = str(self.password_le.text())
        user_text4 = str(self.db_le.text())

        try:
            mcon = MySQLdb.connect(host=user_text1, user=user_text2, passwd=user_text3, db=user_text4)
            self.browser.setText("[*] Welcome, connection successful.")
            text, ok = QInputDialog.getText(self, "Table Name", "Enter table name:")
            if ok and text:
                tb_name = str(text)
                try:
                    sq_tb = pis.read_sql('select * from '+ ' %s ' % tb_name, mcon)
                    df = DataFrame(sq_tb)
                    mcon.close()
                    size = str(len(df))
                    stat_description = df.describe()
                    stats = str(stat_description)
                    kt = str(df.kurt())
                    skew = str(df.skew())
                    cov = str(df.cov())
                    corr = str(df.corr())
                    head = str(df.head())
                    tail = str(df.tail())
                    summation = str(stat_description.sum())
                    self.browser1.setText("Size: " +"%s " %size +"\n"\
                                          +"Statistics:" +"\n"\
                                          +" %s " %stats +"\n"\
                                          +"Kurt:" +"\n"\
                                          +"%s" %kt +"\n"\
                                          +"Skew:" +"\n"\
                                          +"%s" %skew +"\n"\
                                          +"Covarriance:" +"\n"\
                                          +"%s" %cov +"\n"\
                                          +"Correlation:" +"\n"\
                                          +"%s" %corr +"\n"\
                                          +"Summation:" +"\n"\
                                          +"%s" %summation +"\n"\
                                          +"Head:" +"\n"\
                                          +"%s" % head +"\n"\
                                          +"Tail:" +"\n"\
                                          +"%s" %tail)
                    self.browser.setText(stats)
                    self.host_le.clear()
                    self.user_le.clear()
                    self.password_le.clear()
                    self.db_le.clear()
                    
                except Exception, e:
                    self.browser.setText("[*] Ensure that the table name is correct and try again.")
        except Exception, e:
            self.browser.setText("Please specify correct connection details and try again")
Beispiel #6
0
def getstastv(a, apiset):
    index = ['RP' + str(i + 1) for i in range(a.shape[0])]
    sta = DataFrame(data=a, index=index, columns=apiset).transpose()
    stv = DataFrame(index=sta.index)

    stv['range'] = sta.max(axis=1) - sta.min(axis=1)
    stv['var'] = sta.var(axis=1)
    stv['skew'] = sta.skew(axis=1)
    stv['kurt'] = sta.kurt(axis=1)
    stv['cv'] = sta.std(axis=1) / sta.mean(axis=1)

    return sta, stv
Beispiel #7
0
    def extractFeatures(self, data: DataFrame, columns):
        # 均值 A,协方差C,峰值K,偏度S,
        dataA = np.array(data.apply(np.average, axis=0))
        dataC = np.array([x for x in data.apply(np.cov, axis=0).values])
        # 分别使用df.kurt()方法和df.skew()即可完成峰度he偏度计算
        dataK = np.array(data.kurt(axis=0))
        dataS = np.array(data.skew(axis=0))

        dataF = np.array(self.fft_T_function(data, columns))

        df_features = np.concatenate(
            (dataA, dataC, dataK, dataS, dataF))  # len = 36列*5=180
        return df_features
Beispiel #8
0
def moments_features(path):
    if not os.path.exists(path):
        logger.error(path + " is not exist!")
        return
    im = cv2.imread(path)
    [b, g, r] = cv2.split(im)
    moments = []
    for n in [b, g, r]:
        df = DataFrame(np.array(n.flatten()))
        moments.extend(
            float(x)
            for x in [df.mean()[0], df.std()[0],
                      df.skew()[0]])
    return moments
Beispiel #9
0
def _feature_extraction(data: pd.DataFrame) -> pd.Series:
    def nlargest_index(df, n):
        return df.nlargest(n).index.unique()[0:n]

    # first 225 statistical features
    statistical = data.min()
    statistical = statistical.append(data.max(), ignore_index=True)
    statistical = statistical.append(data.mean(), ignore_index=True)
    statistical = statistical.append(data.skew(), ignore_index=True)
    statistical = statistical.append(data.kurtosis(), ignore_index=True)

    # FFT features
    fft = pd.DataFrame(np.fft.fft(data))
    fft_angle = fft.applymap(np.angle)
    fft = fft.applymap(np.abs)
    largest_values = pd.Series()
    largest_angles = pd.Series()
    largest_indices = pd.Series()
    for i in range(0, 45):
        five_largest_idx = nlargest_index(fft.ix[:, i].map(abs), 5)  # is map(abs) redundant?
        largest_indices = largest_indices.append(pd.Series(five_largest_idx),
                                                 ignore_index=True)
        five_largest = fft_angle.ix[five_largest_idx, i].T
        largest_angles = largest_angles.append(five_largest)
        five_largest = fft.ix[five_largest_idx, i].T
        largest_values = largest_values.append(five_largest)

    # Autocorrelation
    autocorrelation = pd.Series()
    autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(1), axis=0))
    for i in range(5, 51, 5):
        autocorrelation = autocorrelation.append(data.apply(lambda col: col.autocorr(i), axis=0))

    # Make result
    feature_vector = pd.Series()
    feature_vector = feature_vector.append(statistical)
    feature_vector = feature_vector.append(largest_values)
    feature_vector = feature_vector.append(largest_angles)
    feature_vector = feature_vector.append(largest_indices)
    feature_vector = feature_vector.append(autocorrelation)
    return feature_vector
Beispiel #10
0
def filedata(filename):
    f=open(filename,'r')
    eigen_list = []
    Act_fact = []
    for line in f.readlines():
        line=line.strip().split(" ")
        if '#' in line[0]:
            print(' ')
        else:
            AccXYZ =[[],[],[]]
            Acc = line[-1].split("*")
            for Accxyz in Acc:
                Accxyz=Accxyz.split(",")
                if len(Accxyz) == 3:
                    AccXYZ[0].append(float(Accxyz[0]))
                    AccXYZ[1].append(float(Accxyz[1]))
                    AccXYZ[2].append(float(Accxyz[2]))
            AccXYZ = np.array(AccXYZ)
            num = float(AccXYZ.shape[1])
            df = DataFrame(AccXYZ)
            skew = df.skew(1) #偏度
            kurt = df.kurt(1) #峰度
            mean = AccXYZ.mean(1)#均值
            std = AccXYZ.std(1)#标准差
            fft = np.fft.fft(AccXYZ)#傅里叶
            pass_mean = passmean(AccXYZ,mean)/num#均值穿越次数
            eigen_choose1 = np.append(np.array(skew),np.array(kurt))
            eigen_choose2= np.append(mean,pass_mean)
            eigen_choose3= np.append(std,pow(abs(fft),2).sum(1)/num)
            eigen_choose12 = np.append(eigen_choose1,eigen_choose2)
            eigen_choose = np.append(eigen_choose12,eigen_choose3)
        if  'ACT' in line[0] :
            Act = line[-1].split(",")
            eigen_choose = np.append(eigen_choose,int(Act[-1]))
            eigen_list.append(list(eigen_choose))
    f.close()
    eigen_fact_matrix = avg_data(eigen_list)
    return eigen_fact_matrix
Beispiel #11
0
def transform_features(X_train: pd.DataFrame, X_valid: pd.DataFrame,
                       X_test: pd.DataFrame, parameters: dict) -> list:
    """Apply log transformations to skewed features

    Args:
        X_train: training data.

        X_valid: validation data.

        X_test: test data.

    Returns:
        A list containing the transformed training, validation and test data.
    """

    log = logging.getLogger(__name__)
    paras = parameters["features"]["transformation"]
    threshold = paras[
        "skew_threshold"]  # threshold for applying log transformation
    inclusions = paras[
        "inclusions"]  # strings to include in the transformation
    exclusions = paras[
        "exclusions"]  # strings to exclude from the transformation

    # Select highly skewed variables
    feature_skew = X_train.skew().sort_values(ascending=False)
    skewed = list(feature_skew[feature_skew > threshold].index)
    log.info(
        blue("{} feature(s) exceed skew threshold of {}.".format(
            len(skewed), threshold)))
    if inclusions:
        n_skewed = len(skewed)
        skewed = [
            var for var in skewed if any(str in var for str in inclusions)
        ]
        n_excluded = n_skewed - len(skewed)
        log.warning(
            red("Including {} variable(s) containing:\n{}.".format(
                n_excluded, inclusions)))
        pause()
    if exclusions:
        n_skewed = len(skewed)
        skewed = [
            var for var in skewed if all(str not in var for str in exclusions)
        ]
        n_excluded = n_skewed - len(skewed)
        log.warning(
            red("Excluding {} variable(s) containing:\n{}.".format(
                n_excluded, exclusions)))
        pause()

    # Check variables are correct
    log.warning(
        red("Transforming {} variable(s): {}.".format(len(skewed), skewed)))
    pause()

    # Apply log transformation to skewed variables
    log.info(
        blue("Applying log transformation to {} feature(s).".format(
            len(skewed))))
    log_X_train = X_train
    log_X_valid = X_valid
    log_X_test = X_test
    for var in skewed:
        var_name = "log_" + var
        log_X_train[var_name] = log_offset(log_X_train[var])
        log_X_valid[var_name] = log_offset(log_X_valid[var])
        log_X_test[var_name] = log_offset(log_X_test[var])
        if paras["drop_vars"]:
            log_X_train = log_X_train.drop(var, axis=1)
            log_X_valid = log_X_valid.drop(var, axis=1)
            log_X_test = log_X_test.drop(var, axis=1)

    # Reorder columns
    log_X_train = log_X_train.reindex(sorted(log_X_train.columns), axis=1)
    log_X_valid = log_X_valid.reindex(sorted(log_X_train.columns), axis=1)
    log_X_test = log_X_test.reindex(sorted(log_X_train.columns), axis=1)

    return [log_X_train, log_X_valid, log_X_test]
Beispiel #12
0
    del yhat
    #del path1
    #del path2

# Making Data for RMSE statistics
data_frame.to_csv('%s/after_RMSE.csv' % (directory_name3))

data = read_csv('%s/after_RMSE.csv' % (directory_name3), header=0, index_col=0)
data = DataFrame(data)

s = data.sum()
ave = data.mean()
median = data.median()
var = data.var()
std = data.std()
skew = data.skew()
kurt = data.kurt()

frame_name = ['sum', 'ave', 'median', 'var', 'std', 'skew', 'kurt']
frame_name = DataFrame(frame_name)

s = s.values
ave = ave.values
median = median.values
var = var.values
std = std.values
skew = skew.values
kurt = kurt.values

data_numpy = [s, ave, median, var, std, skew, kurt]
data = DataFrame(data_numpy)
# In[227]:

df.describe()

# In[229]:

obj = Series(['a', 'a', 'b', 'c'] * 4)
obj

# In[230]:

obj.describe()

# In[231]:

df.skew()

# In[233]:

from pandas_datareader import data as web

all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')

# In[237]:

price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

# In[239]: