Example #1
0
def data_process(load_train, load_test):

    # 读取数据,设置需要处理的列头
    df = pd.read_csv(load_train)
    df_test = pd.read_csv(load_test)
    feature_index = [
        "BILL_1", "BILL_2", "BILL_3", "BILL_4", "BILL_5", "BILL_6", "AGE",
        "SEX_le", "EDUCATION_le", "MARRIAGE_le"
    ]
    encoder_index = ["SEX", "EDUCATION", "MARRIAGE", "RISK"]
    stande_index = ["BILL_1", "BILL_2", "BILL_3", "BILL_4", "BILL_5", "BILL_6"]

    # 数据标准化
    for i in stande_index:
        df_s = StandardScaler().fit_transform(df[i][:, np.newaxis])
        df[i + "_s"] = df_s
    df_s = df.drop(stande_index, axis=1)
    # 对每一个字符元素做LabelEncoder处理,并添加
    for i in encoder_index:
        df_encoder = LabelEncoder().fit_transform(df_s[i])
        df_s[i + "_le"] = df_encoder
    # 删除旧有的列元素
    df_new = df_s.drop(encoder_index, axis=1)
    # 分割数据集
    train_features, test_features, train_labels, test_labels = train_test_split(
        df_new[feature_index],
        df_new['RISK_le'],
        test_size=0.2,
        random_state=42)
    return train_features, test_features, train_labels, test_labels
Example #2
0
def main(csv_path):
    df = pd.read_csv(csv_path)

    # Separating out the features
    X = df[df.columns[:-1]]
    # Separating out the target
    Y = df[df.columns[-1]]

    # Standardizing the features
    X_standard = StandardScaler().fit_transform(X)

    pca = PCA(n_components='mle')
    pca.fit_transform(X_standard)
    pca_1 = pd.DataFrame(pca.components_, columns=X.columns)

    output = pca_1.copy()
    output['explained_variance_ratio'] = pca.explained_variance_ratio_
    print(output)

    X_standard = pd.DataFrame(X_standard, index=X.index, columns=X.columns)
    X_standard = X_standard.drop(
        pca_1.columns[pca_1.apply(lambda col: col[0] < 0)], axis=1)

    pca = PCA(n_components='mle')
    pca.fit_transform(X_standard)
    pca_2 = pd.DataFrame(pca.components_, columns=X_standard.columns)

    output = pca_2.copy()
    output['explained_variance_ratio'] = pca.explained_variance_ratio_
    print(output)
def feature_selection(df, target):
    convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool',
                   'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]',
                   'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category',
                   'all-identical': 'category'}
    ptype = Ptype()
    ptype.run_inference(df)
    predicted = ptype.predicted_types
    count_normal_vars = 0
    count_continuous_vars = 0
    features = []
    for key in predicted:
        # print(key, predicted[key])
        if predicted[key] == 'int' or predicted[key] == 'float':
            features.append(key)
    x = df.loc[:, features].values
    x = StandardScaler().fit_transform(x)
    x = pd.DataFrame(x)
    x.columns = features


    X = x.drop(target, 1)  # Feature Matrix
    y = x[target]  # Target Variable

    # no of features
    nof_list = np.arange(1, len(features))
    high_score = 0
    # Variable to store the optimum features
    nof = 0
    score_list = []
    for n in range(len(nof_list)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        model = LinearRegression()
        rfe = RFE(model, nof_list[n])
        X_train_rfe = rfe.fit_transform(X_train, y_train)
        X_test_rfe = rfe.transform(X_test)
        model.fit(X_train_rfe, y_train)
        score = model.score(X_test_rfe, y_test)
        score_list.append(score)
        if (score > high_score):
            high_score = score
            nof = nof_list[n]
    # print("Optimum number of features: %d" % nof)
    # print("Score with %d features: %f" % (nof, high_score))
    cols = list(X.columns)
    model = LinearRegression()
    # Initializing RFE model
    rfe = RFE(model, nof)
    # Transforming data using RFE
    X_rfe = rfe.fit_transform(X, y)
    # Fitting the data to model
    model.fit(X_rfe, y)
    temp = pd.Series(rfe.support_, index=cols)
    selected_features_rfe = temp[temp == True].index

    quality_measure = nof/len(features)
    return quality_measure
Example #4
0
def _get_data(case):
    if case == 'case1':
        df_birth = pd.read_csv('dataset_birthwt.csv')
        df_part1 = StandardScaler().fit_transform(df_birth[['age', 'lwt']])
        df_part2 = df_birth[['race', 'smoke', 'ptl', 'ht', 'ui', 'ftv']]
        feat_names = ['age', 'lwt'] + list(df_part2.columns)
        y = StandardScaler().fit_transform(df_birth['bwt'].values[:, None])[:,
                                                                            0]
        X = np.hstack((df_part1, df_part2))
    elif case == 'case2':
        df_prostate = pd.read_csv('dataset_prostate.csv')
        y = df_prostate['lpsa']
        feat_names = [
            'lcavol', 'lweight', 'age', 'lbph', 'svi', 'lcp', 'gleason',
            'pgg45'
        ]
        X = StandardScaler().fit_transform(df_prostate[feat_names])
    elif case == 'case3':
        bun = ds.load_diabetes()
        X, y = bun.data, bun.target
        X = StandardScaler().fit_transform(X)
        feat_names = bun.feature_names
    elif case == 'case4':
        df_fev = pd.read_csv('dataset_FEV.csv')

        df_fev.drop(labels='id', axis=1, inplace=True)

        feat_names = ['age', u'fev', u'height', u'sex', u'smoke']
        df_part1 = pd.DataFrame(StandardScaler().fit_transform(
            df_fev[feat_names[:-2]].values),
                                columns=feat_names[:-2])
        df_part2 = pd.get_dummies(df_fev[feat_names[-2:]], drop_first=True)
        y = StandardScaler().fit_transform(df_part1['fev'].values[:, None])[:,
                                                                            0]
        df_part1.drop(labels='fev', axis=1, inplace=True)
        X = np.hstack((df_part1.values, df_part2.values))
        feat_names = list(df_part1.columns) + list(df_part2.columns)
        feat_names = feat_names[:-1] + ['smoker']
    return X, y, feat_names
Example #5
0
df = pd.read_csv(csv_path)

# Separating out the features
X = df[df.columns[:-1]]
# Separating out the target
Y = df[df.columns[-1]]

# Standardizing the features
X_standard = StandardScaler().fit_transform(X)

pca = PCA(n_components='mle')
pca.fit_transform(X_standard)
pca_1 = pd.DataFrame(pca.components_, columns=X.columns)

output = pca_1.copy()
output['explained_variance_ratio'] = pca.explained_variance_ratio_
print(output)

X_standard = pd.DataFrame(X_standard, index=X.index, columns=X.columns)
X_standard = X_standard.drop(
    pca_1.columns[pca_1.apply(lambda col: col[0] < 0)], axis=1)

pca = PCA(n_components='mle')
pca.fit_transform(X_standard)
pca_2 = pd.DataFrame(pca.components_, columns=X_standard.columns)

output = pca_2.copy()
output['explained_variance_ratio'] = pca.explained_variance_ratio_
print(output)
Example #6
0
## Save output

np.savetxt("{}Ha{}_{}_{}_{}.csv".format(NMFreg_output, K, alpha, l1_ratio,
                                        random_state),
           Ha,
           delimiter=",")
np.savetxt("{}Wa{}_{}_{}_{}.csv".format(NMFreg_output, K, alpha, l1_ratio,
                                        random_state),
           Wa,
           delimiter=",")

Ha_norm = StandardScaler(with_mean=False).fit_transform(Ha)
Ha_norm = pd.DataFrame(Ha_norm)
Ha_norm['barcode'] = atlasdge.index.tolist()
maxloc = Ha_norm.drop('barcode', axis=1).values.argmax(axis=1)
cell_clusters['maxloc'] = maxloc
cell_clusters.head()

# In[21]:

#num_atlas_clusters = np.unique(cell_clusters['cluster']).size
#### Interpret these factors to cell type assignments #####
f = open("{}Log.txt".format(NMFreg_output), 'a')
f.write("Mapping Atlas Cells to Clusters\n")
f.close()

num_atlas_clusters = max(cell_clusters['cluster'])
bins = num_atlas_clusters
factor_to_celltype_df = pd.DataFrame(0,
                                     index=range(1, num_atlas_clusters + 1),
Example #7
0
pca_fit = pca.fit_transform(df)
#burası açıklama oranı oluyor bu sayede o bilggilerle ne kadar açıklayıcı olabildiğimizi görebiliyoruz
pca.explained_variance_ratio_

#örenek yapıyorum

df = pd.read_csv("diabetes.csv", sep=",")
df = df.dropna()

dms = pd.get_dummies(df[['Age', 'DiabetesPedigreeFunction', 'Insulin']])
y = df["Outcome"]
#okunmayan  değerleri silmem lazım

df

X_ = df.drop(['Outcome', 'Age', 'DiabetesPedigreeFunction', 'Insulin'],
             axis=1).astype('float64')
X = pd.concat([X_, dms[['DiabetesPedigreeFunction', 'Insulin']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

#model atanıyor
lgb_model = LGBMRegressor().fit(X_train, y_train)

lgb_model

#tahmin yapılıyor

y_pred = lgb_model.predict(X_test)
y_pred
Example #8
0
    a_rob.append(b_rob)

#df_num_norm['mahal_normal'] = a
df_num_norm['mahal_rob'] = a_rob

#df_v2['mahal_normal'] = a
df_v2['mahal_rob'] = a_rob

#a = pd.DataFrame(a)
a_rob = pd.DataFrame(a_rob)
#p = a.quantile(0.9)
p_rob = a_rob.quantile(0.9)
df_num_norm = df_num_norm[df_num_norm.mahal_rob < p_rob.iloc[0]]
df_v2 = df_v2[df_v2.mahal_rob < p_rob.iloc[0]]

df_num_norm = df_num_norm.drop(['mahal_rob'], axis=1)

# In[]

#Vectores y valores propios de la matriz de covarianza de la muestra
eig_vals, eig_vecs = np.linalg.eig(np.array(cov_df))
print('Eigenvectors \n%s' % eig_vecs)
print('\nEigenvalues \n%s' % eig_vals)

#Hacemos una lista de parejas (autovector, autovalor)
eig_pares = [(np.abs(eig_vals[i]), eig_vecs[:, i])
             for i in range(len(eig_vals))]

#Ordenamos estas parejas den orden descendiente con la función sort
eig_pares.sort(key=lambda x: x[0], reverse=True)
#Corremos KMeans
kmeans = KMeans(n_clusters=7).fit(users)
#Graficamos los rsultados de KMeans
df_labeled = pd.DataFrame(kmeans.labels_, columns=list(['labels']))
df_labeled['labels'] = df_labeled['labels'].astype('category')
plt.figure(figsize=(10, 8))
df_labeled['labels'].value_counts().plot.bar(color='y')
plt.xlabel("Cluster")
plt.ylabel("Número de clientes")
plt.title("Número clientes por Cluster")
#plt.show()
#Agragamos los resultados las categorias del KMeans a users
users = users.join(df_labeled)
#Graicamos el dendograma
plt.figure(figsize=(20, 10))
merg = linkage(users.drop('labels', 1), method='ward')
dendrogram(merg, leaf_rotation=360)
plt.title('Dendrogram')
#plt.show()
#Definimos el clusterin jerarquico
hier_clus = AgglomerativeClustering(n_clusters=5,
                                    affinity='euclidean',
                                    linkage='ward')
cluster = hier_clus.fit_predict(users.drop('labels', 1))
#Agregamos las categorias del clustrein jerarqico
users['Agg_label'] = cluster
#Grafica del CH
df_labeled = pd.DataFrame(hier_clus.labels_, columns=list(['labels']))
df_labeled['labels'] = df_labeled['labels'].astype('category')
plt.figure(figsize=(10, 8))
df_labeled['labels'].value_counts().plot.bar(color='y')
Example #10
0
# Backfill Volume data
###############################################################################
if 1:

    # Get historical Volume data
    volume, rat_volume = backfill_volume_with_singular(currencies, granularity,
                                                       _from, _to)
    volume.reset_index(inplace=True)
    volume = volume.rename({'index': 'timestamp'}, axis=1)
    volume.to_pickle(volume_path)

    # try standardized
    ss = StandardScaler().fit_transform(volume.iloc[:, 1:])
    ss = pd.DataFrame(ss, columns=volume.columns[1:], index=volume.index)
    ss.insert(0, 'timestamp', volume.timestamp)
    ss.drop('jpy', axis=1, inplace=True)
    ss.drop('hkd', axis=1, inplace=True)

    cu, ratios = backfill_with_singular(currencies, granularity, _from, _to)
    cu.reset_index(inplace=True)
    cu = cu.rename({'index': 'timestamp'}, axis=1)
    cu.to_pickle(cu_small_path)

    ratios.reset_index(inplace=True)
    ratios = ratios.rename({'index': 'timestamp'}, axis=1)

###############################################################################
# Update Volume and Diff
###############################################################################
if 0:
Example #11
0
def NMFreg(
    counts,
    coords,
    size,
    metacell_dict,
    gene_intersection,
    num_atlas_clusters,
    celltype_to_factor_dict,
    celltype_dict,
    plot_size_dict,
):

    puckcounts = counts[["barcode"] + gene_intersection]
    puckcounts = puckcounts.set_index(counts["barcode"])
    puckcounts = puckcounts.drop("barcode", axis=1)

    cell_totalUMI = np.sum(puckcounts, axis=1)
    puckcounts_cellnorm = np.divide(puckcounts, cell_totalUMI[:, None])
    puckcounts_scaled = StandardScaler(
        with_mean=False).fit_transform(puckcounts_cellnorm)

    XsT = puckcounts_scaled.T

    Hs_hat = []
    for b in tqdm(range(XsT.shape[1])):
        h_hat = scipy.optimize.nnls(WaT, XsT[:, b])[0]
        if b == 0:
            Hs_hat = h_hat
        else:
            Hs_hat = np.vstack((Hs_hat, h_hat))

    Hs = pd.DataFrame(Hs_hat)
    Hs["barcode"] = puckcounts.index.tolist()

    Hs_norm = StandardScaler(with_mean=False).fit_transform(
        Hs.drop("barcode", axis=1))

    Hs_norm = pd.DataFrame(Hs_norm)
    Hs_norm["barcode"] = puckcounts.index.tolist()

    maxloc_s = Hs_norm.drop("barcode", axis=1).values.argmax(axis=1)
    barcode_clusters = pd.DataFrame()
    barcode_clusters["barcode"] = Hs_norm["barcode"]
    barcode_clusters["max_factor"] = maxloc_s

    barcode_clusters["atlas_cluster"] = barcode_clusters["barcode"]

    for c in range(1, num_atlas_clusters + 1):
        condition = np.isin(barcode_clusters["max_factor"],
                            celltype_to_factor_dict[c])
        barcode_clusters["atlas_cluster"][condition] = c

    bead_deconv_df = Hs_norm.apply(
        lambda x: deconv_factor_to_celltype(
            row=x,
            adict=factor_to_celltype_dict,
            K=K,
            num_atlas_clusters=num_atlas_clusters,
        ),
        axis=1,
    )
    bead_deconv_df.insert(0, "barcode", Hs_norm["barcode"])
    bead_deconv_df.columns = ["barcode"
                              ] + (bead_deconv_df.columns[1:] + 1).tolist()
    bead_deconv_df = pd.DataFrame(bead_deconv_df)
    bead_deconv_df = bead_deconv_df.rename(columns=celltype_dict)

    maxloc_ct = bead_deconv_df.drop("barcode",
                                    axis=1).values.argmax(axis=1) + 1
    bead_maxct_df = pd.DataFrame()
    bead_maxct_df["barcode"] = bead_deconv_df["barcode"]
    bead_maxct_df["max_cell_type"] = maxloc_ct

    return Hs, Hs_norm, puckcounts, bead_deconv_df, barcode_clusters, bead_maxct_df
Example #12
0
    # Create target Directory if don't exist
    model_dirname = folderPath + 'model/' 
    if not os.path.exists(model_dirname):
        os.mkdir(model_dirname)
        print("Directory " , model_dirname ,  " Created ")
    else:    
        print("Directory " , model_dirname ,  " already exists")
    model.save(folderPath + model_dirname + "model.h5")
    print("Saved model to disk")

    print('Preparing Testing data')
    X_test = pd.read_csv(dataPath + 'preprocessed_test.csv')
    y_train = pd.read_csv(dataPath + 'preprocessed_train_y.csv')
    # Remove ID
    X_test = X_test.drop('Id', axis=1)
    y_train = y_train.drop('Id', axis=1)
    # Devided input into 2 groups
    X_related_test = X_test[['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']]
    X_effected_test = X_test.drop(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'], axis=1)
    # Scale Data
    X_test = StandardScaler().fit_transform(X_test)
    X_related_test = StandardScaler().fit_transform(X_related_test)
    X_effected_test = StandardScaler().fit_transform(X_effected_test)
    scaler = StandardScaler().fit(y_train)

    print('Predicting...')
    result = model.predict([X_related_test, X_effected_test])

    # Inverse transformation
    result = scaler.inverse_transform(result)
Example #13
0
##standerdising

X.describe()
X = StandardScaler().fit_transform(X)

features = X[X.columns[:5]]
features_standard = StandardScaler().fit_transform(
    features)  # Gaussian Standardisation
X = pd.DataFrame(features_standard,
                 columns=[['pgc', 'np', 'age', 'dpf', 'bmi']])
features = X[X.columns]
features.describe()
Y.info()
X['class'].value_counts()
Y = df['class']
X.drop("class", axis=1, inplace=True)

#####lr model
Xtrain, Xtest, ytrain, ytest = model_selection.train_test_split(
    X, Y, test_size=0.15, random_state=42)
Xtrain.info()
ytrain.value_counts()
ytest.value_counts()
lrmodel = linear_model.LogisticRegression()
lrmodel.fit(Xtrain, ytrain)
lrpredicted = lrmodel.predict(Xtest)
printresult(ytest, lrpredicted)
distances = lrmodel.decision_function(Xtest)
precission, recall, thresh = metrics.precision_recall_curve(ytest, distances)
plt.plot(thresh, precission[:-1], color="b")
plt.plot(thresh, recall[:-1], color="g")