Example #1
0
    def dbscan(self, min_samples, eps=None, window=None):

        self.window = window
        '''
        self.tickers = window.snp_live_tickers
        flat_data = self.average_over_time(window)
        self.normalised = StandardScaler().fit_transform(flat_data)
        '''
        fundamental = self.window.get_fundamental()

        def get_ticker(index_list):
            result = []
            for i in index_list:
                ticker = i[0]
                if ticker not in result:
                    result.append(ticker)
            return result

        fundamental_tickers = get_ticker(fundamental.index)
        num_of_feature = 7
        fundamental_reshaped = fundamental.values.reshape(-1, num_of_feature)
        fundamental_normalised = StandardScaler().fit_transform(
            fundamental_reshaped)
        fundamental_normalised = pd.DataFrame(fundamental_normalised,
                                              index=fundamental_tickers)
        fundamental_normalised = fundamental_normalised.dropna()
        self.tickers = fundamental_normalised.index
        dbscan = DBSCAN(eps=eps,
                        min_samples=min_samples).fit(fundamental_normalised)

        self.dbscan_labels = labels = dbscan.labels_
        core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
        core_samples_mask[dbscan.core_sample_indices_] = True
        self.dbscan_core_indices = dbscan.core_sample_indices_
        self.dbscan_core_length = len(dbscan.core_sample_indices_)
        self.dbscan_core_mask = core_samples_mask

        self.unique_labels = set(labels)
        self.n_clusters = n_clusters = len(
            set(labels)) - (1 if -1 in labels else 0)
        self.n_noise = list(labels).count(-1)
        self.noise = np.where(labels == -1)[0]

        clusters = {}
        for j in range(n_clusters):
            pairs = []
            for i in itertools.combinations(np.where(labels == j)[0], 2):
                pair = (i[0], i[1])
                if window is not None:
                    pair = (self.tickers[i[0]], self.tickers[i[1]])
                pairs.append(pair)
            clusters[j] = pairs
        pair_count = 0
        all_pairs = clusters.values()
        for i in all_pairs:
            pair_count += len(i)
        print('total pairs: ', pair_count)
        return clusters
Example #2
0
def fill_knn(df, train_cols, label_cols):
    # del_cols = []
    # for col in train_cols:
    #     if (df[col] == df[col][0]).all():
    #         del_cols.append(col)
    # for col in del_cols:
    #     train_cols.remove(col)
    data = StandardScaler().fit_transform(df.loc[:, train_cols].values)
    data = pd.DataFrame(data, columns=train_cols)
    data.dropna(axis=1, how='any', inplace=True)
    knn_train_cols = [col for col in train_cols if col in data.columns]
    for col in label_cols:
        data_train = data.loc[df[col].notnull().tolist(),
                              knn_train_cols].values
        data_label = df.loc[df[col].notnull().tolist(), col].values
        knn = neighbors.KNeighborsRegressor()
        knn.fit(data_train, data_label)
        df.loc[df[col].isnull(),
               col] = knn.predict(data.loc[df[col].isnull().tolist(),
                                           knn_train_cols].values)
Example #3
0
#optimum bilese sayisi
pca = PCA().fit(df)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("Bileşen Sayısını")
plt.ylabel("Kümülatif Varyans Oranı")
pca.explained_variance_ratio_
#final
pca = PCA(n_components=3)
pca_fit = pca.fit_transform(df)
#burası açıklama oranı oluyor bu sayede o bilggilerle ne kadar açıklayıcı olabildiğimizi görebiliyoruz
pca.explained_variance_ratio_

#örenek yapıyorum

df = pd.read_csv("diabetes.csv", sep=",")
df = df.dropna()

dms = pd.get_dummies(df[['Age', 'DiabetesPedigreeFunction', 'Insulin']])
y = df["Outcome"]
#okunmayan  değerleri silmem lazım

df

X_ = df.drop(['Outcome', 'Age', 'DiabetesPedigreeFunction', 'Insulin'],
             axis=1).astype('float64')
X = pd.concat([X_, dms[['DiabetesPedigreeFunction', 'Insulin']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)