def dbscan(self, min_samples, eps=None, window=None): self.window = window ''' self.tickers = window.snp_live_tickers flat_data = self.average_over_time(window) self.normalised = StandardScaler().fit_transform(flat_data) ''' fundamental = self.window.get_fundamental() def get_ticker(index_list): result = [] for i in index_list: ticker = i[0] if ticker not in result: result.append(ticker) return result fundamental_tickers = get_ticker(fundamental.index) num_of_feature = 7 fundamental_reshaped = fundamental.values.reshape(-1, num_of_feature) fundamental_normalised = StandardScaler().fit_transform( fundamental_reshaped) fundamental_normalised = pd.DataFrame(fundamental_normalised, index=fundamental_tickers) fundamental_normalised = fundamental_normalised.dropna() self.tickers = fundamental_normalised.index dbscan = DBSCAN(eps=eps, min_samples=min_samples).fit(fundamental_normalised) self.dbscan_labels = labels = dbscan.labels_ core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool) core_samples_mask[dbscan.core_sample_indices_] = True self.dbscan_core_indices = dbscan.core_sample_indices_ self.dbscan_core_length = len(dbscan.core_sample_indices_) self.dbscan_core_mask = core_samples_mask self.unique_labels = set(labels) self.n_clusters = n_clusters = len( set(labels)) - (1 if -1 in labels else 0) self.n_noise = list(labels).count(-1) self.noise = np.where(labels == -1)[0] clusters = {} for j in range(n_clusters): pairs = [] for i in itertools.combinations(np.where(labels == j)[0], 2): pair = (i[0], i[1]) if window is not None: pair = (self.tickers[i[0]], self.tickers[i[1]]) pairs.append(pair) clusters[j] = pairs pair_count = 0 all_pairs = clusters.values() for i in all_pairs: pair_count += len(i) print('total pairs: ', pair_count) return clusters
def fill_knn(df, train_cols, label_cols): # del_cols = [] # for col in train_cols: # if (df[col] == df[col][0]).all(): # del_cols.append(col) # for col in del_cols: # train_cols.remove(col) data = StandardScaler().fit_transform(df.loc[:, train_cols].values) data = pd.DataFrame(data, columns=train_cols) data.dropna(axis=1, how='any', inplace=True) knn_train_cols = [col for col in train_cols if col in data.columns] for col in label_cols: data_train = data.loc[df[col].notnull().tolist(), knn_train_cols].values data_label = df.loc[df[col].notnull().tolist(), col].values knn = neighbors.KNeighborsRegressor() knn.fit(data_train, data_label) df.loc[df[col].isnull(), col] = knn.predict(data.loc[df[col].isnull().tolist(), knn_train_cols].values)
#optimum bilese sayisi pca = PCA().fit(df) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel("Bileşen Sayısını") plt.ylabel("Kümülatif Varyans Oranı") pca.explained_variance_ratio_ #final pca = PCA(n_components=3) pca_fit = pca.fit_transform(df) #burası açıklama oranı oluyor bu sayede o bilggilerle ne kadar açıklayıcı olabildiğimizi görebiliyoruz pca.explained_variance_ratio_ #örenek yapıyorum df = pd.read_csv("diabetes.csv", sep=",") df = df.dropna() dms = pd.get_dummies(df[['Age', 'DiabetesPedigreeFunction', 'Insulin']]) y = df["Outcome"] #okunmayan değerleri silmem lazım df X_ = df.drop(['Outcome', 'Age', 'DiabetesPedigreeFunction', 'Insulin'], axis=1).astype('float64') X = pd.concat([X_, dms[['DiabetesPedigreeFunction', 'Insulin']]], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)