def classify_hazardous(datasets, clf, crossval=False): xtrain, ytrain, xtest, ytest = ld.get_learndata(datasets) # map(normalize_dataset, [xtrain, xtest]) # clf = KNeighborsClassifier(n_neighbors=n_neighbors) print "clf:", clf fit_predict(xtrain, ytrain, xtest, ytest, clf) if crossval: print "init cross validation..." xdata, ydata = ld.get_learndata(datasets, split=False) k_fold = cross_validation.KFold(n=len(xdata), n_folds=3) for tr, ts in k_fold: xtrain, ytrain = xdata[tr], ydata[tr] xtest, ytest = xdata[ts], ydata[ts] # map(normalize_dataset, [xtrain, xtest]) xtrain, s_ = ld.normalize_dataset(xtrain) xtest, s_ = ld.normalize_dataset(xtest) fit_predict(xtrain, ytrain, xtest, ytest, clf) print "done." # if plotclf: # haz_real, nohaz_real = map(normalize_dataset, # [datasets[i][:, :-1] for i in [2,3]]) # vd.plot_classifier(xtrain, clf, num=200, haz=haz_real, figsize=figsize, # nohaz=nohaz_real, labels=['Perihelion distance (q)', # 'Argument of periapsis (w)']) return xtrain, clf
def split_by_clf(clf, cutcol, haz_train, nohaz_train, haz_test=None, nohaz_test=None, verbose=True): """ Splits datasets by classifier. Returns subsets of initial datasets split by classifier and scales of each column. """ haz_test = deepcopy(haz_train) if haz_test is None else haz_test nohaz_test = deepcopy(nohaz_train) if nohaz_test is None else nohaz_test haz_train_cut, nohaz_train_cut = ld.cut_params(haz_train, nohaz_train, cutcol) haz_test_cut, nohaz_test_cut = ld.cut_params(haz_test, nohaz_test, cutcol) bounds = ld.common_bounds( [haz_train_cut, nohaz_train_cut, haz_test_cut, nohaz_test_cut]) haz_train_cut, haz_train_sc = ld.normalize_dataset(haz_train_cut, bounds) nohaz_train_cut, nohaz_train_sc = ld.normalize_dataset( nohaz_train_cut, bounds) haz_test_cut, haz_test_sc = ld.normalize_dataset(haz_test_cut, bounds) nohaz_test_cut, nohaz_test_sc = ld.normalize_dataset( nohaz_test_cut, bounds) scales = ld.common_scales( [haz_train_sc, nohaz_train_sc, haz_test_sc, nohaz_test_sc]) xtrain, ytrain = ld.mix_up(haz_train_cut, nohaz_train_cut) clf = clf.fit(xtrain, ytrain) predicted = clf_split_quality(clf, haz_test_cut, nohaz_test_cut, verbose=verbose) haz_1, nohaz_1, haz_0, nohaz_0 = predicted haz_test_1 = haz_test.iloc[haz_1] nohaz_test_1 = nohaz_test.iloc[nohaz_1] haz_test_0 = haz_test.iloc[haz_0] nohaz_test_0 = nohaz_test.iloc[nohaz_0] haz_concat = pd.concat((haz_test_1, nohaz_test_1)) nohaz_concat = pd.concat((haz_test_0, nohaz_test_0)) # return haz_concat, nohaz_concat return (haz_test_1, nohaz_test_1), (haz_test_0, nohaz_test_0), scales
def sgmask_clf2d_fit(clf, cutcol, inner, outer, scales): """ Fits classifier to separate asteroids belonging to the subgroup from the rest of asteroids. """ x, y = cutcol xmin, xmax = scales[0] ymin, ymax = scales[1] inner_c = inner[cutcol] outer_c = outer[cutcol] inner_c = inner_c[inner_c[x] >= xmin] inner_c = inner_c[inner_c[x] <= xmax] inner_c = inner_c[inner_c[y] >= ymin] inner_c = inner_c[inner_c[y] <= ymax] outer_c = outer_c[outer_c[x] >= xmin] outer_c = outer_c[outer_c[x] <= xmax] outer_c = outer_c[outer_c[y] >= ymin] outer_c = outer_c[outer_c[y] <= ymax] inner_cut = inner_c.as_matrix() outer_cut = outer_c.as_matrix() bounds = np.asarray(scales).T inner_cut, insc = ld.normalize_dataset(inner_cut, bounds=bounds) outer_cut, outsc = ld.normalize_dataset(outer_cut, bounds=bounds) innum = len(inner_cut) sgincol = np.reshape(np.ones(innum), (innum, 1)) inner_cut_id = np.append(inner_cut, sgincol, axis=1) outnum = len(outer_cut) sgoutcol = np.reshape(np.zeros(outnum), (outnum, 1)) outer_cut_id = np.append(outer_cut, sgoutcol, axis=1) together = np.concatenate((inner_cut_id, outer_cut_id)) together = np.random.permutation(together) xtrain, ytrain = ld.split_by_lastcol(together) clf = clf.fit(xtrain, ytrain) return clf
def sgmask_clf(hazdf, nohazdf, hazdf_rest, nohazdf_rest, clf, cutcol): """ Fits classifier to separate asteroids belonging to the subgroup from the rest of asteroids. """ df = pd.concat((hazdf, nohazdf)) x, y = cutcol[0], cutcol[1] xmin, xmax = min(df[x]), max(df[x]) ymin, ymax = min(df[y]), max(df[y]) datacut = df[cutcol].as_matrix() datacut, scales = ld.normalize_dataset(datacut) ndata = len(datacut) sgincol = np.reshape(np.ones(ndata), (ndata, 1)) datacut_ = np.append(datacut, sgincol, axis=1) rest = pd.concat((hazdf_rest, nohazdf_rest)) rest = rest[rest[x] >= xmin] rest = rest[rest[x] <= xmax] rest = rest[rest[y] >= ymin] rest = rest[rest[y] <= ymax] restcut = rest[cutcol].as_matrix() restcut, scales = ld.normalize_dataset(restcut) nrest = len(restcut) sgoutcol = np.reshape(np.zeros(nrest), (nrest, 1)) restcut_ = np.append(restcut, sgoutcol, axis=1) data_rest = np.concatenate((datacut_, restcut_)) data_rest = np.random.permutation(data_rest) xtrain, ytrain = ld.split_by_lastcol(data_rest) clf = clf.fit(xtrain, ytrain) # c = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # c1i = np.where(c==1)[0] # c0i = np.where(c==0)[0] return clf
def density_clusters(data_x, eps=0.015, min_samples=100, plotclusters=False, figsize=(10, 10)): """ Finds density-based clusters in data with DBSCAN. """ data_x_norm, scales = ld.normalize_dataset(data_x) dbsc = DBSCAN(eps=eps, min_samples=min_samples).fit( data_x_norm) # 0.015 100 | 0.021 160 labels = dbsc.labels_ unique_labels = np.unique(labels) core_samples = np.zeros_like(labels, dtype=bool) core_samples[dbsc.core_sample_indices_] = True # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print "n_clusters_:", n_clusters_ # print "core_samples:", core_samples[:50] # colors = ['yellow', 'red', 'green', 'blue', 'magenta'] colors_ = vd.get_colorlist(len(unique_labels)) # print "labels:", labels, len(labels) # print "unique_labels:", unique_labels if plotclusters: fig = plt.figure(figsize=figsize) for (label, color) in zip(unique_labels, colors_): if label == -1: color = "white" class_member_mask = (labels == label) xy = data_x_norm[class_member_mask & core_samples] plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=color, markersize=5) xy2 = data_x_norm[class_member_mask & ~core_samples] plt.plot(xy2[:, 0], xy2[:, 1], 'o', markerfacecolor=color, markersize=4) plt.show() return dbsc
def extract_dbclusters(data, dens_layers, verbose=False): """ Iterratively finds density-based clusters in the data with DBSCAN. Continues serch for clusters in the outliers left after the previous itteration if the dens_layers parameter contains several values for eps and min_smples. """ data_ = deepcopy(data) level = 1 extracted_clusters = [] for eps, min_samples in dens_layers: # densclust = density_clusters(data_, eps=eps, min_samples=min_samples) data_norm, scales = ld.normalize_dataset(data_) densclust = DBSCAN(eps=eps, min_samples=min_samples).fit(data_norm) max_ind = max(densclust.labels_) for i in range(max_ind + 1): clusters_ind = np.where(densclust.labels_ == i)[0] extracted = data_[clusters_ind] id_col = (i + level) * np.ones((len(extracted), 1), dtype=int) extracted = np.append(extracted, id_col, axis=1) extracted_clusters.append(extracted) rest_ind = np.where(densclust.labels_ == -1)[0] data_ = data_[rest_ind] level += (max(densclust.labels_) + 1) id_col = (level + 0) * np.ones((len(data_), 1), dtype=int) extracted = np.append(data_, id_col, axis=1) extracted_clusters.append(extracted) data_ = [] if verbose: print "extracted_clusters\n [ID, number of elements]" for ec in extracted_clusters: print[ec[0][-1], len(ec)] return extracted_clusters