def pull_merged_data(feature=None, drop_outliers=True, force_reread=False, report=False) -> pd.DataFrame: if force_reread: wd = WineData() wdf = wd.raw[["COUNTY", "YEAR"] + PARAM] # type: pd.DataFrame wdf.rename(columns={"COUNTY": "MEGYE", "YEAR": "EV"}, inplace=True) fd = FruitData() fdf = fd.raw[["MEGYE", "EV"] + PARAM] df = pd.concat((wdf, fdf)) # type: pd.DataFrame df.to_excel(projectroot + "Merged.xlsx") else: df = pd.read_excel(projectroot + "Merged.xlsx") if report: print(df.columns) print() print(df.describe()) print() print(df.dtypes) if drop_outliers: # mask = np.logical_or(, df["DH1"] > 90) df = df[df["DH2"] < 135] df = df[df["DH1"] > 90] return df[[feature] + PARAM].dropna() if feature else df
def load_dataset(feature, dset=None): df = FruitData(transform=True) X = {"volatile": df.volatile.as_matrix(), "isotope": df.isotope.as_matrix(), None: df.X.as_matrix()}[dset] X = (X - X.mean(axis=0)) / X.std(axis=0) y = df[feature].as_matrix() return X, y
def xperiment_euclidean_distance(sugar="maize"): def euclidean(sample, reference): return np.sqrt((sample - reference)**2).sum() maize_center = sugar_parameters(sugar)[0] samples = get_sample_xs() fruit = FruitData() for smplnm in sorted(samples): species, dh1, d13c = samples[smplnm] fruit_center = fruit(species).mean(axis=0) + 0.6 s = np.array([dh1, d13c]) - 0.6 d1 = euclidean(s, fruit_center) d2 = euclidean(s, maize_center) print(f"{smplnm} ({species}) {sugar} content: {d1 / (d1+d2):>.4%}")
def xperiment_twoclass_svm(): print("SMPL PROB PROB PREDICTION") print("--------------------------------------") maize = fake_maize_data() samples = get_sample_xs() for i, smplnm in enumerate(sorted(samples)): species, dh1, d13c = samples[smplnm] fruit = FruitData().isotope[["DH1", "D13C"]].as_matrix() X = np.concatenate((fruit, maize)) Y = np.array([species] * len(fruit) + ["Kukorica"] * len(maize)) svm = SVC(kernel="linear", probability=True) svm.fit(X, Y) sX = np.array([[dh1, d13c]]) probs = svm.predict_proba(sX).ravel() pred = svm.predict(sX).ravel() print("{} {:.2%} {:.2%} {:>10}".format( smplnm, probs[0], probs[1], pred[0]))
def plot_transform(trname, feature, ndim): df = FruitData(transform=True) X, Y = df.X, df[feature] lX = transform(X, trname, ndim, Y) F, p = manova(X.as_matrix(), Y.as_matrix()) if ndim == 2: scat = Scatter2D( lX, Y.as_matrix(), axlabels=[f"LatentFactor{i}" for i in range(1, 3)], title=f"Transformation: {feature.upper()}\nF = {F:.4f}; p = {p:.4f}" ) elif ndim == 3: scat = Scatter3D( lX, Y.as_matrix(), axlabels=[f"LatentFactor{i}" for i in range(1, 4)], title=f"Transformation: {feature.upper()}\nF = {F:.4f}; p = {p:.4f}" ) else: raise ValueError(f"Invalid ndim: {ndim}") # scat.scatter() scat.split_scatter(show=True)
from csxdata.visual import spiderplot from csxdata.utilities.vectorop import standardize from SciProjects.fruits.fruitframe import FruitData df = FruitData(transform=False) X, Y = df.volatile, df["FAMILIA"] # bycat = split_by_categories(Y, X) spiderplot.split_gridlike(standardize(X), Y, X.columns, ncols=3)
from csxdata.stats import correlation from csxdata.stats import normaltest from csxdata.visual.histogram import fullplot from SciProjects.fruits.fruitframe import FruitData frame = FruitData(transform=True) X = frame.volatile correlation(X, names=X.columns) normaltest.full(X, names=X.columns) for col in X: fullplot(X[col], paramname=col)