def plotit(what): if what == "DHI": plotme = np.stack((Y_C, DHI), axis=1) else: plotme = np.stack((Y_C, D13C), axis=1) plotmeX, plotmeY = dropna(plotme, country_codes) plotmeX, plotmeY = drop_lowNs(plotmeX, plotmeY, threshold=5) category_frequencies(plotmeY) plot(plotmeX, plotmeY, axlabels=["Y", what])
def xperiment(): df = pull_merged_data(feature=FEATURE).dropna() X, Y = df[PARAM].as_matrix(), df[FEATURE].as_matrix() inspection.category_frequencies(Y) Y, X = drop_lowNs(10, Y, X) inspection.correlation(X, names=PARAM) pairwise_T2(X, Y, dumproot=projectroot, xpid=f"PairwiseT2_{FEATURE}.xlsx") F, p = manova(X, Y) print("-"*50) lda = LDA(n_components=2).fit(X, Y) # type: LDA smexvar = lda.explained_variance_ratio_ scat = scatter.Scatter2D(lda.transform(X), Y, title=f"LDA ({smexvar.sum():.2%})\nMANOVA: F = {F:.4f}, p = {p:.4f}", axlabels=[f"Latent0{i} ({ev:.2%})" for i, ev in enumerate(smexvar, start=1)]) is_many = len(np.unique(Y)) > 5 scat.split_scatter(legend=not is_many, show=True, center=is_many, label=is_many)
def inspect_classes(): from csxdata.stats import normaltest, inspection from csxdata.visual.histogram import fullplot names = [] for l in "YP": for i in range(10): names.append(l + str(i)) X, Y = load_dataset(as_matrix=False, as_string=True) inspection.category_frequencies(Y) inspection.correlation(X, names=names) normaltest.full(X, names=names) for name, column in zip(names, X.T): fullplot(column, name)
def main(): df = pull_merged_data("MEGYE") print() inspection.category_frequencies(df["MEGYE"]) normaltest.full(df[PARAM], names=PARAM) # inspection.correlation(df[PARAM], names=PARAM) print() fig, axarr = plt.subplots(3, 2, figsize=(5, 10)) for param, (histax, probax) in zip(PARAM, axarr): x = df[param] print(f"SKEW of {param}: {x.skew()}") histogram.Histogram(x, ax=histax).plot(axtitle=f"{param} histogram") histogram.NormProb(x, ax=probax).plot(axtitle=f"{param} Norm. prob. plot") plt.suptitle("Normality test on the merged fruit-wine datasets") plt.tight_layout() plt.show()
def xperiment(transform, ndim): X, Y = read_datasets(ycol="YEAR", dropthresh=10) category_frequencies(Y) F, p = manova(X, Y) X = standardize(X) model = get_transformator(ndim, transform) lX = model.fit_transform(X, Y) expvar = model.explained_variance_ratio_[:ndim] plottitle = f"{transform.upper()} ({sum(expvar):.2%})\nMANOVA F = {F:.4f}, p = {p:.4f}" axlabels = [f"Latent0{i+1} ({expvar[i]:.2%})" for i in range(ndim)] if ndim == 2: scat = Scatter2D(lX, Y, title=plottitle, axlabels=axlabels) elif ndim == 3: scat = Scatter3D(lX, Y, title=plottitle, axlabels=axlabels) else: raise ValueError(f"Unsupported dimensionality: {ndim}") scat.split_scatter(legend=True, show=True)
import numpy as np from csxdata.stats.inspection import category_frequencies from csxdata.utilities.highlevel import plot from csxdata.utilities.parser import parse_csv from csxdata.utilities.vectorop import drop_lowNs, dropna from SciProjects.sophie import projectroot X, Y, head = parse_csv(projectroot + "01GEO.csv", indeps=4, headers=1, decimal=True) y_coord = Y[:, -1].astype(float) categ = Y[:, 0] DHI, D13C = X.T plotme = np.stack((DHI, D13C, y_coord), axis=1) plotme, categ = dropna(plotme, categ) category_frequencies(categ) plot(plotme, axlabels=["DHI", "D13C", "Y"])
from SciProjects.sophie import projectroot from csxdata.utilities.parser import parse_csv from csxdata.utilities.vectorop import dropna from csxdata.stats.inspection import category_frequencies, correlation from csxdata.stats.normaltest import full X, Y, head = parse_csv(projectroot + "01GEO.csv", indeps=2, headers=1, decimal=True) category_frequencies(Y) X, Y = dropna(X, Y) correlation(X, ["X", "Y", "DH1", "DH2"]) full(X)
def category_frequencies(): from csxdata.stats.inspection import category_frequencies category_frequencies(Y)
import numpy as np from csxdata.utilities.highlevel import plot from csxdata.stats.inspection import category_frequencies from SciProjects.zsindstat.util import pull_data, axlab_latex def filter_out(X, Y, unwanted): arg = np.argwhere(Y != unwanted).ravel() return X[arg], Y[arg] frame = pull_data("FRUIT", filterby="FAM", selection="Pru") category_frequencies(frame.indeps) plot(frame.data, frame.indeps, ellipse_sigma=2, axlabels=axlab_latex)
def plotit3d(): plotme = np.stack((Y_C, DHI, D13C), axis=1) plotmeX, plotmeY = dropna(plotme, country_codes) category_frequencies(plotmeY) plot(plotmeX, plotmeY, axlabels=["Y", "DHI", "D13C"])
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from SciProjects.fruits.xperiment05.util import pull_data from csxdata.visual import scatter from csxdata.utilities.vectorop import drop_lowNs from csxdata.stats import manova, inspection FEATURE = "EV" df = pull_data(FEATURE) X = df[["DH1", "DH2", "D13C"]].as_matrix() Y = df[FEATURE].as_matrix() Y, X = drop_lowNs(15, Y, X) inspection.category_frequencies(Y) lX = LDA(n_components=2).fit_transform(X, Y) title = "LDA\nMANOVA F = {:.4f}, {:.4f}".format(*manova(X, Y)) scat = scatter.Scatter2D(lX, Y, title=title, axlabels=[f"Latent0{i}" for i in range(1, 3)]) scat.split_scatter(show=True)
import numpy as np from scipy import stats from matplotlib import pyplot as plt from csxdata.visual import Plotter2D from csxdata.utilities.vectorop import dropna from csxdata.stats.inspection import category_frequencies from SciProjects.sophie import pull_data, axtitles X_C, Y_C, DHI, D13C, CCode = pull_data("04GEO_eu.csv") DHI, Y_C, CCode = dropna(DHI, Y_C, CCode) category_frequencies(CCode) R, p = stats.spearmanr(DHI, Y_C) line = np.polyfit(Y_C, DHI, 1) line = np.poly1d(line) ttl = ( "Korreláció $(D/H)_I$ és az egyenlítőtől való távolság között Európában", f"Spearman-korreláció: R = {R:.2f}, p = {p:.2f}, {('nem' if p > 0.05 else '')}szignifikáns" ) axttl = ["Egyenlítőtől való távolság", axtitles["DHI"]] plotter = Plotter2D(plt.figure(), np.stack((Y_C, DHI), axis=1), CCode, title="\n".join(ttl), axlabels=axttl) plotter.split_scatter(center=True, sigma=2, alpha=0.5)
from sklearn.feature_selection import f_oneway from csxdata.stats import inspection from csxdata.utilities.highlevel import plot, transform from SciProjects.zsindstat.util import pull_data, axlab_latex frame = pull_data("YEAR", filterby="FRUIT", selection="meggy") inspection.category_frequencies(frame.indeps) X = frame.learning plot(X, frame.indeps, axlabels=axlab_latex, ellipse_sigma=2) tX = transform(X, factors=1, get_model=False, method="lda", y=frame.indeps) print("F: {}, pval: {}".format(*f_classif(tX, frame.indeps)))