Esempio n. 1
0
def pull_merged_data(feature=None,
                     drop_outliers=True,
                     force_reread=False,
                     report=False) -> pd.DataFrame:
    if force_reread:
        wd = WineData()
        wdf = wd.raw[["COUNTY", "YEAR"] + PARAM]  # type: pd.DataFrame
        wdf.rename(columns={"COUNTY": "MEGYE", "YEAR": "EV"}, inplace=True)
        fd = FruitData()
        fdf = fd.raw[["MEGYE", "EV"] + PARAM]

        df = pd.concat((wdf, fdf))  # type: pd.DataFrame
        df.to_excel(projectroot + "Merged.xlsx")
    else:
        df = pd.read_excel(projectroot + "Merged.xlsx")
    if report:
        print(df.columns)
        print()
        print(df.describe())
        print()
        print(df.dtypes)
    if drop_outliers:
        # mask = np.logical_or(, df["DH1"] > 90)
        df = df[df["DH2"] < 135]
        df = df[df["DH1"] > 90]
    return df[[feature] + PARAM].dropna() if feature else df
Esempio n. 2
0
def load_dataset(feature, dset=None):
    df = FruitData(transform=True)
    X = {"volatile": df.volatile.as_matrix(),
         "isotope": df.isotope.as_matrix(),
         None: df.X.as_matrix()}[dset]
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    y = df[feature].as_matrix()
    return X, y
Esempio n. 3
0
def xperiment_euclidean_distance(sugar="maize"):
    def euclidean(sample, reference):
        return np.sqrt((sample - reference)**2).sum()

    maize_center = sugar_parameters(sugar)[0]
    samples = get_sample_xs()
    fruit = FruitData()
    for smplnm in sorted(samples):
        species, dh1, d13c = samples[smplnm]
        fruit_center = fruit(species).mean(axis=0) + 0.6

        s = np.array([dh1, d13c]) - 0.6

        d1 = euclidean(s, fruit_center)
        d2 = euclidean(s, maize_center)

        print(f"{smplnm} ({species}) {sugar} content: {d1 / (d1+d2):>.4%}")
Esempio n. 4
0
def xperiment_twoclass_svm():
    print("SMPL     PROB      PROB     PREDICTION")
    print("--------------------------------------")
    maize = fake_maize_data()
    samples = get_sample_xs()
    for i, smplnm in enumerate(sorted(samples)):
        species, dh1, d13c = samples[smplnm]
        fruit = FruitData().isotope[["DH1", "D13C"]].as_matrix()

        X = np.concatenate((fruit, maize))
        Y = np.array([species] * len(fruit) + ["Kukorica"] * len(maize))

        svm = SVC(kernel="linear", probability=True)
        svm.fit(X, Y)

        sX = np.array([[dh1, d13c]])
        probs = svm.predict_proba(sX).ravel()
        pred = svm.predict(sX).ravel()
        print("{}    {:.2%}    {:.2%}    {:>10}".format(
            smplnm, probs[0], probs[1], pred[0]))
Esempio n. 5
0
def plot_transform(trname, feature, ndim):
    df = FruitData(transform=True)
    X, Y = df.X, df[feature]
    lX = transform(X, trname, ndim, Y)
    F, p = manova(X.as_matrix(), Y.as_matrix())
    if ndim == 2:
        scat = Scatter2D(
            lX,
            Y.as_matrix(),
            axlabels=[f"LatentFactor{i}" for i in range(1, 3)],
            title=f"Transformation: {feature.upper()}\nF = {F:.4f}; p = {p:.4f}"
        )
    elif ndim == 3:
        scat = Scatter3D(
            lX,
            Y.as_matrix(),
            axlabels=[f"LatentFactor{i}" for i in range(1, 4)],
            title=f"Transformation: {feature.upper()}\nF = {F:.4f}; p = {p:.4f}"
        )
    else:
        raise ValueError(f"Invalid ndim: {ndim}")
    # scat.scatter()
    scat.split_scatter(show=True)
Esempio n. 6
0
from csxdata.visual import spiderplot
from csxdata.utilities.vectorop import standardize

from SciProjects.fruits.fruitframe import FruitData

df = FruitData(transform=False)
X, Y = df.volatile, df["FAMILIA"]

# bycat = split_by_categories(Y, X)

spiderplot.split_gridlike(standardize(X), Y, X.columns, ncols=3)
Esempio n. 7
0
from csxdata.stats import correlation
from csxdata.stats import normaltest
from csxdata.visual.histogram import fullplot

from SciProjects.fruits.fruitframe import FruitData


frame = FruitData(transform=True)
X = frame.volatile
correlation(X, names=X.columns)
normaltest.full(X, names=X.columns)
for col in X:
    fullplot(X[col], paramname=col)