Example #1
0
def point_point_plot(df, columns):
    create_folder("outs\\point2point")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            g = sns.jointplot(x=columns[i], y=columns[j], data=df)
            plt.savefig("outs\\point2point\\{}--{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #2
0
def hex_bin(df, columns):
    create_folder("outs\\hex_bin")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            g = sns.jointplot(x=columns[i], y=columns[j], data=df, kind="hex")
            plt.savefig("outs\\hex_bin\\{}--{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #3
0
def self_hist2d(df, columns):
    create_folder("outs\\self_hist2d")
    for i in range(len(columns)):
        plt.hist2d(df[columns[i]], df[columns[i]], (50, 50), cmin=1)
        plt.colorbar()
        plt.xlabel(columns[i])
        plt.savefig("outs\\self_hist2d\\{}.png".format(columns[i]))
        plt.close()
Example #4
0
def hist2d(df, columns):
    create_folder("outs\\hist2d")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            plt.hist2d(df[columns[i]], df[columns[j]], (50, 50), cmin=1)
            plt.colorbar()
            plt.savefig("outs\\hist2d\\{}--{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #5
0
def pre_processing(df: DataFrame):
    """
    input : a data frame
    outputs: clean data frame
            dtype.txt : a file that has type of each columns
            database:information.sqlite
            tables:
                 information  : clean data frame
                 before_process : data before process
                 missing_information : information of missing_data function output
                 outliers : outliers data
                 describe : describe of clean data
    Description:
                delete null information
                merge capital_gain and capital_loss
                delete education column
                delete outlier information with IQR method
                save information in database
    """
    sql_manager = SqlManager("information.sqlite")
    df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace")
    missing_data_df = missing_data(df)
    missing_data_df.to_sql(name="missing_information",
                           con=sql_manager.conn,
                           if_exists="replace")
    df = df.drop(columns=[
        "status_id", "status_published", 'Column1', "Column2", "Column3",
        "Column4"
    ])
    main_df = df.dropna()
    print(main_df.shape)
    outliers_df, main_df = drop_numerical_outliers(main_df)
    main_df = main_df[main_columns]
    outliers_df.to_sql(name="outliers",
                       con=SqlManager("information.sqlite").conn,
                       if_exists="replace",
                       index=False)
    main_df.to_sql(name="after_clear",
                   con=SqlManager("information.sqlite").conn,
                   if_exists="replace",
                   index=False)
    label_encode(main_df)
    scaled_df = DataFrame(preprocessing.robust_scale(main_df),
                          columns=main_columns)
    scaled_df.to_sql(name="information",
                     con=SqlManager("information.sqlite").conn,
                     if_exists="replace",
                     index=False)
    print(main_df.shape)
    main_df.describe().to_sql(name="describe",
                              con=sql_manager.conn,
                              if_exists='replace')
    create_folder("outs")
    with open("outs\\dtypes.txt", "w") as file:
        file.write(str(main_df.dtypes))
    return main_df
Example #6
0
def density(df, columns):
    create_folder("outs\\density")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            g = sns.jointplot(x=columns[i], y=columns[j], data=df, kind="kde")
            # g.plot_joint(plt.scatter, c="w", s=30, linewidth=1, marker="+")
            # g.ax_joint.collections[0].set_alpha(0)
            plt.savefig("outs\\density\\{}--{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #7
0
def db_scan_plots(df):
    create_folder("outs\\MainDBSCAN")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            print(columns[i], "   ", columns[j])
            plt.scatter(df[columns[i]], df[columns[j]], c=df["cluster"])
            plt.xlabel = columns[i]
            plt.ylabel = columns[j]
            plt.savefig("outs\\MainDBSCAN\\{}---{}.png".format(columns[i], columns[j]))
            plt.close()
Example #8
0
def k_means_plots(df, centers):
    create_folder("outs\\MainKMeans")
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            print(columns[i], "   ", columns[j])
            plt.scatter(df[columns[i]], df[columns[j]], c=df["cluster"])
            x_centers = [x[i] for x in centers]
            y_centers = [y[j] for y in centers]
            plt.scatter(x_centers, y_centers, c="r", marker="+", s=200)
            plt.xlabel = columns[i]
            plt.ylabel = columns[j]
            plt.savefig("outs\\MainKMeans\\{}---{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #9
0
def db_scan_each_2_columns(df):
    plt.close()
    create_folder("outs\\DBSCAN_each2columns")

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            print(columns[i], "  ", columns[j])
            samples = df[[columns[i], columns[j]]].copy()
            db_scan = DBSCAN()
            db_scan.fit(samples)
            samples["cluster"] = db_scan.labels_
            plt.scatter(samples[columns[i]], samples[columns[j]], c=samples["cluster"])
            plt.xlabel = columns[i]
            plt.ylabel = columns[j]
            plt.savefig("outs\\DBSCAN_each2columns\\{}---{}.png".format(columns[i], columns[j]))
            plt.close()
Example #10
0
def pie_plots(columns_name):
    for col in columns_name:
        result = sql_manager.crs.execute((
            "select distinct {},count({}) from information group by {}".format(
                col, col, col))).fetchall()
        counts = [x[1] for x in result]
        attr = [x[0] for x in result]
        fig1, ax1 = plt.subplots()
        ax1.pie(counts,
                labels=attr,
                autopct='%1.1f%%',
                shadow=True,
                startangle=90)
        ax1.axis('equal'
                 )  # Equal aspect ratio ensures that pie is drawn as a circle.
        create_folder("outs\\pie_plots")
        plt.savefig("outs\\pie_plots\\{}.png".format(col))
        plt.close()
Example #11
0
def agglomerative_each_2_columns(df, k):
    plt.close()
    create_folder("outs\\agglomerative_each2columns")

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            print(columns[i], "  ", columns[j])
            samples = df[[columns[i], columns[j]]].copy()
            agglomerative = AgglomerativeClustering(n_clusters=k)
            agglomerative.fit(samples)
            samples["cluster"] = agglomerative.labels_
            plt.scatter(samples[columns[i]],
                        samples[columns[j]],
                        c=samples["cluster"])
            plt.xlabel = columns[i]
            plt.ylabel = columns[j]
            plt.savefig("outs\\agglomerative_each2columns\\{}---{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #12
0
def k_means_each_2_columns(df, k):
    plt.close()
    create_folder("outs\\KMeans_each2columns")

    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            print(columns[i], "  ", columns[j])
            samples = df[[columns[i], columns[j]]].copy()
            k_means = KMeans(n_clusters=k, random_state=5)
            k_means.fit(samples)
            samples["cluster"] = k_means.labels_
            plt.scatter(samples[columns[i]],
                        samples[columns[j]],
                        c=samples["cluster"])
            centers = k_means.cluster_centers_
            x_centers = [x[0] for x in centers]
            y_centers = [y[1] for y in centers]
            plt.scatter(x_centers, y_centers, c="r", marker="+", s=200)
            plt.xlabel = columns[i]
            plt.ylabel = columns[j]
            plt.savefig("outs\\KMeans_each2columns\\{}---{}.png".format(
                columns[i], columns[j]))
            plt.close()
Example #13
0
def diff(df, cols):
    for col in cols:
        df[col].diff().hist()
        create_folder("outs\\diff_hists")
        plt.savefig("outs\\diff_hists\\{}.png".format(col))
        plt.close()
Example #14
0
def boxes(columns_name, df):
    for col in columns_name:
        df[col].plot.box()
        create_folder("outs\\boxes")
        plt.savefig("outs\\boxes\\{}.png".format(col))
        plt.close()