Esempio n. 1
0
def saveListAsExcel(list,
                    outputDirectory,
                    fileName,
                    outputColumnNames,
                    targeName=""):
    outputDataframe = pd.DataFrame(list, columns=outputColumnNames)
    # Here we use a python function to convert our dataframe to Excel and then save it
    writer = ExcelWriter(outputDirectory + "/" + du.timeStamped() + targeName +
                         "_" + fileName + '.xlsx')
    outputDataframe.to_excel(writer, '_Sheet1')
    du.getLogger().debug("Saving to " + outputDirectory + "/" +
                         du.timeStamped() + targeName + "_" + fileName +
                         '.xlsx')
    writer.save()
Esempio n. 2
0
def sparsityPlot(X, outputDirectory, title="figure"):
    fig = figure()
    DefaultSize = fig.get_size_inches()
    DPI = fig.get_dpi()
    fig.set_size_inches((DefaultSize[0] * 5, DefaultSize[1] * 5))
    ax1 = fig.add_subplot(111)
    ax1.spy(X, precision=1, markersize=0.1, marker=".", aspect="auto")
    fig.savefig(outputDirectory + "/" + du.timeStamped() + title +
                "_figure.png")
Esempio n. 3
0
def plotClusterCentroids(kmeans, outputDirectory, title="Cluster_Centroids"):
    #plt.scatter(kmeans.cluster_centers_[:,0] ,kmeans.cluster_centers_[:,1], color='black')
    fig = figure()
    DefaultSize = fig.get_size_inches()
    DPI = fig.get_dpi()
    fig.set_size_inches((DefaultSize[0] * 5, DefaultSize[1] * 5))
    ax1 = fig.add_subplot(111)
    ax1.scatter(kmeans.cluster_centers_[:, 0],
                kmeans.cluster_centers_[:, 1],
                color='black')
    fig.savefig(outputDirectory + "/" + du.timeStamped() + title + ".png")
Esempio n. 4
0
def elbow(X, outputDirectory, title, fromCount, toCount):
    from sklearn.cluster import KMeans
    from matplotlib import pyplot as plt
    distorsions = []
    for k in range(fromCount, toCount):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        distorsions.append(kmeans.inertia_)

    fig = plt.figure(figsize=(15, 5))
    plt.plot(range(fromCount, toCount), distorsions)
    plt.grid(True)
    plt.title('Elbow curve')
    fig.savefig(outputDirectory + "/" + du.timeStamped() + title +
                "_figure.png")
import datautils
import dataprocessing
import datamanipulation_conversions_jake as conversions
import kmeans_sklearn_jake as kmeansTools
import reporting_kmeansplots_sklearn_jake as kmeansplots
############################################################
#              main program starts here
############################################################

# Set the important variables for data folder, data name, output folder
#dataDirectory="C:/Users/jake/Dropbox/Big Data Career/Data/tinysample"
dataDirectory = "Z:/Dropbox/Big Data Career/Data/jd_bydocument/freq-2018-11-01_300000_by_55000_95Percentile"
#dataFile="fjd201808bysentence_merged"
dataFile = "fjd201810byjob_subset_life sciences95"
outputDirectory = "C:/users/jake/Desktop/outputFolder/"
timestamp = datautils.timeStamped()
datautils.createDirectory(outputDirectory)

##############################################
# This creates a logger. You can use this to create an output file
# which logs the steps of your analysis
##############################################
logger = datautils.createLogger(outputDirectory)

######################################################
# Now lets try with tfidf
######################################################

logger.info("\load data\n\n")

data_tuple = dataloader.load(dataDirectory, dataFile)