Python clusterByKmeansLabelの例、utility.clusterByKmeansLabel Pythonの例

コード例 #1

0

ファイルを表示

ファイル: experiments.py プロジェクト: akondrahman/DataAnalysisAndLearning

def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        ##### plotting clusters start
        # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        # low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        # hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)

コード例 #2

0

ファイルを表示

ファイル: experiments.py プロジェクト: akondrahman/DataAnalysisAndLearning

def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    #print "And the labels are .... "
    #print labelsFroVersions

    #print "versionDictWithLabels"
    #print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels,
        NonZero_sanitizedVersionsWithScore)
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict,
                                              sanitizedVersions_CQ)

コード例 #3

0

ファイルを表示

ファイル: experiments.py プロジェクト: akondrahman/DataAnalysisAndLearning

def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        # print "And the labels are .... "
        # print labelsFroVersions

        # print "versionDictWithLabels"
        # print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore
    )
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)

コード例 #4

0

ファイルを表示

ファイル: experiments.py プロジェクト: akondrahman/DataAnalysisAndLearning

def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores

    #strOfScoresToDump=""
    #for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
        ##### plotting clusters start
        #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        #low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        #hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels, False)