def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        ##### plotting clusters start
        # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        # low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        # hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)
def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam,
                                 clusterFlag, scoreTypeParam):
    from sklearn import cluster
    from sklearn.metrics import silhouette_samples, silhouette_score, v_measure_score, adjusted_mutual_info_score, completeness_score

    scoreListToret = []
    print "Performing experiemnt: Select Classifier"
    clusters = [x for x in xrange(100) if x > 1]
    #clusters=[2]
    for clsuter_cnt in clusters:
        print "this is iteration #", clsuter_cnt
        versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(
            dbFileName)
        sanitizedVersions = sanityCheck.getCodeQualityofVersions(
            versionAndCodeQualityDict, meanFlag)
        sanitizedVersions_CQ = sanitizedVersions

        NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
            sanitizedVersions)
        #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
        brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
        onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
            0], brokenDict[1]
        #print "lalalaa ", onlyTheNonZeroSanitizedVScores
        reshapedNonZerSanitizedScores = np.reshape(
            onlyTheNonZeroSanitizedVScores, (-1, 1))
        clusteringType = None
        if clusterFlag:
            clusteringType = cluster.KMeans(n_clusters=clsuter_cnt)
        else:
            clusteringType = cluster.AgglomerativeClustering(
                n_clusters=clsuter_cnt)

        cluster_labels = clusteringType.fit_predict(
            reshapedNonZerSanitizedScores)
        scores = 0
        if scoreTypeParam == 0:
            scores = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
            score_type = 'Silhouette'
        # elif scoreTypeParam==1:
        # 	scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='V-measure'
        # elif scoreTypeParam==2:
        # 	scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='adjusted mutual info'
        # elif scoreTypeParam==3:
        # 	scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='completeness'
        score_combo = (clsuter_cnt, scores)
        scoreListToret.append(score_combo)
        print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format(
            score_type, clsuter_cnt, scores)

    return scoreListToret
def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam, clusterFlag, scoreTypeParam):
    from sklearn import cluster
    from sklearn.metrics import (
        silhouette_samples,
        silhouette_score,
        v_measure_score,
        adjusted_mutual_info_score,
        completeness_score,
    )

    scoreListToret = []
    print "Performing experiemnt: Select Classifier"
    clusters = [x for x in xrange(100) if x > 1]
    # clusters=[2]
    for clsuter_cnt in clusters:
        print "this is iteration #", clsuter_cnt
        versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
        sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
        sanitizedVersions_CQ = sanitizedVersions

        NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
            sanitizedVersions
        )
        # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
        brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
        onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
        # print "lalalaa ", onlyTheNonZeroSanitizedVScores
        reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
        clusteringType = None
        if clusterFlag:
            clusteringType = cluster.KMeans(n_clusters=clsuter_cnt)
        else:
            clusteringType = cluster.AgglomerativeClustering(n_clusters=clsuter_cnt)

        cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
        scores = 0
        if scoreTypeParam == 0:
            scores = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
            score_type = "Silhouette"
            # elif scoreTypeParam==1:
            # 	scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='V-measure'
            # elif scoreTypeParam==2:
            # 	scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='adjusted mutual info'
            # elif scoreTypeParam==3:
            # 	scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='completeness'
        score_combo = (clsuter_cnt, scores)
        scoreListToret.append(score_combo)
        print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format(
            score_type, clsuter_cnt, scores
        )

    return scoreListToret
def experiemnt_mobilesoft(dbFileName, outputStrParam):
    from sklearn import cluster
    import plotter
    clusteringType = cluster.AgglomerativeClustering(n_clusters=5)

    print "Performing experiemnt # Mobilesoft"
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getMobilesoftCodeQualityVersions(
        versionAndCodeQualityDict, 1.00)
    sanitizedVersions_CQ = sanitizedVersions

    NonZero_sanitizedVersionsWithScore = sanityCheck.getAllVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #   strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"
    #
    # ##
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    print "No centroids for Aggolomerative clustering"
    NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
        onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    # clusteringType = cluster.KMeans(n_clusters=5)
    # clusteringType.fit(reshapedNonZerSanitizedScores)
    # centroids = clusteringType.cluster_centers_
    # print "And the centroids are .... ", centroids
    ##############################
    themegaFile_All = outputStrParam + "_" + "cluster_Headered_1407.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels)
    '''
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    #print "And the labels are .... "
    #print labelsFroVersions

    #print "versionDictWithLabels"
    #print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels,
        NonZero_sanitizedVersionsWithScore)
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict,
                                              sanitizedVersions_CQ)
def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        # print "And the labels are .... "
        # print labelsFroVersions

        # print "versionDictWithLabels"
        # print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore
    )
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)
def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores

    #strOfScoresToDump=""
    #for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
        ##### plotting clusters start
        #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        #low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        #hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels, False)