Python DataExtractionFromTables Examples, DataExtractionFromTables Python Examples

Example #1

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getCodeQualityofVersions(dictParam, meanFlag=True):
    versionDictToRet = {}
    versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    ###
    fileList = []
    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        ## let us first check if the vulnerabilites scores are there for all versions if not then excluded
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            if fileCount != None:
                fileList.append(fileCount)
                #print "Version #{} has #{} files".format(versionIDInRiskDict, fileCount)
    print "Stats on file count : len={}, median={},  mean={}, max={}, min={},".format(
        len(fileList), np.median(fileList), np.mean(fileList), max(fileList),
        min(fileList))

    if meanFlag:
        thres = np.mean(fileList)
    else:
        #thres = np.median(fileList)
        thres = np.percentile(fileList, 25)
        #print "50th percentile of file count ... ", thres
    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            # then lets check if the file coutn is at least the mdian file count
            if fileCount >= thres:
                versionDictToRet[versionIDInRiskDict] = dictParam[
                    versionIDInRiskDict]
    return versionDictToRet

Example #2

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getCodeQualityofVersions(dictParam, meanFlag=True):
    versionDictToRet = {}
    versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    ###
    fileList = []
    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        ## let us first check if the vulnerabilites scores are there for all versions if not then excluded
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            if fileCount != None:
                fileList.append(fileCount)
                # print "Version #{} has #{} files".format(versionIDInRiskDict, fileCount)
    print "Stats on file count : len={}, median={},  mean={}, max={}, min={},".format(
        len(fileList), np.median(fileList), np.mean(fileList), max(fileList), min(fileList)
    )

    if meanFlag:
        thres = np.mean(fileList)
    else:
        # thres = np.median(fileList)
        thres = np.percentile(fileList, 25)
        # print "50th percentile of file count ... ", thres
    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            # then lets check if the file coutn is at least the mdian file count
            if fileCount >= thres:
                versionDictToRet[versionIDInRiskDict] = dictParam[versionIDInRiskDict]
    return versionDictToRet

Example #3

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getMobilesoftCodeQualityVersions(dictParam, threshParam):
    versionDictToRet = {}
    versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    ###
    fileList = []
    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        ## let us first check if the vulnerabilites scores are there for all versions if not then excluded
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            if fileCount != None:
                fileList.append(fileCount)
                #print "Version #{} has #{} files".format(versionIDInRiskDict, fileCount)
    print "Stats on file count : len={}, median={},  mean={}, max={}, min={},".format(
        len(fileList), np.median(fileList), np.mean(fileList), max(fileList),
        min(fileList))

    for k_, v_ in versionRiskDict.items():
        versionIDInRiskDict = k_
        if versionIDInRiskDict in dictParam:
            fileCount = dictParam[versionIDInRiskDict][10]
            if fileCount >= threshParam:
                versionDictToRet[versionIDInRiskDict] = dictParam[
                    versionIDInRiskDict]
    return versionDictToRet

Example #4

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        ##### plotting clusters start
        # low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        # low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        # hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        # plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ, NonZer_Santized_versionDictWithLabels, False)

Example #5

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getVulnerbailityScoreStatus(dictParam):
    riskList = []
    original_versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    for k_, v_ in dictParam.items():
        ## get the scores for the valid versions
        riskScore = original_versionRiskDict[k_]
        riskList.append(riskScore)
    return np.mean(riskList), np.median(riskList)

Example #6

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getVulnerbailityScoreStatus(dictParam):
    riskList = []
    original_versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    for k_, v_ in dictParam.items():
        ## get the scores for the valid versions
        riskScore = original_versionRiskDict[k_]
        riskList.append(riskScore)
    return np.mean(riskList), np.median(riskList)

Example #7

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam, clusterFlag, scoreTypeParam):
    from sklearn import cluster
    from sklearn.metrics import (
        silhouette_samples,
        silhouette_score,
        v_measure_score,
        adjusted_mutual_info_score,
        completeness_score,
    )

    scoreListToret = []
    print "Performing experiemnt: Select Classifier"
    clusters = [x for x in xrange(100) if x > 1]
    # clusters=[2]
    for clsuter_cnt in clusters:
        print "this is iteration #", clsuter_cnt
        versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
        sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
        sanitizedVersions_CQ = sanitizedVersions

        NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
            sanitizedVersions
        )
        # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
        brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
        onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
        # print "lalalaa ", onlyTheNonZeroSanitizedVScores
        reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
        clusteringType = None
        if clusterFlag:
            clusteringType = cluster.KMeans(n_clusters=clsuter_cnt)
        else:
            clusteringType = cluster.AgglomerativeClustering(n_clusters=clsuter_cnt)

        cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
        scores = 0
        if scoreTypeParam == 0:
            scores = silhouette_score(reshapedNonZerSanitizedScores, cluster_labels)
            score_type = "Silhouette"
            # elif scoreTypeParam==1:
            # 	scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='V-measure'
            # elif scoreTypeParam==2:
            # 	scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='adjusted mutual info'
            # elif scoreTypeParam==3:
            # 	scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels)
            # 	score_type='completeness'
        score_combo = (clsuter_cnt, scores)
        scoreListToret.append(score_combo)
        print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format(
            score_type, clsuter_cnt, scores
        )

    return scoreListToret

Example #8

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_select_classifier(dbFileName, meanFlag, outputStrParam,
                                 clusterFlag, scoreTypeParam):
    from sklearn import cluster
    from sklearn.metrics import silhouette_samples, silhouette_score, v_measure_score, adjusted_mutual_info_score, completeness_score

    scoreListToret = []
    print "Performing experiemnt: Select Classifier"
    clusters = [x for x in xrange(100) if x > 1]
    #clusters=[2]
    for clsuter_cnt in clusters:
        print "this is iteration #", clsuter_cnt
        versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(
            dbFileName)
        sanitizedVersions = sanityCheck.getCodeQualityofVersions(
            versionAndCodeQualityDict, meanFlag)
        sanitizedVersions_CQ = sanitizedVersions

        NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
            sanitizedVersions)
        #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
        brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
        onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
            0], brokenDict[1]
        #print "lalalaa ", onlyTheNonZeroSanitizedVScores
        reshapedNonZerSanitizedScores = np.reshape(
            onlyTheNonZeroSanitizedVScores, (-1, 1))
        clusteringType = None
        if clusterFlag:
            clusteringType = cluster.KMeans(n_clusters=clsuter_cnt)
        else:
            clusteringType = cluster.AgglomerativeClustering(
                n_clusters=clsuter_cnt)

        cluster_labels = clusteringType.fit_predict(
            reshapedNonZerSanitizedScores)
        scores = 0
        if scoreTypeParam == 0:
            scores = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
            score_type = 'Silhouette'
        # elif scoreTypeParam==1:
        # 	scores = v_measure_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='V-measure'
        # elif scoreTypeParam==2:
        # 	scores = adjusted_mutual_info_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='adjusted mutual info'
        # elif scoreTypeParam==3:
        # 	scores = completeness_score(reshapedNonZerSanitizedScores, cluster_labels)
        # 	score_type='completeness'
        score_combo = (clsuter_cnt, scores)
        scoreListToret.append(score_combo)
        print "::::: score_type={}, For n_clusters ={}, The clustering _score is ={} :::::".format(
            score_type, clsuter_cnt, scores)

    return scoreListToret

Example #9

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_one(dbFileName, meanFlag, outputStrParam):
    print "Performing experiment # 1"
    #import correlation as corr_
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    print "Sanitized versions that will be used in study ", len(
        sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    '''
	Stats on risk score-->len=721, median=51.1111111111,  mean=38.0255199862, max=53.3333333333, min=0.0,
	'''

    riskStatus = sanityCheck.getVulnerbailityScoreStatus(
        sanitizedVersionsWithScore)
    if meanFlag:
        threshold = riskStatus[0]  ## first returned index is mean
    else:
        threshold = riskStatus[1]

    ##############################
    sanitizedVersions_CQ = sanitizedVersions

    #######  high vScore versions started

    high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore,
                                                    sanitizedVersions_CQ,
                                                    threshold)
    high_vScore_Dict = utility.getHighVScoreVersions_VScore(
        sanitizedVersionsWithScore, threshold)
    print "high_vscore_versions ", len(high_vScore_Dict)
    #######  high vScore versions ended

    #######  low vScore versions started
    low_CQ_dict = utility.getLowVScoreVersions_CQ(sanitizedVersionsWithScore,
                                                  sanitizedVersions_CQ,
                                                  threshold)
    low_vScore_Dict = utility.getLowVScoreVersions_VScore(
        sanitizedVersionsWithScore, threshold)
    print "len_vscore_versions ", len(low_vScore_Dict)
    #######  low vScore versions ended
    ##### dumpin time
    ### three ways: first by dumping all highs then all lows
    themegaFile_Seperated = outputStrParam + "_" + "all-CQ-HL-Seperated.csv"
    IO_.dumpIntoFileByHighAndLow(themegaFile_Seperated, high_CQ_dict,
                                 low_CQ_dict)

    ### three ways : second by dumping as it si
    themegaFile_All = outputStrParam + "_" + "all-CQ-HL.csv"
    IO_.dumpIntoFile(themegaFile_All, sanitizedVersions_CQ,
                     sanitizedVersionsWithScore, threshold, False)
    LGR.performLogiRegression(themegaFile_All)

Example #10

0

Show file

File: sanityCheck.py Project: arjundatt/DataAnalysisAndLearning

def getVulnerbailityScoreOfSelectedVersions(dictParam):
  validDictToret={}  
  riskList=[]
  original_versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
  for k_, v_ in dictParam.items():
    ## get the scores for the valid versions 
    riskScore = original_versionRiskDict[k_]
    validDictToret[k_] = riskScore
    riskList.append(riskScore)
  print "Stats on risk score-->len={}, median={},  mean={}, max={}, min={},".format(len(riskList), np.median(riskList), np.mean(riskList), max(riskList), min(riskList))      
  return validDictToret

Example #11

0

Show file

File: sanityCheck.py Project: akondrahman/DataAnalysisAndLearning

def getVulnerbailityScoreOfSelectedVersions(dictParam):
    validDictToret = {}
    riskList = []
    original_versionRiskDict = DEFT.getValuesFrom_Vulnerability(dbFileName)
    for k_, v_ in dictParam.items():
        ## get the scores for the valid versions
        riskScore = original_versionRiskDict[k_]
        validDictToret[k_] = riskScore
        riskList.append(riskScore)
    print "Stats on risk score-->len={}, median={},  mean={}, max={}, min={},".format(
        len(riskList), np.median(riskList), np.mean(riskList), max(riskList),
        min(riskList))
    return validDictToret

Example #12

0

Show file

File: experiments.py Project: arjundatt/DataAnalysisAndLearning

def experiemnt_two(dbFileName, meanFlag, outputStrParam ):
	print "Performing experiemnt # 2"



	#import correlation as corr_
	versionAndCodeQualityDict =  DEFT.getValuesFrom_CodingStandard(dbFileName)
	sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
	print "Sanitized versions that will be used in study ", len(sanitizedVersions)
	#print "Sanitized versions ..." , sanitizedVersions
	NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)

	'''
	Stats on risk score (non-zero elemnts)-->len=549, median=51.1111111111,  mean=49.9387976503, max=53.3333333333, min=15.0	
	'''

	############################## 
	sanitizedVersions_CQ = sanitizedVersions


	riskStatus = sanityCheck.getVulnerbailityScoreStatus(NonZero_sanitizedVersionsWithScore)
	if meanFlag:       
	 threshold = riskStatus[0]   ## first returned index is mean 
	else: 
	 threshold = riskStatus[1]  


	#######  high vScore versions started  

	high_CQ_dict = utility.getHighVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold)
	high_vScore_Dict = utility.getHighVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold)
	print "non zero high_vscore_versions ", len(high_vScore_Dict)
	#######  high vScore versions ended   


	#######  low vScore versions started  
	low_CQ_dict = utility.getLowVScoreVersions_CQ( NonZero_sanitizedVersionsWithScore , sanitizedVersions_CQ , threshold)
	low_vScore_Dict = utility.getLowVScoreVersions_VScore(NonZero_sanitizedVersionsWithScore, threshold)
	print "non zero  len_vscore_versions ", len(low_vScore_Dict)
	#######  low vScore versions ended   
	##### dumpin time 
	### three ways: first by dumping all highs then all lows 
	themegaFile_Seperated = outputStrParam + "_" + "non_zero_all-CQ-HL-Seperated.csv"
	IO_.dumpIntoFileByHighAndLow( themegaFile_Seperated, high_CQ_dict, low_CQ_dict )

	### three ways : second by dumping as it si 
	themegaFile_All = outputStrParam + "_" + "non_zero_all-CQ-HL.csv"
	IO_.dumpIntoFile( themegaFile_All,sanitizedVersions_CQ , NonZero_sanitizedVersionsWithScore, threshold, False )
	LGR.performLogiRegression(themegaFile_All)

Example #13

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_mobilesoft(dbFileName, outputStrParam):
    from sklearn import cluster
    import plotter
    clusteringType = cluster.AgglomerativeClustering(n_clusters=5)

    print "Performing experiemnt # Mobilesoft"
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getMobilesoftCodeQualityVersions(
        versionAndCodeQualityDict, 1.00)
    sanitizedVersions_CQ = sanitizedVersions

    NonZero_sanitizedVersionsWithScore = sanityCheck.getAllVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]

    # strOfScoresToDump=""
    # for elem in onlyTheNonZeroSanitizedVScores:
    #   strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"
    #
    # ##
    # IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    print "No centroids for Aggolomerative clustering"
    NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
        onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    # clusteringType = cluster.KMeans(n_clusters=5)
    # clusteringType.fit(reshapedNonZerSanitizedScores)
    # centroids = clusteringType.cluster_centers_
    # print "And the centroids are .... ", centroids
    ##############################
    themegaFile_All = outputStrParam + "_" + "cluster_Headered_1407.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels)
    '''

Example #14

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    #print "And the labels are .... "
    #print labelsFroVersions

    #print "versionDictWithLabels"
    #print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels,
        NonZero_sanitizedVersionsWithScore)
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict,
                                              sanitizedVersions_CQ)

Example #15

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_correlation(dbFileName, meanFlag, outputStrParam, clusterFlag):
    import correlation
    from sklearn import cluster

    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=2)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=2)

    print "Performing experiemnt # Correlation: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    # print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    # print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(sanitizedVersions)
    # print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[0], brokenDict[1]
    # print "lalalaa ", onlyTheNonZeroSanitizedVScores
    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores, (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions
        )
        # print "And the labels are .... "
        # print labelsFroVersions

        # print "versionDictWithLabels"
        # print len(versionDictWithLabels)
    onlyHighV_Scores_Dict = utility.getH_Scores_ForCorr(
        NonZer_Santized_versionDictWithLabels, NonZero_sanitizedVersionsWithScore
    )
    correlation.performCorrBasedOnIndiMetrics(onlyHighV_Scores_Dict, sanitizedVersions_CQ)

Example #16

0

Show file

File: dataTables.py Project: Pikomonto/DataAnalysisAndLearning

"""
Created on Sun Feb 14 21:35:01 2016

@author: akond
"""



import DataExtractionFromTables as DEFT
dbFileName="/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db"




# table 1 
androidManifestAppList = DEFT.getValuesFrom_Android_Manifest_AppInfo(dbFileName)
#print "Output for androidManifestApp ... ", androidManifestAppList

# table 2
Android_Manifest_CommitInfoList = DEFT.getValuesFrom_Android_Manifest_CommitInfo(dbFileName)
#print "Output for Android_Manifest_CommitInfo ... ", Android_Manifest_CommitInfoList

#table 3
Android_Manifest_intent_joinList = DEFT.getValuesFrom_android_Manifest_intent_join(dbFileName)
#print "Output for android_Manifest_intent_join ... ", Android_Manifest_intent_joinList

#table 4
Android_Manifest_IntentList = DEFT.getValuesFrom_Android_Manifest_Intent(dbFileName)
#print "Output for Android_Manifest_IntentList ... ", Android_Manifest_IntentList

Example #17

0

Show file

File: dataTables.py Project: akondrahman/DataAnalysisAndLearning

# -*- coding: utf-8 -*-
"""
Created on Sun Feb 14 21:35:01 2016

@author: akond
"""

import DataExtractionFromTables as DEFT
dbFileName = "/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db"

# table 1
androidManifestAppList = DEFT.getValuesFrom_Android_Manifest_AppInfo(
    dbFileName)
#print "Output for androidManifestApp ... ", androidManifestAppList

# table 2
Android_Manifest_CommitInfoList = DEFT.getValuesFrom_Android_Manifest_CommitInfo(
    dbFileName)
#print "Output for Android_Manifest_CommitInfo ... ", Android_Manifest_CommitInfoList

#table 3
Android_Manifest_intent_joinList = DEFT.getValuesFrom_android_Manifest_intent_join(
    dbFileName)
#print "Output for android_Manifest_intent_join ... ", Android_Manifest_intent_joinList

#table 4
Android_Manifest_IntentList = DEFT.getValuesFrom_Android_Manifest_Intent(
    dbFileName)
#print "Output for Android_Manifest_IntentList ... ", Android_Manifest_IntentList

#table 5

Example #18

0

Show file

File: correlation.py Project: Pikomonto/DataAnalysisAndLearning

  _corr_score = doCorrelation( major_violationsCntList, vScoreList)  
  print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format(_corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3], _corr_score[4], _corr_score[5])  
  _corr_score = []
  print "------------------"    
  print "Correlating vScore and minor violations  " 
  _corr_score = doCorrelation( minor_violationsCntList, vScoreList)  
  print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format(_corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3], _corr_score[4], _corr_score[5])  
############################CORRRELATION ZONE ##############################   



import sanityCheck, utility
dbFileName="/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db"
import DataExtractionFromTables as DEFT
#import correlation as corr_
versionAndCodeQualityDict =  DEFT.getValuesFrom_CodingStandard(dbFileName)
sanitizedVersions_CQ = sanityCheck.getCodeQualityofVersions(versionAndCodeQualityDict)
print "Sanitized versions that will be used in study ", len(sanitizedVersions_CQ)
#print "Sanitized versions ..." , sanitizedVersions
##### all the vulnerability versions started 
sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions(sanitizedVersions_CQ)
### call for correlation 
#performCorrBasedOnIndiMetrics(sanitizedVersionsWithScore, sanitizedVersions_CQ)
##### all the vulnerability versions ended 



#######  high vScore versions started  
medianScore = 51.11111
high_CQ_dict = utility.getHighVScoreVersions_CQ( sanitizedVersionsWithScore , sanitizedVersions_CQ , medianScore)
high_vScore_Dict = utility.getHighVScoreVersions_VScore(sanitizedVersionsWithScore, medianScore)

Example #19

0

Show file

File: experiments.py Project: akondrahman/DataAnalysisAndLearning

def experiemnt_three(dbFileName, meanFlag, outputStrParam, clusterFlag):
    from sklearn import cluster
    import plotter
    clusteringType = None
    if clusterFlag:
        clusteringType = cluster.KMeans(n_clusters=13)
    else:
        clusteringType = cluster.AgglomerativeClustering(n_clusters=13)

    print "Performing experiemnt # 3: Clustering score into two clusters "
    versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
    sanitizedVersions = sanityCheck.getCodeQualityofVersions(
        versionAndCodeQualityDict, meanFlag)
    sanitizedVersions_CQ = sanitizedVersions
    #print "Sanitized versions that will be used in study ", len(sanitizedVersions)
    #print "Sanitized versions ..." , sanitizedVersions
    NonZero_sanitizedVersionsWithScore = sanityCheck.getNonZeroVulnerbailityScoreOfSelectedVersions(
        sanitizedVersions)
    #print "zzzz", len(NonZero_sanitizedVersionsWithScore)
    ### dyumping scores ...

    brokenDict = utility.getVScoreList(NonZero_sanitizedVersionsWithScore)
    onlyTheNonZeroSanitizedVersionIDs, onlyTheNonZeroSanitizedVScores = brokenDict[
        0], brokenDict[1]
    #print "lalalaa ", onlyTheNonZeroSanitizedVScores

    #strOfScoresToDump=""
    #for elem in onlyTheNonZeroSanitizedVScores:
    #  strOfScoresToDump = strOfScoresToDump + str(elem) +  "," + "\n"

    ###
    #IO_.writeStrToFile("scores_for_clustering_measure.csv", strOfScoresToDump)

    reshapedNonZerSanitizedScores = np.reshape(onlyTheNonZeroSanitizedVScores,
                                               (-1, 1))
    clusteringType.fit(reshapedNonZerSanitizedScores)
    labelsFroVersions = clusteringType.labels_
    if clusterFlag:
        centroids = clusteringType.cluster_centers_
        print "And the centroids are .... ", centroids
        NonZer_Santized_versionDictWithLabels = utility.clusterByKmeansLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
        ##### plotting clusters start
        #low_cluster_y, high_cluster_y = utility.plotClusterByLabel( onlyTheNonZeroSanitizedVersionIDs , labelsFroVersions, NonZero_sanitizedVersionsWithScore)
        #low_cluster_x = [ 22.35294118 for x in low_cluster_y]
        #hig_cluster_x = [ 50.82030058 for x in high_cluster_y]
        #plotter.createClusterPlots(low_cluster_x, low_cluster_y, hig_cluster_x, high_cluster_y)
        ##### plottign clusters end
    else:
        print "No centroids for Aggolomerative clustering"
        NonZer_Santized_versionDictWithLabels = utility.clusterByAggoloLabel(
            onlyTheNonZeroSanitizedVersionIDs, labelsFroVersions)
    print "And the labels are .... "
    print len(labelsFroVersions)
    cluster_labels = clusteringType.fit_predict(reshapedNonZerSanitizedScores)
    silhouette_avg = silhouette_score(reshapedNonZerSanitizedScores,
                                      cluster_labels)
    print "Silhouette average---> ", silhouette_avg

    ##############################
    themegaFile_All = outputStrParam + "_" + "culsterified_non_zero_all-CQ-HL.csv"
    IO_.dumpIntoClusterifiedFile(themegaFile_All, sanitizedVersions_CQ,
                                 NonZer_Santized_versionDictWithLabels, False)

Example #20

0

Show file

File: correlation.py Project: akondrahman/DataAnalysisAndLearning

    _corr_score = []
    print "------------------"
    print "Correlating vScore and minor violations  "
    _corr_score = doCorrelation(minor_violationsCntList, vScoreList)
    print "Pearson:{}, P-P:{}, Spaeramn:{}, S-P:{}, MIC:{}, Non-Linearity:{}".format(
        _corr_score[0], _corr_score[1], _corr_score[2], _corr_score[3],
        _corr_score[4], _corr_score[5])


############################CORRRELATION ZONE ##############################

import sanityCheck, utility
dbFileName = "/Users/akond/Documents/Spring-2016/CSC522/OSSAndroidAppDataset/androSec.db"
import DataExtractionFromTables as DEFT
#import correlation as corr_
versionAndCodeQualityDict = DEFT.getValuesFrom_CodingStandard(dbFileName)
sanitizedVersions_CQ = sanityCheck.getCodeQualityofVersions(
    versionAndCodeQualityDict)
print "Sanitized versions that will be used in study ", len(
    sanitizedVersions_CQ)
#print "Sanitized versions ..." , sanitizedVersions
##### all the vulnerability versions started
sanitizedVersionsWithScore = sanityCheck.getVulnerbailityScoreOfSelectedVersions(
    sanitizedVersions_CQ)
### call for correlation
#performCorrBasedOnIndiMetrics(sanitizedVersionsWithScore, sanitizedVersions_CQ)
##### all the vulnerability versions ended

#######  high vScore versions started
medianScore = 51.11111
high_CQ_dict = utility.getHighVScoreVersions_CQ(sanitizedVersionsWithScore,