Example #1
0
def extract_metafeature(a):
    #statistical(3)
    #print(mean(a.kurtosis()))
    #print(mean(a.skew()))
    #print (mean(a.mean()))
    from sklearn.feature_selection import mutual_info_classif
    from info_gain import info_gain
    y=df[2000]
    X = df.drop(2000,1)
    ft2 = pd.DataFrame({
     #simple
    'nr_instances':[len(a)],
    'nr_features':[len(a.columns)],
    'nr_missing_values':[a.isnull().sum().sum()],
    #statistical 
    #"max_value":[a.values.max()],
    #"min_value":[a.values.min()],
    'mean_kurtosis':[mean(a.kurtosis())],
    'mean_skewness':[mean(a.skew())],
    'mean':[mean(a.mean())],
    #information_theoretic
    #'MI':[mean(mutual_info_classif(X, y))],
    #model_based
    'Info_gain':[info_gain.info_gain(X,y)],
    #'Intistic_value':[info_gain.intrinsic_value(X,y)],
    'Inf_gain_ratio':[info_gain.info_gain_ratio(X,y)]
    })
    
    return(ft2)
Example #2
0
def infomationGain(data):
    label = [x[-1] for x in data]
    for i in range(len(data[0]) - 1):
        attribute = [x[i] for x in data]
        ig = info_gain.info_gain_ratio(label, attribute)
        print(ig)
    draw(data)
def calc_gr(df, label_column, progress=True):
    """Calculated the Gain Ratio for each column of a df in relation to a 
    specified label_column.

    Args:
        df (pd.DataFrame): Dataframe the Gain Ratio values need to be 
            calculated for.
        label_column (str): Name of the label_column.
        progress (bool, optional): If True, progress bars will be shown to inform the 
            user about the progress made by the process. Defaults to True.

    Raises:
        RuntimeWarning: Is raised if the gain ratio calculation fails 
            for a column (and returns a nan).

    Returns:
        dict: Dictionary with the column names as keys and the corresponding
        Gain Ratio values as values.
    """

    gr_values = dict()

    if progress:
        iterator = tqdm(
            df.columns,
            desc="Greedy Top Down - (2/3) Calculating Gain Ratios.")
    else:
        iterator = df.columns

    for column in iterator:

        gr_values[column] = info_gain.info_gain_ratio(df[label_column],
                                                      df[column])

        # Check if the gain ratio was successfully calculated.

        if pd.isna(gr_values[column]):

            # Check if the gain ratio was unsuccessfully calculated because all values of a column are equal (GR not defined) -> Set GR to 0.

            if len(np.unique(df[column])) == 1:

                gr_values[column] = 0

            else:

                raise RuntimeWarning("The information gain ratio of column " +
                                     column +
                                     " could not be calculated (is nan).")

    gr_values["VRN"] = 0.0

    return gr_values
Example #4
0
def extract_metafeature(a):
    y = a[a.columns[-1]]
    X = a[a.columns[:-1]]
    return {
        #simple
        'nr_instances': len(a),
        'nr_features': len(a.columns),
        'nr_missing_values': a.isnull().sum().sum(),
        'mean_kurtosis': mean(a.kurtosis()),
        'mean_skewness': mean(a.skew()),
        'mean': mean(a.mean()),
        'Info_gain': info_gain.info_gain(X, y),
        'Inf_gain_ratio': info_gain.info_gain_ratio(X, y)
    }
def training_dic(label):
    manually_extracted_terms = ['heartbeat', 'ping', 'ping/echo', 'beat', 'decorator', 'piggybacking', 'outbound',
                                'period', 'audit', 'trail', 'wizard', 'log', 'string', 'category', 'thread', 'pooling',
                                'pool', 'thread', 'connect', 'sparrow', 'processor', 'worker', 'time-wait', 'prototype',
                                'singleton', 'strategy', 'chain of responsibility', 'lazy load', 'static scheduling',
                                'dynamic priority scheduling', 'authentic', 'credential', 'challenge', 'login', 'FIFO',
                                'fixed-priority', 'dynamic priority scheduling', 'schedule', 'task', 'priority',
                                'adaptor', 'bridge', 'composite', 'flyweight', 'memento', 'observer', 'proxy',
                                'strategy', 'checkpoint', 'checkpoints', 'barrier', 'weak point', 'layoff', 'restraint',
                                'austerity', 'abridgement', 'deliver', 'spare', 'unoccupied', 'option', 'unused',
                                'logging', 'minutes', 'redundancy replication', 'redundancy storage', 'zone-redundant',
                                'geo-redundant', 'replication', 'voting', 'vote', 'balloting', 'choosing', 'voter',
                                'processor', 'preferred', 'shadow operation', 'shadow mode', 'secure session',
                                'security', 'removal', 'time out', 'run out', 'constraint', 'action', 'monitor',
                                'timer', 'runtime', 'time stamp', 'timestamp', 'time strap', 'sanity checking',
                                'sanity check', 'functional redundancy', 'function requirement allocation', 'parallel',
                                'separate', 'warm restart', 'dual redundancy', 'resisting attacks', 'detecting',
                                'detect', 'recovering', 'recover', 'sensor', '', 'authenticate', 'confidentiality',
                                'exposure', 'limit access', 'passwords', 'one-time', 'passwords',
                                'digital certificates', 'maintain data confidentiality', 'handle', 'protecting',
                                'routine', 'storage', 'mandatory', 'recovering from attacks', 'state', 'maintain',
                                'maintaining', 'redundant', 'access control', 'profile', 'performance',
                                'processing_time', 'response_time', 'resource_consumption', 'throughput', 'efficiency',
                                'carrying_into_action', 'carrying_out', 'operation', 'achievement', 'interaction',
                                'accomplishment', 'action', 'maintainability', 'update', 'modify', 'modular',
                                'decentralized', 'encapsulation', 'dependency', 'interdependent', 'interdependent',
                                'understandability', 'modifiability', 'modularity', 'maintain', 'analyzability',
                                'changeability', 'testability', 'encapsulation', 'compatibility', 'co-existence',
                                'interoperability', 'exchange', 'sharing', 'usability', 'flexibility', 'interface',
                                'user-friendly', 'default', 'configure', 'serviceability', 'convention',
                                'accessibility', 'gui', 'serviceableness', 'useableness', 'utility', 'useable',
                                'learnability', 'understandability', 'operability', 'function', 'use', 'reliability',
                                'failure', 'bug', 'resilience', 'crash', 'stability', 'dependable', 'dependability',
                                'irresponsibleness', 'recover', 'recoverability', 'tolerance', 'error', 'fails',
                                'redundancy', 'integrity', 'irresponsibleness', 'dependable', 'maturity',
                                'recoverability', 'accountability', 'answerableness', 'functional', 'function',
                                'accuracy', 'completeness', 'suitability', 'compliance', 'performing', 'employable',
                                'functionality', 'complexity', 'functioning', 'security', 'safe', 'vulnerability',
                                'trustworthy', 'firewall', 'login', 'password', 'pin', 'auth', 'verification',
                                'protection', 'certificate', 'security_system', 'law', 'portability', 'portable',
                                'cross_platform', 'transfer', 'transformability', 'documentation', 'standardized',
                                'migration', 'specification', 'movability', 'moveableness', 'replaceability',
                                'adaptability']

    manually_extracted_terms = set(manually_extracted_terms)

    # Load architecture posts from SO
    architecture_posts = pandas.read_excel()

    # preprocess
    pre_arch_posts = text_preprocess(architecture_posts)

    # training by Word2Vec
    items = []

    for s in pre_arch_posts:
        items.append(str(s).split(' '))

    model = gensim.models.word2vec.Word2Vec(items, min_count=2, size=150)
    automatic_extracted_terms = set()

    for w in manually_extracted_terms:
        if w in model:
            for t in model.wv.similar_by_word(w):
                if t[1] > 0.350:
                    automatic_extracted_terms.add(t[0])

    bow_vector = CountVectorizer()
    bow_arch_data = bow_vector.fit_transform(pre_arch_posts).toarray()

    for t in automatic_extracted_terms:
        # find term index in bow vocabulary
        index = bow_vector.vocabulary_.get(t, -1)

        if index != -1:
            # filtering terms by calculating information gain ratio
            igr = info_gain.info_gain_ratio(bow_vector[:, index], label)

            if igr < 0.300:
                automatic_extracted_terms.remove(t)

    final_terms = manually_extracted_terms | automatic_extracted_terms
Example #6
0
def run_evaluation(mode):

    All_Bug_num=0
    All_Loc_num=0
    
    
    data_train,label_train=read_data_python(subject+'/train.csv')
    data_test,label_test=read_data_python(subject+'/test.csv')
    
    print(data_train.shape)
    this_column=data_train[:,3]
    print(info_gain.info_gain_ratio(label_train,this_column))
    
#     min_max_scaler = preprocessing.MinMaxScaler()
#     data_train_minmax = min_max_scaler.fit_transform(data_train)
#     data_test_minmax = min_max_scaler.transform(data_test)
#     
#     data_train=data_train_minmax
#     data_test=data_test_minmax
    
#     if(mode=="origin"):
#         min_max_scaler = preprocessing.MinMaxScaler()
#         data_train_minmax = min_max_scaler.fit_transform(data_train)
#         data_test_minmax = min_max_scaler.transform(data_test)
#          
#         data_train=data_train_minmax
#         data_test=data_test_minmax
#          
#     if(mode=="all"):
#          
#         tradi_list=[]
#         embed_list=[]
#         for j in range(0,20):
#             tradi_list.append(j)
#         for j in range(20,52):
#             embed_list.append(j)
#          
#         min_max_scaler = preprocessing.MinMaxScaler()
#         tradi_data_train=data_train[:,tradi_list]
#         tradi_data_test=data_test[:,tradi_list]
#          
#         embed_data_train=data_train[:,embed_list]
#         embed_data_test=data_test[:,embed_list]
#          
#         data_train_minmax_tradi = min_max_scaler.fit_transform(tradi_data_train)
#         data_test_minmax_tradi = min_max_scaler.transform(tradi_data_test)
#          
#         data_train=numpy.hstack((data_train_minmax_tradi,embed_data_train))
#         data_test=numpy.hstack((data_test_minmax_tradi,embed_data_test))
    
    package_loc_dict={}
    package_defect_dict={}
    
    defect_file=open(subject+'/Process-Origin.csv','r')
    lines=defect_file.readlines()
    for index,each_line in enumerate(lines):
        if(index!=0):
            records=each_line.strip('\n').split(',')
            class_name=records[0]
            defect_count=records[21]
            package_defect_dict[class_name]=int(defect_count)
            package_loc_dict[class_name]=records[11]

    class_test_defect_dense={}
    class_test_defect={}
    class_test_loc={}
    class_name_list=[]
    test_instance_num=0
    
    
    test_file=open(subject+'/test.csv','r')
    lines=test_file.readlines()
    for index,each_line in enumerate(lines):
        if(not index==0):
            records=each_line.strip('\n').split(',')
            class_name=records[0]
            class_name_list.append(class_name)
            class_test_defect_dense[class_name]=float(package_defect_dict[class_name])/(float(package_loc_dict[class_name])+1)
            class_test_defect[class_name]=package_defect_dict[class_name]        
            test_instance_num=test_instance_num+1
            class_test_loc[class_name]=float(package_loc_dict[class_name])
            
            All_Loc_num=All_Loc_num+float(package_loc_dict[class_name])
            All_Bug_num=All_Bug_num+int(package_defect_dict[class_name])
            
    defect_order=sorted(class_test_defect_dense.items(), key=lambda x:x[1], reverse=True)
    
    order_file=open(subject+'/optimal.csv','w')
    for each_turple in defect_order:
        each_class=each_turple[0]
        order_file.write(each_class+','+str(class_test_defect[each_class])+','+str(package_loc_dict[each_class])+'\n')
    order_file.close()
    
    reverse_order=sorted(class_test_defect_dense.items(), key=lambda x:x[1], reverse=False)
    reverse_file=open(subject+'/worst.csv','w')
    for each_turple in reverse_order:
        each_class=each_turple[0]
        reverse_file.write(each_class+','+str(class_test_defect[each_class])+','+str(package_loc_dict[each_class])+'\n')
    reverse_file.close()
    
    if(label_sum(label_train)>(len(label_train)/2)):
        print("The training data does not need balance.")
        predprob_auc,predprob,precision,recall,fmeasure,auc,mcc=classifier_output(classifier,data_train,label_train,data_test,label_test,grid_sear=True)
        print(precision,recall,fmeasure,auc,mcc)
        if(mode=="origin"):
            Precision_list_origin.append(precision)
            Recall_list_origin.append(recall)
            F_measure_list_origin.append(fmeasure)
            AUC_list_origin.append(auc)
            MCC_list_origin.append(mcc)
        elif(mode=="vector"):
            Precision_list_vector.append(precision)
            Recall_list_vector.append(recall)
            F_measure_list_vector.append(fmeasure)
            AUC_list_vector.append(auc)
            MCC_list_vector.append(mcc)
        elif(mode=="all"):
            Precision_list_all.append(precision)
            Recall_list_all.append(recall)
            F_measure_list_all.append(fmeasure)
            AUC_list_all.append(auc)
            MCC_list_all.append(mcc)
    else:
#         smo = SVMSMOTE()
#         data_bin_, label_bin_= smo.fit_sample(data_train, label_train)
        opt_para=[5,200,2]
        data_bin_, label_bin_=smote_wrapper(opt_para, data_train, label_train)
        predprob_auc,predprob,precision,recall,fmeasure,auc,mcc=classifier_output(classifier,data_bin_,label_bin_,data_test,label_test,grid_sear=True)#False is only for debugging.
        print(precision,recall,fmeasure,auc,mcc)
        if(mode=="origin"):
            Precision_list_origin.append(precision)
            Recall_list_origin.append(recall)
            F_measure_list_origin.append(fmeasure)
            AUC_list_origin.append(auc)
            MCC_list_origin.append(mcc)
        elif(mode=="vector"):
            Precision_list_vector.append(precision)
            Recall_list_vector.append(recall)
            F_measure_list_vector.append(fmeasure)
            AUC_list_vector.append(auc)
            MCC_list_vector.append(mcc)
        elif(mode=="all"):
            Precision_list_all.append(precision)
            Recall_list_all.append(recall)
            F_measure_list_all.append(fmeasure)
            AUC_list_all.append(auc)
            MCC_list_all.append(mcc)
    
    
    class_in_prediction_effortaware={}
    class_in_prediction_effortaware_coreness={}    
    for i in range(len(predprob_auc)):
        class_name=class_name_list[i]
        predict_result=predprob_auc[i]

        if(float(package_loc_dict[class_name])==0.0):
            class_in_prediction_effortaware[class_name]=0.0
        else:
            class_in_prediction_effortaware[class_name]=float(predict_result)/class_test_loc[class_name]
        
        if(float(package_loc_dict[class_name])==0.0):
            class_in_prediction_effortaware_coreness[class_name]=0.0
        else:
            if(not class_name in file_core_dict):
                class_in_prediction_effortaware_coreness[class_name]=float(predict_result)/class_test_loc[class_name]
            else:
                class_in_prediction_effortaware_coreness[class_name]=float(predict_result)*file_core_dict[class_name]/class_test_loc[class_name]
            
    model_file=open(subject+'/model.csv','w')
    effort_order=sorted(class_in_prediction_effortaware.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        model_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    model_file.close()
    
    effort_file=open(subject+'/effort_core.csv','w')        
    effort_order=sorted(class_in_prediction_effortaware_coreness.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        effort_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    effort_file.close()
    
    loc_order=sorted(class_test_loc.items(), key=lambda x:x[1], reverse=False)
    loc_file=open(subject+'/loc.csv','w')
    for each_turple in loc_order:
        each_class=each_turple[0]
        loc_file.write(each_class+','+str(class_test_defect[each_class])+','+str(package_loc_dict[each_class])+'\n')
    loc_file.close()
    
    
    class_in_prediction_effortaware_positive={}
    class_in_prediction_effortaware_negative={}
    
    class_in_prediction_core_positive={}
    class_in_prediction_core_negative={}
    
    for i in range(len(predprob_auc)):
        class_name=class_name_list[i]
        predict_result=predprob_auc[i]
        
        if(predict_result>=0.5):
            if(float(package_loc_dict[class_name])==0.0):
                class_in_prediction_effortaware_positive[class_name]=0.0
                class_in_prediction_core_positive[class_name]=0.0
            else:
                class_in_prediction_effortaware_positive[class_name]=float(predict_result)/class_test_loc[class_name]
                if(not class_name in file_core_dict):
                    class_in_prediction_core_positive[class_name]=float(predict_result)/class_test_loc[class_name]
                else:
                    class_in_prediction_core_positive[class_name]=float(predict_result)*file_core_dict[class_name]/class_test_loc[class_name]
        else:
            if(float(package_loc_dict[class_name])==0.0):
                class_in_prediction_effortaware_negative[class_name]=0.0
                class_in_prediction_core_negative[class_name]=0.0
            else:
                class_in_prediction_effortaware_negative[class_name]=float(predict_result)/class_test_loc[class_name]
                if(not class_name in file_core_dict):
                    class_in_prediction_core_negative[class_name]=float(predict_result)/class_test_loc[class_name]
                else:
                    class_in_prediction_core_negative[class_name]=float(predict_result)*file_core_dict[class_name]/class_test_loc[class_name]
    
    effort_file=open(subject+'/effort_CBS.csv','w')         
    effort_order=sorted(class_in_prediction_effortaware_positive.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        effort_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    effort_order=sorted(class_in_prediction_effortaware_negative.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        effort_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    effort_file.close()
    
    effort_file=open(subject+'/effort_CBS_core.csv','w') 
    effort_order=sorted(class_in_prediction_core_positive.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        effort_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    effort_order=sorted(class_in_prediction_core_negative.items(), key=lambda x:x[1], reverse=True)
    for each_turple in effort_order:
        class_name=each_turple[0]
        effort_file.write(class_name+','+str(class_test_defect[class_name])+','+str(package_loc_dict[class_name])+','+str(each_turple[1])+'\n')
    effort_file.close()
    
    twenty_per=float(All_Loc_num)/5
 
    IFA_model=output_results('model',twenty_per,All_Loc_num,All_Bug_num,mode)      
    output_results('optimal',twenty_per,All_Loc_num,All_Bug_num,mode)
    output_results('worst',twenty_per,All_Loc_num,All_Bug_num,mode)
    output_results('loc',twenty_per,All_Loc_num,All_Bug_num,mode)
    output_results('effort_core',twenty_per,All_Loc_num,All_Bug_num,mode)
    IFA_CBS=output_results('effort_CBS',twenty_per,All_Loc_num,All_Bug_num,mode)
    output_results('effort_CBS_core',twenty_per,All_Loc_num,All_Bug_num,mode)
    
    if(mode=='origin'):
        IFA_list_origin.append(IFA_model)
        IFA_list_CBS_origin.append(IFA_CBS)
    if(mode=='vector'):
        IFA_list_vector.append(IFA_model)
        IFA_list_CBS_vector.append(IFA_CBS)
    if(mode=='all'):
        IFA_list_all.append(IFA_model)
        IFA_list_CBS_all.append(IFA_CBS)
    
    result_file=open(subject+"/ALL_POPT_Record"+"_"+mode+".csv","a")
    
    result_class_file=open(subject+"/ALL_Classification_Record"+"_"+mode+".csv","a")
    
    model_matrix = numpy.loadtxt(open(subject+"/RESULT_model.csv","rb"),delimiter=",",skiprows=0)
    optimal_matrix = numpy.loadtxt(open(subject+"/RESULT_optimal.csv","rb"),delimiter=",",skiprows=0)
    worst_matrix = numpy.loadtxt(open(subject+"/RESULT_worst.csv","rb"),delimiter=",",skiprows=0)
    loc_matrix = numpy.loadtxt(open(subject+"/RESULT_loc.csv","rb"),delimiter=",",skiprows=0)
    effortcore_matrix = numpy.loadtxt(open(subject+"/RESULT_effort_core.csv","rb"),delimiter=",",skiprows=0)
    CBS_matrix = numpy.loadtxt(open(subject+"/RESULT_effort_CBS.csv","rb"),delimiter=",",skiprows=0)
    CBScore_matrix = numpy.loadtxt(open(subject+"/RESULT_effort_CBS_core.csv","rb"),delimiter=",",skiprows=0)
    
    model=numpy.trapz(model_matrix[:,1],x=model_matrix[:,0])
    optimal=numpy.trapz(optimal_matrix[:,1],x=optimal_matrix[:,0])
    
    
    worst=numpy.trapz(worst_matrix[:,1],x=worst_matrix[:,0])
    loc=numpy.trapz(loc_matrix[:,1],x=loc_matrix[:,0])
    effortcore=numpy.trapz(effortcore_matrix[:,1],x=effortcore_matrix[:,0])
    CBS=numpy.trapz(CBS_matrix[:,1],x=CBS_matrix[:,0])
    CBScore=numpy.trapz(CBScore_matrix[:,1],x=CBScore_matrix[:,0])
    
    P_opt_model=(model-worst)/(optimal-worst)
    P_opt_loc=(loc-worst)/(optimal-worst)
    P_opt_effortcore=(effortcore-worst)/(optimal-worst)
    P_opt_CBS=(CBS-worst)/(optimal-worst)
    P_opt_CBScore=(CBScore-worst)/(optimal-worst)
    
    result_class_file.write(str(precision)+','+str(recall)+','+str(fmeasure)+','+str(auc)+','+str(mcc)+"\n")
    result_class_file.flush()
    result_class_file.close()
    
    result_file.write(str(P_opt_model)+','+str(P_opt_effortcore)+','+str(P_opt_CBS)+','+str(P_opt_CBScore)+"\n")
    result_file.flush()
    result_file.close()
    
    return P_opt_model,P_opt_effortcore,P_opt_CBS,P_opt_CBScore
Example #7
0
 origin_data=read_data(subject+'/Process-Binary.csv')
 
 vector_text=read_text_data(subject+'/Process-Vector.csv')
 
 all_text=read_text_data(subject+'/Process-All.csv')
 
 data_all,label_all=read_data_python(subject+'/Process-All.csv')
 print(data_all.shape)
 IGR_result_file=open(subject+'/IGR_results.csv','w')
 num_metrics=data_all.shape[1]
 metrics_cursor=0
 IGR_dict={}
 
 while(metrics_cursor<num_metrics):
     this_column=data_all[:,metrics_cursor]
     IGR=info_gain.info_gain_ratio(label_all,this_column)
     IGR_dict[metrics_cursor]=IGR
     metrics_cursor=metrics_cursor+1
 
 all_IGR_file.write(subject)    
 IGR_order=sorted(IGR_dict.items(), key=lambda x:x[1], reverse=True)
 IGR_counter=0
 for each_turple in IGR_order:
     IGR_result_file.write(str(each_turple[0])+','+str(each_turple[1])+'\n')
     IGR_result_file.flush()
     if(IGR_counter<10):
         all_IGR_file.write(','+str(each_turple[0]))
         IGR_counter=IGR_counter+1
 all_IGR_file.write('\n')
 all_IGR_file.flush()
 
def training_dic(QA_AT_data, QA_AT_label):
    manually_extracted_term_lists = [
        'heartbeat', 'ping', 'ping/echo', 'beat', 'decorator', 'piggybacking',
        'outbound', 'period', 'audit', 'trail', 'wizard', 'log', 'string',
        'category', 'thread', 'pooling', 'pool', 'thread', 'connect',
        'sparrow', 'processor', 'worker', 'time-wait', 'prototype',
        'singleton', 'strategy', 'chain of responsibility', 'lazy load',
        'static scheduling', 'dynamic priority scheduling', 'authentic',
        'credential', 'challenge', 'login', 'FIFO', 'fixed-priority',
        'dynamic priority scheduling', 'schedule', 'task', 'priority',
        'adaptor', 'bridge', 'composite', 'flyweight', 'memento', 'observer',
        'proxy', 'strategy', 'checkpoint', 'checkpoints', 'barrier',
        'weak point', 'layoff', 'restraint', 'austerity', 'abridgement',
        'deliver', 'spare', 'unoccupied', 'option', 'unused', 'logging',
        'minutes', 'redundancy replication', 'redundancy storage',
        'zone-redundant', 'geo-redundant', 'replication', 'voting', 'vote',
        'balloting', 'choosing', 'voter', 'processor', 'preferred',
        'shadow operation', 'shadow mode', 'secure session', 'security',
        'removal', 'time out', 'run out', 'constraint', 'action', 'monitor',
        'timer', 'runtime', 'time stamp', 'timestamp', 'time strap',
        'sanity checking', 'sanity check', 'functional redundancy',
        'function requirement allocation', 'parallel', 'separate',
        'warm restart', 'dual redundancy', 'resisting attacks', 'detecting',
        'detect', 'recovering', 'recover', 'sensor', 'authenticate',
        'confidentiality', 'exposure', 'limit access', 'passwords', 'one-time',
        'passwords', 'digital certificates', 'maintain data confidentiality',
        'handle', 'protecting', 'routine', 'storage', 'mandatory',
        'recovering from attacks', 'state', 'maintain', 'maintaining',
        'redundant', 'access control', 'profile', 'performance',
        'processing_time', 'response_time', 'resource_consumption',
        'throughput', 'efficiency', 'carrying_into_action', 'carrying_out',
        'operation', 'achievement', 'interaction', 'accomplishment', 'action',
        'maintainability', 'update', 'modify', 'modular', 'decentralized',
        'encapsulation', 'dependency', 'interdependent', 'interdependent',
        'understandability', 'modifiability', 'modularity', 'maintain',
        'analyzability', 'changeability', 'testability', 'encapsulation',
        'compatibility', 'co-existence', 'interoperability', 'exchange',
        'sharing', 'usability', 'flexibility', 'interface', 'user-friendly',
        'default', 'configure', 'serviceability', 'convention',
        'accessibility', 'gui', 'serviceableness', 'useableness', 'utility',
        'useable', 'learnability', 'understandability', 'operability',
        'function', 'use', 'reliability', 'failure', 'bug', 'resilience',
        'crash', 'stability', 'dependable', 'dependability',
        'irresponsibleness', 'recover', 'recoverability', 'tolerance', 'error',
        'fails', 'redundancy', 'integrity', 'irresponsibleness', 'dependable',
        'maturity', 'recoverability', 'accountability', 'answerableness',
        'functional', 'function', 'accuracy', 'completeness', 'suitability',
        'compliance', 'performing', 'employable', 'functionality',
        'complexity', 'functioning', 'security', 'safe', 'vulnerability',
        'trustworthy', 'firewall', 'login', 'password', 'pin', 'auth',
        'verification', 'protection', 'certificate', 'security_system', 'law',
        'portability', 'portable', 'cross_platform', 'transfer',
        'transformability', 'documentation', 'standardized', 'migration',
        'specification', 'movability', 'moveableness', 'replaceability',
        'adaptability'
    ]

    manually_extracted_terms = set(manually_extracted_term_lists)

    # Load architecture posts from SO
    architecture_posts = pandas.read_excel(
        '../dataset/Architecture posts - Stack Overflow.xlsx',
        0,
        usecols=[1, 5, 6],
        names=['title', 'question', 'answers'])
    arch_items = []
    arch_terms = set()

    for i in range(len(architecture_posts)):
        arch_items.append(
            str(architecture_posts['title'][i]) + ' ' +
            str(architecture_posts['question'][i]) + ' ' +
            str(architecture_posts['answers'][i]))

    for i in range(len(text_preprocess(arch_items))):
        arch_items[i] = str(arch_items[i]).split(' ')

        for w in arch_items[i]:
            arch_terms.add(w)

    QA_AT_items = []

    for i in range(len(QA_AT_data)):
        if QA_AT_label[i] == 1:
            QA_AT_items.append(str(QA_AT_data[i]).split(' '))

    # training by Word2Vec
    items = arch_items + QA_AT_items

    word2vec_model = gensim.models.word2vec.Word2Vec(items, size=150)
    automatic_extracted_terms = set()

    for w in manually_extracted_terms:
        if w in word2vec_model:
            for t in word2vec_model.wv.similar_by_word(w):
                # retain terms that are in architecture posts and have a similarity greater than 0.35
                if t[1] > 0.35 and t[0] in arch_terms:
                    automatic_extracted_terms.add(t[0])

    bow_vector = CountVectorizer()
    bow_QA_AT_data = bow_vector.fit_transform(QA_AT_data).todense()

    final_training_dictionary = set()
    v_len = bow_QA_AT_data.shape[0]

    for t in automatic_extracted_terms:
        # find current term's index in bow vocabulary
        index = bow_vector.vocabulary_.get(t, -1)

        if index != -1:
            # represent the distribution of current term in QA_AT data
            v = np.reshape(bow_QA_AT_data[:, index], v_len).tolist()[0]
            igr = info_gain.info_gain_ratio(v, QA_AT_label)

            if igr > 0.30:
                # retain terms in QA_AT data and have an information gain ratio greater than 0.30
                final_training_dictionary.add(t)

    final_training_dictionary = manually_extracted_terms | final_training_dictionary

    for t in final_training_dictionary:
        print(t)

    return final_training_dictionary, word2vec_model
from sklearn.naive_bayes import GaussianNB
g_nb = GaussianNB(priors = None)
g_nb_fit = g_nb.fit(x_train,y_train)

g_nb_pred = g_nb.predict(x_test)
print(confusion_matrix(y_test,g_nb_pred))
print('\n')
print(classification_report(y_test,g_nb_pred))

#info_gain,Gain_Ratio/14columns
!pip install info_gain
from info_gain import info_gain
noShow_plus = noShow.drop('Status',axis=1)
for item in noShow_plus:
  ig = info_gain.info_gain(noShow[item], noShow['Status'])
  igr = info_gain.info_gain_ratio(noShow[item], noShow['Status'])

  print("%s的info_gain:" %(item),ig)
  print("%s的Gain_Ratio:" %(item),igr)

#info_gain,Gain_Ratio/16columns
!pip install info_gain
from info_gain import info_gain
noShow_plus = noShow.drop('Status',axis=1)
for item in noShow_plus:
  ig = info_gain.info_gain(noShow[item], noShow['Status'])
  igr = info_gain.info_gain_ratio(noShow[item], noShow['Status'])

  print("%s的info_gain:" %(item),ig)
  print("%s的Gain_Ratio:" %(item),igr)
Example #10
0
#CORRELATION ANALYSIS (numerical)
#N1 and N2 appear to have the highest correlation to the output but also seem to suffer from multicolinearity.
#A somewhat significant negative correlation also appears between N2 and N3
#A somewhat significant positive correlation also appears between N4 and N5
plt.figure(figsize=(10, 7))
plot = sn.heatmap(
    df.corr(), square=True,
    annot=True).get_figure().savefig("Viz/correlation_matrix.png")
plt.clf()

#INFORMATION GAIN RATIO ANALYSIS (categorical)
#Mostly low values. C1 and C2 are the highest
print("\nInformation Gain Ratio:")
for column in df[df.columns[pd.Series(df.columns).str.startswith('C')]]:
    print("Info gain ratio for column ", column, ": ",
          info_gain.info_gain_ratio(df[column], df['LABEL']))

#plot = sn.scatterplot(x="N3", y="N2", data=df[['N3','N2']]).get_figure().savefig("N1_N2_scatter.png")
#plt.clf()

#==============================================================================
# FEATURE ENGINEERING
#==============================================================================

#CATEGORICAL VARIABLES 1-HOT ENCODING
df = pd.get_dummies(df)

#Plot new correlation matrix
plt.figure(figsize=(50, 47))
plot = sn.heatmap(
    (df[df.columns[pd.Series(df.columns).str.contains('_')]].join(