def do_local_train_test_split(self, train_obj: StyloFeatures, trainproblemlength, config_learning, threshold): skflocal = StratifiedKFoldProblemId.StratifiedKFoldProblemId( iids=train_obj.getiids(), n_splits=3, shuffle=True, random_state=411, nocodesperprogrammer=trainproblemlength) listoftraintestsplits = [] for train_index, test_index in skflocal.split(None, None): trainfiles: StyloFeatures = train_obj[train_index] testfiles: StyloFeatures = train_obj[test_index] trainfiles, testfiles = self._tfidf_feature_selection( train_obj=trainfiles, test_obj=testfiles, config_learning=config_learning, threshold=threshold) listoftraintestsplits.append([ trainfiles.getfeaturematrix(), testfiles.getfeaturematrix(), trainfiles.getlabels(), testfiles.getlabels() ]) return listoftraintestsplits
def learn_process(features_merged: StyloFeatures, learn_method: str, learn_config: typing.Union[ConfigurationLearning, ConfigurationLearningRNN]): skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId( iids=features_merged.getiids(), n_splits=8, shuffle=True, random_state=411, nocodesperprogrammer=learn_config.probsperprogrammer) for train_index, test_index in skf2.split(None, None): curproblemid = "_".join( features_merged.getiids()[test_index[0]].split("_")[0:2]) if curproblemid == PROBLEM_ID_LOADED: # I. Do classification accuracy, curtestlearnsetup = perform_standard_classification_for_split( features_merged=features_merged, train_index=train_index, test_index=test_index, problem_id_test=curproblemid, configuration_learning=learn_config, modelsavedir=None, threshold_sel=1.5, learn_method=learn_method, skf2=skf2) # II. Extract values and return them for checking... currentaccuracy = accuracy[curproblemid] print(currentaccuracy) print(curtestlearnsetup.data_final_test.getfeaturematrix(). mean()) print(curtestlearnsetup.data_final_train.getfeaturematrix(). mean()) print(curtestlearnsetup.data_final_test.getfeaturematrix() [:, :10].mean()) print(curtestlearnsetup.data_final_train.getfeaturematrix(). max()) return currentaccuracy, \ curtestlearnsetup.data_final_test.getfeaturematrix().mean(), \ curtestlearnsetup.data_final_train.getfeaturematrix().mean(), \ curtestlearnsetup.data_final_test.getfeaturematrix()[:, :10].mean(), \ curtestlearnsetup.data_final_train.getfeaturematrix().max()
def setUp(self): """ Runs before any test. """ configuration_learning: ConfigurationLearning = ConfigurationLearning( repo_path=Config.repo_path, dataset_features_dir=os.path.join(Config.repo_path, "data/dataset_2017"), suffix_data="_2017_8_formatted_macrosremoved", learnmodelspath=Config.learnmodelspath, use_lexems=False, use_lexical_features=False, stop_words=Config.stop_words_codestylo, probsperprogrammer=Config.probsperprogrammer, no_of_programmers=204, noofparallelthreads=8, hyperparameters=None) # A. First, get the author iids so that we have a unique key for each source file. To this end, we simply load # the whole dataset (but just the arff features). arffmatrix = StyloARFFFeatures( inputdata=configuration_learning.arffile, removelog=True) iids = arffmatrix.getiids() del arffmatrix # B. Split dataset into train - test set # Our adapted stratified k-fold skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId( iids=iids, n_splits=8, shuffle=True, random_state=411, nocodesperprogrammer=configuration_learning.probsperprogrammer) train_index, test_index = None, None for train_index, test_index in skf2.split(None, None): if iids[test_index][0].startswith("3264486_5736519012712448"): break # use a small helper function that represents a default set of settings features_merged: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features( config_learning=configuration_learning) print( "jo-cl", features_merged.codestyloreference.codestyloreference. getfeaturematrix().mean(), "should be: 0.0929585") # C. Merge the objects into one matrix / feature representation self.data_final_train = features_merged[train_index] self.data_final_test = features_merged[test_index] # set up tf-idf self.data_final_train.createtfidffeatures(trainobject=None) self.data_final_test.createtfidffeatures( trainobject=self.data_final_train) # with lexems configuration_learning_lex = copy.deepcopy(configuration_learning) configuration_learning_lex.use_lexems = True features_merged2: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features( config_learning=configuration_learning_lex) self.data_final_train_lexems = features_merged2[train_index] self.data_final_test_lexems = features_merged2[test_index] # with lexical features and arff configuration_learning_lexical = copy.deepcopy(configuration_learning) configuration_learning_lex.use_lexical_features = True features_merged3: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_all( config_learning=configuration_learning_lexical) self.data_final_train_lexicals = features_merged3[train_index] self.data_final_test_lexicals = features_merged3[test_index]
def perform_standard_classification_for_split( features_merged: StyloFeatures, train_index: np.ndarray, test_index: np.ndarray, problem_id_test: str, configuration_learning: ConfigurationLearning, modelsavedir: typing.Optional[str], threshold_sel: typing.Union[int, float], learn_method: str, skf2: StratifiedKFoldProblemId, ) -> typing.Tuple[dict, LearnSetup]: """ Convenience Function. Perform classification task with given train- test split where test set contains one challenge/problem. If modelsavedir is given, it will save the model, the accuracy value and the configuration file into this directory. :param features_merged: feature matrix object :param train_index: indices of training objects :param test_index: indices of test objects :param problem_id_test: the unique challenge/problem ID of the test set :param configuration_learning: config :param modelsavedir: directory where the model, accuracy value and config file will be saved. If none, nothing will be saved. :param threshold_sel: threshold for feature selection. If int, we select 'threshold' features, if float, we select all features whose mutual information score is above 'threshold'. Look at __tfidf_feature_selection in Learning.py :param learn_method: learning method, "RF", "SVM", "DNN", and "RNN" are supported. :param skf2: StratifiedKFoldProblemId object. :return: accuracy as dict (so that problem id from test set is directly available), and LearnSetup. (Both are also stored in modelsavedir, if modelsavedir is not None) """ accuracies = {} # I. Load features trainfiles: StyloFeatures = features_merged[train_index] testfiles: StyloFeatures = features_merged[test_index] # II. Learning problemid_test_learn = "_".join(testfiles.getiids()[0].split("_")[0:2]) assert problem_id_test == problemid_test_learn learning: Learning = Learning() currentaccuracy, curtestlearnsetup = learning.do_learn( train_obj=trainfiles, test_obj=testfiles, config_learning=configuration_learning, modelsavedir=modelsavedir, problemid_test=problemid_test_learn, threshold=threshold_sel, learn_method=learn_method, trainproblemlength=len(skf2.get_n_problems()[0])) # III. save accuracy dict, overwrite it in each iteration. accuracies[problemid_test_learn] = currentaccuracy if modelsavedir is not None: accfile = os.path.join( modelsavedir, "acc_" + learn_method + "_" + str(threshold_sel) + "__" + problemid_test_learn + ".pck") pickle.dump(accuracies, file=open(accfile, 'wb')) # IV. Save configuration in a readable text file (use str method of config class), added after Version 0.2 if modelsavedir is not None: with open( os.path.join( modelsavedir, "configuration_" + str(threshold_sel) + "__" + problemid_test_learn + ".txt"), "w") as text_file: print(str(configuration_learning), file=text_file) return accuracies, curtestlearnsetup
features_merged: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features( config_learning=configuration_learning) elif feature_method == "CCS18": assert configuration_learning.use_lexems is not True unigrammmatrix_train: StyloFeatures = utils_extraction.extract_train_test_unigram( config_learning=configuration_learning, tf=True, idf=True, ngram_range=(1, 3)) features_merged: StyloFeaturesProxy = StyloFeaturesProxy(codestyloreference=unigrammmatrix_train) else: raise Exception("feature_method") ############## Split dataset into train - test set with our our grouped stratified k-fold ############## skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId(iids=features_merged.getiids(), n_splits=8, shuffle=True, random_state=411, nocodesperprogrammer=configuration_learning.probsperprogrammer) print("No splits:", skf2.get_n_splits()) ############## Do training + testing on each split ############## accuracy = {} for train_index, test_index in skf2.split(None, None): curproblemid = "_".join(features_merged.getiids()[test_index[0]].split("_")[0:2]) if curproblemid == PROBLEM_ID_LOADED: # the following method saves the model and config file into modelsavedir if given accuracy, _ = classification.NovelAPI.utils_classification.perform_standard_classification_for_split( features_merged=features_merged, train_index=train_index,