Ejemplo n.º 1
0
    def do_local_train_test_split(self, train_obj: StyloFeatures,
                                  trainproblemlength, config_learning,
                                  threshold):

        skflocal = StratifiedKFoldProblemId.StratifiedKFoldProblemId(
            iids=train_obj.getiids(),
            n_splits=3,
            shuffle=True,
            random_state=411,
            nocodesperprogrammer=trainproblemlength)

        listoftraintestsplits = []
        for train_index, test_index in skflocal.split(None, None):
            trainfiles: StyloFeatures = train_obj[train_index]
            testfiles: StyloFeatures = train_obj[test_index]

            trainfiles, testfiles = self._tfidf_feature_selection(
                train_obj=trainfiles,
                test_obj=testfiles,
                config_learning=config_learning,
                threshold=threshold)

            listoftraintestsplits.append([
                trainfiles.getfeaturematrix(),
                testfiles.getfeaturematrix(),
                trainfiles.getlabels(),
                testfiles.getlabels()
            ])

        return listoftraintestsplits
Ejemplo n.º 2
0
    def learn_process(features_merged: StyloFeatures, learn_method: str,
                      learn_config: typing.Union[ConfigurationLearning,
                                                 ConfigurationLearningRNN]):
        skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId(
            iids=features_merged.getiids(),
            n_splits=8,
            shuffle=True,
            random_state=411,
            nocodesperprogrammer=learn_config.probsperprogrammer)

        for train_index, test_index in skf2.split(None, None):
            curproblemid = "_".join(
                features_merged.getiids()[test_index[0]].split("_")[0:2])
            if curproblemid == PROBLEM_ID_LOADED:

                # I. Do classification
                accuracy, curtestlearnsetup = perform_standard_classification_for_split(
                    features_merged=features_merged,
                    train_index=train_index,
                    test_index=test_index,
                    problem_id_test=curproblemid,
                    configuration_learning=learn_config,
                    modelsavedir=None,
                    threshold_sel=1.5,
                    learn_method=learn_method,
                    skf2=skf2)

                # II. Extract values and return them for checking...
                currentaccuracy = accuracy[curproblemid]

                print(currentaccuracy)
                print(curtestlearnsetup.data_final_test.getfeaturematrix().
                      mean())
                print(curtestlearnsetup.data_final_train.getfeaturematrix().
                      mean())
                print(curtestlearnsetup.data_final_test.getfeaturematrix()
                      [:, :10].mean())
                print(curtestlearnsetup.data_final_train.getfeaturematrix().
                      max())

                return currentaccuracy, \
                       curtestlearnsetup.data_final_test.getfeaturematrix().mean(), \
                       curtestlearnsetup.data_final_train.getfeaturematrix().mean(), \
                       curtestlearnsetup.data_final_test.getfeaturematrix()[:, :10].mean(), \
                       curtestlearnsetup.data_final_train.getfeaturematrix().max()
Ejemplo n.º 3
0
    def setUp(self):
        """
        Runs before any test.
        """

        configuration_learning: ConfigurationLearning = ConfigurationLearning(
            repo_path=Config.repo_path,
            dataset_features_dir=os.path.join(Config.repo_path,
                                              "data/dataset_2017"),
            suffix_data="_2017_8_formatted_macrosremoved",
            learnmodelspath=Config.learnmodelspath,
            use_lexems=False,
            use_lexical_features=False,
            stop_words=Config.stop_words_codestylo,
            probsperprogrammer=Config.probsperprogrammer,
            no_of_programmers=204,
            noofparallelthreads=8,
            hyperparameters=None)

        # A. First, get the author iids so that we have a unique key for each source file. To this end, we simply load
        # the whole dataset (but just the arff features).
        arffmatrix = StyloARFFFeatures(
            inputdata=configuration_learning.arffile, removelog=True)
        iids = arffmatrix.getiids()
        del arffmatrix

        # B. Split dataset into train - test set

        # Our adapted stratified k-fold
        skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId(
            iids=iids,
            n_splits=8,
            shuffle=True,
            random_state=411,
            nocodesperprogrammer=configuration_learning.probsperprogrammer)

        train_index, test_index = None, None
        for train_index, test_index in skf2.split(None, None):
            if iids[test_index][0].startswith("3264486_5736519012712448"):
                break

        # use a small helper function that represents a default set of settings
        features_merged: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features(
            config_learning=configuration_learning)

        print(
            "jo-cl",
            features_merged.codestyloreference.codestyloreference.
            getfeaturematrix().mean(), "should be: 0.0929585")

        # C. Merge the objects into one matrix / feature representation
        self.data_final_train = features_merged[train_index]
        self.data_final_test = features_merged[test_index]

        # set up tf-idf
        self.data_final_train.createtfidffeatures(trainobject=None)
        self.data_final_test.createtfidffeatures(
            trainobject=self.data_final_train)

        # with lexems
        configuration_learning_lex = copy.deepcopy(configuration_learning)
        configuration_learning_lex.use_lexems = True
        features_merged2: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features(
            config_learning=configuration_learning_lex)

        self.data_final_train_lexems = features_merged2[train_index]
        self.data_final_test_lexems = features_merged2[test_index]

        # with lexical features and arff
        configuration_learning_lexical = copy.deepcopy(configuration_learning)
        configuration_learning_lex.use_lexical_features = True
        features_merged3: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_all(
            config_learning=configuration_learning_lexical)

        self.data_final_train_lexicals = features_merged3[train_index]
        self.data_final_test_lexicals = features_merged3[test_index]
Ejemplo n.º 4
0
def perform_standard_classification_for_split(
    features_merged: StyloFeatures,
    train_index: np.ndarray,
    test_index: np.ndarray,
    problem_id_test: str,
    configuration_learning: ConfigurationLearning,
    modelsavedir: typing.Optional[str],
    threshold_sel: typing.Union[int, float],
    learn_method: str,
    skf2: StratifiedKFoldProblemId,
) -> typing.Tuple[dict, LearnSetup]:
    """
    Convenience Function.
    Perform classification task with given train- test split where test set contains one challenge/problem.
    If modelsavedir is given, it will save the model, the accuracy value and the configuration file
    into this directory.
    :param features_merged: feature matrix object
    :param train_index: indices of training objects
    :param test_index: indices of test objects
    :param problem_id_test: the unique challenge/problem ID of the test set
    :param configuration_learning: config
    :param modelsavedir: directory where the model, accuracy value and config file will be saved. If none, nothing
    will be saved.
    :param threshold_sel: threshold for feature selection.
    If int, we select 'threshold' features, if float, we select all features whose mutual information score
    is above 'threshold'. Look at __tfidf_feature_selection in Learning.py
    :param learn_method: learning method, "RF", "SVM", "DNN", and "RNN" are supported.
    :param skf2: StratifiedKFoldProblemId object.
    :return: accuracy as dict (so that problem id from test set is directly available), and LearnSetup.
    (Both are also stored in modelsavedir, if modelsavedir is not None)
    """
    accuracies = {}

    # I. Load features
    trainfiles: StyloFeatures = features_merged[train_index]
    testfiles: StyloFeatures = features_merged[test_index]

    # II. Learning
    problemid_test_learn = "_".join(testfiles.getiids()[0].split("_")[0:2])
    assert problem_id_test == problemid_test_learn

    learning: Learning = Learning()
    currentaccuracy, curtestlearnsetup = learning.do_learn(
        train_obj=trainfiles,
        test_obj=testfiles,
        config_learning=configuration_learning,
        modelsavedir=modelsavedir,
        problemid_test=problemid_test_learn,
        threshold=threshold_sel,
        learn_method=learn_method,
        trainproblemlength=len(skf2.get_n_problems()[0]))

    # III. save accuracy dict, overwrite it in each iteration.
    accuracies[problemid_test_learn] = currentaccuracy
    if modelsavedir is not None:
        accfile = os.path.join(
            modelsavedir, "acc_" + learn_method + "_" + str(threshold_sel) +
            "__" + problemid_test_learn + ".pck")
        pickle.dump(accuracies, file=open(accfile, 'wb'))

    # IV. Save configuration in a readable text file (use str method of config class), added after Version 0.2
    if modelsavedir is not None:
        with open(
                os.path.join(
                    modelsavedir, "configuration_" + str(threshold_sel) +
                    "__" + problemid_test_learn + ".txt"), "w") as text_file:
            print(str(configuration_learning), file=text_file)

    return accuracies, curtestlearnsetup
Ejemplo n.º 5
0
    features_merged: StyloFeaturesProxy = utils_extraction.extract_link_train_test_usenix_features(
        config_learning=configuration_learning)

elif feature_method == "CCS18":
    assert configuration_learning.use_lexems is not True
    unigrammmatrix_train: StyloFeatures = utils_extraction.extract_train_test_unigram(
        config_learning=configuration_learning, tf=True, idf=True, ngram_range=(1, 3))
    features_merged: StyloFeaturesProxy = StyloFeaturesProxy(codestyloreference=unigrammmatrix_train)

else:
    raise Exception("feature_method")


############## Split dataset into train - test set with our our grouped stratified k-fold ##############
skf2 = StratifiedKFoldProblemId.StratifiedKFoldProblemId(iids=features_merged.getiids(), n_splits=8, shuffle=True,
                                                         random_state=411,
                                                         nocodesperprogrammer=configuration_learning.probsperprogrammer)
print("No splits:", skf2.get_n_splits())


############## Do training + testing on each split ##############
accuracy = {}

for train_index, test_index in skf2.split(None, None):
    curproblemid = "_".join(features_merged.getiids()[test_index[0]].split("_")[0:2])
    if curproblemid == PROBLEM_ID_LOADED:

        # the following method saves the model and config file into modelsavedir if given
        accuracy, _ = classification.NovelAPI.utils_classification.perform_standard_classification_for_split(
            features_merged=features_merged,
            train_index=train_index,