Esempio n. 1
0
 def load_dfs(self):
     self.train_df = self.__preprocess_df(
         pd.read_csv(
             util.data_path("ar_miner",
                            f"{self.config.dataset}_train.csv")))
     self.test_df = self.__preprocess_df(
         pd.read_csv(
             util.data_path("ar_miner", f"{self.config.dataset}_test.csv")))
Esempio n. 2
0
    def __init__(self, config, tokenizer):
        self.config = config
        self.tokenizer = tokenizer

        self.train_df = self.__preprocess_df(
            pd.read_csv(util.data_path("coherence", "shuffle", "train.csv")))
        self.valid_df = self.__preprocess_df(
            pd.read_csv(util.data_path("coherence", "shuffle", "valid.csv")))
Esempio n. 3
0
    def load_dfs(self):
        self.scalabrino_df = self.__preprocess_df(
            pd.read_csv(util.data_path("readability", "scalabrino.csv")),
            'scalabrino')
        #self.buse_df = self.__preprocess_df(pd.read_csv(util.data_path("readability", "buse.csv")), 'buse')
        #self.dorn_df = self.__preprocess_df(pd.read_csv(util.data_path("readability", "dorn.csv")), 'dorn')

        if self.config.features:
            self.scalabrino_features_df = self.__preprocess_features_df(
                pd.read_csv(
                    util.data_path("readability", "features",
                                   "scalabrino_reduced.csv")))
            self.buse_features_df = self.__preprocess_features_df(
                pd.read_csv(
                    util.data_path("readability", "features", "buse.csv")))
            self.dorn_features_df = self.__preprocess_features_df(
                pd.read_csv(
                    util.data_path("readability", "features", "dorn.csv")))

            assert (self.scalabrino_df.shape[0] ==
                    self.scalabrino_features_df.shape[0])
            assert (self.buse_df.shape[0] == self.buse_features_df.shape[0])
            assert (self.dorn_df.shape[0] == self.dorn_features_df.shape[0])

            assert (self.scalabrino_df.label.equals(
                self.scalabrino_features_df.Readable))
            assert (self.buse_df.label.equals(self.buse_features_df.Readable))
        else:
            self.scalabrino_features_df = None
            self.buse_features_df = None
            self.dorn_features_df = None

        # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        #     print(pd.DataFrame({'label': self.scalabrino_df.label.eq(self.scalabrino_features_df.Readable), 'avg_rank': self.scalabrino_df.avg_rank}))
        #     print(pd.DataFrame({'label': self.buse_df.label.eq(self.buse_features_df.Readable), 'avg_rank': self.buse_df.avg_rank}))

        # assert(self.dorn_df.label.equals(self.dorn_features_df.Readable))

        # python = self.dorn_df[self.dorn_df.lang == 'python']
        # java = self.dorn_df[self.dorn_df.lang == 'java']
        # cuda = self.dorn_df[self.dorn_df.lang == 'cuda']

        # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        #     for p in permutations([cuda, python, java]):
        #         self.dorn_df = pd.concat(p, ignore_index=True)
        #         print("\n\n")
        #         eq = self.dorn_df.label.eq(self.dorn_features_df.Readable)
        #         print(eq.value_counts())
        #         print(pd.DataFrame({'eq': eq, 'avg_rank': self.dorn_df.avg_rank}))

        if self.config.line_length_tokens:
            self.__add_special_tokens()
Esempio n. 4
0
def load_backtrans_dfs(dir_name, langs, prefix, **kwargs):
    print(kwargs)
    return {
        lang: pd.read_csv(util.data_path(dir_name, f"{prefix}_{lang}.csv"),
                          **kwargs)
        for lang in langs
    }
Esempio n. 5
0
    def load_self_train_df(self, self_train_thresh):
        self.self_train_df = None

        if not self.binary:
            raise ValueError('self-training only available in binary mode')

        if self_train_thresh is not None:
            self.self_train_df = pd.read_csv(util.data_path(
                "satd", "unclassified_pos.csv"),
                                             delimiter=',')
            print("self-train shape before filter", self.self_train_df.shape)
            self.self_train_df = self.self_train_df[
                self.self_train_df.uncertainty < self_train_thresh]
            self.self_train_df = self.preprocess_df(self.self_train_df, {
                'NO_TECHNICAL_DEBT': 0,
                'TECHNICAL_DEBT': 1
            })

            print(self.self_train_df.head())
            self.self_train_df = self.self_train_df[[
                'projectname', 'commenttext', 'preprocessed', 'label'
            ]]
            self.self_train_df.dropna(inplace=True)
            print(self.self_train_df.head())
            print("unclassified shape after filter", self.self_train_df.shape)
Esempio n. 6
0
    def load_df(self):
        df = pd.read_csv(util.data_path("cado", "dataset.csv"), delimiter=',')
        self.__add_none_label(df)

        self.orig_df = df
        self.df = self.preprocess_df(df)
        self.test_df = None
Esempio n. 7
0
    def load_test_df(self):
        if not self.test_df is None:
            return

        df = pd.read_csv(util.data_path("cado", "python_stdlib.csv"),
                         delimiter=',')
        self.__add_none_label(df)

        self.test_df = self.preprocess_df(df, col='text')
    def __init__(self, config, tokenizer):
        self.config = config
        self.tokenizer = tokenizer

        df = pd.read_csv(util.data_path("complexity", "cmpx.csv"))
        df = self.__preprocess_df(df)

        print(df.head())

        train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False)
        train_df, valid_df = train_test_split(train_df, test_size=0.2, shuffle=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
Esempio n. 9
0
    def load_df(self):
        df = pd.read_csv(util.data_path("satd", "dataset.csv"), delimiter=',')

        self.project_names = df.projectname.unique()
        self._label_names = sorted(df.classification.unique().tolist())
        # move WITHOUT_CLASSIFICATION to 0th position (so it has class_id 0)
        self._label_names.remove(self.NEGATIVE_CLASS_NAME)
        self._label_names.insert(0, self.NEGATIVE_CLASS_NAME)

        self.label_map = self.build_label_map()

        if not self.binary:
            df = df[df.classification.isin(
                ['DESIGN', 'IMPLEMENTATION', self.NEGATIVE_CLASS_NAME])]

        self.df = self.preprocess_df(df, self.label_map)
Esempio n. 10
0
 def load_dfs(self):
     self.corazza_df = self.__preprocess_df(
         pd.read_csv(util.data_path("coherence", "corazza.csv")), "method")
     self.wang_df = self.__preprocess_df(
         pd.read_csv(util.data_path("coherence", "wang.csv")), "code")
Esempio n. 11
0
 def load_df(self):
     self.df = self.__preprocess_df(
         pd.read_csv(util.data_path("smell_detection", "smells.csv")))
     self.df = self.df[~self.df.problematic.isna()]
Esempio n. 12
0
    def get_app_reviews_dataloader(self):
        test_df = pd.read_csv(util.data_path("sentidata/AppReviews.csv"), delimiter=',')
        test_df = self.preprocess(test_df, 'oracle', label_map=self.LABEL_MAP_SIGN)

        return self.get_dataloader(test_df, bs=self.config.eval_bs, text_col='sentence', shuffle=False)
Esempio n. 13
0
    def get_jira_dataloader(self):
        test_df = pd.read_csv(util.data_path("sentidata/jira.csv"), delimiter=',')
        #NOTE: Replaces the label column with the preprocessed one
        test_df = self.preprocess(test_df, 'label', label_map=self.LABEL_MAP_SIGN)

        return self.get_dataloader(test_df, bs=self.config.eval_bs, shuffle=False)
Esempio n. 14
0
    def get_stack_overflow_dataloader(self):
        test_df = pd.read_csv(util.data_path("sentidata/StackOverflow.csv"), delimiter=',')
        test_df = self.preprocess(test_df, 'oracle', label_map=self.LABEL_MAP_SIGN)

        return self.get_dataloader(test_df, bs=self.config.eval_bs, shuffle=False)
Esempio n. 15
0
    def load_dfs(self):
        train_valid_df = pd.read_csv(util.data_path("senti4sd", "train_ext.csv"))
        test_df = pd.read_csv(util.data_path("senti4sd", "test_ext.csv"))

        self.train_valid_df = self.preprocess(train_valid_df)
        self.test_df = self.preprocess(test_df)
def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('satd_complete_binary')
        # config.train_head_only = True

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    df = pd.read_csv(util.data_path('satd', 'unclassified.csv'))
    # df = pd.read_csv(util.data_path('satd', 'dataset.csv'))
    df.dropna(inplace=True)
    # df.rename(columns={'classification': 'orig_classification'}, inplace=True)

    print(df.dtypes)

    print(df.head())

    df['preprocessed'] = df.commenttext.map(TDDataset.preprocess)
    df.dropna(inplace=True)
    # df = df.head(100)
    preprocessed = df.preprocessed.values
    dummy_labels = np.zeros(preprocessed.shape[0])
    dataloader = tu.get_dataloader(config,
                                   tokenizer,
                                   preprocessed,
                                   dummy_labels,
                                   bs=128,
                                   shuffle=False)

    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)

    preds = experiment.predict(dataloader)
    preds = torch.from_numpy(preds)
    probs = F.softmax(preds, dim=1)
    uncertainty = least_conf(probs).numpy()
    labels = np.argmax(preds, axis=1)

    df['uncertainty'] = uncertainty
    df['probs0'] = probs[:, 0].numpy()
    df['probs1'] = probs[:, 1].numpy()
    df['classification'] = labels
    df.drop('preprocessed', axis='columns', inplace=True)

    label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)}
    print(label_name_map)

    # convert_label = {'DEFECT': 1, 'DESIGN': 1,
    #                  'IMPLEMENTATION': 1, 'TEST': 1,
    #                  'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1}
    # df['correct'] = (df.orig_classification.map(convert_label) == df.classification)
    # print(df.correct.value_counts(normalize=True))

    df.classification = df.classification.map(label_name_map)
    df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False)

    tech_debt_df = df[df.classification == 'TECHNICAL_DEBT']
    print(tech_debt_df.shape)
    tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'),
                        index=False)
Esempio n. 17
0
 def load_df(self):
     self.df = self.__preprocess_df(pd.read_csv(util.data_path("review_classification", "reviews_backtrans.csv")))
Esempio n. 18
0
 def load_dfs(self):
     self.df = self.__preprocess_df(
         pd.read_csv(util.data_path("corcod", "dataset.csv")))
 def load_df(self):
     self.df = self.__preprocess_df(
         pd.read_csv(
             util.data_path("comment_classification", "comments.csv")))