def load_dfs(self): self.train_df = self.__preprocess_df( pd.read_csv( util.data_path("ar_miner", f"{self.config.dataset}_train.csv"))) self.test_df = self.__preprocess_df( pd.read_csv( util.data_path("ar_miner", f"{self.config.dataset}_test.csv")))
def __init__(self, config, tokenizer): self.config = config self.tokenizer = tokenizer self.train_df = self.__preprocess_df( pd.read_csv(util.data_path("coherence", "shuffle", "train.csv"))) self.valid_df = self.__preprocess_df( pd.read_csv(util.data_path("coherence", "shuffle", "valid.csv")))
def load_dfs(self): self.scalabrino_df = self.__preprocess_df( pd.read_csv(util.data_path("readability", "scalabrino.csv")), 'scalabrino') #self.buse_df = self.__preprocess_df(pd.read_csv(util.data_path("readability", "buse.csv")), 'buse') #self.dorn_df = self.__preprocess_df(pd.read_csv(util.data_path("readability", "dorn.csv")), 'dorn') if self.config.features: self.scalabrino_features_df = self.__preprocess_features_df( pd.read_csv( util.data_path("readability", "features", "scalabrino_reduced.csv"))) self.buse_features_df = self.__preprocess_features_df( pd.read_csv( util.data_path("readability", "features", "buse.csv"))) self.dorn_features_df = self.__preprocess_features_df( pd.read_csv( util.data_path("readability", "features", "dorn.csv"))) assert (self.scalabrino_df.shape[0] == self.scalabrino_features_df.shape[0]) assert (self.buse_df.shape[0] == self.buse_features_df.shape[0]) assert (self.dorn_df.shape[0] == self.dorn_features_df.shape[0]) assert (self.scalabrino_df.label.equals( self.scalabrino_features_df.Readable)) assert (self.buse_df.label.equals(self.buse_features_df.Readable)) else: self.scalabrino_features_df = None self.buse_features_df = None self.dorn_features_df = None # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # print(pd.DataFrame({'label': self.scalabrino_df.label.eq(self.scalabrino_features_df.Readable), 'avg_rank': self.scalabrino_df.avg_rank})) # print(pd.DataFrame({'label': self.buse_df.label.eq(self.buse_features_df.Readable), 'avg_rank': self.buse_df.avg_rank})) # assert(self.dorn_df.label.equals(self.dorn_features_df.Readable)) # python = self.dorn_df[self.dorn_df.lang == 'python'] # java = self.dorn_df[self.dorn_df.lang == 'java'] # cuda = self.dorn_df[self.dorn_df.lang == 'cuda'] # with pd.option_context('display.max_rows', None, 'display.max_columns', None): # for p in permutations([cuda, python, java]): # self.dorn_df = pd.concat(p, ignore_index=True) # print("\n\n") # eq = self.dorn_df.label.eq(self.dorn_features_df.Readable) # print(eq.value_counts()) # print(pd.DataFrame({'eq': eq, 'avg_rank': self.dorn_df.avg_rank})) if self.config.line_length_tokens: self.__add_special_tokens()
def load_backtrans_dfs(dir_name, langs, prefix, **kwargs): print(kwargs) return { lang: pd.read_csv(util.data_path(dir_name, f"{prefix}_{lang}.csv"), **kwargs) for lang in langs }
def load_self_train_df(self, self_train_thresh): self.self_train_df = None if not self.binary: raise ValueError('self-training only available in binary mode') if self_train_thresh is not None: self.self_train_df = pd.read_csv(util.data_path( "satd", "unclassified_pos.csv"), delimiter=',') print("self-train shape before filter", self.self_train_df.shape) self.self_train_df = self.self_train_df[ self.self_train_df.uncertainty < self_train_thresh] self.self_train_df = self.preprocess_df(self.self_train_df, { 'NO_TECHNICAL_DEBT': 0, 'TECHNICAL_DEBT': 1 }) print(self.self_train_df.head()) self.self_train_df = self.self_train_df[[ 'projectname', 'commenttext', 'preprocessed', 'label' ]] self.self_train_df.dropna(inplace=True) print(self.self_train_df.head()) print("unclassified shape after filter", self.self_train_df.shape)
def load_df(self): df = pd.read_csv(util.data_path("cado", "dataset.csv"), delimiter=',') self.__add_none_label(df) self.orig_df = df self.df = self.preprocess_df(df) self.test_df = None
def load_test_df(self): if not self.test_df is None: return df = pd.read_csv(util.data_path("cado", "python_stdlib.csv"), delimiter=',') self.__add_none_label(df) self.test_df = self.preprocess_df(df, col='text')
def __init__(self, config, tokenizer): self.config = config self.tokenizer = tokenizer df = pd.read_csv(util.data_path("complexity", "cmpx.csv")) df = self.__preprocess_df(df) print(df.head()) train_df, test_df = train_test_split(df, test_size=0.2, shuffle=False) train_df, valid_df = train_test_split(train_df, test_size=0.2, shuffle=False) self.train_df = train_df self.valid_df = valid_df self.test_df = test_df
def load_df(self): df = pd.read_csv(util.data_path("satd", "dataset.csv"), delimiter=',') self.project_names = df.projectname.unique() self._label_names = sorted(df.classification.unique().tolist()) # move WITHOUT_CLASSIFICATION to 0th position (so it has class_id 0) self._label_names.remove(self.NEGATIVE_CLASS_NAME) self._label_names.insert(0, self.NEGATIVE_CLASS_NAME) self.label_map = self.build_label_map() if not self.binary: df = df[df.classification.isin( ['DESIGN', 'IMPLEMENTATION', self.NEGATIVE_CLASS_NAME])] self.df = self.preprocess_df(df, self.label_map)
def load_dfs(self): self.corazza_df = self.__preprocess_df( pd.read_csv(util.data_path("coherence", "corazza.csv")), "method") self.wang_df = self.__preprocess_df( pd.read_csv(util.data_path("coherence", "wang.csv")), "code")
def load_df(self): self.df = self.__preprocess_df( pd.read_csv(util.data_path("smell_detection", "smells.csv"))) self.df = self.df[~self.df.problematic.isna()]
def get_app_reviews_dataloader(self): test_df = pd.read_csv(util.data_path("sentidata/AppReviews.csv"), delimiter=',') test_df = self.preprocess(test_df, 'oracle', label_map=self.LABEL_MAP_SIGN) return self.get_dataloader(test_df, bs=self.config.eval_bs, text_col='sentence', shuffle=False)
def get_jira_dataloader(self): test_df = pd.read_csv(util.data_path("sentidata/jira.csv"), delimiter=',') #NOTE: Replaces the label column with the preprocessed one test_df = self.preprocess(test_df, 'label', label_map=self.LABEL_MAP_SIGN) return self.get_dataloader(test_df, bs=self.config.eval_bs, shuffle=False)
def get_stack_overflow_dataloader(self): test_df = pd.read_csv(util.data_path("sentidata/StackOverflow.csv"), delimiter=',') test_df = self.preprocess(test_df, 'oracle', label_map=self.LABEL_MAP_SIGN) return self.get_dataloader(test_df, bs=self.config.eval_bs, shuffle=False)
def load_dfs(self): train_valid_df = pd.read_csv(util.data_path("senti4sd", "train_ext.csv")) test_df = pd.read_csv(util.data_path("senti4sd", "test_ext.csv")) self.train_valid_df = self.preprocess(train_valid_df) self.test_df = self.preprocess(test_df)
def main(): config = get_config() with config: config.logging_steps = 400 config.train_epochs = 2 config.lr = 4e-5 # config.lr = 1e-4 config.model_type = 'roberta' config.model_path = util.models_path('satd_complete_binary') # config.train_head_only = True tokenizer = tu.load_tokenizer(config) model_cls = tu.get_model_cls(config) df = pd.read_csv(util.data_path('satd', 'unclassified.csv')) # df = pd.read_csv(util.data_path('satd', 'dataset.csv')) df.dropna(inplace=True) # df.rename(columns={'classification': 'orig_classification'}, inplace=True) print(df.dtypes) print(df.head()) df['preprocessed'] = df.commenttext.map(TDDataset.preprocess) df.dropna(inplace=True) # df = df.head(100) preprocessed = df.preprocessed.values dummy_labels = np.zeros(preprocessed.shape[0]) dataloader = tu.get_dataloader(config, tokenizer, preprocessed, dummy_labels, bs=128, shuffle=False) model = tu.load_model(config) model.to(config.device) util.set_seed(config) experiment = Experiment(config, model, tokenizer) preds = experiment.predict(dataloader) preds = torch.from_numpy(preds) probs = F.softmax(preds, dim=1) uncertainty = least_conf(probs).numpy() labels = np.argmax(preds, axis=1) df['uncertainty'] = uncertainty df['probs0'] = probs[:, 0].numpy() df['probs1'] = probs[:, 1].numpy() df['classification'] = labels df.drop('preprocessed', axis='columns', inplace=True) label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)} print(label_name_map) # convert_label = {'DEFECT': 1, 'DESIGN': 1, # 'IMPLEMENTATION': 1, 'TEST': 1, # 'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1} # df['correct'] = (df.orig_classification.map(convert_label) == df.classification) # print(df.correct.value_counts(normalize=True)) df.classification = df.classification.map(label_name_map) df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False) tech_debt_df = df[df.classification == 'TECHNICAL_DEBT'] print(tech_debt_df.shape) tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'), index=False)
def load_df(self): self.df = self.__preprocess_df(pd.read_csv(util.data_path("review_classification", "reviews_backtrans.csv")))
def load_dfs(self): self.df = self.__preprocess_df( pd.read_csv(util.data_path("corcod", "dataset.csv")))
def load_df(self): self.df = self.__preprocess_df( pd.read_csv( util.data_path("comment_classification", "comments.csv")))