Beispiel #1
0
    def get_dataloader(self, df, features_df, bs, include_df=False, **kwargs):
        text_values = df.preprocessed.values
        label_ids = df.label.values

        if features_df is not None:
            print(features_df.columns)
            extra_features = features_df.drop('Readable', axis=1).to_numpy()
            extra_features = StandardScaler().fit_transform(extra_features)
            assert (extra_features.shape[1] == self.config.extra_features_size)
        else:
            extra_features = None

        # dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, extra_features=extra_features, **kwargs)

        if self.config.sep_token:
            # leave text section empty
            dataloader = tu.get_dataloader(self.config,
                                           self.tokenizer,
                                           [''] * len(text_values),
                                           label_ids,
                                           bs,
                                           text_pair_values=text_values,
                                           **kwargs)
        else:
            dataloader = tu.get_dataloader(self.config, self.tokenizer,
                                           text_values, label_ids, bs,
                                           **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
    def get_dataloader(self, df, bs, include_df=False, fake=False, **kwargs):
        text_values = df.methodText.values
        label_ids = df.cyclomatic.values
        if fake:
            label_ids = label_ids.copy()[:128]
            text_values = text_values[:128]
            np.random.shuffle(label_ids)

        if self.config.sep_token:
            # leave text section empty
            dataloader = tu.get_dataloader(self.config, self.tokenizer, [''] * len(text_values), label_ids, bs, text_pair_values=text_values, **kwargs) 
        else:            
            dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) 

        if include_df:
            return (dataloader, df)
        return dataloader        
Beispiel #3
0
    def get_dataloader(self, df, bs, include_df=False, **kwargs):
        text_values = df.preprocessed
        label_ids = df['class'].values
        dataloader = tu.get_dataloader(self.config, self.tokenizer,
                                       text_values, label_ids, bs, **kwargs)

        if include_df:
            return (dataloader, df)
        return dataloader
Beispiel #4
0
    def get_dataloader(self, df, bs, include_df=False, **kwargs):
        text_values = df.text.values
        label_ids = df.informative.astype(int).values
        dataloader = tu.get_dataloader(self.config, self.tokenizer,
                                       text_values, label_ids, bs, **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
Beispiel #5
0
    def get_dataloader(self, df, bs, text_col='text', **kwargs):
        text_values = df[text_col].values

        if self.config.soft_label:
            label_ids = df[self.LABEL_NAMES].to_numpy()
        else:
            label_ids = df.label.values

        dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) 
        return dataloader
Beispiel #6
0
    def get_dataloader(self, df, bs, include_df=False, **kwargs):
        text_values = df.code.values
        label_ids = df.problematic.astype(int).values
        print(label_ids)

        if self.config.sep_token:
            # leave text section empty
            dataloader = tu.get_dataloader(self.config,
                                           self.tokenizer,
                                           [''] * len(text_values),
                                           label_ids,
                                           bs,
                                           text_pair_values=text_values,
                                           **kwargs)
        else:
            dataloader = tu.get_dataloader(self.config, self.tokenizer,
                                           text_values, label_ids, bs,
                                           **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
Beispiel #7
0
    def get_dataloader(self, df, bs, include_df=False, **kwargs):
        text_values = df.code.values
        label_ids = df.complexity.map(
            {label: idx
             for idx, label in enumerate(self.LABEL_NAMES)}).values

        if self.config.sep_token:
            # leave text section empty
            dataloader = tu.get_dataloader(self.config,
                                           self.tokenizer,
                                           [''] * len(text_values),
                                           label_ids,
                                           bs,
                                           text_pair_values=text_values,
                                           **kwargs)
        else:
            dataloader = tu.get_dataloader(self.config, self.tokenizer,
                                           text_values, label_ids, bs,
                                           **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
Beispiel #8
0
    def get_dataloader(self, tokenizer, df, bs, include_df=False, **kwargs):
        text_values = df.preprocessed.values
        single_class = self.config.single_class

        if self.config.single_class:
            label_ids = df[single_class].to_numpy()
        else:
            label_ids = df[Dataset.LABELS].to_numpy()

        dataloader = tu.get_dataloader(self.config, tokenizer, text_values,
                                       label_ids, bs, **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
Beispiel #9
0
    def get_dataloader(self, tokenizer, df, bs, include_df=False, **kwargs):
        # convert_label = {'DEFECT': 0, 'DESIGN': 1,
        #                 'IMPLEMENTATION': 2, 'TEST': 3,
        #                 'WITHOUT_CLASSIFICATION': 4, 'DOCUMENTATION': 5}

        text_values = df.preprocessed.values
        label_ids = df.label.values

        print(label_ids.dtype)
        print(np.unique(label_ids))

        dataloader = tu.get_dataloader(self.config, tokenizer, text_values,
                                       label_ids, bs, **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
Beispiel #10
0
    def get_dataloader(self, df, bs, backtrans_langs=[], include_df=False, **kwargs):
        text_values = df.preprocessed.values
        label_ids = df.label_id.astype(int).values
        
        if backtrans_langs:
            logger.info("Text values array shape before augmentation: %s", text_values.shape)
            logger.info("Label ids array shape before augmentation: %s", label_ids.shape)

            for l in backtrans_langs:
                text_values = np.append(text_values, df[f'preprocessed_{l}'].values, axis=0)
            label_ids = np.tile(label_ids, len(backtrans_langs) + 1)

            logger.info("Text values array shape AFTER augmentation: %s", text_values.shape)
            logger.info("Label ids array shape AFTER augmentation: %s", label_ids.shape)

        dataloader = tu.get_dataloader(self.config, self.tokenizer, text_values, label_ids, bs, **kwargs) 

        if include_df:
            return (dataloader, df)

        return dataloader        
Beispiel #11
0
    def get_dataloader(self, df, bs, include_df=False, **kwargs):

        if self.config.sep_token:
            text_values1 = df.preprocessed_comment.values
            text_values2 = df.preprocessed_code.values
        else:
            text_values1 = df.preprocessed.values
            text_values2 = None

        label_ids = df.label.values
        dataloader = tu.get_dataloader(self.config,
                                       self.tokenizer,
                                       text_values1,
                                       label_ids,
                                       bs,
                                       text_pair_values=text_values2,
                                       **kwargs)

        if include_df:
            return (dataloader, df)

        return dataloader
def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('satd_complete_binary')
        # config.train_head_only = True

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    df = pd.read_csv(util.data_path('satd', 'unclassified.csv'))
    # df = pd.read_csv(util.data_path('satd', 'dataset.csv'))
    df.dropna(inplace=True)
    # df.rename(columns={'classification': 'orig_classification'}, inplace=True)

    print(df.dtypes)

    print(df.head())

    df['preprocessed'] = df.commenttext.map(TDDataset.preprocess)
    df.dropna(inplace=True)
    # df = df.head(100)
    preprocessed = df.preprocessed.values
    dummy_labels = np.zeros(preprocessed.shape[0])
    dataloader = tu.get_dataloader(config,
                                   tokenizer,
                                   preprocessed,
                                   dummy_labels,
                                   bs=128,
                                   shuffle=False)

    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)

    preds = experiment.predict(dataloader)
    preds = torch.from_numpy(preds)
    probs = F.softmax(preds, dim=1)
    uncertainty = least_conf(probs).numpy()
    labels = np.argmax(preds, axis=1)

    df['uncertainty'] = uncertainty
    df['probs0'] = probs[:, 0].numpy()
    df['probs1'] = probs[:, 1].numpy()
    df['classification'] = labels
    df.drop('preprocessed', axis='columns', inplace=True)

    label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)}
    print(label_name_map)

    # convert_label = {'DEFECT': 1, 'DESIGN': 1,
    #                  'IMPLEMENTATION': 1, 'TEST': 1,
    #                  'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1}
    # df['correct'] = (df.orig_classification.map(convert_label) == df.classification)
    # print(df.correct.value_counts(normalize=True))

    df.classification = df.classification.map(label_name_map)
    df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False)

    tech_debt_df = df[df.classification == 'TECHNICAL_DEBT']
    print(tech_debt_df.shape)
    tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'),
                        index=False)