예제 #1
0
 def make_patterns(args, cur_work_dir):
     print('Compute patterns...')
     result = subprocess.run(['make', 'patterns'], stdout=subprocess.PIPE, encoding='utf-8', cwd=cur_work_dir)
     print(result.returncode)
     if result.returncode != 0:
         print(result.stderr)
         exit(3)
     else:
         print(result.stdout)
         if args.dataset_file:
             dataset_file_path = Path(cur_work_dir, args.dataset_file)
             if not dataset_file_path.parent.exists():
                 dataset_file_path.parent.mkdir(parents=True)
             shutil.copy(Path(Config.dataset_file()), dataset_file_path)
         else:
             dataset_file_path = Path(Config.dataset_file())
         print('dataset was saved to {}'.format(str(dataset_file_path.absolute())))
예제 #2
0
    def preprocess_file(self, scale_ncss=True, scale=False, **kwargs):

        df = pd.read_csv(Config.dataset_file())
        df = df[~df["filename"].str.lower().str.contains("test")]
        config = Config.get_patterns_config()
        if self.do_rename_columns:
            p_codes = \
                [x['code'] for x in config['patterns']] \
                + ['lines' + x['code'] for x in config['patterns']]
            m_codes = [x['code'] for x in config['metrics']]
            keys = p_codes + m_codes
            vals = \
                [x['name'] for x in config['patterns']] \
                + ['lines' + x['name'] for x in config['patterns']] \
                + [x['name'] for x in config['metrics']]

            replace_dict = dict(zip(keys, vals))
            df = df.rename(replace_dict)
            df.columns = vals
            print('Columns renamed:' + df.head())

        df = df.dropna().drop_duplicates(
            subset=df.columns.difference(['filename']))
        df = df[(df.ncss > 20) & (df.ncss < 100) &
                (df.npath_method_avg < 100000.00)].copy().reset_index()

        df.drop('filename', axis=1, inplace=True)
        df.drop('index', axis=1, inplace=True)
        self.target = df[['M4']].values[:, 0]
        if scale_ncss:
            new = pd.DataFrame(df[self.only_patterns].values /
                               df['M2'].values.reshape((-1, 1)),
                               columns=self.only_patterns)
            self.target /= df['M2'].values.reshape(-1)
        else:
            new = df[self.only_patterns].copy()
        if scale:
            self.input = pd.DataFrame(StandardScaler().fit_transform(
                new.values),
                                      columns=new.columns,
                                      index=new.index).values
        else:
            self.input = new.values

        self.feature_order = list(new.columns)