def train(self, model): if self.train_params['reload'] > 0: self.load_model(model, self.train_params['reload']) valid_every = self.train_params.get('valid_every', None) save_every = self.train_params.get('save_every', None) batch_size = self.train_params.get('batch_size', 128) nb_epoch = self.train_params.get('nb_epoch', 10) split = self.train_params.get('validation_split', 0) val_loss = {'loss': 1., 'epoch': 0} chunk_size = self.train_params.get('chunk_size', 100000) for i in range(self.train_params['reload'] + 1, nb_epoch): print('Epoch %d :: \n' % i, end='') logger.debug('loading data chunk..') offset = (i - 1) * self.train_params.get('chunk_size', 100000) names = data_loader.load_hdf5( self.data_path + self.data_params['train_methname'], offset, chunk_size) apis = data_loader.load_hdf5( self.data_path + self.data_params['train_apiseq'], offset, chunk_size) tokens = data_loader.load_hdf5( self.data_path + self.data_params['train_tokens'], offset, chunk_size) descs = data_loader.load_hdf5( self.data_path + self.data_params['train_desc'], offset, chunk_size) logger.debug('padding data..') methnames = pad(names, self.data_params['methname_len']) apiseqs = pad(apis, self.data_params['apiseq_len']) tokens = pad(tokens, self.data_params['tokens_len']) good_descs = pad(descs, self.data_params['desc_len']) bad_descs = [desc for desc in descs] random.shuffle(bad_descs) bad_descs = pad(bad_descs, self.data_params['desc_len']) hist = model.fit( [methnames, apiseqs, tokens, good_descs, bad_descs], epochs=1, batch_size=batch_size, validation_split=split) if hist.history['val_loss'][0] < val_loss['loss']: val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i} print('Best: Loss = {}, Epoch = {}'.format(val_loss['loss'], val_loss['epoch'])) if save_every is not None and i % save_every == 0: self.save_model(model, i) if valid_every is not None and i % valid_every == 0: acc, mrr, map, ndcg = self.valid(model, 1000, 1)
def repr_code(self, model): logger.info('Loading the use data ..') methnames = data_loader.load_hdf5( self.data_path + self.data_params['use_methname'], 0, -1) apiseqs = data_loader.load_hdf5( self.data_path + self.data_params['use_apiseq'], 0, -1) tokens = data_loader.load_hdf5( self.data_path + self.data_params['use_tokens'], 0, -1) methnames = pad(methnames, self.data_params['methname_len']) apiseqs = pad(apiseqs, self.data_params['apiseq_len']) tokens = pad(tokens, self.data_params['tokens_len']) logger.info('Representing code ..') vecs = model.repr_code([methnames, apiseqs, tokens], batch_size=1000) vecs = vecs.astype('float32') vecs = normalize(vecs) return vecs
def repr_code(self, model): logger.info('Loading the use data ..') methnames = data_loader.load_hdf5( self.data_path + self.data_params['use_methname'], 0, -1) methnames = pad(methnames, self.data_params['methname_len']) logger.info('Representing code ..') vecs = model.repr_code([methnames], batch_size=10000) vecs = vecs.astype(np.float) vecs = normalize(vecs) return vecs
def valid(self, model, poolsize, K): """ validate in a code pool. param: poolsize - size of the code pool, if -1, load the whole test set """ def ACC(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1 return sum / float(len(real)) def MAP(real, predict): sum = 0.0 for id, val in enumerate(real): try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + (id + 1) / float(index + 1) return sum / float(len(real)) def MRR(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1.0 / float(index + 1) return sum / float(len(real)) def NDCG(real, predict): dcg = 0.0 idcg = IDCG(len(real)) for i, predictItem in enumerate(predict): if predictItem in real: itemRelevance = 1 rank = i + 1 dcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(rank + 1)) return dcg / float(idcg) def IDCG(n): idcg = 0 itemRelevance = 1 for i in range(n): idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(i + 2)) return idcg #load valid dataset if self._eval_sets is None: methnames = data_loader.load_hdf5( self.data_path + self.data_params['valid_methname'], 0, poolsize) apiseqs = data_loader.load_hdf5( self.data_path + self.data_params['valid_apiseq'], 0, poolsize) tokens = data_loader.load_hdf5( self.data_path + self.data_params['valid_tokens'], 0, poolsize) descs = data_loader.load_hdf5( self.data_path + self.data_params['valid_desc'], 0, poolsize) self._eval_sets = { 'methnames': methnames, 'apiseqs': apiseqs, 'tokens': tokens, 'descs': descs } acc, mrr, map, ndcg = 0, 0, 0, 0 data_len = len(self._eval_sets['descs']) for i in tqdm(range(data_len)): desc = self._eval_sets['descs'][i] #good desc descs = pad([desc] * data_len, self.data_params['desc_len']) methnames = pad(self._eval_sets['methnames'], self.data_params['methname_len']) apiseqs = pad(self._eval_sets['apiseqs'], self.data_params['apiseq_len']) tokens = pad(self._eval_sets['tokens'], self.data_params['tokens_len']) n_results = K sims = model.predict([methnames, apiseqs, tokens, descs], batch_size=data_len).flatten() negsims = np.negative(sims) predict = np.argsort( negsims) #predict = np.argpartition(negsims, kth=n_results-1) predict = predict[:n_results] predict = [int(k) for k in predict] real = [i] acc += ACC(real, predict) mrr += MRR(real, predict) map += MAP(real, predict) ndcg += NDCG(real, predict) acc = acc / float(data_len) mrr = mrr / float(data_len) map = map / float(data_len) ndcg = ndcg / float(data_len) logger.info(f'ACC={acc}, MRR={mrr}, MAP={map}, nDCG={ndcg}') return acc, mrr, map, ndcg
#################################################################################################################################################################### data_param = config.get('data_params', dict()) ### Loading training set data = pd.read_csv('use.rawcode.txt', sep="\n", header=None) for i in range(len(data[0])): method_string = data[0][i] code_embedding_DeepCS.replace_methname(method_string, i) methname = pd.read_csv('new_methname.txt', sep="\n", header=None) methnames = code_embedding_DeepCS.text_to_array(methname[0]) descs = data_loader.load_hdf5(data_path + data_param['train_desc'], 0, args.chunk_size) good_descs = pad(descs, data_param['desc_len']) bad_descs = [desc for desc in descs] random.shuffle(bad_descs) bad_descs = pad(bad_descs, data_param['desc_len']) #refactoring refactored_mn = TestSuiteGenerator.generateTestSuite(model, methnames, args.chunk_size, args.gen, args.mutation_rate) refactored_mn = np.array(refactored_mn).reshape(args.chunk_size, 6) methnames = np.concatenate((methnames, refactored_mn), axis=0) good_descs = np.concatenate((good_descs, good_descs), axis=0) bad_descs = np.concatenate((bad_descs, bad_descs), axis=0)
def valid(self, model, poolsize, K): """ validate in a code pool. param: poolsize - size of the code pool, if -1, load the whole test set """ def ACC(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1 return sum / float(len(real)) def MAP(real, predict): sum = 0.0 for id, val in enumerate(real): try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + (id + 1) / float(index + 1) return sum / float(len(real)) def MRR(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1.0 / float(index + 1) return sum / float(len(real)) def NDCG(real, predict): dcg = 0.0 idcg = IDCG(len(real)) for i, predictItem in enumerate(predict): if predictItem in real: itemRelevance = 1 rank = i + 1 dcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(rank + 1)) return dcg / float(idcg) def IDCG(n): idcg = 0 itemRelevance = 1 for i in range(n): idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(i + 2)) return idcg #load valid dataset print('self._eval_sets,', self._eval_sets) if self._eval_sets is None: methnames = data_loader.load_hdf5( self.data_path + self.data_params['valid_methname'], 0, poolsize) descs = data_loader.load_hdf5( self.data_path + self.data_params['valid_desc'], 0, poolsize) self._eval_sets = {'methnames': methnames, 'descs': descs} accs, mrrs, maps, ndcgs = [], [], [], [] data_len = len(self._eval_sets['descs']) print('data_len', data_len) for i in tqdm(range(data_len)): desc = self._eval_sets['descs'][i] #good desc descs = pad([desc] * data_len, self.data_params['desc_len']) methnames = pad(self._eval_sets['methnames'], self.data_params['methname_len']) n_results = K sims = model.predict([methnames, descs], batch_size=data_len).flatten() negsims = np.negative(sims) predict = np.argpartition(negsims, kth=n_results - 1) predict = predict[:n_results] predict = [int(k) for k in predict] real = [i] accs.append(ACC(real, predict)) mrrs.append(MRR(real, predict)) maps.append(MAP(real, predict)) ndcgs.append(NDCG(real, predict)) logger.info( f'ACC={np.mean(accs)}, MRR={np.mean(mrrs)}, MAP={np.mean(maps)}, nDCG={np.mean(ndcgs)}' ) return accs, mrrs, maps, ndcgs
mode = 'test' filename = './log_folder/record.txt' threshold_CC = 0 threshold_MC = 0.7 symbols_SQ = 2 seq = '[0 ,1]' seq = re.findall(r"\d+\.?\d*", seq) TargMetri = None CoverageStop = 0.9 TestCaseNum = 2000 minimalTest = 0 r = record(filename, time.time()) data_param = config.get('data_params', dict()) ### Loading training set names = data_loader.load_hdf5(data_path + data_param['train_methname'], 0, data_param.get('chunk_size', 100000)) methnames = pad(names, data_param['methname_len']) print(methnames[0]) z = methnames[0].reshape(1, 6) print(z) z[0][1] = 0 print(z) apis = data_loader.load_hdf5( data_path + data_param['train_apiseq'], 0, data_param.get('chunk_size', 100000), ) tokens = data_loader.load_hdf5( data_path + data_param['train_tokens'], 0, data_param.get('chunk_size', 100000),