def __init__(self, tripletFile, bugDataExtractor, trainingDatasetPath, bugDatabase): # Store the feature generated by bug self.inputBug = [] self.inputDuplicateBug = [] self.cache = {} self.trainingDataset = BugDataset(trainingDatasetPath) self.pairs = [] self.bugIds = [ bugDatabase.bugList[idx]['bug_id'] for idx in range(self.trainingDataset.end) ] self.masterIdByBugId = bugDatabase.getMasterIdByBugId(self.bugIds) self.bugDataExtractor = bugDataExtractor f = open(tripletFile, 'r') self.logger = logging.getLogger(__name__) for l in f: bugId, duplicateBugId, nonDuplicateBugId = l.strip().split(',') ftrsBug = bugDataExtractor.extract(bugId) ftrsDuplicateBug = bugDataExtractor.extract(duplicateBugId) self.inputBug.append(ftrsBug) self.inputDuplicateBug.append(ftrsDuplicateBug) self.pairs.append((bugId, duplicateBugId))
parser = argparse.ArgumentParser(description='') parser.add_argument('--bug_data', required=True, help="") parser.add_argument('--dataset', required=True, help="") parser.add_argument('--list_size', required=True, type=int, help="") parser.add_argument('--type', required=True, help="") parser.add_argument('--save', help="") parser.add_argument('--model', help="") parser.add_argument('--nproc', type=int, default=6, help="") parser.add_argument('--same_prod', action='store_true', help="") logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() args = parser.parse_args() logger.info(args) bugDataset = BugDataset(args.dataset) bugIds = bugDataset.bugIds duplicateBugs = bugDataset.duplicateIds if args.type in set(['tfidf', 'binary']): # Insert imports to load TfIdfVectorizer class from data.bug_dataset import BugDataset bugReportDatabase = BugReportDatabase.fromJson(args.bug_data) masterBugIdByBugId = bugReportDatabase.getMasterIdByBugId() vectorizer = pickle.load(open(args.model, 'rb')) normalize = True if args.type == 'tfidf' else False negativeSimMatrix = generateNegativeListSparseVector(args.list_size, bugReportDatabase, bugIds, vectorizer,
"Performs the recall rate estimation each epoch. This parameter receives the file that contains the list of bug ids." ) logger = logging.getLogger() logger.setLevel(logging.DEBUG) logHandler = logging.StreamHandler() formatter = JsonLogFormatter() logHandler.setFormatter(formatter) logger.addHandler(logHandler) args = parser.parse_args() logger.info(args.__dict__) args.recall_ratio_k = [int(k) for k in args.recall_ratio_k] bugSetDataset = BugDataset(args.input) bugReportDatabase = BugReportDatabase.fromJson(args.bug_dataset) if args.recall_estimation: bugIds, listByBugId = pickle.load(open(args.recall_estimation, 'rb')) duplicateBugs = list(listByBugId.keys()) else: listByBugId = None bugIds = [] for idx in range(len(bugReportDatabase)): bugIds.append(bugReportDatabase.getBugByIndex(idx)['bug_id']) duplicateBugs = bugSetDataset.duplicateIdxs similarityListByDuplicate = []
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["lexicon"]: emb = np.load(compareAggOpt["word_embedding"]) lexicon = Lexicon(unknownSymbol=None) with codecs.open(compareAggOpt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") paddingId = lexicon.getLexiconIndex(paddingSym) embedding = Embedding(lexicon, emb, paddingIdx=paddingId) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) elif compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) else: embedding = None if compareAggOpt["norm_word_embedding"]: embedding.zscoreNormalization() # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 bow = compareAggOpt.get('bow', False) freq = compareAggOpt.get('frequency', False) and bow logger.info("BoW={} and TF={}".format(bow, freq)) if compareAggOpt['extractor'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary and Description information.") # Loading Filters extractorFilters = loadFilters(compareAggOpt['extractor']['filters']) arguments = (databasePath, compareAggOpt['word_embedding'], str(compareAggOpt['lexicon']), ' '.join( sorted([ fil.__class__.__name__ for fil in extractorFilters ])), compareAggOpt['tokenizer'], str(bow), str(freq), SABDEncoderPreprocessor.__name__) inputHandlers.append(SABDInputHandler(paddingId, minSeqSize)) extractorCache = PreprocessingCache(cacheFolder, arguments) if bow: extractorPreprocessor = SABDBoWPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, freq, extractorCache) else: extractorPreprocessor = SABDEncoderPreprocessor( lexicon, bugReportDatabase, extractorFilters, tokenizer, paddingId, extractorCache) preprocessors.append(extractorPreprocessor) # Create model model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'], compareAggOpt['matching'], compareAggOpt['aggregate'], compareAggOpt['classifier'], freq) if args['loss'] == 'bce': logger.info("Using BCE Loss: margin={}".format(args['margin'])) lossFn = BCELoss() lossNoReduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(inputHandlers, torch.float32, unsqueeze_target=True) elif args['loss'] == 'triplet': logger.info("Using Triplet Loss: margin={}".format(args['margin'])) lossFn = TripletLoss(args['margin']) lossNoReduction = TripletLoss(args['margin'], reduction='none') cmp_collate = TripletBugCollate(inputHandlers) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) trainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'product_component': logger.info("Product Component Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = ProductComponentRandomGen( bugReportDatabase, preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmp_collate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) if isinstance(lossFn, BCELoss): training_reader = PairBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) elif isinstance(lossFn, TripletLoss): training_reader = TripletBugDatasetReader( trainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingLoader = DataLoader(training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): if isinstance(lossFn, BCELoss): validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) elif isinstance(lossFn, TripletLoss): validation_reader = TripletBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer( model, preprocessors, device, PairBugCollate(inputHandlers, ignore_target=True), args['ranking_batch_size'], args['ranking_n_workers']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y elif isinstance(lossFn, (BCELoss)): return y_pred, y trainer = Engine(trainingIteration) trainingMetrics = {'training_loss': AverageLoss(lossFn)} if isinstance(lossFn, BCELoss): trainingMetrics['training_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) trainingMetrics['training_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) trainingMetrics['training_recall'] = RecallWrapper( output_transform=thresholded_output_transform) # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(lossFn, output_transform=lambda x: (x[0], x[0][0]) if x[1] is None else x) } if isinstance(lossFn, BCELoss): validationMetrics['validation_dist_target'] = MeanScoreDistance( output_transform=scoreDistanceTrans) validationMetrics['validation_acc'] = AccuracyWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_precision'] = PrecisionWrapper( output_transform=thresholded_output_transform) validationMetrics['validation_recall'] = RecallWrapper( output_transform=thresholded_output_transform) evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() if not lastEpoch: training_reader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)
parser.add_argument("-test", '--test', dest='test', help="") parser.add_argument("--result_file", help="") parser.add_argument("--window", type=int, help="") logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() args = parser.parse_args() logger.info(args) training = True bugReportDatabase = BugReportDatabase.fromJson(args.db) testDataset = BugDataset(args.test) preprocessing = DBRDPreprocessing() if args.dt is not None: trainingDataset = BugDataset(args.dt) np.random.seed(args.seed) random.seed(args.seed) data_model = [] data_alpha = [] # Split the dataset into two sets: the first set is used to train the model and # the second is used to tune the ensemble weights duplicate_reports = trainingDataset.duplicateIds
out_file.write("A-T={}\n".format(format_tf_to_text(bug['total_tri']))) out_file.write("DID={}\n".format('' if len(bug['dup_id']) == 0 else bug['dup_id'])) out_file.write("VERSION={}\n".format(version_dict[bug['version']])) out_file.write("COMPONENT={}\n".format(product_dict[bug['product']])) out_file.write("SUB-COMPONENT={}\n".format( component_dict[bug['component']])) out_file.write("TYPE={}\n".format(type_dict[bug['bug_severity']])) out_file.write("PRIORITY={}\n".format(priority_dict[bug['priority']])) if __name__ == '__main__': parser = argparse.ArgumentParser(description='') parser.add_argument('--database', required=True, help="") parser.add_argument('--test', required=True, help="") parser.add_argument('--output', required=True, help="") logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() args = parser.parse_args() logger.info(args) output_path = args.output database = BugReportDatabase.fromJson(args.database) test = BugDataset(args.test) max_bug_id = max(map(lambda bug_id: int(bug_id), test.bugIds)) generate_input(database, max_bug_id, output_path)
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = [ 'summary', 'description', 'sum_desc', 'scorer', 'categorical' ] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None """ Set preprocessor that will pre-process the raw information from the bug reports. Each different information has a specific encoder(NN), preprocessor and input handler. """ preprocessors = PreprocessorList() encoders = [] inputHandlers = [] sum_desc_opts = args['sum_desc'] databasePath = args['bug_database'] if sum_desc_opts is not None: processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers, preprocessors, encoders, cacheFolder, databasePath, logger, paddingSym) sumOpts = args.get("summary") if sumOpts is not None: processSumParam(sumOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym) descOpts = args.get("description") if descOpts is not None: processDescriptionParam(descOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym) categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: processCategoricalParam(categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, encoders, logger) """ Set the final scorer and the loss. Load the scorer if this argument was set. """ scorerOpts = args['scorer'] scorerType = scorerOpts['type'] if scorerType == 'binary': pass # withoutBugEmbedding = scorerOpts.get('without_embedding', False) # batchNorm = scorerOpts.get('batch_normalization', True) # hiddenSizes = scorerOpts.get('hidden_sizes', [100]) # model = ProbabilityPairNN(encoders, withoutBugEmbedding, hiddenSizes, batchNorm) # lossFn = BCELoss() # lossNoReduction = BCELoss(reduction='none') # # logger.info("Using BCELoss") elif scorerType == 'cosine': model = CosineTripletNN(encoders, scorerOpts['dropout']) margin = scorerOpts.get('margin', 0.0) if (categoricalOpt is not None and categoricalOpt.get('bn_last_layer', False)) or ( sum_desc_opts is not None and sum_desc_opts.get('bn_last_layer', False)) or ( sumOpts is not None and sumOpts.get('bn_last_layer')): raise Exception( 'You are applying batch normalization in the bug embedding.') lossFn = TripletLoss(margin) lossNoReduction = TripletLoss(margin, reduction='none') logger.info("Using Cosine Embeding Loss: margin={}".format(margin)) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ tripletCollate = TripletBugCollate(inputHandlers) # load training if args.get('triplets_training'): negativePairGenOpt = args.get('neg_pair_generator', ) tripletTrainingFile = args.get('triplets_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") tripletTrainingReader = TripletBugDatasetReader( tripletTrainingFile, preprocessors) else: pairGenType = negativePairGenOpt['type'] masterIdByBugId = bugReportDatabase.getMasterIdByBugId() if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, tripletCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, tripletCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, decimals=negativePairGenOpt['decimals']) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, tripletCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length']) elif pairGenType == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length']) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, tripletCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) tripletTrainingReader = TripletBugDatasetReader( tripletTrainingFile, preprocessors, negativePairGenerator) trainingLoader = DataLoader(tripletTrainingReader, batch_size=batchSize, collate_fn=tripletCollate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('triplets_validation'): tripletValidationReader = TripletBugDatasetReader( args.get('triplets_validation'), preprocessors) validationLoader = DataLoader(tripletValidationReader, batch_size=batchSize, collate_fn=tripletCollate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = SharedEncoderNNScorer(preprocessors, inputHandlers, model, device, batchSize=args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() x, y = tripletCollate.to(batch, device) output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output trainer = Engine(trainingIteration) trainingMetrics = { 'training_loss': AverageLoss(lossFn, batch_size=lambda x: x.shape[0]) } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): x, y = TripletBugCollate.to(batch, device) y_pred = model(*x) return y_pred, y_pred validationMetrics = { 'validation_loss': LossWrapper(lossFn, batch_size=lambda x: x.shape[0]) } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train") if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation") if offlineGeneration: tripletTrainingReader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % args['save']) torch.save(modelInfo, args['save']) if args.get('triplets_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('triplets_validation'): # Evaluate Training evaluator.run(trainingLoader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation") recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master)
trainingBugs = set() triplets = [] if args.training: if args.is_pairs: logger.info("Reading training with pairs") f = open(args.training, 'r') for l in f: bugId1, bugId2, label = l.strip().split(',') trainingBugs.add(bugId1) trainingBugs.add(bugId2) else: logger.info("Reading training") bugDataset = BugDataset(args.training) trainingBugs.update(bugDataset.bugIds) logger.info("Preprocessing and fitting data") trainingText = [] for bugId in trainingBugs: bugReport = bugReportDataset.getBug(bugId) text = concatenateSummaryAndDescription(bugReport) trainingText.append(text) if args.load: logger.info('Loading object') vectorizer = pickle.load(open(args.load, 'rb')) else:
def main(_run, _config, _seed, _log): # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # Setting the parameter to save and loading parameters important_parameters = ['dbr_cnn'] parameters_to_save = dict([(name, args[name]) for name in important_parameters]) if args['load'] is not None: map_location = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' model_info = torch.load(args['load'], map_location=map_location) model_state = model_info['model'] for param_name, param_value in model_info['params'].items(): args[param_name] = param_value else: model_state = None # Set basic variables preprocessors = PreprocessorList() input_handlers = [] report_database = BugReportDatabase.fromJson(args['bug_database']) batchSize = args['batch_size'] dbr_cnn_opt = args['dbr_cnn'] # Loading word embedding and lexicon emb = np.load(dbr_cnn_opt["word_embedding"]) padding_sym = "</s>" lexicon = Lexicon(unknownSymbol=None) with codecs.open(dbr_cnn_opt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") padding_id = lexicon.getLexiconIndex(padding_sym) embedding = Embedding(lexicon, emb, paddingIdx=padding_id) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) # Load filters and tokenizer filters = loadFilters(dbr_cnn_opt['filters']) if dbr_cnn_opt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif dbr_cnn_opt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % dbr_cnn_opt['tokenizer']) # Add preprocessors preprocessors.append( DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'], report_database)) preprocessors.append( SummaryDescriptionPreprocessor(lexicon, report_database, filters, tokenizer, padding_id)) # Add input_handlers input_handlers.append(DBRDCNN_CategoricalInputHandler()) input_handlers.append( TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"]))) # Create Model model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"], dbr_cnn_opt['update_embedding']) model.to(device) if model_state: model.load_state_dict(model_state) # Set loss function logger.info("Using BCE Loss") loss_fn = BCELoss() loss_no_reduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(input_handlers, torch.float32, unsqueeze_target=True) # Loading the training and setting how the negative example will be generated. if args.get('pairs_training'): negative_pair_gen_opt = args.get('neg_pair_generator', ) pairsTrainingFile = args.get('pairs_training') random_anchor = negative_pair_gen_opt['random_anchor'] offlineGeneration = not (negative_pair_gen_opt is None or negative_pair_gen_opt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, randomInvertPair=args['random_switch']) else: pair_gen_type = negative_pair_gen_opt['type'] master_id_by_bug_id = report_database.getMasterIdByBugId() if pair_gen_type == 'random': logger.info("Random Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = RandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id) elif pair_gen_type == 'non_negative': logger.info("Non Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['n_tries'], device, randomAnchor=random_anchor) elif pair_gen_type == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, training_dataset.duplicateIds, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries'], negative_pair_gen_opt['random_anchor']) elif pair_gen_type == 'random_k': logger.info("Random K Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = KRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'], device) elif pair_gen_type == "pre": logger.info("Pre-selected list generator") negative_pair_generator = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) elif pair_gen_type == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries']) negative_pair_generator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pair_gen_type) pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, negative_pair_generator, randomInvertPair=args['random_switch']) training_loader = DataLoader(pair_training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(training_loader.dataset))) # load validation if args.get('pairs_validation'): pair_validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validation_loader = DataLoader(pair_validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validation_loader.dataset))) else: validation_loader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2'], momentum=args['momentum']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1], input_handlers[0], input_handlers[1], model, device, args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselect_list_ranking = PreselectListRanking(recallEstimationOpt) lr_scheduler_opt = args.get('lr_scheduler', None) if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant': logger.info("Scheduler: Constant") lr_sched = None elif lr_scheduler_opt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lr_scheduler_opt["step_size"], args["decay"])) lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"], lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lr_scheduler_opt["decay"])) lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lr_scheduler_opt["decay"])) lrDecay = lr_scheduler_opt["decay"] lr_sched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pair_gen_type) # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = loss_fn(output, y) loss.backward() optimizer.step() return loss, output, y trainer = Engine(trainingIteration) negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0 trainingMetrics = { 'training_loss': AverageLoss(loss_fn), 'training_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'training_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'training_recall': RecallWrapper(output_transform=thresholded_output_transform), } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(loss_fn), 'validation_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'validation_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'validation_recall': RecallWrapper(output_transform=thresholded_output_transform), } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lr_sched: lr_sched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validation_loader: evaluator.run(validation_loader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, ranking_scorer, report_database, None, epoch, "train") ranking_scorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), epoch, "validation") ranking_scorer.free() if not lastEpoch: pair_training_reader.sampleNewNegExamples(model, loss_no_reduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parameters_to_save } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(training_loader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validation_loader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), 0, "validation") # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recall_rate_opt = args.get('recall_rate', {'type': 'none'}) if recall_rate_opt['type'] != 'none': if recall_rate_opt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = SunRanking(report_database, recall_rate_dataset, recall_rate_opt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recall_rate_opt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = DeshmukhRanking(report_database, recall_rate_dataset) group_by_master = recall_rate_opt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recall_rate_opt['type']) logRankingResult( _run, logger, ranking_class, ranking_scorer, report_database, recall_rate_opt["result_file"], 0, None, group_by_master, )
bugReportDatabase = BugReportDatabase.fromJson(args.bug_database) descIdfFileName = args.idf_basename + '_description_tfidf.pk' sumIdfFileName = args.idf_basename + '_summary_tfidf.pk' bothIdfFileName = args.idf_basename + '_both_tfidf.pk' tokenizer = TreebankWordTokenizer() stemmer = SnowballStemmer('english', ignore_stopwords=True) stopWords = set(stopwords.words('english')) classicalPreProcessing = ClassicalPreprocessing(tokenizer, stemmer, stopWords) if args.training_reports is not None: bugSetDataset = BugDataset(args.training_reports) bugIds = [] for idx in range(bugSetDataset.end): bugIds.append(bugReportDatabase.getBugByIndex(idx)['bug_id']) if os.path.isfile(descIdfFileName): logger.warning("Idf file %s exists and it will be overwritten." % descIdfFileName) logger.info( "Computing and saving idf of the description in the training") descTfidf = calculateIdfs(bugReportDatabase, classicalPreProcessing, bugIds, 'description') pickle.dump(descTfidf, open(descIdfFileName, 'wb'))
def main(_run, _config, _seed, _log): # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = [ 'summary', 'description', 'sum_desc', 'classifier', 'categorical' ] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None """ Set preprocessor that will pre-process the raw information from the bug reports. Each different information has a specific encoder(NN), preprocessor and input handler. """ preprocessors = PreprocessorList() encoders = [] inputHandlers = [] globalDropout = args['dropout'] databasePath = args['bug_database'] sum_desc_opts = args['sum_desc'] if sum_desc_opts is not None: if globalDropout: args['sum_desc']['dropout'] = globalDropout processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers, preprocessors, encoders, cacheFolder, databasePath, logger, paddingSym) sumOpts = args.get("summary") if sumOpts is not None: if globalDropout: args['summary']['dropout'] = globalDropout processSumParam(sumOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym) descOpts = args.get("description") if descOpts is not None: if globalDropout: args['description']['dropout'] = globalDropout processDescriptionParam(descOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym) categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: if globalDropout: args['categorical']['dropout'] = globalDropout processCategoricalParam(categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, encoders, logger) """ Set the final classifier and the loss. Load the classifier if this argument was set. """ classifierOpts = args['classifier'] classifierType = classifierOpts['type'] labelDType = None if globalDropout: args['classifier']['dropout'] = globalDropout if classifierType == 'binary': withoutBugEmbedding = classifierOpts.get('without_embedding', False) batchNorm = classifierOpts.get('batch_normalization', True) dropout = classifierOpts.get('dropout', 0.0) hiddenSizes = classifierOpts.get('hidden_sizes', [100]) model = ProbabilityPairNN(encoders, withoutBugEmbedding, hiddenSizes, batchNorm, dropout) lossFn = NLLLoss() lossNoReduction = NLLLoss(reduction='none') labelDType = torch.int64 logger.info("Using NLLLoss") elif classifierType == 'cosine': model = CosinePairNN(encoders) margin = classifierOpts.get('margin', 0.0) if classifierOpts['loss'] == 'cosine_loss': lossFn = CosineLoss(margin) lossNoReduction = CosineLoss(margin, reduction='none') labelDType = torch.float32 logger.info("Using Cosine Embeding Loss: margin={}".format(margin)) elif classifierOpts['loss'] == 'neculoiu_loss': lossFn = NeculoiuLoss(margin) lossNoReduction = NeculoiuLoss(margin, reduction='none') labelDType = torch.float32 logger.info("Using Neculoiu Loss: margin={}".format(margin)) model.to(device) if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ pairCollate = PairBugCollate(inputHandlers, labelDType) # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) pairsTrainingFile = args.get('pairs_training') randomAnchor = negativePairGenOpt['random_anchor'] offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") pairTrainingReader = PairBugDatasetReader( pairsTrainingFile, preprocessors, randomInvertPair=args['random_switch']) else: pairGenType = negativePairGenOpt['type'] masterIdByBugId = bugReportDatabase.getMasterIdByBugId() if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, pairCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, pairCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, pairCollate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, device, negativePairGenOpt['n_tries']) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, pairCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length']) elif pairGenType == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length']) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, pairCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, device, negativePairGenOpt['n_tries']) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) pairTrainingReader = PairBugDatasetReader( pairsTrainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingLoader = DataLoader(pairTrainingReader, batch_size=batchSize, collate_fn=pairCollate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): pairValidationReader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(pairValidationReader, batch_size=batchSize, collate_fn=pairCollate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2'], momentum=args['momentum']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = SharedEncoderNNScorer(preprocessors, inputHandlers, model, device, args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt) lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if isinstance(lossFn, NLLLoss): return torch.exp(y_pred[:, 1]), y elif isinstance(lossFn, CosineLoss): return y_pred, (y * 2) - 1 # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() (bug1, bug2), y = pairCollate.to(batch, device) output = model(bug1, bug2) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y trainer = Engine(trainingIteration) negTarget = 0.0 if isinstance(lossFn, NLLLoss) else -1.0 trainingMetrics = { 'training_loss': AverageLoss(lossFn), 'training_dist_target': MeanScoreDistance(negTarget=negTarget, output_transform=scoreDistanceTrans), 'training_confusion_matrix': ConfusionMatrix(2, output_transform=lambda x: (x[1], x[2])), } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): (bug1, bug2), y = pairCollate.to(batch, device) y_pred = model(bug1, bug2) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(lossFn), 'validation_dist_target': MeanScoreDistance(negTarget=negTarget, output_transform=scoreDistanceTrans), 'validation_confusion_matrix': ConfusionMatrix(2), } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logConfusionMatrix(_run, logger, 'training_confusion_matrix', engine.state.metrics['training_confusion_matrix'], epoch) logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logConfusionMatrix( _run, logger, 'validation_confusion_matrix', evaluator.state.metrics['validation_confusion_matrix'], epoch) logMetrics(_run, logger, evaluator.state.metrics, epoch) if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train") if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation") if offlineGeneration: pairTrainingReader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % args['save']) torch.save(modelInfo, args['save']) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(trainingLoader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation") # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=pairCollate.collate) logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy(), 'test_precision': ignite.metrics.Precision(), 'test_recall': ignite.metrics.Recall(), 'test_confusion_matrix': ConfusionMatrix(2), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': np.float(metricValue.cpu().numpy()[1]), 'epoch': None, 'tp': metric._true_positives.cpu().numpy().tolist(), 'total_positive': metric._positives.cpu().numpy().tolist() }) _run.log_scalar(metricName, metricValue[1]) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt["group_by_master"] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master)
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--bug_data', required=True, help="") parser.add_argument('--dataset', required=True, help="") parser.add_argument('--n', required=True, type=int, help="") parser.add_argument('--type', required=True, help="") parser.add_argument('--aux_file', help="") parser.add_argument('--model', help="") logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S') logger = logging.getLogger() args = parser.parse_args() logger.info(args) bugDataset = BugDataset(args.dataset) bugReportDatabase = BugReportDatabase.fromJson(args.bug_data) bugIds = bugDataset.bugIds duplicateBugs = bugDataset.duplicateIds if args.aux_file: ''' In our methodology, we compare new bug with all the previously bugs that are in the database. To better generate pairs and triplets, we use the bugs that were reported before the ones from the validation. ''' auxBugDataset = BugDataset(args.aux_file) bugsFromMainFile = list(bugIds) bugsFromMainFileSet = set(bugIds)
def main(_run, _config, _seed, _log): """ :param _run: :param _config: :param _seed: :param _log: :return: """ """ Setting and loading parameters """ # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) bugReportDatabase = BugReportDatabase.fromJson(args['bug_database']) paddingSym = "</s>" batchSize = args['batch_size'] device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # It is the folder where the preprocessed information will be stored. cacheFolder = args['cache_folder'] # Setting the parameter to save and loading parameters importantParameters = ['compare_aggregation', 'categorical'] parametersToSave = dict([(parName, args[parName]) for parName in importantParameters]) if args['load'] is not None: mapLocation = ( lambda storage, loc: storage.cuda()) if cudaOn else 'cpu' modelInfo = torch.load(args['load'], map_location=mapLocation) modelState = modelInfo['model'] for paramName, paramValue in modelInfo['params'].items(): args[paramName] = paramValue else: modelState = None if args['rep'] is not None and args['rep']['model']: logger.info("Loading REP") rep = read_weights(args['rep']['model']) rep_input, max_tkn_id = read_dbrd_file(args['rep']['input'], math.inf) rep_recommendation = args['rep']['k'] rep.fit_transform(rep_input, max_tkn_id, True) rep_input_by_id = {} for inp in rep_input: rep_input_by_id[inp[SUN_REPORT_ID_INDEX]] = inp else: rep = None preprocessors = PreprocessorList() inputHandlers = [] categoricalOpt = args.get('categorical') if categoricalOpt is not None and len(categoricalOpt) != 0: categoricalEncoder, _, _ = processCategoricalParam( categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, None, logger, cudaOn) else: categoricalEncoder = None filterInputHandlers = [] compareAggOpt = args['compare_aggregation'] databasePath = args['bug_database'] # Loading word embedding if compareAggOpt["word_embedding"]: # todo: Allow use embeddings and other representation lexicon, embedding = Embedding.fromFile( compareAggOpt['word_embedding'], 'UUUKNNN', hasHeader=False, paddingSym=paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) lazy = False else: embedding = None # Tokenizer if compareAggOpt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif compareAggOpt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % compareAggOpt['tokenizer']) # Preparing input handlers, preprocessors and cache minSeqSize = max(compareAggOpt['aggregate']["window"] ) if compareAggOpt['aggregate']["model"] == "cnn" else -1 if compareAggOpt['summary'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Summary information.") # Loading Filters sumFilters = loadFilters(compareAggOpt['summary']['filters']) if compareAggOpt['summary']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in sumFilters ])), compareAggOpt['tokenizer'], SummaryPreprocessor.__name__) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = SummaryPreprocessor(lexicon, bugReportDatabase, sumFilters, tokenizer, paddingId, summaryCache) elif compareAggOpt['summary']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # summaryPreprocessor = ELMoPreprocessor(0, elmoEmbedding) # compareAggOpt['summary']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['summary']['model_type'] == 'BERT': arguments = (databasePath, "CADD SUMMARY", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) summaryCache = PreprocessingCache(cacheFolder, arguments) summaryPreprocessor = TransformerPreprocessor( "short_desc", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, summaryCache) # compareAggOpt['summary']["input_size"] = 768 preprocessors.append(summaryPreprocessor) if compareAggOpt['desc'] is not None: # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") descFilters = loadFilters(compareAggOpt['desc']['filters']) if compareAggOpt['desc']['model_type'] in ('lstm', 'gru', 'word_emd', 'residual'): arguments = (databasePath, compareAggOpt['word_embedding'], ' '.join( sorted([ fil.__class__.__name__ for fil in descFilters ])), compareAggOpt['tokenizer'], "CADD DESC", str(compareAggOpt['desc']['summarization'])) inputHandlers.append( RNNInputHandler(paddingId, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, descFilters, tokenizer, paddingId, cache=descriptionCache) elif compareAggOpt['desc']['model_type'] == 'ELMo': raise NotImplementedError("ELMO is not implemented!") # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize)) # descPreprocessor = ELMoPreprocessor(1, elmoEmbedding) # compareAggOpt['desc']["input_size"] = elmoEmbedding.get_size() elif compareAggOpt['desc']['model_type'] == 'BERT': arguments = (databasePath, "CADD DESC", "BERT", "bert-base-uncased") inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize)) descriptionCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = TransformerPreprocessor("description", "bert-base-uncased", BertTokenizer, 0, bugReportDatabase, descriptionCache) # compareAggOpt['desc']["input_size"] = 768 preprocessors.append(descPreprocessor) # Create model model = CADD(embedding, categoricalEncoder, compareAggOpt, compareAggOpt['summary'], compareAggOpt['desc'], compareAggOpt['matching'], compareAggOpt['aggregate'], cudaOn=cudaOn) lossFn = F.nll_loss lossNoReduction = NLLLoss(reduction='none') if cudaOn: model.cuda() if modelState: model.load_state_dict(modelState) """ Loading the training and validation. Also, it sets how the negative example will be generated. """ cmpAggCollate = PairBugCollate(inputHandlers, torch.int64) # load training if args.get('pairs_training'): negativePairGenOpt = args.get('neg_pair_generator', ) pairTrainingFile = args.get('pairs_training') offlineGeneration = not (negativePairGenOpt is None or negativePairGenOpt['type'] == 'none') masterIdByBugId = bugReportDatabase.getMasterIdByBugId() randomAnchor = negativePairGenOpt['random_anchor'] if rep: logger.info("Generate negative examples using REP.") randomAnchor = negativePairGenOpt['random_anchor'] trainingDataset = BugDataset(args['rep']['training']) bugIds = trainingDataset.bugIds negativePairGenerator = REPGenerator(rep, rep_input_by_id, args['rep']['neg_training'], preprocessors, bugIds, masterIdByBugId, args['rep']['rate'], randomAnchor) elif not offlineGeneration: logger.info("Not generate dynamically the negative examples.") negativePairGenerator = None else: pairGenType = negativePairGenOpt['type'] if pairGenType == 'random': logger.info("Random Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = RandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, randomAnchor=randomAnchor) elif pairGenType == 'non_negative': logger.info("Non Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = MiscNonZeroRandomGen( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, trainingDataset.duplicateIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) elif pairGenType == 'random_k': logger.info("Random K Negative Pair Generator") trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (trainingDataset.info, len(bugIds))) negativePairGenerator = KRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['k'], device, randomAnchor=randomAnchor) elif pairGenType == "pre": logger.info("Pre-selected list generator") negativePairGenerator = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "positive_pre": logger.info("Positive Pre-selected list generator") negativePairGenerator = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) elif pairGenType == "misc_non_zero_pre": logger.info("Misc: non-zero and Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) elif pairGenType == "misc_non_zero_positive_pre": logger.info( "Misc: non-zero and Positive Pre-selected list generator") negativePairGenerator1 = PositivePreSelectedGenerator( negativePairGenOpt['pre_list_file'], preprocessors, cmpAggCollate, negativePairGenOpt['rate'], masterIdByBugId, negativePairGenOpt['preselected_length'], randomAnchor=randomAnchor) trainingDataset = BugDataset(negativePairGenOpt['training']) bugIds = trainingDataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmpAggCollate, negativePairGenOpt['rate'], bugIds, masterIdByBugId, negativePairGenOpt['n_tries'], device, randomAnchor=randomAnchor) negativePairGenerator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pairGenType) pairTrainingReader = PairBugDatasetReader( pairTrainingFile, preprocessors, negativePairGenerator, randomInvertPair=args['random_switch']) trainingCollate = cmpAggCollate trainingLoader = DataLoader(pairTrainingReader, batch_size=batchSize, collate_fn=trainingCollate.collate, shuffle=True) logger.info("Training size: %s" % (len(trainingLoader.dataset))) # load validation if args.get('pairs_validation'): pairValidationReader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validationLoader = DataLoader(pairValidationReader, batch_size=batchSize, collate_fn=cmpAggCollate.collate) logger.info("Validation size: %s" % (len(validationLoader.dataset))) else: validationLoader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate rankingScorer = GeneralScorer(model, preprocessors, device, cmpAggCollate) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt, args['sample_size_rr_tr']) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselectListRanking = PreselectListRanking(recallEstimationOpt, args['sample_size_rr_val']) # LR scheduler lrSchedulerOpt = args.get('lr_scheduler', None) if lrSchedulerOpt is None: logger.info("Scheduler: Constant") lrSched = None elif lrSchedulerOpt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lrSchedulerOpt["step_size"], args["decay"])) lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"], lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lrSchedulerOpt["decay"])) lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"]) elif lrSchedulerOpt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lrSchedulerOpt["decay"])) lrDecay = lrSchedulerOpt["decay"] lrSched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pairGenType) # Set training functions def trainingIteration(engine, batch): engine.kk = 0 model.train() optimizer.zero_grad() x, y = batch output = model(*x) loss = lossFn(output, y) loss.backward() optimizer.step() return loss, output, y def scoreDistanceTrans(output): if len(output) == 3: _, y_pred, y = output else: y_pred, y = output if lossFn == F.nll_loss: return torch.exp(y_pred[:, 1]), y trainer = Engine(trainingIteration) trainingMetrics = { 'training_loss': AverageLoss(lossFn, batch_size=lambda x: x[0].shape[0]), 'training_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): if not hasattr(engine, 'kk'): engine.kk = 0 model.eval() with torch.no_grad(): x, y = batch y_pred = model(*x) # for k, (pred, t) in enumerate(zip(y_pred, y)): # engine.kk += 1 # print("{}: {} \t {}".format(engine.kk, torch.round(torch.exp(pred) * 100), t)) return y_pred, y validationMetrics = { 'validation_loss': ignite.metrics.Loss(lossFn), 'validation_dist_target': MeanScoreDistance(output_transform=scoreDistanceTrans) } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) # recommendation if rep: recommendation_fn = REP_CADD_Recommender( rep, rep_input_by_id, rep_recommendation).generateRecommendationList else: recommendation_fn = generateRecommendationList @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lrSched: lrSched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validationLoader: evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, epoch) if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, rankingScorer, bugReportDatabase, None, epoch, "train", recommendationListfn=recommendation_fn) rankingScorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), epoch, "validation", recommendationListfn=recommendation_fn) rankingScorer.free() pairTrainingReader.sampleNewNegExamples(model, lossNoReduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parametersToSave } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(trainingLoader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validationLoader) logMetrics(_run, logger, evaluator.state.metrics, 0) if recallEstimationOpt: logRankingResult(_run, logger, preselectListRanking, rankingScorer, bugReportDatabase, args.get("ranking_result_file"), 0, "validation", recommendationListfn=recommendation_fn) recallRateOpt = args.get('recall_rate', {'type': 'none'}) if recallRateOpt['type'] != 'none': if recallRateOpt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = SunRanking(bugReportDatabase, recallRateDataset, recallRateOpt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recallRateOpt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recallRateOpt['type'])) recallRateDataset = BugDataset(recallRateOpt['dataset']) rankingClass = DeshmukhRanking(bugReportDatabase, recallRateDataset) group_by_master = recallRateOpt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recallRateOpt['type']) logRankingResult(_run, logger, rankingClass, rankingScorer, bugReportDatabase, recallRateOpt["result_file"], 0, None, group_by_master, recommendationListfn=recommendation_fn)