def processDescriptionParam(descOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") # Loading word embedding lexicon, embedding = load_embedding(descOpts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(descOpts['filters']) # Tokenizer if descOpts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif descOpts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % descOpts['tokenizer']) arguments = ( databasePath, descOpts['word_embedding'], str(descOpts['lexicon']), ' '.join(sorted([fil.__class__.__name__ for fil in filters])), descOpts['tokenizer'], "description") descCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, descCache) preprocessors.append(descPreprocessor) if descOpts['encoder_type'] == 'rnn': rnnType = descOpts.get('rnn_type') hiddenSize = descOpts.get('hidden_size') bidirectional = descOpts.get('bidirectional', False) numLayers = descOpts.get('num_layers', 1) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) fixedOpt = descOpts.get('fixed_opt', False) descRNN = SortedRNNEncoder(rnnType, embedding, hiddenSize, numLayers, bidirectional, updateEmb, dropout) if fixedOpt == 'self_att': att = SelfAttention(descRNN.getOutputSize(), descOpts['self_att_hidden'], descOpts['n_hops']) descEncoder = RNN_Self_Attention(descRNN, att, paddingId, dropout) else: descEncoder = RNNFixedOuput(descRNN, fixedOpt, dropout) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) elif descOpts['encoder_type'] == 'cnn': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) descEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'cnn+dense': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) hiddenSizes = descOpts.get('hidden_sizes') hiddenAct = loadActivationClass(descOpts.get('hidden_act')) hiddenDropout = descOpts.get('hidden_dropout') batchLast = descOpts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) descEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'dense+self_att': dropout = descOpts.get('dropout', 0.0) hiddenSize = descOpts.get('hidden_size') self_att_hidden = descOpts['self_att_hidden'] n_hops = descOpts['n_hops'] updateEmb = descOpts.get('update_embedding', False) descEncoder = Dense_Self_Attention(embedding, hiddenSize, self_att_hidden, n_hops, paddingId, updateEmb, dropout=dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, -1)) elif descOpts['encoder_type'] == 'word_mean': standardization = descOpts.get('standardization', False) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) batch_normalization = descOpts.get('update_embedding', False) hiddenSize = descOpts.get('hidden_size') descEncoder = WordMean( embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % descOpts['encoder_type'])
def processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers, preprocessors, encoders, cacheFolder, databasePath, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using summary and description information.") # Loading word embedding lexicon, embedding = load_embedding(sum_desc_opts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(sum_desc_opts['filters']) # Tokenizer if sum_desc_opts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary+description information") tokenizer = MultiLineTokenizer() elif sum_desc_opts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary+description information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % sum_desc_opts['tokenizer']) arguments = ( databasePath, sum_desc_opts['word_embedding'], ' '.join(sorted([fil.__class__.__name__ for fil in filters])), sum_desc_opts['tokenizer'], "summary_description") cacheSumDesc = PreprocessingCache(cacheFolder, arguments) sumDescPreprocessor = SummaryDescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, cacheSumDesc) preprocessors.append(sumDescPreprocessor) if sum_desc_opts['encoder_type'] == 'cnn': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) sumDescEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'cnn+dense': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) hiddenSizes = sum_desc_opts.get('hidden_sizes') hiddenAct = loadActivationClass(sum_desc_opts.get('hidden_act')) hiddenDropout = sum_desc_opts.get('hidden_dropout') batchLast = sum_desc_opts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) sumDescEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'word_mean': standardization = sum_desc_opts.get('standardization', False) dropout = sum_desc_opts.get('dropout', 0.0) updateEmb = sum_desc_opts.get('update_embedding', False) batch_normalization = sum_desc_opts.get('update_embedding', False) hiddenSize = sum_desc_opts.get('hidden_size') sumDescEncoder = WordMean(embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(sumDescEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % sum_desc_opts['encoder_type'])
def main(_run, _config, _seed, _log): # Setting logger args = _config logger = _log logger.info(args) logger.info('It started at: %s' % datetime.now()) torch.manual_seed(_seed) device = torch.device('cuda' if args['cuda'] else "cpu") if args['cuda']: logger.info("Turning CUDA on") else: logger.info("Turning CUDA off") # Setting the parameter to save and loading parameters important_parameters = ['dbr_cnn'] parameters_to_save = dict([(name, args[name]) for name in important_parameters]) if args['load'] is not None: map_location = ( lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu' model_info = torch.load(args['load'], map_location=map_location) model_state = model_info['model'] for param_name, param_value in model_info['params'].items(): args[param_name] = param_value else: model_state = None # Set basic variables preprocessors = PreprocessorList() input_handlers = [] report_database = BugReportDatabase.fromJson(args['bug_database']) batchSize = args['batch_size'] dbr_cnn_opt = args['dbr_cnn'] # Loading word embedding and lexicon emb = np.load(dbr_cnn_opt["word_embedding"]) padding_sym = "</s>" lexicon = Lexicon(unknownSymbol=None) with codecs.open(dbr_cnn_opt["lexicon"]) as f: for l in f: lexicon.put(l.strip()) lexicon.setUnknown("UUUKNNN") padding_id = lexicon.getLexiconIndex(padding_sym) embedding = Embedding(lexicon, emb, paddingIdx=padding_id) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) # Load filters and tokenizer filters = loadFilters(dbr_cnn_opt['filters']) if dbr_cnn_opt['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif dbr_cnn_opt['tokenizer'] == 'white_space': logger.info( "Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % dbr_cnn_opt['tokenizer']) # Add preprocessors preprocessors.append( DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'], report_database)) preprocessors.append( SummaryDescriptionPreprocessor(lexicon, report_database, filters, tokenizer, padding_id)) # Add input_handlers input_handlers.append(DBRDCNN_CategoricalInputHandler()) input_handlers.append( TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"]))) # Create Model model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"], dbr_cnn_opt['update_embedding']) model.to(device) if model_state: model.load_state_dict(model_state) # Set loss function logger.info("Using BCE Loss") loss_fn = BCELoss() loss_no_reduction = BCELoss(reduction='none') cmp_collate = PairBugCollate(input_handlers, torch.float32, unsqueeze_target=True) # Loading the training and setting how the negative example will be generated. if args.get('pairs_training'): negative_pair_gen_opt = args.get('neg_pair_generator', ) pairsTrainingFile = args.get('pairs_training') random_anchor = negative_pair_gen_opt['random_anchor'] offlineGeneration = not (negative_pair_gen_opt is None or negative_pair_gen_opt['type'] == 'none') if not offlineGeneration: logger.info("Not generate dynamically the negative examples.") pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, randomInvertPair=args['random_switch']) else: pair_gen_type = negative_pair_gen_opt['type'] master_id_by_bug_id = report_database.getMasterIdByBugId() if pair_gen_type == 'random': logger.info("Random Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = RandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id) elif pair_gen_type == 'non_negative': logger.info("Non Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['n_tries'], device, randomAnchor=random_anchor) elif pair_gen_type == 'misc_non_zero': logger.info("Misc Non Zero Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = MiscNonZeroRandomGen( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, training_dataset.duplicateIds, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries'], negative_pair_gen_opt['random_anchor']) elif pair_gen_type == 'random_k': logger.info("Random K Negative Pair Generator") training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds logger.info( "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d" % (training_dataset.info, len(bug_ids))) negative_pair_generator = KRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'], device) elif pair_gen_type == "pre": logger.info("Pre-selected list generator") negative_pair_generator = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) elif pair_gen_type == "misc_non_zero_pre": logger.info("Pre-selected list generator") negativePairGenerator1 = PreSelectedGenerator( negative_pair_gen_opt['pre_list_file'], preprocessors, negative_pair_gen_opt['rate'], master_id_by_bug_id, negative_pair_gen_opt['preselected_length']) training_dataset = BugDataset( negative_pair_gen_opt['training']) bug_ids = training_dataset.bugIds negativePairGenerator2 = NonNegativeRandomGenerator( preprocessors, cmp_collate, negative_pair_gen_opt['rate'], bug_ids, master_id_by_bug_id, device, negative_pair_gen_opt['n_tries']) negative_pair_generator = MiscOfflineGenerator( (negativePairGenerator1, negativePairGenerator2)) else: raise ArgumentError( "Offline generator is invalid (%s). You should choose one of these: random, hard and pre" % pair_gen_type) pair_training_reader = PairBugDatasetReader( pairsTrainingFile, preprocessors, negative_pair_generator, randomInvertPair=args['random_switch']) training_loader = DataLoader(pair_training_reader, batch_size=batchSize, collate_fn=cmp_collate.collate, shuffle=True) logger.info("Training size: %s" % (len(training_loader.dataset))) # load validation if args.get('pairs_validation'): pair_validation_reader = PairBugDatasetReader( args.get('pairs_validation'), preprocessors) validation_loader = DataLoader(pair_validation_reader, batch_size=batchSize, collate_fn=cmp_collate.collate) logger.info("Validation size: %s" % (len(validation_loader.dataset))) else: validation_loader = None """ Training and evaluate the model. """ optimizer_opt = args.get('optimizer', 'adam') if optimizer_opt == 'sgd': logger.info('SGD') optimizer = optim.SGD(model.parameters(), lr=args['lr'], weight_decay=args['l2'], momentum=args['momentum']) elif optimizer_opt == 'adam': logger.info('Adam') optimizer = optim.Adam(model.parameters(), lr=args['lr'], weight_decay=args['l2']) # Recall rate ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1], input_handlers[0], input_handlers[1], model, device, args['ranking_batch_size']) recallEstimationTrainOpt = args.get('recall_estimation_train') if recallEstimationTrainOpt: preselectListRankingTrain = PreselectListRanking( recallEstimationTrainOpt) recallEstimationOpt = args.get('recall_estimation') if recallEstimationOpt: preselect_list_ranking = PreselectListRanking(recallEstimationOpt) lr_scheduler_opt = args.get('lr_scheduler', None) if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant': logger.info("Scheduler: Constant") lr_sched = None elif lr_scheduler_opt["type"] == 'step': logger.info("Scheduler: StepLR (step:%s, decay:%f)" % (lr_scheduler_opt["step_size"], args["decay"])) lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"], lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'exp': logger.info("Scheduler: ExponentialLR (decay:%f)" % (lr_scheduler_opt["decay"])) lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"]) elif lr_scheduler_opt["type"] == 'linear': logger.info( "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" % (lr_scheduler_opt["decay"])) lrDecay = lr_scheduler_opt["decay"] lr_sched = LambdaLR(optimizer, lambda epoch: 1 / (1.0 + epoch * lrDecay)) else: raise ArgumentError( "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear " % pair_gen_type) # Set training functions def trainingIteration(engine, batch): model.train() optimizer.zero_grad() x, y = cmp_collate.to(batch, device) output = model(*x) loss = loss_fn(output, y) loss.backward() optimizer.step() return loss, output, y trainer = Engine(trainingIteration) negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0 trainingMetrics = { 'training_loss': AverageLoss(loss_fn), 'training_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'training_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'training_recall': RecallWrapper(output_transform=thresholded_output_transform), } # Add metrics to trainer for name, metric in trainingMetrics.items(): metric.attach(trainer, name) # Set validation functions def validationIteration(engine, batch): model.eval() with torch.no_grad(): x, y = cmp_collate.to(batch, device) y_pred = model(*x) return y_pred, y validationMetrics = { 'validation_loss': LossWrapper(loss_fn), 'validation_acc': AccuracyWrapper(output_transform=thresholded_output_transform), 'validation_precision': PrecisionWrapper(output_transform=thresholded_output_transform), 'validation_recall': RecallWrapper(output_transform=thresholded_output_transform), } evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in validationMetrics.items(): metric.attach(evaluator, name) @trainer.on(Events.EPOCH_STARTED) def onStartEpoch(engine): epoch = engine.state.epoch logger.info("Epoch: %d" % epoch) if lr_sched: lr_sched.step() logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"])) @trainer.on(Events.EPOCH_COMPLETED) def onEndEpoch(engine): epoch = engine.state.epoch logMetrics(_run, logger, engine.state.metrics, epoch) # Evaluate Training if validation_loader: evaluator.run(validation_loader) logMetrics(_run, logger, evaluator.state.metrics, epoch) lastEpoch = args['epochs'] - epoch == 0 if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0): logRankingResult(_run, logger, preselectListRankingTrain, ranking_scorer, report_database, None, epoch, "train") ranking_scorer.free() if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0): logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), epoch, "validation") ranking_scorer.free() if not lastEpoch: pair_training_reader.sampleNewNegExamples(model, loss_no_reduction) if args.get('save'): save_by_epoch = args['save_by_epoch'] if save_by_epoch and epoch in save_by_epoch: file_name, file_extension = os.path.splitext(args['save']) file_path = file_name + '_epoch_{}'.format( epoch) + file_extension else: file_path = args['save'] modelInfo = { 'model': model.state_dict(), 'params': parameters_to_save } logger.info("==> Saving Model: %s" % file_path) torch.save(modelInfo, file_path) if args.get('pairs_training'): trainer.run(training_loader, max_epochs=args['epochs']) elif args.get('pairs_validation'): # Evaluate Training evaluator.run(validation_loader) logMetrics(logger, evaluator.state.metrics) if recallEstimationOpt: logRankingResult(_run, logger, preselect_list_ranking, ranking_scorer, report_database, args.get("ranking_result_file"), 0, "validation") # Test Dataset (accuracy, recall, precision, F1) pair_test_dataset = args.get('pair_test_dataset') if pair_test_dataset is not None and len(pair_test_dataset) > 0: pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors) testLoader = DataLoader(pairTestReader, batch_size=batchSize, collate_fn=cmp_collate.collate) if not isinstance(cmp_collate, PairBugCollate): raise NotImplementedError( 'Evaluation of pairs using tanh was not implemented yet') logger.info("Test size: %s" % (len(testLoader.dataset))) testMetrics = { 'test_accuracy': ignite.metrics.Accuracy( output_transform=thresholded_output_transform), 'test_precision': ignite.metrics.Precision( output_transform=thresholded_output_transform), 'test_recall': ignite.metrics.Recall( output_transform=thresholded_output_transform), 'test_predictions': PredictionCache(), } test_evaluator = Engine(validationIteration) # Add metrics to evaluator for name, metric in testMetrics.items(): metric.attach(test_evaluator, name) test_evaluator.run(testLoader) for metricName, metricValue in test_evaluator.state.metrics.items(): metric = testMetrics[metricName] if isinstance(metric, ignite.metrics.Accuracy): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'correct': metric._num_correct, 'total': metric._num_examples }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, (ignite.metrics.Precision, ignite.metrics.Recall)): logger.info({ 'type': 'metric', 'label': metricName, 'value': metricValue, 'epoch': None, 'tp': metric._true_positives.item(), 'total_positive': metric._positives.item() }) _run.log_scalar(metricName, metricValue) elif isinstance(metric, ConfusionMatrix): acc = cmAccuracy(metricValue) prec = cmPrecision(metricValue, False) recall = cmRecall(metricValue, False) f1 = 2 * (prec * recall) / (prec + recall + 1e-15) logger.info({ 'type': 'metric', 'label': metricName, 'accuracy': np.float(acc), 'precision': prec.cpu().numpy().tolist(), 'recall': recall.cpu().numpy().tolist(), 'f1': f1.cpu().numpy().tolist(), 'confusion_matrix': metricValue.cpu().numpy().tolist(), 'epoch': None }) _run.log_scalar('test_f1', f1[1]) elif isinstance(metric, PredictionCache): logger.info({ 'type': 'metric', 'label': metricName, 'predictions': metric.predictions }) # Calculate recall rate recall_rate_opt = args.get('recall_rate', {'type': 'none'}) if recall_rate_opt['type'] != 'none': if recall_rate_opt['type'] == 'sun2011': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = SunRanking(report_database, recall_rate_dataset, recall_rate_opt['window']) # We always group all bug reports by master in the results in the sun 2011 methodology group_by_master = True elif recall_rate_opt['type'] == 'deshmukh': logger.info("Calculating recall rate: {}".format( recall_rate_opt['type'])) recall_rate_dataset = BugDataset(recall_rate_opt['dataset']) ranking_class = DeshmukhRanking(report_database, recall_rate_dataset) group_by_master = recall_rate_opt['group_by_master'] else: raise ArgumentError( "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear " % recall_rate_opt['type']) logRankingResult( _run, logger, ranking_class, ranking_scorer, report_database, recall_rate_opt["result_file"], 0, None, group_by_master, )