def __init__(self, tripletFile, bugDataExtractor, trainingDatasetPath,
                 bugDatabase):
        # Store the feature generated by bug
        self.inputBug = []
        self.inputDuplicateBug = []
        self.cache = {}
        self.trainingDataset = BugDataset(trainingDatasetPath)
        self.pairs = []

        self.bugIds = [
            bugDatabase.bugList[idx]['bug_id']
            for idx in range(self.trainingDataset.end)
        ]
        self.masterIdByBugId = bugDatabase.getMasterIdByBugId(self.bugIds)
        self.bugDataExtractor = bugDataExtractor

        f = open(tripletFile, 'r')

        self.logger = logging.getLogger(__name__)

        for l in f:
            bugId, duplicateBugId, nonDuplicateBugId = l.strip().split(',')

            ftrsBug = bugDataExtractor.extract(bugId)
            ftrsDuplicateBug = bugDataExtractor.extract(duplicateBugId)

            self.inputBug.append(ftrsBug)
            self.inputDuplicateBug.append(ftrsDuplicateBug)
            self.pairs.append((bugId, duplicateBugId))
Exemple #2
0
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--bug_data', required=True, help="")
    parser.add_argument('--dataset', required=True, help="")
    parser.add_argument('--list_size', required=True, type=int, help="")
    parser.add_argument('--type', required=True, help="")
    parser.add_argument('--save', help="")
    parser.add_argument('--model', help="")
    parser.add_argument('--nproc', type=int, default=6, help="")
    parser.add_argument('--same_prod', action='store_true', help="")

    logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger()
    args = parser.parse_args()
    logger.info(args)

    bugDataset = BugDataset(args.dataset)

    bugIds = bugDataset.bugIds
    duplicateBugs = bugDataset.duplicateIds

    if args.type in set(['tfidf', 'binary']):
        # Insert imports to load TfIdfVectorizer class
        from data.bug_dataset import BugDataset

        bugReportDatabase = BugReportDatabase.fromJson(args.bug_data)
        masterBugIdByBugId = bugReportDatabase.getMasterIdByBugId()

        vectorizer = pickle.load(open(args.model, 'rb'))
        normalize = True if args.type == 'tfidf' else False

        negativeSimMatrix = generateNegativeListSparseVector(args.list_size, bugReportDatabase, bugIds, vectorizer,
Exemple #3
0
        "Performs the recall rate estimation each epoch. This parameter receives the file that contains the list of bug ids."
    )

    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    logHandler = logging.StreamHandler()
    formatter = JsonLogFormatter()
    logHandler.setFormatter(formatter)
    logger.addHandler(logHandler)

    args = parser.parse_args()

    logger.info(args.__dict__)

    args.recall_ratio_k = [int(k) for k in args.recall_ratio_k]
    bugSetDataset = BugDataset(args.input)
    bugReportDatabase = BugReportDatabase.fromJson(args.bug_dataset)

    if args.recall_estimation:
        bugIds, listByBugId = pickle.load(open(args.recall_estimation, 'rb'))
        duplicateBugs = list(listByBugId.keys())
    else:
        listByBugId = None
        bugIds = []

        for idx in range(len(bugReportDatabase)):
            bugIds.append(bugReportDatabase.getBugByIndex(idx)['bug_id'])

        duplicateBugs = bugSetDataset.duplicateIdxs

    similarityListByDuplicate = []
Exemple #4
0
def main(_run, _config, _seed, _log):
    """

    :param _run:
    :param _config:
    :param _seed:
    :param _log:
    :return:
    """
    """
    Setting and loading parameters
    """
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)
    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")

    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = ['compare_aggregation', 'categorical']
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None

    preprocessors = PreprocessorList()
    inputHandlers = []

    categoricalOpt = args.get('categorical')

    if categoricalOpt is not None and len(categoricalOpt) != 0:
        categoricalEncoder, _, _ = processCategoricalParam(
            categoricalOpt, bugReportDatabase, inputHandlers, preprocessors,
            None, logger)
    else:
        categoricalEncoder = None

    filterInputHandlers = []

    compareAggOpt = args['compare_aggregation']
    databasePath = args['bug_database']

    # Loading word embedding
    if compareAggOpt["lexicon"]:
        emb = np.load(compareAggOpt["word_embedding"])

        lexicon = Lexicon(unknownSymbol=None)
        with codecs.open(compareAggOpt["lexicon"]) as f:
            for l in f:
                lexicon.put(l.strip())

        lexicon.setUnknown("UUUKNNN")
        paddingId = lexicon.getLexiconIndex(paddingSym)
        embedding = Embedding(lexicon, emb, paddingIdx=paddingId)

        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
    elif compareAggOpt["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(
            compareAggOpt['word_embedding'],
            'UUUKNNN',
            hasHeader=False,
            paddingSym=paddingSym)
        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
        paddingId = lexicon.getLexiconIndex(paddingSym)
    else:
        embedding = None

    if compareAggOpt["norm_word_embedding"]:
        embedding.zscoreNormalization()

    # Tokenizer
    if compareAggOpt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif compareAggOpt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % compareAggOpt['tokenizer'])

    # Preparing input handlers, preprocessors and cache
    minSeqSize = max(compareAggOpt['aggregate']["window"]
                     ) if compareAggOpt['aggregate']["model"] == "cnn" else -1
    bow = compareAggOpt.get('bow', False)
    freq = compareAggOpt.get('frequency', False) and bow

    logger.info("BoW={} and TF={}".format(bow, freq))

    if compareAggOpt['extractor'] is not None:
        # Use summary and description (concatenated) to address this problem
        logger.info("Using Summary and Description information.")
        # Loading Filters
        extractorFilters = loadFilters(compareAggOpt['extractor']['filters'])

        arguments = (databasePath, compareAggOpt['word_embedding'],
                     str(compareAggOpt['lexicon']), ' '.join(
                         sorted([
                             fil.__class__.__name__ for fil in extractorFilters
                         ])), compareAggOpt['tokenizer'], str(bow), str(freq),
                     SABDEncoderPreprocessor.__name__)

        inputHandlers.append(SABDInputHandler(paddingId, minSeqSize))
        extractorCache = PreprocessingCache(cacheFolder, arguments)

        if bow:
            extractorPreprocessor = SABDBoWPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, freq, extractorCache)
        else:
            extractorPreprocessor = SABDEncoderPreprocessor(
                lexicon, bugReportDatabase, extractorFilters, tokenizer,
                paddingId, extractorCache)
        preprocessors.append(extractorPreprocessor)

    # Create model
    model = SABD(embedding, categoricalEncoder, compareAggOpt['extractor'],
                 compareAggOpt['matching'], compareAggOpt['aggregate'],
                 compareAggOpt['classifier'], freq)

    if args['loss'] == 'bce':
        logger.info("Using BCE Loss: margin={}".format(args['margin']))
        lossFn = BCELoss()
        lossNoReduction = BCELoss(reduction='none')
        cmp_collate = PairBugCollate(inputHandlers,
                                     torch.float32,
                                     unsqueeze_target=True)
    elif args['loss'] == 'triplet':
        logger.info("Using Triplet Loss: margin={}".format(args['margin']))
        lossFn = TripletLoss(args['margin'])
        lossNoReduction = TripletLoss(args['margin'], reduction='none')
        cmp_collate = TripletBugCollate(inputHandlers)

    model.to(device)

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    # load training
    if args.get('pairs_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        trainingFile = args.get('pairs_training')

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')
        masterIdByBugId = bugReportDatabase.getMasterIdByBugId()
        randomAnchor = negativePairGenOpt['random_anchor']

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            negativePairGenerator = None
        else:
            pairGenType = negativePairGenOpt['type']

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = MiscNonZeroRandomGen(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    trainingDataset.duplicateIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'product_component':
                logger.info("Product Component Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = ProductComponentRandomGen(
                    bugReportDatabase,
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['k'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

            elif pairGenType == "positive_pre":
                logger.info("Positive Pre-selected list generator")
                negativePairGenerator = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Misc: non-zero and Pre-selected list generator")
                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            elif pairGenType == "misc_non_zero_positive_pre":
                logger.info(
                    "Misc: non-zero and Positive Pre-selected list generator")
                negativePairGenerator1 = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))

            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

        if isinstance(lossFn, BCELoss):
            training_reader = PairBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])
        elif isinstance(lossFn, TripletLoss):
            training_reader = TripletBugDatasetReader(
                trainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])

        trainingLoader = DataLoader(training_reader,
                                    batch_size=batchSize,
                                    collate_fn=cmp_collate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        if isinstance(lossFn, BCELoss):
            validation_reader = PairBugDatasetReader(
                args.get('pairs_validation'), preprocessors)
        elif isinstance(lossFn, TripletLoss):
            validation_reader = TripletBugDatasetReader(
                args.get('pairs_validation'), preprocessors)

        validationLoader = DataLoader(validation_reader,
                                      batch_size=batchSize,
                                      collate_fn=cmp_collate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = GeneralScorer(
        model, preprocessors, device,
        PairBugCollate(inputHandlers, ignore_target=True),
        args['ranking_batch_size'], args['ranking_n_workers'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt, args['sample_size_rr_tr'])

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt,
                                                    args['sample_size_rr_val'])

    # LR scheduler
    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    # Set training functions
    def trainingIteration(engine, batch):
        engine.kk = 0
        model.train()

        optimizer.zero_grad()
        x, y = cmp_collate.to(batch, device)
        output = model(*x)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    def scoreDistanceTrans(output):
        if len(output) == 3:
            _, y_pred, y = output
        else:
            y_pred, y = output

        if lossFn == F.nll_loss:
            return torch.exp(y_pred[:, 1]), y
        elif isinstance(lossFn, (BCELoss)):
            return y_pred, y

    trainer = Engine(trainingIteration)
    trainingMetrics = {'training_loss': AverageLoss(lossFn)}

    if isinstance(lossFn, BCELoss):
        trainingMetrics['training_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        trainingMetrics['training_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        trainingMetrics['training_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)
        # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        if not hasattr(engine, 'kk'):
            engine.kk = 0

        model.eval()

        with torch.no_grad():
            x, y = cmp_collate.to(batch, device)
            y_pred = model(*x)

            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(lossFn,
                    output_transform=lambda x: (x[0], x[0][0])
                    if x[1] is None else x)
    }

    if isinstance(lossFn, BCELoss):
        validationMetrics['validation_dist_target'] = MeanScoreDistance(
            output_transform=scoreDistanceTrans)
        validationMetrics['validation_acc'] = AccuracyWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_precision'] = PrecisionWrapper(
            output_transform=thresholded_output_transform)
        validationMetrics['validation_recall'] = RecallWrapper(
            output_transform=thresholded_output_transform)

    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    # recommendation
    recommendation_fn = generateRecommendationList

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        lastEpoch = args['epochs'] - epoch == 0

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRankingTrain,
                             rankingScorer,
                             bugReportDatabase,
                             None,
                             epoch,
                             "train",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             epoch,
                             "validation",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if not lastEpoch:
            training_reader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validationLoader)
        logMetrics(_run, logger, evaluator.state.metrics, 0)

        if recallEstimationOpt:
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             0,
                             "validation",
                             recommendationListfn=recommendation_fn)

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=cmp_collate.collate)

        if not isinstance(cmp_collate, PairBugCollate):
            raise NotImplementedError(
                'Evaluation of pairs using tanh was not implemented yet')

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy':
            ignite.metrics.Accuracy(
                output_transform=thresholded_output_transform),
            'test_precision':
            ignite.metrics.Precision(
                output_transform=thresholded_output_transform),
            'test_recall':
            ignite.metrics.Recall(
                output_transform=thresholded_output_transform),
            'test_predictions':
            PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'tp': metric._true_positives.item(),
                    'total_positive': metric._positives.item()
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)
            group_by_master = recallRateOpt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run,
                         logger,
                         rankingClass,
                         rankingScorer,
                         bugReportDatabase,
                         recallRateOpt["result_file"],
                         0,
                         None,
                         group_by_master,
                         recommendationListfn=recommendation_fn)
    parser.add_argument("-test", '--test', dest='test', help="")
    parser.add_argument("--result_file", help="")
    parser.add_argument("--window", type=int, help="")

    logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                        level=logging.INFO,
                        datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger()
    args = parser.parse_args()
    logger.info(args)

    training = True

    bugReportDatabase = BugReportDatabase.fromJson(args.db)
    testDataset = BugDataset(args.test)
    preprocessing = DBRDPreprocessing()

    if args.dt is not None:
        trainingDataset = BugDataset(args.dt)

        np.random.seed(args.seed)
        random.seed(args.seed)

        data_model = []
        data_alpha = []

        # Split the dataset into two sets: the first set is used to train the model and
        # the second is used to tune the ensemble weights
        duplicate_reports = trainingDataset.duplicateIds
        out_file.write("A-T={}\n".format(format_tf_to_text(bug['total_tri'])))

        out_file.write("DID={}\n".format('' if len(bug['dup_id']) ==
                                         0 else bug['dup_id']))
        out_file.write("VERSION={}\n".format(version_dict[bug['version']]))
        out_file.write("COMPONENT={}\n".format(product_dict[bug['product']]))
        out_file.write("SUB-COMPONENT={}\n".format(
            component_dict[bug['component']]))
        out_file.write("TYPE={}\n".format(type_dict[bug['bug_severity']]))
        out_file.write("PRIORITY={}\n".format(priority_dict[bug['priority']]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--database', required=True, help="")
    parser.add_argument('--test', required=True, help="")
    parser.add_argument('--output', required=True, help="")

    logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger()
    args = parser.parse_args()
    logger.info(args)

    output_path = args.output
    database = BugReportDatabase.fromJson(args.database)
    test = BugDataset(args.test)

    max_bug_id = max(map(lambda bug_id: int(bug_id), test.bugIds))

    generate_input(database, max_bug_id, output_path)
Exemple #7
0
def main(_run, _config, _seed, _log):
    """

    :param _run:
    :param _config:
    :param _seed:
    :param _log:
    :return:
    """
    """
    Setting and loading parameters
    """
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)

    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")

    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = [
        'summary', 'description', 'sum_desc', 'scorer', 'categorical'
    ]
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None
    """
    Set preprocessor that will pre-process the raw information from the bug reports.
    Each different information has a specific encoder(NN), preprocessor and input handler.
    """

    preprocessors = PreprocessorList()
    encoders = []
    inputHandlers = []

    sum_desc_opts = args['sum_desc']
    databasePath = args['bug_database']

    if sum_desc_opts is not None:
        processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers,
                            preprocessors, encoders, cacheFolder, databasePath,
                            logger, paddingSym)

    sumOpts = args.get("summary")

    if sumOpts is not None:
        processSumParam(sumOpts, bugReportDatabase, inputHandlers,
                        preprocessors, encoders, databasePath, cacheFolder,
                        logger, paddingSym)

    descOpts = args.get("description")

    if descOpts is not None:
        processDescriptionParam(descOpts, bugReportDatabase, inputHandlers,
                                preprocessors, encoders, databasePath,
                                cacheFolder, logger, paddingSym)

    categoricalOpt = args.get('categorical')
    if categoricalOpt is not None and len(categoricalOpt) != 0:
        processCategoricalParam(categoricalOpt, bugReportDatabase,
                                inputHandlers, preprocessors, encoders, logger)
    """
    Set the final scorer and the loss. Load the scorer if this argument was set.
    """
    scorerOpts = args['scorer']
    scorerType = scorerOpts['type']

    if scorerType == 'binary':
        pass
        # withoutBugEmbedding = scorerOpts.get('without_embedding', False)
        # batchNorm = scorerOpts.get('batch_normalization', True)
        # hiddenSizes = scorerOpts.get('hidden_sizes', [100])
        # model = ProbabilityPairNN(encoders, withoutBugEmbedding, hiddenSizes, batchNorm)
        # lossFn = BCELoss()
        # lossNoReduction = BCELoss(reduction='none')
        #
        # logger.info("Using BCELoss")
    elif scorerType == 'cosine':
        model = CosineTripletNN(encoders, scorerOpts['dropout'])
        margin = scorerOpts.get('margin', 0.0)

        if (categoricalOpt is not None
                and categoricalOpt.get('bn_last_layer', False)) or (
                    sum_desc_opts is not None
                    and sum_desc_opts.get('bn_last_layer', False)) or (
                        sumOpts is not None and sumOpts.get('bn_last_layer')):
            raise Exception(
                'You are applying batch normalization in the bug embedding.')

        lossFn = TripletLoss(margin)
        lossNoReduction = TripletLoss(margin, reduction='none')
        logger.info("Using Cosine Embeding Loss: margin={}".format(margin))

    model.to(device)

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    tripletCollate = TripletBugCollate(inputHandlers)

    # load training
    if args.get('triplets_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        tripletTrainingFile = args.get('triplets_training')

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            tripletTrainingReader = TripletBugDatasetReader(
                tripletTrainingFile, preprocessors)
        else:
            pairGenType = negativePairGenOpt['type']
            masterIdByBugId = bugReportDatabase.getMasterIdByBugId()

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors, tripletCollate, negativePairGenOpt['rate'],
                    bugIds, masterIdByBugId)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    tripletCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    decimals=negativePairGenOpt['decimals'])
            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors, tripletCollate, negativePairGenOpt['rate'],
                    bugIds, masterIdByBugId, negativePairGenOpt['k'], device)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'], preprocessors,
                    negativePairGenOpt['rate'], masterIdByBugId,
                    negativePairGenOpt['preselected_length'])
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Pre-selected list generator")

                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'], preprocessors,
                    negativePairGenOpt['rate'], masterIdByBugId,
                    negativePairGenOpt['preselected_length'])

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors, tripletCollate, negativePairGenOpt['rate'],
                    bugIds, masterIdByBugId, negativePairGenOpt['n_tries'],
                    device)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

            tripletTrainingReader = TripletBugDatasetReader(
                tripletTrainingFile, preprocessors, negativePairGenerator)

        trainingLoader = DataLoader(tripletTrainingReader,
                                    batch_size=batchSize,
                                    collate_fn=tripletCollate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('triplets_validation'):
        tripletValidationReader = TripletBugDatasetReader(
            args.get('triplets_validation'), preprocessors)
        validationLoader = DataLoader(tripletValidationReader,
                                      batch_size=batchSize,
                                      collate_fn=tripletCollate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = SharedEncoderNNScorer(preprocessors,
                                          inputHandlers,
                                          model,
                                          device,
                                          batchSize=args['ranking_batch_size'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt)

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt)

    # LR scheduler
    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    # Set training functions
    def trainingIteration(engine, batch):
        model.train()
        optimizer.zero_grad()
        x, y = tripletCollate.to(batch, device)
        output = model(*x)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output

    trainer = Engine(trainingIteration)
    trainingMetrics = {
        'training_loss': AverageLoss(lossFn, batch_size=lambda x: x.shape[0])
    }

    # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        model.eval()
        with torch.no_grad():
            x, y = TripletBugCollate.to(batch, device)
            y_pred = model(*x)
            return y_pred, y_pred

    validationMetrics = {
        'validation_loss': LossWrapper(lossFn, batch_size=lambda x: x.shape[0])
    }
    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRankingTrain,
                             rankingScorer, bugReportDatabase, None, epoch,
                             "train")

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRanking,
                             rankingScorer, bugReportDatabase,
                             args.get("ranking_result_file"), epoch,
                             "validation")

        if offlineGeneration:
            tripletTrainingReader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % args['save'])
            torch.save(modelInfo, args['save'])

    if args.get('triplets_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('triplets_validation'):
        # Evaluate Training
        evaluator.run(trainingLoader)
        logMetrics(logger, evaluator.state.metrics)

        if recallEstimationOpt:
            logRankingResult(_run, logger, preselectListRanking,
                             rankingScorer, bugReportDatabase,
                             args.get("ranking_result_file"), 0, "validation")

    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)

            group_by_master = recallRateOpt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run, logger, rankingClass, rankingScorer,
                         bugReportDatabase, recallRateOpt["result_file"], 0,
                         None, group_by_master)
    trainingBugs = set()
    triplets = []
    if args.training:

        if args.is_pairs:
            logger.info("Reading training with pairs")
            f = open(args.training, 'r')

            for l in f:
                bugId1, bugId2, label = l.strip().split(',')

                trainingBugs.add(bugId1)
                trainingBugs.add(bugId2)
        else:
            logger.info("Reading training")
            bugDataset = BugDataset(args.training)
            trainingBugs.update(bugDataset.bugIds)


    logger.info("Preprocessing and fitting data")
    trainingText = []

    for bugId in trainingBugs:
        bugReport = bugReportDataset.getBug(bugId)
        text = concatenateSummaryAndDescription(bugReport)
        trainingText.append(text)

    if args.load:
        logger.info('Loading  object')
        vectorizer = pickle.load(open(args.load, 'rb'))
    else:
Exemple #9
0
def main(_run, _config, _seed, _log):
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)

    device = torch.device('cuda' if args['cuda'] else "cpu")
    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # Setting the parameter to save and loading parameters
    important_parameters = ['dbr_cnn']
    parameters_to_save = dict([(name, args[name])
                               for name in important_parameters])

    if args['load'] is not None:
        map_location = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        model_info = torch.load(args['load'], map_location=map_location)
        model_state = model_info['model']

        for param_name, param_value in model_info['params'].items():
            args[param_name] = param_value
    else:
        model_state = None

    # Set basic variables
    preprocessors = PreprocessorList()
    input_handlers = []
    report_database = BugReportDatabase.fromJson(args['bug_database'])
    batchSize = args['batch_size']
    dbr_cnn_opt = args['dbr_cnn']

    # Loading word embedding and lexicon
    emb = np.load(dbr_cnn_opt["word_embedding"])
    padding_sym = "</s>"

    lexicon = Lexicon(unknownSymbol=None)
    with codecs.open(dbr_cnn_opt["lexicon"]) as f:
        for l in f:
            lexicon.put(l.strip())

    lexicon.setUnknown("UUUKNNN")
    padding_id = lexicon.getLexiconIndex(padding_sym)
    embedding = Embedding(lexicon, emb, paddingIdx=padding_id)

    logger.info("Lexicon size: %d" % (lexicon.getLen()))
    logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))

    # Load filters and tokenizer
    filters = loadFilters(dbr_cnn_opt['filters'])

    if dbr_cnn_opt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif dbr_cnn_opt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % dbr_cnn_opt['tokenizer'])

    # Add preprocessors
    preprocessors.append(
        DBR_CNN_CategoricalPreprocessor(dbr_cnn_opt['categorical_lexicon'],
                                        report_database))
    preprocessors.append(
        SummaryDescriptionPreprocessor(lexicon, report_database, filters,
                                       tokenizer, padding_id))

    # Add input_handlers
    input_handlers.append(DBRDCNN_CategoricalInputHandler())
    input_handlers.append(
        TextCNNInputHandler(padding_id, min(dbr_cnn_opt["window"])))

    # Create Model
    model = DBR_CNN(embedding, dbr_cnn_opt["window"], dbr_cnn_opt["nfilters"],
                    dbr_cnn_opt['update_embedding'])

    model.to(device)

    if model_state:
        model.load_state_dict(model_state)

    # Set loss function
    logger.info("Using BCE Loss")
    loss_fn = BCELoss()
    loss_no_reduction = BCELoss(reduction='none')
    cmp_collate = PairBugCollate(input_handlers,
                                 torch.float32,
                                 unsqueeze_target=True)

    # Loading the training and setting how the negative example will be generated.
    if args.get('pairs_training'):
        negative_pair_gen_opt = args.get('neg_pair_generator', )
        pairsTrainingFile = args.get('pairs_training')
        random_anchor = negative_pair_gen_opt['random_anchor']

        offlineGeneration = not (negative_pair_gen_opt is None
                                 or negative_pair_gen_opt['type'] == 'none')

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            pair_training_reader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                randomInvertPair=args['random_switch'])
        else:
            pair_gen_type = negative_pair_gen_opt['type']
            master_id_by_bug_id = report_database.getMasterIdByBugId()

            if pair_gen_type == 'random':
                logger.info("Random Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = RandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id)

            elif pair_gen_type == 'non_negative':
                logger.info("Non Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmp_collate,
                    negative_pair_gen_opt['rate'],
                    bug_ids,
                    master_id_by_bug_id,
                    negative_pair_gen_opt['n_tries'],
                    device,
                    randomAnchor=random_anchor)
            elif pair_gen_type == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = MiscNonZeroRandomGen(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, training_dataset.duplicateIds,
                    master_id_by_bug_id, device,
                    negative_pair_gen_opt['n_tries'],
                    negative_pair_gen_opt['random_anchor'])
            elif pair_gen_type == 'random_k':
                logger.info("Random K Negative Pair Generator")
                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (training_dataset.info, len(bug_ids)))

                negative_pair_generator = KRandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id, negative_pair_gen_opt['k'],
                    device)
            elif pair_gen_type == "pre":
                logger.info("Pre-selected list generator")
                negative_pair_generator = PreSelectedGenerator(
                    negative_pair_gen_opt['pre_list_file'], preprocessors,
                    negative_pair_gen_opt['rate'], master_id_by_bug_id,
                    negative_pair_gen_opt['preselected_length'])
            elif pair_gen_type == "misc_non_zero_pre":
                logger.info("Pre-selected list generator")

                negativePairGenerator1 = PreSelectedGenerator(
                    negative_pair_gen_opt['pre_list_file'], preprocessors,
                    negative_pair_gen_opt['rate'], master_id_by_bug_id,
                    negative_pair_gen_opt['preselected_length'])

                training_dataset = BugDataset(
                    negative_pair_gen_opt['training'])
                bug_ids = training_dataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors, cmp_collate, negative_pair_gen_opt['rate'],
                    bug_ids, master_id_by_bug_id, device,
                    negative_pair_gen_opt['n_tries'])

                negative_pair_generator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pair_gen_type)

            pair_training_reader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                negative_pair_generator,
                randomInvertPair=args['random_switch'])

        training_loader = DataLoader(pair_training_reader,
                                     batch_size=batchSize,
                                     collate_fn=cmp_collate.collate,
                                     shuffle=True)
        logger.info("Training size: %s" % (len(training_loader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        pair_validation_reader = PairBugDatasetReader(
            args.get('pairs_validation'), preprocessors)
        validation_loader = DataLoader(pair_validation_reader,
                                       batch_size=batchSize,
                                       collate_fn=cmp_collate.collate)

        logger.info("Validation size: %s" % (len(validation_loader.dataset)))
    else:
        validation_loader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'],
                              momentum=args['momentum'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    ranking_scorer = DBR_CNN_Scorer(preprocessors[0], preprocessors[1],
                                    input_handlers[0], input_handlers[1],
                                    model, device, args['ranking_batch_size'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt)

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselect_list_ranking = PreselectListRanking(recallEstimationOpt)

    lr_scheduler_opt = args.get('lr_scheduler', None)

    if lr_scheduler_opt is None or lr_scheduler_opt['type'] == 'constant':
        logger.info("Scheduler: Constant")
        lr_sched = None
    elif lr_scheduler_opt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lr_scheduler_opt["step_size"], args["decay"]))
        lr_sched = StepLR(optimizer, lr_scheduler_opt["step_size"],
                          lr_scheduler_opt["decay"])
    elif lr_scheduler_opt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lr_scheduler_opt["decay"]))
        lr_sched = ExponentialLR(optimizer, lr_scheduler_opt["decay"])
    elif lr_scheduler_opt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lr_scheduler_opt["decay"]))

        lrDecay = lr_scheduler_opt["decay"]
        lr_sched = LambdaLR(optimizer, lambda epoch: 1 /
                            (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pair_gen_type)

    # Set training functions
    def trainingIteration(engine, batch):
        model.train()

        optimizer.zero_grad()
        x, y = cmp_collate.to(batch, device)
        output = model(*x)
        loss = loss_fn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    trainer = Engine(trainingIteration)
    negTarget = 0.0 if isinstance(loss_fn, NLLLoss) else -1.0

    trainingMetrics = {
        'training_loss':
        AverageLoss(loss_fn),
        'training_acc':
        AccuracyWrapper(output_transform=thresholded_output_transform),
        'training_precision':
        PrecisionWrapper(output_transform=thresholded_output_transform),
        'training_recall':
        RecallWrapper(output_transform=thresholded_output_transform),
    }

    # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        model.eval()

        with torch.no_grad():
            x, y = cmp_collate.to(batch, device)
            y_pred = model(*x)

            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(loss_fn),
        'validation_acc':
        AccuracyWrapper(output_transform=thresholded_output_transform),
        'validation_precision':
        PrecisionWrapper(output_transform=thresholded_output_transform),
        'validation_recall':
        RecallWrapper(output_transform=thresholded_output_transform),
    }
    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lr_sched:
            lr_sched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validation_loader:
            evaluator.run(validation_loader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        lastEpoch = args['epochs'] - epoch == 0

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRankingTrain,
                             ranking_scorer, report_database, None, epoch,
                             "train")
            ranking_scorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run, logger, preselect_list_ranking,
                             ranking_scorer, report_database,
                             args.get("ranking_result_file"), epoch,
                             "validation")
            ranking_scorer.free()

        if not lastEpoch:
            pair_training_reader.sampleNewNegExamples(model, loss_no_reduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parameters_to_save
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(training_loader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validation_loader)
        logMetrics(logger, evaluator.state.metrics)

        if recallEstimationOpt:
            logRankingResult(_run, logger, preselect_list_ranking,
                             ranking_scorer, report_database,
                             args.get("ranking_result_file"), 0, "validation")

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=cmp_collate.collate)

        if not isinstance(cmp_collate, PairBugCollate):
            raise NotImplementedError(
                'Evaluation of pairs using tanh was not implemented yet')

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy':
            ignite.metrics.Accuracy(
                output_transform=thresholded_output_transform),
            'test_precision':
            ignite.metrics.Precision(
                output_transform=thresholded_output_transform),
            'test_recall':
            ignite.metrics.Recall(
                output_transform=thresholded_output_transform),
            'test_predictions':
            PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'tp': metric._true_positives.item(),
                    'total_positive': metric._positives.item()
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recall_rate_opt = args.get('recall_rate', {'type': 'none'})
    if recall_rate_opt['type'] != 'none':
        if recall_rate_opt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recall_rate_opt['type']))
            recall_rate_dataset = BugDataset(recall_rate_opt['dataset'])

            ranking_class = SunRanking(report_database, recall_rate_dataset,
                                       recall_rate_opt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recall_rate_opt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recall_rate_opt['type']))
            recall_rate_dataset = BugDataset(recall_rate_opt['dataset'])
            ranking_class = DeshmukhRanking(report_database,
                                            recall_rate_dataset)
            group_by_master = recall_rate_opt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recall_rate_opt['type'])

        logRankingResult(
            _run,
            logger,
            ranking_class,
            ranking_scorer,
            report_database,
            recall_rate_opt["result_file"],
            0,
            None,
            group_by_master,
        )
    bugReportDatabase = BugReportDatabase.fromJson(args.bug_database)

    descIdfFileName = args.idf_basename + '_description_tfidf.pk'
    sumIdfFileName = args.idf_basename + '_summary_tfidf.pk'
    bothIdfFileName = args.idf_basename + '_both_tfidf.pk'

    tokenizer = TreebankWordTokenizer()
    stemmer = SnowballStemmer('english', ignore_stopwords=True)
    stopWords = set(stopwords.words('english'))

    classicalPreProcessing = ClassicalPreprocessing(tokenizer, stemmer,
                                                    stopWords)

    if args.training_reports is not None:
        bugSetDataset = BugDataset(args.training_reports)
        bugIds = []

        for idx in range(bugSetDataset.end):
            bugIds.append(bugReportDatabase.getBugByIndex(idx)['bug_id'])

        if os.path.isfile(descIdfFileName):
            logger.warning("Idf file %s exists and it will be overwritten." %
                           descIdfFileName)

        logger.info(
            "Computing and saving idf of the description in the training")
        descTfidf = calculateIdfs(bugReportDatabase, classicalPreProcessing,
                                  bugIds, 'description')
        pickle.dump(descTfidf, open(descIdfFileName, 'wb'))
def main(_run, _config, _seed, _log):
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)

    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")
    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = [
        'summary', 'description', 'sum_desc', 'classifier', 'categorical'
    ]
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if args['cuda'] else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None
    """
    Set preprocessor that will pre-process the raw information from the bug reports.
    Each different information has a specific encoder(NN), preprocessor and input handler.
    """

    preprocessors = PreprocessorList()
    encoders = []
    inputHandlers = []
    globalDropout = args['dropout']

    databasePath = args['bug_database']

    sum_desc_opts = args['sum_desc']

    if sum_desc_opts is not None:
        if globalDropout:
            args['sum_desc']['dropout'] = globalDropout

        processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers,
                            preprocessors, encoders, cacheFolder, databasePath,
                            logger, paddingSym)

    sumOpts = args.get("summary")

    if sumOpts is not None:
        if globalDropout:
            args['summary']['dropout'] = globalDropout

        processSumParam(sumOpts, bugReportDatabase, inputHandlers,
                        preprocessors, encoders, databasePath, cacheFolder,
                        logger, paddingSym)

    descOpts = args.get("description")

    if descOpts is not None:
        if globalDropout:
            args['description']['dropout'] = globalDropout

        processDescriptionParam(descOpts, bugReportDatabase, inputHandlers,
                                preprocessors, encoders, databasePath,
                                cacheFolder, logger, paddingSym)

    categoricalOpt = args.get('categorical')

    if categoricalOpt is not None and len(categoricalOpt) != 0:
        if globalDropout:
            args['categorical']['dropout'] = globalDropout

        processCategoricalParam(categoricalOpt, bugReportDatabase,
                                inputHandlers, preprocessors, encoders, logger)
    """
    Set the final classifier and the loss. Load the classifier if this argument was set.
    """
    classifierOpts = args['classifier']
    classifierType = classifierOpts['type']
    labelDType = None

    if globalDropout:
        args['classifier']['dropout'] = globalDropout

    if classifierType == 'binary':
        withoutBugEmbedding = classifierOpts.get('without_embedding', False)
        batchNorm = classifierOpts.get('batch_normalization', True)
        dropout = classifierOpts.get('dropout', 0.0)
        hiddenSizes = classifierOpts.get('hidden_sizes', [100])
        model = ProbabilityPairNN(encoders, withoutBugEmbedding, hiddenSizes,
                                  batchNorm, dropout)
        lossFn = NLLLoss()
        lossNoReduction = NLLLoss(reduction='none')

        labelDType = torch.int64

        logger.info("Using NLLLoss")
    elif classifierType == 'cosine':
        model = CosinePairNN(encoders)
        margin = classifierOpts.get('margin', 0.0)

        if classifierOpts['loss'] == 'cosine_loss':
            lossFn = CosineLoss(margin)
            lossNoReduction = CosineLoss(margin, reduction='none')
            labelDType = torch.float32
            logger.info("Using Cosine Embeding Loss: margin={}".format(margin))
        elif classifierOpts['loss'] == 'neculoiu_loss':
            lossFn = NeculoiuLoss(margin)
            lossNoReduction = NeculoiuLoss(margin, reduction='none')
            labelDType = torch.float32
            logger.info("Using Neculoiu Loss: margin={}".format(margin))

    model.to(device)

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    pairCollate = PairBugCollate(inputHandlers, labelDType)

    # load training
    if args.get('pairs_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        pairsTrainingFile = args.get('pairs_training')
        randomAnchor = negativePairGenOpt['random_anchor']

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')

        if not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            pairTrainingReader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                randomInvertPair=args['random_switch'])
        else:
            pairGenType = negativePairGenOpt['type']
            masterIdByBugId = bugReportDatabase.getMasterIdByBugId()

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors,
                    pairCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    pairCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = MiscNonZeroRandomGen(
                    preprocessors, pairCollate, negativePairGenOpt['rate'],
                    bugIds, trainingDataset.duplicateIds, masterIdByBugId,
                    device, negativePairGenOpt['n_tries'])
            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors, pairCollate, negativePairGenOpt['rate'],
                    bugIds, masterIdByBugId, negativePairGenOpt['k'], device)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'], preprocessors,
                    negativePairGenOpt['rate'], masterIdByBugId,
                    negativePairGenOpt['preselected_length'])
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Pre-selected list generator")

                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'], preprocessors,
                    negativePairGenOpt['rate'], masterIdByBugId,
                    negativePairGenOpt['preselected_length'])

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors, pairCollate, negativePairGenOpt['rate'],
                    bugIds, masterIdByBugId, device,
                    negativePairGenOpt['n_tries'])

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

            pairTrainingReader = PairBugDatasetReader(
                pairsTrainingFile,
                preprocessors,
                negativePairGenerator,
                randomInvertPair=args['random_switch'])

        trainingLoader = DataLoader(pairTrainingReader,
                                    batch_size=batchSize,
                                    collate_fn=pairCollate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        pairValidationReader = PairBugDatasetReader(
            args.get('pairs_validation'), preprocessors)
        validationLoader = DataLoader(pairValidationReader,
                                      batch_size=batchSize,
                                      collate_fn=pairCollate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'],
                              momentum=args['momentum'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = SharedEncoderNNScorer(preprocessors, inputHandlers, model,
                                          device, args['ranking_batch_size'])
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt)

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt)

    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    def scoreDistanceTrans(output):
        if len(output) == 3:
            _, y_pred, y = output
        else:
            y_pred, y = output

        if isinstance(lossFn, NLLLoss):
            return torch.exp(y_pred[:, 1]), y
        elif isinstance(lossFn, CosineLoss):
            return y_pred, (y * 2) - 1

    # Set training functions
    def trainingIteration(engine, batch):
        model.train()
        optimizer.zero_grad()
        (bug1, bug2), y = pairCollate.to(batch, device)
        output = model(bug1, bug2)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    trainer = Engine(trainingIteration)
    negTarget = 0.0 if isinstance(lossFn, NLLLoss) else -1.0

    trainingMetrics = {
        'training_loss':
        AverageLoss(lossFn),
        'training_dist_target':
        MeanScoreDistance(negTarget=negTarget,
                          output_transform=scoreDistanceTrans),
        'training_confusion_matrix':
        ConfusionMatrix(2, output_transform=lambda x: (x[1], x[2])),
    }

    # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        model.eval()
        with torch.no_grad():
            (bug1, bug2), y = pairCollate.to(batch, device)
            y_pred = model(bug1, bug2)
            return y_pred, y

    validationMetrics = {
        'validation_loss':
        LossWrapper(lossFn),
        'validation_dist_target':
        MeanScoreDistance(negTarget=negTarget,
                          output_transform=scoreDistanceTrans),
        'validation_confusion_matrix':
        ConfusionMatrix(2),
    }
    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logConfusionMatrix(_run, logger, 'training_confusion_matrix',
                           engine.state.metrics['training_confusion_matrix'],
                           epoch)
        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logConfusionMatrix(
                _run, logger, 'validation_confusion_matrix',
                evaluator.state.metrics['validation_confusion_matrix'], epoch)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRankingTrain,
                             rankingScorer, bugReportDatabase, None, epoch,
                             "train")

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run, logger, preselectListRanking,
                             rankingScorer, bugReportDatabase,
                             args.get("ranking_result_file"), epoch,
                             "validation")

        if offlineGeneration:
            pairTrainingReader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % args['save'])
            torch.save(modelInfo, args['save'])

    if args.get('pairs_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(trainingLoader)
        logMetrics(logger, evaluator.state.metrics)

        if recallEstimationOpt:
            logRankingResult(_run, logger, preselectListRanking,
                             rankingScorer, bugReportDatabase,
                             args.get("ranking_result_file"), 0, "validation")

    # Test Dataset (accuracy, recall, precision, F1)
    pair_test_dataset = args.get('pair_test_dataset')

    if pair_test_dataset is not None and len(pair_test_dataset) > 0:
        pairTestReader = PairBugDatasetReader(pair_test_dataset, preprocessors)
        testLoader = DataLoader(pairTestReader,
                                batch_size=batchSize,
                                collate_fn=pairCollate.collate)

        logger.info("Test size: %s" % (len(testLoader.dataset)))

        testMetrics = {
            'test_accuracy': ignite.metrics.Accuracy(),
            'test_precision': ignite.metrics.Precision(),
            'test_recall': ignite.metrics.Recall(),
            'test_confusion_matrix': ConfusionMatrix(2),
            'test_predictions': PredictionCache(),
        }
        test_evaluator = Engine(validationIteration)

        # Add metrics to evaluator
        for name, metric in testMetrics.items():
            metric.attach(test_evaluator, name)

        test_evaluator.run(testLoader)

        for metricName, metricValue in test_evaluator.state.metrics.items():
            metric = testMetrics[metricName]

            if isinstance(metric, ignite.metrics.Accuracy):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'value': metricValue,
                    'epoch': None,
                    'correct': metric._num_correct,
                    'total': metric._num_examples
                })
                _run.log_scalar(metricName, metricValue)
            elif isinstance(metric,
                            (ignite.metrics.Precision, ignite.metrics.Recall)):
                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'value':
                    np.float(metricValue.cpu().numpy()[1]),
                    'epoch':
                    None,
                    'tp':
                    metric._true_positives.cpu().numpy().tolist(),
                    'total_positive':
                    metric._positives.cpu().numpy().tolist()
                })
                _run.log_scalar(metricName, metricValue[1])
            elif isinstance(metric, ConfusionMatrix):
                acc = cmAccuracy(metricValue)
                prec = cmPrecision(metricValue, False)
                recall = cmRecall(metricValue, False)
                f1 = 2 * (prec * recall) / (prec + recall + 1e-15)

                logger.info({
                    'type':
                    'metric',
                    'label':
                    metricName,
                    'accuracy':
                    np.float(acc),
                    'precision':
                    prec.cpu().numpy().tolist(),
                    'recall':
                    recall.cpu().numpy().tolist(),
                    'f1':
                    f1.cpu().numpy().tolist(),
                    'confusion_matrix':
                    metricValue.cpu().numpy().tolist(),
                    'epoch':
                    None
                })

                _run.log_scalar('test_f1', f1[1])
            elif isinstance(metric, PredictionCache):
                logger.info({
                    'type': 'metric',
                    'label': metricName,
                    'predictions': metric.predictions
                })

    # Calculate recall rate
    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)

            group_by_master = recallRateOpt["group_by_master"]
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run, logger, rankingClass, rankingScorer,
                         bugReportDatabase, recallRateOpt["result_file"], 0,
                         None, group_by_master)
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--bug_data', required=True, help="")
    parser.add_argument('--dataset', required=True, help="")
    parser.add_argument('--n', required=True, type=int, help="")
    parser.add_argument('--type', required=True, help="")
    parser.add_argument('--aux_file', help="")
    parser.add_argument('--model', help="")

    logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
    logger = logging.getLogger()
    args = parser.parse_args()
    logger.info(args)

    bugDataset = BugDataset(args.dataset)
    bugReportDatabase = BugReportDatabase.fromJson(args.bug_data)

    bugIds = bugDataset.bugIds
    duplicateBugs = bugDataset.duplicateIds

    if args.aux_file:
        '''
        In our methodology, we compare new bug with all the previously bugs that are in the database.
        To better generate pairs and triplets, we use the bugs that were reported before the ones 
        from the validation.
        '''
        auxBugDataset = BugDataset(args.aux_file)
        bugsFromMainFile = list(bugIds)
        bugsFromMainFileSet = set(bugIds)
Exemple #13
0
def main(_run, _config, _seed, _log):
    """

    :param _run:
    :param _config:
    :param _seed:
    :param _log:
    :return:
    """
    """
    Setting and loading parameters
    """
    # Setting logger
    args = _config
    logger = _log

    logger.info(args)
    logger.info('It started at: %s' % datetime.now())

    torch.manual_seed(_seed)

    bugReportDatabase = BugReportDatabase.fromJson(args['bug_database'])
    paddingSym = "</s>"
    batchSize = args['batch_size']

    device = torch.device('cuda' if args['cuda'] else "cpu")

    if args['cuda']:
        logger.info("Turning CUDA on")
    else:
        logger.info("Turning CUDA off")

    # It is the folder where the preprocessed information will be stored.
    cacheFolder = args['cache_folder']

    # Setting the parameter to save and loading parameters
    importantParameters = ['compare_aggregation', 'categorical']
    parametersToSave = dict([(parName, args[parName])
                             for parName in importantParameters])

    if args['load'] is not None:
        mapLocation = (
            lambda storage, loc: storage.cuda()) if cudaOn else 'cpu'
        modelInfo = torch.load(args['load'], map_location=mapLocation)
        modelState = modelInfo['model']

        for paramName, paramValue in modelInfo['params'].items():
            args[paramName] = paramValue
    else:
        modelState = None

    if args['rep'] is not None and args['rep']['model']:
        logger.info("Loading REP")
        rep = read_weights(args['rep']['model'])
        rep_input, max_tkn_id = read_dbrd_file(args['rep']['input'], math.inf)
        rep_recommendation = args['rep']['k']

        rep.fit_transform(rep_input, max_tkn_id, True)

        rep_input_by_id = {}

        for inp in rep_input:
            rep_input_by_id[inp[SUN_REPORT_ID_INDEX]] = inp

    else:
        rep = None

    preprocessors = PreprocessorList()
    inputHandlers = []

    categoricalOpt = args.get('categorical')

    if categoricalOpt is not None and len(categoricalOpt) != 0:
        categoricalEncoder, _, _ = processCategoricalParam(
            categoricalOpt, bugReportDatabase, inputHandlers, preprocessors,
            None, logger, cudaOn)
    else:
        categoricalEncoder = None

    filterInputHandlers = []

    compareAggOpt = args['compare_aggregation']
    databasePath = args['bug_database']

    # Loading word embedding
    if compareAggOpt["word_embedding"]:
        # todo: Allow use embeddings and other representation
        lexicon, embedding = Embedding.fromFile(
            compareAggOpt['word_embedding'],
            'UUUKNNN',
            hasHeader=False,
            paddingSym=paddingSym)
        logger.info("Lexicon size: %d" % (lexicon.getLen()))
        logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize()))
        paddingId = lexicon.getLexiconIndex(paddingSym)
        lazy = False
    else:
        embedding = None

    # Tokenizer
    if compareAggOpt['tokenizer'] == 'default':
        logger.info("Use default tokenizer to tokenize summary information")
        tokenizer = MultiLineTokenizer()
    elif compareAggOpt['tokenizer'] == 'white_space':
        logger.info(
            "Use white space tokenizer to tokenize summary information")
        tokenizer = WhitespaceTokenizer()
    else:
        raise ArgumentError(
            "Tokenizer value %s is invalid. You should choose one of these: default and white_space"
            % compareAggOpt['tokenizer'])

    # Preparing input handlers, preprocessors and cache
    minSeqSize = max(compareAggOpt['aggregate']["window"]
                     ) if compareAggOpt['aggregate']["model"] == "cnn" else -1

    if compareAggOpt['summary'] is not None:
        # Use summary and description (concatenated) to address this problem
        logger.info("Using Summary information.")
        # Loading Filters
        sumFilters = loadFilters(compareAggOpt['summary']['filters'])

        if compareAggOpt['summary']['model_type'] in ('lstm', 'gru',
                                                      'word_emd', 'residual'):
            arguments = (databasePath, compareAggOpt['word_embedding'],
                         ' '.join(
                             sorted([
                                 fil.__class__.__name__ for fil in sumFilters
                             ])), compareAggOpt['tokenizer'],
                         SummaryPreprocessor.__name__)

            inputHandlers.append(
                RNNInputHandler(paddingId, minInputSize=minSeqSize))

            summaryCache = PreprocessingCache(cacheFolder, arguments)
            summaryPreprocessor = SummaryPreprocessor(lexicon,
                                                      bugReportDatabase,
                                                      sumFilters, tokenizer,
                                                      paddingId, summaryCache)
        elif compareAggOpt['summary']['model_type'] == 'ELMo':
            raise NotImplementedError("ELMO is not implemented!")
            # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize))
            # summaryPreprocessor = ELMoPreprocessor(0, elmoEmbedding)
            # compareAggOpt['summary']["input_size"] = elmoEmbedding.get_size()
        elif compareAggOpt['summary']['model_type'] == 'BERT':
            arguments = (databasePath, "CADD SUMMARY", "BERT",
                         "bert-base-uncased")

            inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize))

            summaryCache = PreprocessingCache(cacheFolder, arguments)
            summaryPreprocessor = TransformerPreprocessor(
                "short_desc", "bert-base-uncased", BertTokenizer, 0,
                bugReportDatabase, summaryCache)
#            compareAggOpt['summary']["input_size"] = 768

        preprocessors.append(summaryPreprocessor)

    if compareAggOpt['desc'] is not None:
        # Use summary and description (concatenated) to address this problem
        logger.info("Using Description information.")
        descFilters = loadFilters(compareAggOpt['desc']['filters'])

        if compareAggOpt['desc']['model_type'] in ('lstm', 'gru', 'word_emd',
                                                   'residual'):
            arguments = (databasePath, compareAggOpt['word_embedding'],
                         ' '.join(
                             sorted([
                                 fil.__class__.__name__ for fil in descFilters
                             ])), compareAggOpt['tokenizer'], "CADD DESC",
                         str(compareAggOpt['desc']['summarization']))

            inputHandlers.append(
                RNNInputHandler(paddingId, minInputSize=minSeqSize))

            descriptionCache = PreprocessingCache(cacheFolder, arguments)
            descPreprocessor = DescriptionPreprocessor(lexicon,
                                                       bugReportDatabase,
                                                       descFilters,
                                                       tokenizer,
                                                       paddingId,
                                                       cache=descriptionCache)
        elif compareAggOpt['desc']['model_type'] == 'ELMo':
            raise NotImplementedError("ELMO is not implemented!")
            # inputHandlers.append(ELMoInputHandler(cudaOn, minInputSize=minSeqSize))
            # descPreprocessor = ELMoPreprocessor(1, elmoEmbedding)
            # compareAggOpt['desc']["input_size"] = elmoEmbedding.get_size()
        elif compareAggOpt['desc']['model_type'] == 'BERT':
            arguments = (databasePath, "CADD DESC", "BERT",
                         "bert-base-uncased")

            inputHandlers.append(BERTInputHandler(0, minInputSize=minSeqSize))

            descriptionCache = PreprocessingCache(cacheFolder, arguments)
            descPreprocessor = TransformerPreprocessor("description",
                                                       "bert-base-uncased",
                                                       BertTokenizer, 0,
                                                       bugReportDatabase,
                                                       descriptionCache)
#            compareAggOpt['desc']["input_size"] = 768

        preprocessors.append(descPreprocessor)

    # Create model
    model = CADD(embedding,
                 categoricalEncoder,
                 compareAggOpt,
                 compareAggOpt['summary'],
                 compareAggOpt['desc'],
                 compareAggOpt['matching'],
                 compareAggOpt['aggregate'],
                 cudaOn=cudaOn)

    lossFn = F.nll_loss
    lossNoReduction = NLLLoss(reduction='none')

    if cudaOn:
        model.cuda()

    if modelState:
        model.load_state_dict(modelState)
    """
    Loading the training and validation. Also, it sets how the negative example will be generated.
    """
    cmpAggCollate = PairBugCollate(inputHandlers, torch.int64)

    # load training
    if args.get('pairs_training'):
        negativePairGenOpt = args.get('neg_pair_generator', )
        pairTrainingFile = args.get('pairs_training')

        offlineGeneration = not (negativePairGenOpt is None
                                 or negativePairGenOpt['type'] == 'none')
        masterIdByBugId = bugReportDatabase.getMasterIdByBugId()
        randomAnchor = negativePairGenOpt['random_anchor']

        if rep:
            logger.info("Generate negative examples using REP.")
            randomAnchor = negativePairGenOpt['random_anchor']
            trainingDataset = BugDataset(args['rep']['training'])

            bugIds = trainingDataset.bugIds
            negativePairGenerator = REPGenerator(rep, rep_input_by_id,
                                                 args['rep']['neg_training'],
                                                 preprocessors, bugIds,
                                                 masterIdByBugId,
                                                 args['rep']['rate'],
                                                 randomAnchor)
        elif not offlineGeneration:
            logger.info("Not generate dynamically the negative examples.")
            negativePairGenerator = None
        else:
            pairGenType = negativePairGenOpt['type']

            if pairGenType == 'random':
                logger.info("Random Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = RandomGenerator(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    randomAnchor=randomAnchor)

            elif pairGenType == 'non_negative':
                logger.info("Non Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = NonNegativeRandomGenerator(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'misc_non_zero':
                logger.info("Misc Non Zero Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = MiscNonZeroRandomGen(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    trainingDataset.duplicateIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == 'random_k':
                logger.info("Random K Negative Pair Generator")
                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                logger.info(
                    "Using the following dataset to generate negative examples: %s. Number of bugs in the training: %d"
                    % (trainingDataset.info, len(bugIds)))

                negativePairGenerator = KRandomGenerator(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['k'],
                    device,
                    randomAnchor=randomAnchor)
            elif pairGenType == "pre":
                logger.info("Pre-selected list generator")
                negativePairGenerator = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

            elif pairGenType == "positive_pre":
                logger.info("Positive Pre-selected list generator")
                negativePairGenerator = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)
            elif pairGenType == "misc_non_zero_pre":
                logger.info("Misc: non-zero and Pre-selected list generator")
                negativePairGenerator1 = PreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))
            elif pairGenType == "misc_non_zero_positive_pre":
                logger.info(
                    "Misc: non-zero and Positive Pre-selected list generator")
                negativePairGenerator1 = PositivePreSelectedGenerator(
                    negativePairGenOpt['pre_list_file'],
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    masterIdByBugId,
                    negativePairGenOpt['preselected_length'],
                    randomAnchor=randomAnchor)

                trainingDataset = BugDataset(negativePairGenOpt['training'])
                bugIds = trainingDataset.bugIds

                negativePairGenerator2 = NonNegativeRandomGenerator(
                    preprocessors,
                    cmpAggCollate,
                    negativePairGenOpt['rate'],
                    bugIds,
                    masterIdByBugId,
                    negativePairGenOpt['n_tries'],
                    device,
                    randomAnchor=randomAnchor)

                negativePairGenerator = MiscOfflineGenerator(
                    (negativePairGenerator1, negativePairGenerator2))

            else:
                raise ArgumentError(
                    "Offline generator is invalid (%s). You should choose one of these: random, hard and pre"
                    % pairGenType)

        pairTrainingReader = PairBugDatasetReader(
            pairTrainingFile,
            preprocessors,
            negativePairGenerator,
            randomInvertPair=args['random_switch'])
        trainingCollate = cmpAggCollate
        trainingLoader = DataLoader(pairTrainingReader,
                                    batch_size=batchSize,
                                    collate_fn=trainingCollate.collate,
                                    shuffle=True)
        logger.info("Training size: %s" % (len(trainingLoader.dataset)))

    # load validation
    if args.get('pairs_validation'):
        pairValidationReader = PairBugDatasetReader(
            args.get('pairs_validation'), preprocessors)
        validationLoader = DataLoader(pairValidationReader,
                                      batch_size=batchSize,
                                      collate_fn=cmpAggCollate.collate)

        logger.info("Validation size: %s" % (len(validationLoader.dataset)))
    else:
        validationLoader = None
    """
    Training and evaluate the model. 
    """
    optimizer_opt = args.get('optimizer', 'adam')

    if optimizer_opt == 'sgd':
        logger.info('SGD')
        optimizer = optim.SGD(model.parameters(),
                              lr=args['lr'],
                              weight_decay=args['l2'])
    elif optimizer_opt == 'adam':
        logger.info('Adam')
        optimizer = optim.Adam(model.parameters(),
                               lr=args['lr'],
                               weight_decay=args['l2'])

    # Recall rate
    rankingScorer = GeneralScorer(model, preprocessors, device, cmpAggCollate)
    recallEstimationTrainOpt = args.get('recall_estimation_train')

    if recallEstimationTrainOpt:
        preselectListRankingTrain = PreselectListRanking(
            recallEstimationTrainOpt, args['sample_size_rr_tr'])

    recallEstimationOpt = args.get('recall_estimation')

    if recallEstimationOpt:
        preselectListRanking = PreselectListRanking(recallEstimationOpt,
                                                    args['sample_size_rr_val'])

    # LR scheduler
    lrSchedulerOpt = args.get('lr_scheduler', None)

    if lrSchedulerOpt is None:
        logger.info("Scheduler: Constant")
        lrSched = None
    elif lrSchedulerOpt["type"] == 'step':
        logger.info("Scheduler: StepLR (step:%s, decay:%f)" %
                    (lrSchedulerOpt["step_size"], args["decay"]))
        lrSched = StepLR(optimizer, lrSchedulerOpt["step_size"],
                         lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'exp':
        logger.info("Scheduler: ExponentialLR (decay:%f)" %
                    (lrSchedulerOpt["decay"]))
        lrSched = ExponentialLR(optimizer, lrSchedulerOpt["decay"])
    elif lrSchedulerOpt["type"] == 'linear':
        logger.info(
            "Scheduler: Divide by (1 + epoch * decay) ---- (decay:%f)" %
            (lrSchedulerOpt["decay"]))

        lrDecay = lrSchedulerOpt["decay"]
        lrSched = LambdaLR(optimizer, lambda epoch: 1 /
                           (1.0 + epoch * lrDecay))
    else:
        raise ArgumentError(
            "LR Scheduler is invalid (%s). You should choose one of these: step, exp and linear "
            % pairGenType)

    # Set training functions
    def trainingIteration(engine, batch):
        engine.kk = 0

        model.train()
        optimizer.zero_grad()
        x, y = batch
        output = model(*x)
        loss = lossFn(output, y)
        loss.backward()
        optimizer.step()
        return loss, output, y

    def scoreDistanceTrans(output):
        if len(output) == 3:
            _, y_pred, y = output
        else:
            y_pred, y = output

        if lossFn == F.nll_loss:
            return torch.exp(y_pred[:, 1]), y

    trainer = Engine(trainingIteration)
    trainingMetrics = {
        'training_loss':
        AverageLoss(lossFn, batch_size=lambda x: x[0].shape[0]),
        'training_dist_target':
        MeanScoreDistance(output_transform=scoreDistanceTrans)
    }

    # Add metrics to trainer
    for name, metric in trainingMetrics.items():
        metric.attach(trainer, name)

    # Set validation functions
    def validationIteration(engine, batch):
        if not hasattr(engine, 'kk'):
            engine.kk = 0

        model.eval()
        with torch.no_grad():
            x, y = batch
            y_pred = model(*x)

            # for k, (pred, t) in enumerate(zip(y_pred, y)):
            #     engine.kk += 1
            #     print("{}: {} \t {}".format(engine.kk, torch.round(torch.exp(pred) * 100), t))
            return y_pred, y

    validationMetrics = {
        'validation_loss':
        ignite.metrics.Loss(lossFn),
        'validation_dist_target':
        MeanScoreDistance(output_transform=scoreDistanceTrans)
    }
    evaluator = Engine(validationIteration)

    # Add metrics to evaluator
    for name, metric in validationMetrics.items():
        metric.attach(evaluator, name)

    # recommendation
    if rep:
        recommendation_fn = REP_CADD_Recommender(
            rep, rep_input_by_id,
            rep_recommendation).generateRecommendationList
    else:
        recommendation_fn = generateRecommendationList

    @trainer.on(Events.EPOCH_STARTED)
    def onStartEpoch(engine):
        epoch = engine.state.epoch
        logger.info("Epoch: %d" % epoch)

        if lrSched:
            lrSched.step()

        logger.info("LR: %s" % str(optimizer.param_groups[0]["lr"]))

    @trainer.on(Events.EPOCH_COMPLETED)
    def onEndEpoch(engine):
        epoch = engine.state.epoch

        logMetrics(_run, logger, engine.state.metrics, epoch)

        # Evaluate Training
        if validationLoader:
            evaluator.run(validationLoader)
            logMetrics(_run, logger, evaluator.state.metrics, epoch)

        if recallEstimationTrainOpt and (epoch % args['rr_train_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRankingTrain,
                             rankingScorer,
                             bugReportDatabase,
                             None,
                             epoch,
                             "train",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        if recallEstimationOpt and (epoch % args['rr_val_epoch'] == 0):
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             epoch,
                             "validation",
                             recommendationListfn=recommendation_fn)
            rankingScorer.free()

        pairTrainingReader.sampleNewNegExamples(model, lossNoReduction)

        if args.get('save'):
            save_by_epoch = args['save_by_epoch']

            if save_by_epoch and epoch in save_by_epoch:
                file_name, file_extension = os.path.splitext(args['save'])
                file_path = file_name + '_epoch_{}'.format(
                    epoch) + file_extension
            else:
                file_path = args['save']

            modelInfo = {
                'model': model.state_dict(),
                'params': parametersToSave
            }

            logger.info("==> Saving Model: %s" % file_path)
            torch.save(modelInfo, file_path)

    if args.get('pairs_training'):
        trainer.run(trainingLoader, max_epochs=args['epochs'])
    elif args.get('pairs_validation'):
        # Evaluate Training
        evaluator.run(validationLoader)
        logMetrics(_run, logger, evaluator.state.metrics, 0)

        if recallEstimationOpt:
            logRankingResult(_run,
                             logger,
                             preselectListRanking,
                             rankingScorer,
                             bugReportDatabase,
                             args.get("ranking_result_file"),
                             0,
                             "validation",
                             recommendationListfn=recommendation_fn)

    recallRateOpt = args.get('recall_rate', {'type': 'none'})
    if recallRateOpt['type'] != 'none':
        if recallRateOpt['type'] == 'sun2011':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])

            rankingClass = SunRanking(bugReportDatabase, recallRateDataset,
                                      recallRateOpt['window'])
            # We always group all bug reports by master in the results in the sun 2011 methodology
            group_by_master = True
        elif recallRateOpt['type'] == 'deshmukh':
            logger.info("Calculating recall rate: {}".format(
                recallRateOpt['type']))
            recallRateDataset = BugDataset(recallRateOpt['dataset'])
            rankingClass = DeshmukhRanking(bugReportDatabase,
                                           recallRateDataset)
            group_by_master = recallRateOpt['group_by_master']
        else:
            raise ArgumentError(
                "recall_rate.type is invalid (%s). You should choose one of these: step, exp and linear "
                % recallRateOpt['type'])

        logRankingResult(_run,
                         logger,
                         rankingClass,
                         rankingScorer,
                         bugReportDatabase,
                         recallRateOpt["result_file"],
                         0,
                         None,
                         group_by_master,
                         recommendationListfn=recommendation_fn)