def processCategoricalParam(categoricalOpt, bugReportDatabase, inputHandlers, preprocessors, encoders, logger): logger.info("Using Categorical Information.") categoricalPreprocessors, categoricalLexicons = createCategoricalPreprocessorAndLexicons( categoricalOpt['lexicons'], bugReportDatabase) handler = BasicInputHandler(transpose_input=True) if inputHandlers is not None: inputHandlers.append(handler) if preprocessors is not None: preprocessors.append(categoricalPreprocessors) # Create model embeddingSize = categoricalOpt.get('emb_size', 20) hiddenSizes = categoricalOpt.get('hidden_sizes') batchNorm = categoricalOpt.get('batch_normalization', False) layerNorm = categoricalOpt.get('layer_norm', False) dropout = categoricalOpt.get('dropout', 0.0) actFunc = loadActivationClass(categoricalOpt.get('activation')) bnLastLayer = categoricalOpt.get('bn_last_layer', False) categoricalEncoder = CategoricalEncoder(categoricalLexicons, embeddingSize, hiddenSizes, actFunc, batchNorm, bnLastLayer,dropout,layerNorm) if encoders is not None: encoders.append(categoricalEncoder) return categoricalEncoder, handler, categoricalPreprocessors
def processSumDescParam(sum_desc_opts, bugReportDatabase, inputHandlers, preprocessors, encoders, cacheFolder, databasePath, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using summary and description information.") # Loading word embedding lexicon, embedding = load_embedding(sum_desc_opts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(sum_desc_opts['filters']) # Tokenizer if sum_desc_opts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary+description information") tokenizer = MultiLineTokenizer() elif sum_desc_opts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary+description information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % sum_desc_opts['tokenizer']) arguments = ( databasePath, sum_desc_opts['word_embedding'], ' '.join(sorted([fil.__class__.__name__ for fil in filters])), sum_desc_opts['tokenizer'], "summary_description") cacheSumDesc = PreprocessingCache(cacheFolder, arguments) sumDescPreprocessor = SummaryDescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, cacheSumDesc) preprocessors.append(sumDescPreprocessor) if sum_desc_opts['encoder_type'] == 'cnn': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) sumDescEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'cnn+dense': windowSizes = sum_desc_opts.get('window_sizes', [3]) nFilters = sum_desc_opts.get('nfilters', 100) updateEmb = sum_desc_opts.get('update_embedding', False) actFunc = loadActivationFunction(sum_desc_opts.get('activation', 'relu')) batchNorm = sum_desc_opts.get('batch_normalization', False) dropout = sum_desc_opts.get('dropout', 0.0) hiddenSizes = sum_desc_opts.get('hidden_sizes') hiddenAct = loadActivationClass(sum_desc_opts.get('hidden_act')) hiddenDropout = sum_desc_opts.get('hidden_dropout') batchLast = sum_desc_opts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) sumDescEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(sumDescEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif sum_desc_opts['encoder_type'] == 'word_mean': standardization = sum_desc_opts.get('standardization', False) dropout = sum_desc_opts.get('dropout', 0.0) updateEmb = sum_desc_opts.get('update_embedding', False) batch_normalization = sum_desc_opts.get('update_embedding', False) hiddenSize = sum_desc_opts.get('hidden_size') sumDescEncoder = WordMean(embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(sumDescEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % sum_desc_opts['encoder_type'])
def processDescriptionParam(descOpts, bugReportDatabase, inputHandlers, preprocessors, encoders, databasePath, cacheFolder, logger, paddingSym): # Use summary and description (concatenated) to address this problem logger.info("Using Description information.") # Loading word embedding lexicon, embedding = load_embedding(descOpts, paddingSym) logger.info("Lexicon size: %d" % (lexicon.getLen())) logger.info("Word Embedding size: %d" % (embedding.getEmbeddingSize())) paddingId = lexicon.getLexiconIndex(paddingSym) # Loading Filters filters = loadFilters(descOpts['filters']) # Tokenizer if descOpts['tokenizer'] == 'default': logger.info("Use default tokenizer to tokenize summary information") tokenizer = MultiLineTokenizer() elif descOpts['tokenizer'] == 'white_space': logger.info("Use white space tokenizer to tokenize summary information") tokenizer = WhitespaceTokenizer() else: raise ArgumentError( "Tokenizer value %s is invalid. You should choose one of these: default and white_space" % descOpts['tokenizer']) arguments = ( databasePath, descOpts['word_embedding'], str(descOpts['lexicon']), ' '.join(sorted([fil.__class__.__name__ for fil in filters])), descOpts['tokenizer'], "description") descCache = PreprocessingCache(cacheFolder, arguments) descPreprocessor = DescriptionPreprocessor(lexicon, bugReportDatabase, filters, tokenizer, paddingId, descCache) preprocessors.append(descPreprocessor) if descOpts['encoder_type'] == 'rnn': rnnType = descOpts.get('rnn_type') hiddenSize = descOpts.get('hidden_size') bidirectional = descOpts.get('bidirectional', False) numLayers = descOpts.get('num_layers', 1) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) fixedOpt = descOpts.get('fixed_opt', False) descRNN = SortedRNNEncoder(rnnType, embedding, hiddenSize, numLayers, bidirectional, updateEmb, dropout) if fixedOpt == 'self_att': att = SelfAttention(descRNN.getOutputSize(), descOpts['self_att_hidden'], descOpts['n_hops']) descEncoder = RNN_Self_Attention(descRNN, att, paddingId, dropout) else: descEncoder = RNNFixedOuput(descRNN, fixedOpt, dropout) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) elif descOpts['encoder_type'] == 'cnn': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) descEncoder = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'cnn+dense': windowSizes = descOpts.get('window_sizes', [3]) nFilters = descOpts.get('nfilters', 100) updateEmb = descOpts.get('update_embedding', False) actFunc = loadActivationFunction(descOpts.get('activation', 'relu')) batchNorm = descOpts.get('batch_normalization', False) dropout = descOpts.get('dropout', 0.0) hiddenSizes = descOpts.get('hidden_sizes') hiddenAct = loadActivationClass(descOpts.get('hidden_act')) hiddenDropout = descOpts.get('hidden_dropout') batchLast = descOpts.get("bn_last_layer", False) cnnEnc = TextCNN(windowSizes, nFilters, embedding, updateEmb, actFunc, batchNorm, dropout) descEncoder = MultilayerDense(cnnEnc, hiddenSizes, hiddenAct, batchNorm, batchLast, hiddenDropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, max(windowSizes))) elif descOpts['encoder_type'] == 'dense+self_att': dropout = descOpts.get('dropout', 0.0) hiddenSize = descOpts.get('hidden_size') self_att_hidden = descOpts['self_att_hidden'] n_hops = descOpts['n_hops'] updateEmb = descOpts.get('update_embedding', False) descEncoder = Dense_Self_Attention(embedding, hiddenSize, self_att_hidden, n_hops, paddingId, updateEmb, dropout=dropout) encoders.append(descEncoder) inputHandlers.append(TextCNNInputHandler(paddingId, -1)) elif descOpts['encoder_type'] == 'word_mean': standardization = descOpts.get('standardization', False) dropout = descOpts.get('dropout', 0.0) updateEmb = descOpts.get('update_embedding', False) batch_normalization = descOpts.get('update_embedding', False) hiddenSize = descOpts.get('hidden_size') descEncoder = WordMean( embedding, updateEmb, hiddenSize, standardization, dropout, batch_normalization) encoders.append(descEncoder) inputHandlers.append(RNNInputHandler(paddingId)) else: raise ArgumentError( "Encoder type of summary and description is invalid (%s). You should choose one of these: cnn" % descOpts['encoder_type'])