#---------------------------------------------------------------------------------------------------------------------- # 2. Data Preprocessing #---------------------------------------------------------------------------------------------------------------------- print('2. Data Preprocessing....') logger.info("Data Preprocessing for train dataset startting.....") logger.info( "Creating input vocabulary for context and output vocabulary for lable/method name" ) input_vocab = Vocabulary("Context") output_vocab = Vocabulary("label") contexts = [] labels = [] for index, row in df_train.iterrows(): input_vocab.addSentence(row['Context']) contexts.append(row['Context']) output_vocab.addSentence(row['Label']) labels.append(row['Label']) logger.info("Number of word in input vocabulary: %d", input_vocab.n_words) logger.info("Number of word in output vocabulary: %d", output_vocab.n_words) logger.info( "Removing the words that appear less than %d in input and output vocabulary", config.min_frequency) input_vocab.removeWordLessThan(config.min_frequency) output_vocab.removeWordLessThan(config.min_frequency) logger.info("After Filtering") logger.info("Number of word in input vocabulary: %d", input_vocab.n_words) logger.info("Number of word in output vocabulary: %d", output_vocab.n_words) context_vocab_size = input_vocab.n_words