コード例 #1
0
ファイル: __main__.py プロジェクト: ocktree/CMPT470
#----------------------------------------------------------------------------------------------------------------------
# 2. Data Preprocessing
#----------------------------------------------------------------------------------------------------------------------
print('2. Data Preprocessing....')
logger.info("Data Preprocessing for train dataset startting.....")
logger.info(
    "Creating input vocabulary for context and output vocabulary for lable/method name"
)
input_vocab = Vocabulary("Context")
output_vocab = Vocabulary("label")

contexts = []
labels = []
for index, row in df_train.iterrows():
    input_vocab.addSentence(row['Context'])
    contexts.append(row['Context'])
    output_vocab.addSentence(row['Label'])
    labels.append(row['Label'])
logger.info("Number of word in input vocabulary: %d", input_vocab.n_words)
logger.info("Number of word in output vocabulary: %d", output_vocab.n_words)
logger.info(
    "Removing the words that appear less than %d in input and output vocabulary",
    config.min_frequency)
input_vocab.removeWordLessThan(config.min_frequency)
output_vocab.removeWordLessThan(config.min_frequency)
logger.info("After Filtering")
logger.info("Number of word in input vocabulary: %d", input_vocab.n_words)
logger.info("Number of word in output vocabulary: %d", output_vocab.n_words)

context_vocab_size = input_vocab.n_words