train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader() train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) # Build a vocabulary and load the pretrained embedding vocab = Vocabulary() vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) word_embedding = vocab.make_word_embedding(embedding_folder + "glove.6B.100d.txt") # save vocab vocab_save_path = 'H:/result/bidaf/vocab.json' vocab.save(vocab_save_path) # Use the feature extractor,which is only necessary when using linguistic features. # feature_transformer = FeatureExtractor(features=['match_lemma','match_lower','pos','ner','context_tf'], # build_vocab_feature_names=set(['pos','ner']),word_counter=vocab.get_word_counter()) # train_data = feature_transformer.fit_transform(dataset=train_data) # eval_data = feature_transformer.transform(dataset=eval_data) # Build a batch generator for training and evaluation,where additional features and a feature vocabulary are # necessary when a linguistic feature is used. train_batch_generator = BatchGenerator(vocab, train_data, batch_size=64, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=64) # train and save checkpoint in save_dir save_dir = 'H:/result/bidaf' # Import the built-in model and compile the training operation, call functions such as train_and_evaluate for
logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '/root/ZX/SMRCToolkit/data_folder/' embedding_folder = '/root/ZX/SMRCToolkit/embedding_folder/' train_file = data_folder + "train-v1.1.json" dev_file = data_folder + "dev-v1.1.json" reader = SquadReader(fine_grained=True) train_data = reader.read(train_file) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) vocab = Vocabulary(do_lowercase=False) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) vocab.save("/root/ZX/SMRCToolkit/vocab_save_folder/testvocab.json") word_embedding = vocab.make_word_embedding(embedding_folder + "glove.840B.300d.txt") # save vocab vocab_save_path = '/root/ZX/SMRCToolkit/vocab_save_folder/vocab.json' vocab.save(vocab_save_path) train_batch_generator = BatchGenerator(vocab, train_data, batch_size=50, training=True) eval_batch_generator = BatchGenerator(vocab, eval_data, batch_size=50) train_batch_generator.init()
data_folder = os.path.join("/", "home", "baheti", "QADialogueSystem", "Data", "QA_datasets", "coqa/") train_filename = "coqa-train-v1.0.json" eval_filename = "coqa-dev-v1.0.json" vocab = Vocabulary(do_lowercase=True) vocab_filepath = os.path.join("models", "vocab.txt") if os.path.exists(vocab_filepath): print("loading from filepath") # load from the filepath vocab.load(vocab_filepath) else: print("creating vocab as new") train_data = coqa_reader.read(data_folder + train_filename, 'train') eval_data = coqa_reader.read(data_folder + eval_filename, 'dev') vocab.build_vocab(train_data + eval_data) vocab.save(vocab_filepath) # Squad seq2seq_train_moses_tokenized # DATA_DIR = os.path.join("/", "home", "baheti", "QADialogueSystem", "RuleBasedQuestionsToAnswer", "squad_seq2seq_train_moses_tokenized") # coqa_format_test_save_file = os.path.join(DATA_DIR, "squad_seq2seq_predicted_responses_test_coqa_format.json") # src_squad_seq2seq_predicted_responses_file = os.path.join(DATA_DIR, "src_squad_seq2seq_predicted_responses_test.txt") # predictions_save_file = "coqa_predictions_on_squad_seq2seq_predicted_responses_test.txt" # SQUAD seq2seq dev moses tokenized DATA_DIR = os.path.join("..", "RuleBasedQuestionsToAnswer", "squad_seq2seq_dev_moses_tokenized") coqa_format_test_save_file = os.path.join( DATA_DIR, "squad_seq2seq_dev_moses_test_coqa_format.json") src_squad_seq2seq_predicted_responses_file = os.path.join( DATA_DIR, "src_squad_seq2seq_dev_moses_test.txt") predictions_save_file = "coqa_predictions_on_squad_seq2seq_dev_moses_test.txt"