def load_datasets(load_existing_dump=False): model_config = ModelConfig() data_reader = DataReader() train_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.train_path), "r").readlines() valid_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.valid_path), "r").readlines() test_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.test_path), "r").readlines() # Load data train_data = data_reader.read_data(train_lines) print("Loaded Train data") valid_data = data_reader.read_data(valid_lines) print("Loaded Dev data") test_data = data_reader.read_data(test_lines) print("Loaded Test data") feature_extractor = FeatureExtractor(model_config) dataset = Dataset(model_config, train_data, valid_data, test_data, feature_extractor) # Vocab processing if load_existing_dump: dataset.word2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file)) dataset.idx2word = { idx: word for (word, idx) in dataset.word2idx.items() } dataset.pos2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.pos_vocab_file)) dataset.idx2pos = {idx: pos for (pos, idx) in dataset.pos2idx.items()} dataset.dep2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.dep_vocab_file)) dataset.idx2dep = {idx: dep for (dep, idx) in dataset.dep2idx.items()} dataset.model_config.load_existing_vocab = True print "loaded existing Vocab!" dataset.word_embedding_matrix = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file)) dataset.pos_embedding_matrix = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.pos_emb_file)) dataset.dep_embedding_matrix = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.dep_emb_file)) print "loaded existing embedding matrix!" else: dataset.build_vocab() dump_pickle( dataset.word2idx, os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file)) dump_pickle( dataset.pos2idx, os.path.join(DataConfig.dump_dir, DataConfig.pos_vocab_file)) dump_pickle( dataset.dep2idx, os.path.join(DataConfig.dump_dir, DataConfig.dep_vocab_file)) dataset.model_config.load_existing_vocab = True print "Vocab Build Done!" dataset.build_embedding_matrix() print "embedding matrix Build Done" dump_pickle( dataset.word_embedding_matrix, os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file)) dump_pickle(dataset.pos_embedding_matrix, os.path.join(DataConfig.dump_dir, DataConfig.pos_emb_file)) dump_pickle(dataset.dep_embedding_matrix, os.path.join(DataConfig.dump_dir, DataConfig.dep_emb_file)) print "converting data into ids.." dataset.convert_data_to_ids() print "Done!" dataset.model_config.word_features_types = len(dataset.train_inputs[0][0]) dataset.model_config.pos_features_types = len(dataset.train_inputs[1][0]) dataset.model_config.dep_features_types = len(dataset.train_inputs[2][0]) dataset.model_config.num_features_types = dataset.model_config.word_features_types + \ dataset.model_config.pos_features_types + dataset.model_config.dep_features_types dataset.model_config.num_classes = len(dataset.train_targets[0]) return dataset
def load_datasets(load_existing_dump=False): model_config = ModelConfig() data_reader = DataReader() train_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.train_path), "r").readlines() valid_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.valid_path), "r").readlines() test_lines = open( os.path.join(DataConfig.data_dir_path, DataConfig.test_path), "r").readlines() # Load data train_data_obj = data_reader.read_data(train_lines) print("Loaded Train data") valid_data_obj = data_reader.read_data(valid_lines) print("Loaded Dev data") test_data_obj = data_reader.read_data(test_lines) print("Loaded Test data") feature_extractor = FeatureExtractor(model_config) dataset = Dataset(model_config, train_data_obj, valid_data_obj, test_data_obj, feature_extractor) dataset.model_config.max_seq_len = dataset.get_max_seq_len() dataset.model_config.max_word_len = dataset.get_max_word_len() # Vocab processing if load_existing_dump: dataset.word2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file)) dataset.idx2word = { idx: word for (word, idx) in dataset.word2idx.items() } dataset.char2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.char_vocab_file)) dataset.idx2char = { idx: char for (char, idx) in dataset.char2idx.items() } dataset.label2idx = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.label_vocab_file)) dataset.idx2label = { idx: label for (label, idx) in dataset.label2idx.items() } dataset.model_config.load_existing_vocab = True print "loaded existing Vocab!" dataset.word_embedding_matrix = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file)) dataset.char_embedding_matrix = get_pickle( os.path.join(DataConfig.dump_dir, DataConfig.char_emb_file)) print "loaded existing embedding matrix!" else: dataset.build_vocab() dump_pickle( dataset.word2idx, os.path.join(DataConfig.dump_dir, DataConfig.word_vocab_file)) dump_pickle( dataset.char2idx, os.path.join(DataConfig.dump_dir, DataConfig.char_vocab_file)) dump_pickle( dataset.label2idx, os.path.join(DataConfig.dump_dir, DataConfig.label_vocab_file)) dataset.model_config.load_existing_vocab = True print "Vocab Build Done!" dataset.build_embedding_matrix() print "embedding matrix Build Done" dump_pickle( dataset.word_embedding_matrix, os.path.join(DataConfig.dump_dir, DataConfig.word_emb_file)) dump_pickle( dataset.char_embedding_matrix, os.path.join(DataConfig.dump_dir, DataConfig.char_emb_file)) dataset.model_config.num_classes = len(dataset.label2idx) print "converting data into ids.." dataset.convert_data_to_ids() print "Done!" return dataset