def build_graph(config): word2idx, idx2word = get_vocabs(config['vocab_file']) embeddings = get_embeddings(word2idx, config['s2v_file']) weights = config.get('weights', [1 for _ in config['metrics']]) assert len(config['metrics']) == len(weights) metrics = {m: {'weight': w} for m, w in zip(config['metrics'], weights)} if 'lm' in metrics: metrics['lm'].update( dict(forward=config['lm_save_dir'], reverse=config.get('lm_rev_save_dir', None), num_words=len(word2idx))) if 'cos' in metrics: idf_file = config.get('idf_file', None) if idf_file is not None: metrics['cos'].update( dict(idf=get_idf_vector(idf_file, word2idx), embeddings=embeddings)) else: metrics['cos'].update(dict(embeddings=embeddings)) sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_config) model_inputs, model_outputs = get_model(metrics, mode=config['mode']) if 'lm' in metrics: init_lm_checkpoints(metrics['lm']) sess.run(tf.global_variables_initializer()) return sess, model_inputs, model_outputs, embeddings, word2idx, idx2word
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) vocab = list(vocab) vocab.insert(0, PAD) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename, processing_word) vocab_chars = get_char_vocab(train) vocab_chars = list(vocab_chars) vocab_chars.insert(0, PAD) write_vocab(vocab_chars, config.chars_filename) # Build and save type vocab vocab_types = set() print len(vocab_tags) for tag in vocab_tags: if tag != 'O': vocab_types.add(tag[2:]) write_vocab(vocab_types, config.types_filename)
def prepare(args, config): word2idx, idx2word = get_vocabs(args.vocab_file) try: embeddings = get_embeddings(word2idx, args.w2v_file) except FileNotFoundError: logging.info( 'embedding file not found. Train embeddings from scratch instead') embeddings = None with tf.variable_scope('LanguageModel'): model_inputs, model_outputs = get_model(config, embeddings, len(word2idx)) sess_config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_config) sess.run(tf.global_variables_initializer()) return word2idx, model_inputs, model_outputs, sess
import numpy as np from chu_liu_edmonds import decode_mst from utils import get_vocabs, nll_loss, UAS from models import model_1, model_2 from hp import hp_dict from data_reader import PosDataset data_dir = "C:\\Users\\jeremy.levy\\OneDrive - Technion\\MSc\\Courses\\courses_gal\\NLP\\HW\\HW2 - wet\\HW2-files\\" # data_dir = "C:\\Users\\galye\\Dropbox\\studies\\MSc\\NLP\\HW2 - wet\\HW2-files\\" path_train = data_dir + "train.labeled" path_test = data_dir + "test.labeled" word_dict, pos_dict = get_vocabs([path_train, path_test]) dataset_saved = False if dataset_saved is True: print("Loading dataset") training_sentences = torch.load('training_sentences.pt') test_sentences = torch.load('test_sentences.pt') else: print("Extracting dataset") training_sentences = PosDataset(path_train, word_dict, pos_dict, padding=False) test_sentences = PosDataset(path_test, word_dict, pos_dict, padding=False)
torch.tensor(pos_idx_list, dtype=torch.long, requires_grad=False)) sentence_len_list.append(sentence_len) # if padding: # all_sentence_word_idx = torch.tensor(sentence_word_idx_list, dtype=torch.long) # all_sentence_pos_idx = torch.tensor(sentence_pos_idx_list, dtype=torch.long) # all_sentence_len = torch.tensor(sentence_len_list, dtype=torch.long, requires_grad=False) # return TensorDataset(all_sentence_word_idx, all_sentence_pos_idx, all_sentence_len) return { i: sample_tuple for i, sample_tuple in enumerate( zip(sentence_word_idx_list, sentence_pos_idx_list, sentence_len_list)) } if __name__ == "__main__": path_train = "data_new/train.labeled" path_test = "data_new/test.labeled" paths_list = [path_train, path_test] word_dict, pos_dict = get_vocabs(paths_list) train = DepDataset(word_dict, pos_dict, 'data_new', 'train', padding=False) train_dataloader = DataLoader(train, shuffle=True) test = DepDataset(word_dict, pos_dict, 'data_new', 'test', padding=False) test_dataloader = DataLoader(test, shuffle=False) print("Number of Train Tagged Sentences ", len(train)) print("Number of Test Tagged Sentences ", len(test))
with torch.no_grad(): words_idx_tensor, pos_idx_tensor, heads_tensor = input_data tag_scores = model(words_idx_tensor, pos_idx_tensor) predicted_mst, _ = decode_mst(energy=tag_scores.detach().cpu(), length=tag_scores.shape[0], has_labels=False) tags.append(predicted_mst[1:]) return tags # create data sets data_dir = "HW2-files/" path_train = data_dir + "train.labeled" path_test = data_dir + "test.labeled" paths_list = [path_train, path_test] word_cnt, word_dict, pos_dict = utils.get_vocabs(paths_list) train = utils.PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'train') train_dataloader = utils.DataLoader(train, shuffle=True) test = utils.PosDataset(word_cnt, word_dict, pos_dict, data_dir, 'test') test_dataloader = utils.DataLoader(test, shuffle=False) word_vocab_size = len(train.word2idx) tag_vocab_size = len(train.pos_idx_mappings) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda else "cpu") # create and load trained model base_model = basic_model.DnnDependencyParser(basic_model.WORD_EMBEDDING_DIM, basic_model.POS_EMBEDDING_DIM, basic_model.HIDDEN_DIM, word_vocab_size,