def worker(parser, rank, pos_sents, neg_sents, name, return_dict): """Parallel worker.""" results = [] num_correct = 0 sentences = list(zip(pos_sents, neg_sents)) if rank == 0: sentences = tqdm(list(zip(pos_sents, neg_sents))) for i, (pos, neg) in enumerate(sentences): pos_pp = parser.perplexity(pos) neg_pp = parser.perplexity(neg) correct = pos_pp < neg_pp num_correct += correct # see which words are unked during prediction pos = process_sentence(pos, parser.grammar.w2i) neg = process_sentence(neg, parser.grammar.w2i) result = (name, str(i), str(round(pos_pp, 2)), str(round(neg_pp, 2)), str(int(correct)), ' '.join(pos), ' '.join(neg)) results.append(result) return_dict[rank] = (results, num_correct)
def main(): # load the data #en_es_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u1, c1 = get_en_es_data(0,0) #for more datasets, uncomment the following two lines #es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u2, c2 = get_es_en_data(0, 0) fr_en_train_dev, fr_en_train, fr_en_dev, fr_en_test, mappings, u3, c3 = get_fr_en_data(0, 0) # convert the data to token to idx all_tokens = np.array(list(fr_en_train[0]) + list(fr_en_dev[0]) + list(fr_en_test[0])) token_to_idx = create_token_to_idx(all_tokens) # original code line: train_user_idx = prepare_data(es_en_train[0], es_en_train[1], token_to_idx, user_to_idx) # code now: split to processing tokens and importing the already processed metadata # the part for metadata originally had a shape of (num_of_exericses, MAX_TOKEN_SIZE) but it's now (num_of_exericses, MAX_TOKEN_SIZE, num_of_features) train_sentence_idx = process_sentence(fr_en_train_dev[0], token_to_idx) train_metadata = fr_en_train_dev[1] instance_id_to_dict = fr_en_train_dev[3] # convert true_labels to idx labels_array = np.zeros((len(fr_en_train_dev[0]), MAX_TOKEN_SIZE)) for i in range(len(labels_array)): idx = np.array([instance_id_to_dict[i_id] for i_id in fr_en_train_dev[2][i]] + \ [0] * (MAX_TOKEN_SIZE - len(fr_en_train_dev[2][i]))) labels_array[i] = idx model = EncoderDecoder(len(token_to_idx), mappings, 100, 100, 4, 100, 100) for j in range(10): print("Epoch ", j+1) total_loss = 0 for i in tqdm(range(0, len(train_sentence_idx), BATCH_SIZE)): x_batch = train_sentence_idx[i: i+BATCH_SIZE] y_batch = labels_array[i: i+BATCH_SIZE] x_metadata_batch = train_metadata[i: i+BATCH_SIZE] mask = create_padding_mask(x_batch) with tf.GradientTape() as tape: logits = model.call(x_batch, x_metadata_batch, mask, training=True) loss = model.loss_function(logits, y_batch, mask) total_loss += loss.numpy() gradients = tape.gradient(loss, model.trainable_variables) model.optimizer.apply_gradients(zip(gradients, model.trainable_variables)) print("Avg batch loss", total_loss/i+1) # if i == 40: # break # print("====Dev ====") # flattened_instance_ids, actual, preds = predict(model, es_en_dev, token_to_idx) print("====Test====") flattened_instance_ids, actual, preds = predict(model, fr_en_test, token_to_idx)
def predict(model, data, token_to_idx): """The function is used to generate instance-wise predictions; The function computes the probability of getting the word incorrect. Arguments: model {tf.model} -- The tensorflow model trained on the duolingo train data data {tuple} -- tuple containing the raw_data, user_data, and instance ids token_to_idx {dict} -- the mapping from token to idx user_to_idx {dict} -- the mapping from user to idx """ raw_sent, raw_users, all_instance_ids, labels_dict = data # TODO: same as above sent_idx = process_sentence(raw_sent, token_to_idx) user_idx = raw_users flattened_instance_ids = [] # create the mask and predict the logits mask = create_padding_mask(sent_idx) actual = [] preds = [] for i in tqdm(range(0, len(sent_idx), BATCH_SIZE)): x_batch = sent_idx[i: i+BATCH_SIZE] x_user_batch = user_idx[i: i+BATCH_SIZE] instance_ids_list = all_instance_ids[i: i+BATCH_SIZE] mask = create_padding_mask(x_batch) logits = model.call(x_batch, x_user_batch, mask, training=False) probs = tf.nn.softmax(logits) predictions = probs[:, :, 1] # assert len(preds) == len(actual) for j, instance_ids in enumerate(instance_ids_list): instance_ids_length = len(instance_ids) _preds = predictions[j][:instance_ids_length] true = [int(labels_dict[instance]) for instance in instance_ids] preds.extend(_preds.numpy().tolist()) actual.extend(true) # add to final list of instance ids flattened_instance_ids.extend(instance_ids) compute_metrics(actual, preds) return flattened_instance_ids, actual, preds
def main(): #for more datasets, uncomment the following two lines es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings, u2, c2 = get_es_en_data( 0, 0) # convert the data to token to idx all_tokens = np.array( list(es_en_train[0]) + list(es_en_dev[0]) + list(es_en_test[0])) token_to_idx = create_token_to_idx(all_tokens) train_sentence_idx = process_sentence(es_en_train_dev[0], token_to_idx) train_metadata = es_en_train_dev[1] instance_id_to_dict = es_en_train_dev[3] # convert true_labels to idx labels_array = np.zeros((len(es_en_train_dev[0]), MAX_TOKEN_SIZE)) for i in range(len(labels_array)): idx = np.array([instance_id_to_dict[i_id] for i_id in es_en_train_dev[2][i]] + \ [0] * (MAX_TOKEN_SIZE - len(es_en_train_dev[2][i]))) labels_array[i] = idx model = EncoderDecoder(len(token_to_idx), mappings, 100, 100, 4, 100, 100) for j in range(10): print("Epoch ", j + 1) total_loss = 0 for i in tqdm(range(0, len(train_sentence_idx), BATCH_SIZE)): x_batch = train_sentence_idx[i:i + BATCH_SIZE] y_batch = labels_array[i:i + BATCH_SIZE] x_metadata_batch = train_metadata[i:i + BATCH_SIZE] mask = create_padding_mask(x_batch) with tf.GradientTape() as tape: logits = model.call(x_batch, x_metadata_batch, mask, training=True) loss = model.loss_function(logits, y_batch, mask) total_loss += loss.numpy() gradients = tape.gradient(loss, model.trainable_variables) model.optimizer.apply_gradients( zip(gradients, model.trainable_variables)) print("Avg batch loss", total_loss / i + 1) print("====Test====") flattened_instance_ids, actual, preds = predict( model, es_en_test, token_to_idx)
def parse(self, sentence, verbose=True, use_numpy=False, num_trees=10, root='TOP'): processed_sentence = process_sentence(sentence, self.grammar.w2i) if verbose: print('Processed sentence: `{}`'.format( ' '.join(processed_sentence))) print('Running CKY...') score, back = self.cky(processed_sentence, use_numpy=use_numpy) root_id = self.grammar.n2i[root] score = score[root_id, 0, -1] if verbose: print('Building tree...') tree = self.build_tree(back, sentence, root=root) return tree, score
def make_dataset(root_path, annotation_path, sample_duration, dictionary): video2text = load_annotation_data(annotation_path) valid_videos = video2text.keys() dataset = [] i = 0 dataset_size = len(os.listdir(root_path)) for i, video in enumerate(os.listdir(root_path)): if video == ".DS_Store" or video not in valid_videos: continue if i % 1000 == 0: print('dataset loading [{}/{}]'.format(i, dataset_size)) video_path = os.path.join(root_path, video) if not os.path.exists(video_path): print(f"Path {video_path} not found!") continue n_frames = len(os.listdir(video_path)) begin_t = 1 end_t = n_frames sample = { 'video': video_path, 'segment': [begin_t, end_t], 'n_frames': n_frames, 'label': process_sentence(video2text[video], dictionary) } step = sample_duration video_sample = [] for j in range(1, n_frames, step): sample_j = copy.deepcopy(sample) sample_j['frame_indices'] = list( range(j, min(n_frames + 1, j + sample_duration))) video_sample.append(sample_j) dataset.append(video_sample) return dataset
def perplexity(self, sentence): processed_sentence = process_sentence(sentence, self.grammar.w2i) sent_len = len(sentence) sentence_array = np.array( [self.grammar.w2i[word] for word in processed_sentence], dtype=np.int32) score = -np.inf * np.ones( (self.grammar.num_nonterminals, sent_len + 1, sent_len + 1), dtype=np.float32) back = -1 * np.ones( (self.grammar.num_nonterminals, sent_len + 1, sent_len + 1, 3), dtype=np.int32) # Inside recursion logprob = _cky.inside( sentence_array, sent_len, score, self.grammar.num_lexical_rules, self.grammar.num_unary_rules, self.grammar.num_binary_rules, self.grammar.num_nonterminals, self.grammar.lexical, self.grammar.unary, self.grammar.binary, self.grammar.top, self.grammar.lexical_prob, self.grammar.unary_prob, self.grammar.binary_prob, self.grammar.top_prob, ) return np.exp(-logprob / sent_len)
def syneval(parser, indir, outpath, parallel=False, short=False, add_period=True): print(f'Loading syneval examples from directory `{indir}`.') print(f'Writing predictions to `{outpath}`.') files = SHORT if short else ALL with open(outpath, 'w') as outfile: print('\t'.join( ('name', 'index', 'pos-perplexity', 'neg-perplexity', 'correct', 'pos-sentence-processed', 'neg-sentence-processed')), file=outfile) print('Predicting syneval for:', '\n', '\n '.join(files)) for fname in files: print(f'Predicting `{fname}`...') inpath = os.path.join(indir, fname) with open(inpath + '.pos') as f: pos_sents = [line.strip() for line in f.readlines()] if add_period: pos_sents = [sent + ' .' for sent in pos_sents] with open(inpath + '.neg') as f: neg_sents = [line.strip() for line in f.readlines()] if add_period: neg_sents = [sent + ' .' for sent in neg_sents] pos_sents = [sent.split() for sent in pos_sents] neg_sents = [sent.split() for sent in neg_sents] assert len(pos_sents) == len(neg_sents) if parallel: size = mp.cpu_count() print(f'Predicting in parallel with {size} processes...') chunk_size = ceil_div(len(pos_sents), size) pos_parts = [ pos_sents[i:i + chunk_size] for i in range(0, len(pos_sents), chunk_size) ] neg_parts = [ neg_sents[i:i + chunk_size] for i in range(0, len(neg_sents), chunk_size) ] # spawn processes manager = mp.Manager() return_dict = manager.dict() processes = [] for rank in range(size): p = mp.Process(target=worker, args=(parser, rank, pos_parts[rank], neg_parts[rank], fname, return_dict)) p.start() processes.append(p) for p in processes: p.join() results = sum([return_dict[rank][0] for rank in range(size)], []) # merge all results num_correct = sum([ return_dict[rank][1] for rank in range(size) ]) # sum number of correct results else: results = [] num_correct = 0 for i, (pos, neg) in enumerate(tqdm(list(zip(pos_sents, neg_sents)))): pos_pp = parser.perplexity(pos) neg_pp = parser.perplexity(neg) correct = pos_pp < neg_pp num_correct += correct # see which words are unked during prediction pos = process_sentence(pos, parser.grammar.w2i) neg = process_sentence(neg, parser.grammar.w2i) result = (fname, str(i), str(round(pos_pp, 2)), str(round(neg_pp, 2)), str(int(correct)), ' '.join(pos), ' '.join(neg)) results.append(result) for result in results: print('\t'.join(result), file=outfile) print( f'{fname}: {num_correct}/{len(pos_sents)} = {num_correct / len(pos_sents):.2%} correct', '\n')
def main(): # load the data en_es_train_dev, es_en_train, es_en_dev, es_en_test, mappings1, u1, c1 = get_en_es_data(0,0) #for more datasets, uncomment the following two lines es_en_train_dev, es_en_train, es_en_dev, es_en_test, mappings2, u2, c2 = get_es_en_data(u1, c1) fr_en_train_dev, fr_en_train, fr_en_dev, fr_en_test, mappings3, u3, c3 = get_fr_en_data(u2, c2) ##combine train_dev for all three datasets #get each attributes en_es_sentence, en_es_meta, en_es_inst, en_es_label = en_es_train_dev es_en_sentence, es_en_meta, es_en_inst, es_en_label = es_en_train_dev fr_en_sentence, fr_en_meta, fr_en_inst, fr_en_label = fr_en_train_dev #concatenate print(en_es_sentence.shape) print(es_en_sentence.shape) print(fr_en_sentence.shape) print(en_es_meta.shape) print(es_en_meta.shape) print(fr_en_meta.shape) print(en_es_inst.shape) print(es_en_inst.shape) print(fr_en_inst.shape) combined_sentence = np.concatenate((en_es_sentence, es_en_sentence, fr_en_sentence), axis=0) combined_meta = np.concatenate((en_es_meta, es_en_meta, fr_en_meta), axis=0) combined_inst = np.concatenate((en_es_inst, es_en_inst, fr_en_inst), axis=0) #combine labels combined_labels = copy.deepcopy(en_es_label) combined_labels.update(es_en_label) #add es_en to dict combined_labels.update(fr_en_label) #add fr_en to dict index = np.random.permutation(combined_sentence.shape[0]) shuffled_combined_sentence = combined_sentence[index] shuffled_combined_meta = combined_meta[index] shuffled_combined_inst = combined_inst[index] combined_train_dev = (shuffled_combined_sentence, shuffled_combined_meta, shuffled_combined_inst, combined_labels) #combine mappings1,mappings2,mappings3 usid1, ctid1, clt1, sessid1, fmatid1, speechid1, dep1, morph1 = mappings1 usid2, ctid2, clt2, sessid2, fmatid2, speechid2, dep2, morph2 = mappings2 usid3, ctid3, clt3, sessid3, fmatid3, speechid3, dep3, morph3 = mappings3 usid = combine_dicts(usid1, usid2, usid3) ctid = combine_dicts(ctid, ctid2, ctid3) clt = combine_dicts(clt1, clt2, clt3) sess = combine_dicts(sessid1, sessid2, sessid3) fmat = combine_dicts(fmatid1, fmatid2, fmatid3) speech = combine_dicts(speechid1, speechid2, speechid3) dep = combine_dicts(dep1, dep2, dep3) morph = combine_dicts(morph1, morph2, morph3) combined_mappings = (usid, ctid, clt, sess, fmat, speech, dep, morph) # convert the data to token to idx all_tokens = np.array(list(es_en_train[0]) + list(es_en_dev[0]) + list(es_en_test[0]) + \ list(en_es_train[0]) + list(en_es_dev[0]) + list(en_es_test[0]) + \ list(fr_en_train[0]) + list(fr_en_dev[0]) + list(fr_en_test[0])) token_to_idx = create_token_to_idx(all_tokens) # TODO: # original code line: train_user_idx = prepare_data(es_en_train[0], es_en_train[1], token_to_idx, user_to_idx) # code now: split to processing tokens and importing the already processed metadata # the part for metadata originally had a shape of (num_of_exericses, MAX_TOKEN_SIZE) but it's now (num_of_exericses, MAX_TOKEN_SIZE, num_of_features) train_sentence_idx = process_sentence(combined_train_dev[0], token_to_idx) train_metadata = combined_train_dev[1] instance_id_to_dict = combined_train_dev[3] # convert true_labels to idx labels_array = np.zeros((len(combined_train_dev[0]), MAX_TOKEN_SIZE)) for i in range(len(labels_array)): idx = np.array([instance_id_to_dict[i_id] for i_id in combined_train_dev[2][i]] + \ [0] * (MAX_TOKEN_SIZE - len(combined_train_dev[2][i]))) labels_array[i] = idx model = EncoderDecoder(len(token_to_idx), combined_mappings, 300, 300, 4, 100, 100) for j in range(10): print("Epoch ", j+1) # TODO: shuffle training data total_loss = 0 for i in tqdm(range(0, len(train_sentence_idx)/50, BATCH_SIZE)): x_batch = train_sentence_idx[i: i+BATCH_SIZE] y_batch = labels_array[i: i+BATCH_SIZE] x_metadata_batch = train_metadata[i: i+BATCH_SIZE] mask = create_padding_mask(x_batch) with tf.GradientTape() as tape: logits = model.call(x_batch, x_metadata_batch, mask, training=True) loss = model.loss_function(logits, y_batch, mask) total_loss += loss.numpy() gradients = tape.gradient(loss, model.trainable_variables) model.optimizer.apply_gradients(zip(gradients, model.trainable_variables)) print("Avg batch loss", total_loss/(i+1)) # if i == 40: # break # print("====Dev ====") # flattened_instance_ids, actual, preds = predict(model, es_en_dev, token_to_idx) print("====Test====") flattened_instance_ids1, actual1, preds1 = predict(model, es_en_test, token_to_idx) flattened_instance_ids2, actual2, preds2 = predict(model, en_es_test, token_to_idx) flattened_instance_ids3, actual3, preds3 = predict(model, fr_en_test, token_to_idx)
pad_idx = inputText.vocab.stoi["<pad>"] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) # In[12]: # Loading the pretrained model if load_model: load_checkpoint(torch.load("./training/data/model/my_checkpoint.pth.tar"), model, optimizer) # In[13]: # Example sentence src = "The feasibility study estimates that it would take passengers about four minutes to cross the Potomac River on the gondola." prediction = process_sentence(model, src, inputText, outputText, device) prediction = prediction[:-1] # remove <eos> token print(prediction) # ## Evaluation # In[14]: # Loading the test dataset test_data = pd.read_csv("./training/test/test-sample.txt", header=0, names=['InputText', 'OutputText'], sep='\t', encoding='utf-8')