def create_vocabulary(self):
        base_path = self.base_path
        tokens_list = []
        train_dataset = self.read_raw_lines('%s/train' % base_path)
        validation_dataset = self.read_raw_lines('%s/validation' % base_path)
        test_dataset = self.read_raw_lines('%s/test' % base_path)

        for dataset in [train_dataset, validation_dataset, test_dataset]:
            inputs = dataset['inputs']
            outputs = dataset['outputs']
            for inp in inputs:
                tokens = tokenizer.split_sentence(inp, tokenizer_type=constants.TOKENIZER_NLTK)
                tokens_list.extend(tokens)
            
            for out in outputs:
                tokens = tokenizer.split_sentence(out, tokenizer_type=constants.TOKENIZER_NLTK)
                tokens_list.extend(tokens)

        vocab = utils.unique_vals(tokens_list, min_num_occurences=0)
        utils.save_lines(vocab, '%s/vocab.txt' % base_path)
def create_dataset(save_dir, data_path, shared_path):
    print("Loading data from path %s" % data_path)
    data = json.load(open(data_path))
    print("Done loading data")
    shared_data = json.load(open(shared_path))
    print("Done loading shared data from path %s" % shared_path)

    def count_sums(up_to_idx):
        total_len = 0
        for i in range(0, up_to_idx):
            total_len += len(shared_data['x'][i])
        return total_len

    idxs = []
    xs = []
    answer_starts = []
    answer_ends = []
    indices = []
    questions = []

    for i in range(len(shared_data['x'])):
        print("On %s of %s" % (i, len(shared_data['x'])))
        for j in range(len(shared_data['x'][i])):
            cur_tokens = shared_data['x'][i][j][0]
            cur_text = " ".join(cur_tokens)
            cur_ans_starts, cur_ans_ends = spacy_tokenizer.extract_phrases(
                cur_text, 2)
            answer_starts.extend([str(ans) for ans in cur_ans_starts])
            answer_ends.extend([str(ans) for ans in cur_ans_ends])
            idxs.extend(range(len(idxs), len(idxs) + len(cur_ans_starts)))
            questions.extend(["<NONE>"] * len(cur_ans_starts))
            indices.extend([str(len(xs))] * len(cur_ans_starts))
            xs.append('\t'.join(cur_tokens))

    idxs = list(map(lambda idx: str(idx), idxs))
    utils.save_lines(idxs, '%s/ids.txt' % save_dir)
    utils.save_lines(questions, '%s/outputs.txt' % save_dir)
    utils.save_lines(answer_starts, '%s/answer_starts.txt' % save_dir)
    utils.save_lines(answer_ends, '%s/answer_ends.txt' % save_dir)
    utils.save_lines(xs, '%s/inputs.txt' % save_dir)
    utils.save_lines(indices, '%s/indices.txt' % save_dir)
 def save_predictions(self, epoch_num, predictions):
     save_dir = self.config['save_directory']
     save_path = '%s/%s' % (save_dir, 'predictions_%s' % epoch_num)
     utils.save_lines(predictions, save_path)
Beispiel #4
0
from torch.autograd import variable

dataset_path = 'datasets/newsqa'
load_path = 'logs/squad_saved_data_truncated/model_2.pyt7'

language_model_loader = LanguageModelLoaderTruncate(
    dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
language_model = torch_utils.load_model(load_path).cuda()
language_model.config['save_directory'] = 'logs/newsqa_saved_data'

language_wrapper = LanguageWrapper(language_model,
                                   language_model_loader.get_vocab())
language_trainer = LanguageTrainer(language_model.config, language_wrapper,
                                   language_model_loader)

#test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
#            epoch_num=10, max_length=20)
#dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
#            epoch_num=10, max_length=10)
train_predictions = language_trainer.predict(
    dataset_type=constants.DATASET_TRAIN, epoch_num=10, max_length=10)

utils.save_lines(
    train_predictions,
    'logs/newsqa_saved_data/dummy5_train_predictions_epoch_6.txt')
utils.save_lines(
    dev_predictions,
    'logs/newsqa_saved_data/dummy5_validation_predictions_epoch_6.txt')
utils.save_lines(test_predictions,
                 'logs/newsqa_saved_data/dummy5_test_predictions_epoch_6.txt')
from models.language_wrapper import LanguageWrapper 
from helpers import constants 
import torch 
from helpers import torch_utils, utils
from torch.autograd import variable

dataset_path = 'datasets/newsqa_unsupervised_large_verb_filtered'
load_path = 'logs/squad_saved_data/model_8.pyt7' # CHANGE THIS TO WHATEVER YOU WANT

language_model_loader = LanguageModelLoader(dataset_path, tokenizer_type=constants.TOKENIZER_TAB)
language_model = torch_utils.load_model(load_path).cuda()
language_model.config['save_directory'] = 'logs/newsqa_unsupervised_saved_data'

language_wrapper = LanguageWrapper(language_model, language_model_loader.get_vocab())
language_trainer = LanguageTrainer(language_model.config, language_wrapper, language_model_loader)

##test_predictions = language_trainer.predict(dataset_type=constants.DATASET_TEST,
#            epoch_num=10, max_length=20, beam_size=5)
#dev_predictions = language_trainer.predict(dataset_type=constants.DATASET_VALIDATION,
#            epoch_num=10, max_length=10, beam_size=5)
train_predictions = language_trainer.predict(dataset_type=constants.DATASET_TRAIN,
            epoch_num=10, max_length=10)

utils.save_lines(train_predictions, 'logs/newsqa_saved_data/train_predictions_epoch_6.txt')
#utils.save_lines(dev_predictions, 'logs/newsqa_saved_data/validation_predictions_epoch_6.txt')
#utils.save_lines(test_predictions, 'logs/newsqa_saved_data/test_predictions_epoch_6.txt')




    input_max_length=2100)#00)

config = utils.load_json(config_path)
config['batch_size'] = 25
config['input_max_length'] = data_loader.input_max_length
model = IOBModel(config, embeddings=embeddings)
model.restore(params_path)

num_steps = 0

data_loader.reset_indices()
total_predictions = []
num_steps = 0

while True:
    batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
    num_steps += config['batch_size']
    print(num_steps)
    if batch is None:
      break
    predictions = model.predict(batch)
    texts = data_loader.label_vocab.tokens_list(predictions)
    for i in range(0, len(texts)):
      cur_input_length = batch['input_lengths'][i]
      cur_text = texts[i]

      text_str = " ".join(cur_text[0:cur_input_length])
      total_predictions.append(text_str)

utils.save_lines(total_predictions, predictions_save_path)
      loss, predictions = model.forward(batch)  
      batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
      print(predictions)
      print(loss)

    if i % 3 == 0:
      data_loader.reset_indices()
      total_predictions = []
      while True:
        batch = data_loader.get_batch(constants.DATASET_TEST, config['batch_size'])
        if batch is None:
          break
        predictions = model.predict(batch)
        texts = data_loader.label_vocab.tokens_list(predictions)
        for i in range(0, len(texts)):
          cur_input_length = batch['input_lengths'][i]
          cur_text = texts[i]
          text_str = " ".join(cur_text[0:cur_input_length])
          total_predictions.append(text_str)
      utils.save_lines(total_predictions, \
        '%s/predictions_test_%s.txt' % (config['save_path'], i))

    data_loader.mix_indices()
    batch = data_loader.get_batch(constants.DATASET_TRAIN, config['batch_size'])
  





Beispiel #8
0
def save_batch_results(batch_list, save_dir='results', f1_thres=0.1):
    q = []
    answerss = []
    x = []
    predicted_answers = []
    best_answers = []
    best_f1_scores = []
    predicted_f1_scores = []

    total_best_f1_scores = []
    total_predicted_f1_scores = []
    for batch in batch_list:
        total_best_f1_scores.extend(batch['best_f1_scores'])
        total_predicted_f1_scores.extend(batch['predicted_f1_scores'])
        for i in range(0, len(batch['answerss'])):
            if batch['best_f1_scores'][i] - batch['predicted_f1_scores'][
                    i] > f1_thres:
                x.append((' '.join(batch['x'][i][0])).rstrip('\r\n'))
                predicted_answers.append(batch['predicted_answers'][i])
                best_answers.append(batch['best_answers'][i])
                best_f1_scores.append(str(batch['best_f1_scores'][i]))
                predicted_f1_scores.append(str(
                    batch['predicted_f1_scores'][i]))
                answerss.append(
                    list(
                        map(lambda ans: ans.rstrip('\r\n'),
                            batch['answerss'][i])))
                q.append(batch['q'][i])
    avg_best_f1_scores = np.mean(total_best_f1_scores)
    avg_predicted_f1_scores = np.mean(total_predicted_f1_scores)

    score_msg = ["PREDICTED %s BEST %s NUM_SAMPLES %s NUM_TOTAL %s" % \
    (avg_predicted_f1_scores, avg_best_f1_scores, \
        len(best_answers), len(total_best_f1_scores))]
    utils.save_lines(score_msg, "%s/f1_score_comparison.txt" % save_dir)
    utils.save_lines(best_f1_scores, "%s/best_f1_scores.txt" % save_dir)
    utils.save_lines(predicted_f1_scores,
                     "%s/predicted_f1_scores.txt" % save_dir)
    utils.save_lines(best_answers, "%s/best_answers.txt" % save_dir)
    utils.save_lines(predicted_answers, "%s/predicted_answers.txt" % save_dir)
    utils.save_lines(x, "%s/paragraphs.txt" % save_dir)
    utils.save_tabbed_lines(answerss, "%s/gold_answers.txt" % save_dir)
    utils.save_tabbed_lines(q, "%s/questions.txt" % save_dir)