def read_dataset(dataset, tier, vocab):
    """Reads the dataset, extracts context, question, answer,
    and answer pointer in their own file. Returns the number
    of questions and answers processed for the dataset"""

    context_data = []
    query_data = []
    question_uuid_data = []

    for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens]
                qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens]

                context_data.append(' '.join(context_ids))
                query_data.append(' '.join(qustion_ids))
                question_uuid_data.append(question_uuid)

    return context_data, query_data, question_uuid_data
Beispiel #2
0
def read_dataset(dataset, tier, vocab):
    """Reads the dataset, extracts context, question, answer,
    and answer pointer in their own file. Returns the number
    of questions and answers processed for the dataset"""

    context_data = []
    query_data = []
    question_uuid_data = []

    for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens]
                qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens]

                context_data.append(' '.join(context_ids))
                query_data.append(' '.join(qustion_ids))
                question_uuid_data.append(question_uuid)

    return context_data, query_data, question_uuid_data
def preprocess_dataset(dataset):
    qn_uuid_data = []
    context_token_data = []
    qn_token_data = []

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing data"):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):

            context = unicode(article_paragraphs[pid]['context'])  # string

            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)
            context = context.lower()

            qas = article_paragraphs[pid]['qas']

            for qn in qas:
                question = unicode(qn['question'])
                question_tokens = tokenize(question)

                question_uuid = qn['id']
                qn_uuid_data.append(question_uuid)
                context_token_data.append(context_tokens)
                qn_token_data.append(question_tokens)

    return qn_uuid_data, context_token_data, qn_token_data
Beispiel #4
0
def preprocess_dataset(dataset):
    """
    Note: this is similar to squad_preprocess.preprocess_and_write, but:
      (1) We only extract the context and question information from the JSON file.
        We don't extract answer information. This makes this function much simpler
        than squad_preprocess.preprocess_and_write, because we don't have to convert
        the character spans to word spans. This also means that we don't have to
        discard any examples due to tokenization problems.

    Input:
      dataset: data read from SQuAD JSON file

    Returns:
      qn_uuid_data, context_token_data, qn_token_data: lists of uuids, tokenized context and tokenized questions
    """
    qn_uuid_data = []
    context_token_data = []
    qn_token_data = []

    for articles_id in tqdm(list(range(len(dataset['data']))),
                            desc="Preprocessing data"):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):

            context = str(article_paragraphs[pid]['context'])  # string

            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)  # list of strings (lowercase)
            context = context.lower()

            qas = article_paragraphs[pid]['qas']  # list of questions

            # for each question
            for qn in qas:

                # read the question text and tokenize
                question = str(qn['question'])  # string
                question_tokens = tokenize(question)  # list of strings

                # also get the question_uuid
                question_uuid = qn['id']

                # Append to data lists
                qn_uuid_data.append(question_uuid)
                context_token_data.append(context_tokens)
                qn_token_data.append(question_tokens)

    return qn_uuid_data, context_token_data, qn_token_data
def preprocess_dataset(dataset):
    """
    Note: this is similar to squad_preprocess.preprocess_and_write, but:
      (1) We only extract the context and question information from the JSON file.
        We don't extract answer information. This makes this function much simpler
        than squad_preprocess.preprocess_and_write, because we don't have to convert
        the character spans to word spans. This also means that we don't have to
        discard any examples due to tokenization problems.

    Input:
      dataset: data read from SQuAD JSON file

    Returns:
      qn_uuid_data, context_token_data, qn_token_data: lists of uuids, tokenized context and tokenized questions
    """
    qn_uuid_data = []
    context_token_data = []
    qn_token_data = []

    for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing data"):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):

            context = unicode(article_paragraphs[pid]['context']) # string

            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context) # list of strings (lowercase)
            context = context.lower()

            qas = article_paragraphs[pid]['qas'] # list of questions

            # for each question
            for qn in qas:

                # read the question text and tokenize
                question = unicode(qn['question']) # string
                question_tokens = tokenize(question) # list of strings

                # also get the question_uuid
                question_uuid = qn['id']

                # Append to data lists
                qn_uuid_data.append(question_uuid)
                context_token_data.append(context_tokens)
                qn_token_data.append(question_tokens)

    return qn_uuid_data, context_token_data, qn_token_data
def read_dev_dataset(dev_dataset, tier, vocab):
    """Reads the dev dataset json file and extracts the input data (context and question
    vectors) and question uuid data.
    """

    dev_question_data = []
    dev_context_data = []
    dev_question_uuid_data = []

    for articles_id in tqdm(range(len(dev_dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dev_dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                context_ids = [
                    int(vocab.get(w, UNK_ID)) for w in context_tokens
                ]
                question_ids = [
                    int(vocab.get(w, UNK_ID)) for w in question_tokens
                ]



                dev_question_datum = np.array([question_ids[i] if i < len(question_ids) \
                    else 0 for i in xrange(FLAGS.question_seq_length)])
                dev_question_data.append(dev_question_datum)

                dev_context_datum = np.array([context_ids[i] if i < len(context_ids) \
                    else 0 for i in xrange(FLAGS.context_seq_length)])
                dev_context_data.append(dev_context_datum)

                dev_question_uuid_data.append(question_uuid)

    dev_question_data = np.array(dev_question_data)
    dev_context_data = np.array(dev_context_data)
    dev_question_uuid_data = np.array(dev_question_uuid_data)
    return dev_question_data, dev_context_data, dev_question_uuid_data
Beispiel #7
0
def get_raw_tokens(dataset, tier, vocab, rev_vocab, embeddings):
    vocab2 = {}
    context_maps = []

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):

            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens, _, _ = tokenize(context, tokenizer=FLAGS.tokenizer)

            for token in context_tokens:
                vocab2[token] = 1
            context_map = {
                'context_tokens': context_tokens,
                'question_maps': []
            }

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens, _, _ = tokenize(question,
                                                 tokenizer=FLAGS.tokenizer)
                question_uuid = qas[qid]['id']

                for token in question_tokens:
                    vocab2[token] = 1
                question_map = {
                    'question_tokens': question_tokens,
                    'question_uuid': question_uuid
                }
                context_map['question_maps'].append(question_map)

            context_maps.append(context_map)

    if FLAGS.word_lookup:
        missing_words = find_missing_words(vocab2, vocab)
        vocab, rev_vocab, embeddings, _ = adu.enhance_vocabulary(
            vocab, rev_vocab, embeddings, missing_words)

    return context_maps, vocab, rev_vocab, embeddings
Beispiel #8
0
def get_question_context_data(question_string, context_json_file):

    context_string = data_from_json(context_json_file)['context']
    context = str(context_string)  # string

    # The following replacements are suggested in the paper
    # BidAF (Seo et al., 2016)
    context = context.replace("''", '" ')
    context = context.replace("``", '" ')

    context_tokens = tokenize(context)  # list of strings (lowercase)
    context = context.lower()

    question = str(question_string)  # string
    question_tokens = tokenize(question)  # list of strings

    # also get the question_uuid
    question_uuid = len(question_tokens)

    return [question_uuid], [context_tokens], [question_tokens]
Beispiel #9
0
    def evaluate(self, thresh=0.05):
        dataset = squad.Squad(train=True)
        prediction = []
        for index, [context, qas] in enumerate(dataset):
            if index % 100 == 0:
                print(index)
            contexts = []
            for sentence in sent_tokenizer.tokenize(context):
                sentence = tokenize(sentence)
                sentence = [
                    word if in_vocab(self.vocab, word) else UNK
                    for word in sentence
                ]
                contexts.append(" ".join(sentence))

            context_vec = self.vectorizer.transform(contexts)
            for qa in qas:
                question, answer, answer_start, is_impossible = qa
                answer_end = answer_start + len(answer)

                question = [
                    word if in_vocab(self.vocab, word) else UNK
                    for word in tokenize(question)
                ]
                question = " ".join(question)
                question_vec = self.vectorizer.transform([question])
                scores = [
                    cosine_similarity(question_vec, vec).flatten()
                    for vec in context_vec
                ]
                scores = np.asarray(scores).flatten()

                ranks = np.argsort(scores)[::-1]

                if scores[ranks[0]] > thresh:
                    prediction.append(
                        is_correct(contexts, ranks[0], answer_start,
                                   answer_end))
        accuracy = sum(prediction) / len(prediction)
        print(accuracy)
Beispiel #10
0
def compute_vectors():
    """ Computes tfidf vectors for the dataset and pickles the vectorizer """
    files = ["answer", "question", "context"]
    types = ["dev", "train"]
    files = ["{}.{}".format(t, f) for f in files for t in types]
    files = [(os.path.join("data", file)) for file in files]

    text = []
    for file in files:
        text.append(open(file, mode='r', encoding='utf8').read())

    text = " ".join(text)
    sentences = text.splitlines()

    words = tokenize(text)
    vocab = Counter(words).most_common(10000)
    vocab = [word for word, count in vocab]
    vocab.append(UNK)
    vocab = sorted(vocab)
    print("Vocab size : ", len(vocab))
    with open(vocab_file, mode='w', encoding='utf8') as file:
        file.write("\n".join(vocab))

    # Add more text to aid tf-idf computation
    print("Processing sentences")
    base_text = open("baseline/base", encoding='utf8', mode='r').readlines()
    for sentence in base_text:
        sentence = tokenize(sentence)
        sentence = [
            word if in_vocab(vocab, word) else UNK for word in sentence
        ]
        sentences.append(" ".join(sentence))

    print("Fitting vectorizer")
    vectorizer = TfidfVectorizer().fit(sentences)
    pickle.dump(vectorizer, open(vector_file, mode='wb'))
Beispiel #11
0
def do_shell(model, dev, input_model=None):
    """ Interactive shell

    Type a question, write next for the next paragraph or enter a blank for another human's question.  

    Args:  
        model: QA model that has an instance variable 'answer' that returns answer span and takes placeholders  
        question, question_length, paragraph, paragraph_length  
        dev: Development set
    """
    # what is is_training if import_meta_graph
    checkpoint_dir = os.path.join(FLAGS.train_dir, FLAGS.model_name)
    vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    vocab, rev_vocab = initialize_vocab(vocab_path)
    # TODO no logs
    saver = tf.train.Saver()
    with tf.Session() as session:
        if False:  # load_meta
            last_meta = next(
                reversed(
                    [f for f in os.listdir(checkpoint_dir) if '.meta' in f]))
            saver = tf.train.import_meta_graph(os.path.join(last_meta))
        saver.restore(session, tf.train.latest_checkpoint(checkpoint_dir))
        print('HINT: Input as question "next" for next paragraph')
        while True:
            original_question, paragraphs, question_lengths, paragraph_lengths, answers = dev.get_batch(
                1)
            for i in itertools.count():
                paragraph = reverse_indices(paragraphs[0], rev_vocab)
                if not i:
                    print('\n')
                    print(paragraph, end='\n\n')

                question_input = input('QUESTION: ')

                if question_input == 'next':
                    break
                elif question_input:
                    question = [
                        vocab.get(word, UNK_ID)
                        for word in tokenize(question_input)
                    ]
                    question, question_length = pad_sequence(
                        question, FLAGS.max_question_length)
                    questions, question_lengths = [question], [question_length]
                else:
                    question_words = reverse_indices(original_question[0],
                                                     rev_vocab)
                    questions = original_question
                    print(question_words)

                if input_model:
                    #feed into siamese model instead
                    question = feed_dict_inputs[0]
                    question = input_model.run(question)
                feed_dict = model.fill_feed_dict(questions, paragraphs,
                                                 question_lengths,
                                                 paragraph_lengths)

                if False:  #load_meta
                    start, end = session.run([
                        'prediction/answer_start:0', 'prediction/answer_end:0'
                    ], feed_dict)
                    start, end = start[0], end[0]
                else:
                    start, end = session.run(model.answer, feed_dict)
                    start, end = start[0], end[0]

                answer_idxs = paragraphs[0][start:end + 1]
                answer_words = ''.join(reverse_indices(answer_idxs, rev_vocab))
                print(f'COMPUTER: {answer_words}')

                if not question_input:
                    start, end = answers[0]
                    correct_answer_idxs = paragraphs[0][start:end + 1]
                    correct_answer = ''.join(
                        reverse_indices(correct_answer_idxs, rev_vocab))
                    print(f'HUMAN: {correct_answer}')
                print()
def read_dataset(dataset, tier, vocab):
    """Reads the dataset, extracts context, question, answer,
    and answer pointer in their own file. Returns the number
    of questions and answers processed for the dataset"""
    context_word_cnt = 0
    context_ukn_word_cnt = 0

    context_tokens_data = []
    context_data = []
    question_tokens_data = []
    query_data = []
    question_uuid_data = []
    rand_max = len(vocab.values())
    context_lengths = []

    if FLAGS.eval_on_train:
        s_labels = []
        e_labels = []
#        true_answers = []

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']

            for qid in range(len(qas)):
                context_lengths.append(len(context_tokens))
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                context_ids = [
                    vocab.get(w, qa_data.UNK_ID) for w in context_tokens
                ]
                question_ids = [
                    vocab.get(w, qa_data.UNK_ID) for w in question_tokens
                ]
                context_word_cnt += len(context_ids)

                for i in xrange(len(context_ids)):
                    if context_ids[i] == qa_data.UNK_ID:
                        if FLAGS.rand_unknown:
                            context_ids[i] = random.randint(0, rand_max - 1)
                        context_ukn_word_cnt += 1
                        #print(context_tokens[i])

                if FLAGS.rand_unknown:
                    for i in xrange(len(question_ids)):
                        if int(question_ids[i]) == qa_data.UNK_ID:
                            question_ids[i] = str(
                                random.randint(0, rand_max - 1))

                context_data.append(context_ids)
                query_data.append(question_ids)
                question_uuid_data.append(question_uuid)
                context_tokens_data.append(context_tokens)
                question_tokens_data.append(question_tokens)


#                if FLAGS.eval_on_train:
#                    answer = qas[qid]['answers'][0]['text'].split()
# Wrong because qas[qid]['answers'][0]['answer_start'] is the token, not index
#s_labels.append(qas[qid]['answers'][0]['answer_start'])
#e_labels.append(qas[qid]['answers'][0]['answer_start'] + len(answer) - 1)
# remove answer
#                    true_answers.append(answer)
#print(sorted(context_lengths))
    context_lengths_over = [
        context_length > 300 for context_length in context_lengths
    ]
    print('+' * 100)
    print('Percentage of questions with context over context_max_length is: ' +
          str(sum(context_lengths_over) / len(context_lengths)))
    print('Percentage of unknow is ' +
          str(context_ukn_word_cnt / context_word_cnt))
    # remove answer
    #    if FLAGS.eval_on_train:
    #        return context_tokens_data, context_data, question_tokens_data, query_data, question_uuid_data, s_labels, e_labels, true_answers
    return context_tokens_data, context_data, question_tokens_data, query_data, question_uuid_data
Beispiel #13
0
def read_dataset(dataset, tier, vocab):
    """Reads the dataset, extracts context, question, answer,
    and answer pointer in their own file. Returns the number
    of questions and answers processed for the dataset"""

    context_data = []
    query_data = []
    question_uuid_data = []
    context_mask = []
    query_mask = []
    mask = []

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)
            # note this function is added by ourselves

            if len(context_tokens) > FLAGS.output_size:
                context_tokens = context_tokens[:FLAGS.output_size]
            vec = []
            vec.extend([True] * len(context_tokens))
            if len(context_tokens) < FLAGS.output_size:
                vec.extend([False] * (FLAGS.output_size - len(context_tokens)))

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                # note this part
                if len(question_tokens) > FLAGS.max_length:
                    question_tokens = question_tokens[:FLAGS.max_length]
                query_mask.append(len(question_tokens))
                question_uuid = qas[qid]['id']

                context_ids = [
                    vocab.get(w, qa_data.UNK_ID) for w in context_tokens
                ]
                context_mask.append(len(context_ids))
                if len(context_ids) < FLAGS.output_size:
                    context_ids.extend([0] *
                                       (FLAGS.output_size - len(context_ids)))
                qustion_ids = [
                    vocab.get(w, qa_data.UNK_ID) for w in question_tokens
                ]
                if len(qustion_ids) < FLAGS.max_length:
                    qustion_ids.extend([0] *
                                       (FLAGS.max_length - len(qustion_ids)))

                # context_data.append(' '.join(context_ids))
                # query_data.append(' '.join(qustion_ids))
                context_data.append(context_ids)
                query_data.append(qustion_ids)
                question_uuid_data.append(question_uuid)
                mask.append(vec)

    return context_data, query_data, question_uuid_data, context_mask, query_mask, mask
Beispiel #14
0
def expand_vocab(prefix, dev_filename, vocab, embd, raw_glove,
                 raw_glove_vocab):

    # Don't check file size, since we could be using other datasets
    dev_dataset = maybe_download(squad_base_url, dev_filename, prefix)
    dev_data = data_from_json(os.path.join(prefix, dev_filename))
    #context_data, question_data, question_uuid_data = read_dataset(dev_data, 'dev', vocab)
    dataset = dev_data
    context_data = []
    query_data = []
    question_uuid_data = []
    tier = 'dev'
    new_vocab = {}
    found = 0
    notfound = 0

    for articles_id in tqdm(range(len(dataset['data'])),
                            desc="Preprocessing {}".format(tier)):
        article_paragraphs = dataset['data'][articles_id]['paragraphs']
        for pid in range(len(article_paragraphs)):
            context = article_paragraphs[pid]['context']
            # The following replacements are suggested in the paper
            # BidAF (Seo et al., 2016)
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')

            context_tokens = tokenize(context)

            qas = article_paragraphs[pid]['qas']
            for qid in range(len(qas)):
                question = qas[qid]['question']
                question_tokens = tokenize(question)
                question_uuid = qas[qid]['id']

                #context_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in context_tokens]
                #qustion_ids = [str(vocab.get(w, qa_data.UNK_ID)) for w in question_tokens]
                #print(context_ids)
                for w in context_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

                for w in question_tokens:
                    if not w in vocab:
                        if not w in new_vocab:
                            new_vocab[w] = 1
                        else:
                            new_vocab[w] += 1
                        notfound += 1
                    else:
                        found += 1

    print('found/not found: {}/{}, {}% not found'.format(
        found, notfound, 100 * notfound / float(found + notfound)))
    print('New vocabulary:', len(new_vocab))

    vocab_list = list(vocab.items())
    vn = len(vocab_list)
    for i in range((len(new_vocab))):
        vocab_list.append((new_vocab.keys()[i], vn + i))

    vocab = dict(vocab_list)
    rev_vocab = dict([(x, y) for (y, x) in vocab_list])
    #context_data.append(' '.join(context_ids))
    #query_data.append(' '.join(qustion_ids))
    #question_uuid_data.append(question_uuid)
    #return context_data, question_data, question_uuid_data
    _, dim = embd.shape
    new_glove = np.random.randn(len(vocab), dim)
    new_glove[:vn, :] = embd

    found = 0
    for i in range(vn, vn + (len(new_vocab))):
        word = vocab_list[i][0]
        if word in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word]
            new_glove[i, :] = raw_glove[idx, :]
        if word.capitalize() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.capitalize()]
            new_glove[i, :] = raw_glove[idx, :]
        if word.upper() in raw_glove_vocab:
            found += 1
            idx = raw_glove_vocab[word.upper()]
            new_glove[i, :] = raw_glove[idx, :]
    #from IPython import embed; embed()
    print("{} unseen words found embeddings".format(found))

    return vocab, rev_vocab, new_glove
Beispiel #15
0
def read_write_dataset_(dataset, tier, prefix):
    """Reads the dataset, extracts context, question, answer,
    and answer pointer in their own file. Returns the number
    of questions and answers processed for the dataset"""
    qn, an = 0, 0
    skipped = 0

    with open(os.path.join(prefix, tier +'.context'), 'w') as context_file,  \
         open(os.path.join(prefix, tier +'.question'), 'w') as question_file:#,\
         #open(os.path.join(prefix, tier +'.answer'), 'w') as text_file, \
         #open(os.path.join(prefix, tier +'.span'), 'w') as span_file:
        question_uuid_data = []
        for articles_id in tqdm(range(len(dataset['data'])), desc="Preprocessing {}".format(tier)):
            article_paragraphs = dataset['data'][articles_id]['paragraphs']
            for pid in range(len(article_paragraphs)):
                context = article_paragraphs[pid]['context']
                # The following replacements are suggested in the paper
                # BidAF (Seo et al., 2016)
                context = context.replace("''", '" ')
                context = context.replace("``", '" ')

                context_tokens = tokenize(context)
                #answer_map = token_idx_map(context, context_tokens)

                qas = article_paragraphs[pid]['qas']
                for qid in range(len(qas)):
                    question = qas[qid]['question']
                    question_tokens = tokenize(question)
                    question_uuid = qas[qid]['id']
                    #answers = qas[qid]['answers']
                    qn += 1

                    num_answers = range(1)

                    for ans_id in num_answers:
                        # it contains answer_start, text
                        #text = qas[qid]['answers'][ans_id]['text']
                        #a_s = qas[qid]['answers'][ans_id]['answer_start']

                        #text_tokens = tokenize(text)

                        #answer_start = qas[qid]['answers'][ans_id]['answer_start']

                        #answer_end = answer_start + len(text)

                        #last_word_answer = len(text_tokens[-1]) # add one to get the first char

                        try:
                            #a_start_idx = answer_map[answer_start][1]

                            #a_end_idx = answer_map[answer_end - last_word_answer][1]

                            # remove length restraint since we deal with it later
                            context_file.write(' '.join(context_tokens) + '\n')
                            question_file.write(' '.join(question_tokens) + '\n')
                            #text_file.write(' '.join(text_tokens) + '\n')
                            #span_file.write(' '.join([str(a_start_idx), str(a_end_idx)]) + '\n')
                            question_uuid_data.append(question_uuid)
                        except Exception:# as e:
                            skipped += 1

                        an += 1

    print("Skipped {} question/answer pairs in {}".format(skipped, tier))
    return qn,an, question_uuid_data