Exemple #1
0
def main():
    data = TriviaQaWebDataset()

    stop = NltkPlusStopWords()
    splitter = MergeParagraphs(400)
    selector = TopTfIdf(stop, 4)

    print("Loading data..")
    train = data.get_train()
    print("Start")
    for q in train:
        for doc in q.all_docs:
            if len(doc.answer_spans) > 3:
                text = splitter.split_annotated(
                    data.evidence.get_document(doc.doc_id), doc.answer_spans)
                text = selector.prune(q.question, text)
                for para in text:
                    if len(para.answer_spans) > 3:
                        print(q.question)
                        text = flatten_iterable(para.text)
                        for s, e in para.answer_spans:
                            text[s] = "{{{" + text[s]
                            text[e] = text[e] + "}}}"
                        print(" ".join(text))
                        input()
Exemple #2
0
def read_input_data(model):
  data = []
  vocab = set()
  tokenizer = NltkAndPunctTokenizer()
  splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
  selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
  with open(OPTS.input_file) as f:
    for i, line in enumerate(f):
      try:
        document_raw, question_raw = line.strip().split('\t')
      except ValueError as e:
        print(line.strip())
        print('Error at line %d' % i)
        raise e
      document = re.split("\s*\n\s*", document_raw)
      question = tokenizer.tokenize_paragraph_flat(question_raw)
      doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
      split_doc = splitter.split(doc_toks)
      context = selector.prune(question, split_doc)
      if model.preprocessor is not None:
        context = [model.preprocessor.encode_text(question, x) for x in context]
      else:
        context = [flatten_iterable(x.text) for x in context]
      vocab.update(question)
      for txt in context:
        vocab.update(txt)
      ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]
      data.append((document_raw, question_raw, context, ex))
  return data, vocab
def show_web_paragraphs():
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = TopTfIdf(stop, 6)
    stop_words = stop.words

    corpus = TriviaQaWebDataset()
    train = corpus.get_train()
    points = flatten_iterable([(q, d) for d in q.all_docs] for q in train)
    np.random.shuffle(points)

    for q, d in points:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        doc = corpus.evidence.get_document(d.doc_id)
        doc = splitter.split_annotated(doc, d.answer_spans)
        ranked = ranker.dists(q.question, doc)
        if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0:
            continue
        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i, (para, dist) in enumerate(ranked[0:2]):
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist))
            if len(para.answer_spans) == 0:
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
def main():
  corpus = SquadCorpus()
  if OPTS.normalize_before_ranking:
      normalizer = WordNormalizer()
  else:
      normalizer = None
  if OPTS.use_vec_dist:
    word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d')
    prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer)
  else:
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer)
  orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev()
  orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs
               for q in p.questions] 
  new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1)
  new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
  print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens)))
  print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens)))
  if OPTS.out_file:
    write_output(OPTS.split, new_data, OPTS.out_file)
def get_para_filter(filter_name, per_document, n_paragraphs):
    filter_name = ('tfidf' if per_document else
                   'linear') if filter_name is None else filter_name
    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True),
                               n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(n_paragraphs)
    else:
        raise ValueError()
    return para_filter
Exemple #6
0
def contains_question_word():
    data = TriviaQaWebDataset()
    stop = NltkPlusStopWords(punctuation=True).words
    doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True))
    splits = MergeParagraphs(400)
    # splits = Truncate(400)
    questions = data.get_dev()
    pairs = flatten_iterable([(q, doc) for doc in q.all_docs]
                             for q in questions)
    pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id))
    np.random.RandomState(0).shuffle(questions)
    has_token = 0
    total = 0
    used = Counter()

    for q, doc in tqdm(pairs[:1000]):
        text = data.evidence.get_document(doc.doc_id, splits.reads_first_n)
        q_tokens = set(x.lower() for x in q.question)
        q_tokens -= stop
        for para in splits.split_annotated(text, doc.answer_spans):
            # if para.start == 0:
            #     continue
            if len(para.answer_spans) == 0:
                continue
            if any(x.lower() in q_tokens for x in flatten_iterable(para.text)):
                has_token += 1
                for x in flatten_iterable(para.text):
                    if x in q_tokens:
                        used[x] += 1
            # else:
            #     print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans)
            #     input()
            total += 1
    for k, v in used.most_common(200):
        print("%s: %d" % (k, v))
    print(has_token / total)
def find_answer(documents, raw_question):

    raw_question = raw_question.lower()
    documents = [d.lower() for d in documents]

    global best_spans, conf

    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    tokenizer = NltkAndPunctTokenizer()

    question = tokenizer.tokenize_paragraph_flat(raw_question)

    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    splitter = MergeParagraphs(400)

    documents = [splitter.split(doc) for doc in documents]

    if len(documents) == 1:
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    context = [flatten_iterable(x.text) for x in context]

    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    encoded = model.encode(data, is_train=False)

    with sess.as_default():
        spans, confid = sess.run([best_spans, conf], feed_dict=encoded)

    best_para = np.argmax(confid)
    ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] +
                                      1])
    confidence = confid[best_para]

    return ans, confidence
def main():
    corpus = SquadCorpus()
    prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True)
    orig_data = corpus.get_train(
    ) if OPTS.split == 'train' else corpus.get_dev()
    orig_lens = [
        len(p.text[0]) for doc in orig_data for p in doc.paragraphs
        for q in p.questions
    ]
    new_data = preprocess_par(orig_data,
                              corpus.evidence,
                              prepro,
                              n_processes=1)
    new_lens = [len(p.text) for q in new_data for p in q.paragraphs]
    print('%d original, mean %.2f words' %
          (len(orig_lens), np.mean(orig_lens)))
    print('%d new, mean %.2f words' % (len(new_lens), np.mean(new_lens)))
    if OPTS.out_file:
        write_output(OPTS.split, new_data, OPTS.out_file)
def show_open_paragraphs(start: int, end: int):
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = ShallowOpenWebRanker(6)
    stop_words = stop.words

    print("Loading train")
    corpus = TriviaQaOpenDataset()
    train = corpus.get_dev()
    np.random.shuffle(train)

    for q in train:
        q_words = {strip_accents_unicode(w.lower()) for w in q.question}
        q_words = {x for x in q_words if x not in stop_words}

        para = []
        for d in q.all_docs:
            doc = corpus.evidence.get_document(d.doc_id)
            para += splitter.split_annotated(doc, d.answer_spans)

        ranked = ranker.prune(q.question, para)
        if len(ranked) < start:
            continue
        ranked = ranked[start:end]

        print(" ".join(q.question))
        print(q.answer.all_answers)
        for i in range(start, end):
            para = ranked[i]
            text = flatten_iterable(para.text)
            print("Start=%d, Rank=%d" % (para.start, i))
            if len(para.answer_spans) == 0:
                # print("No Answer!")
                continue
            for s, e in para.answer_spans:
                text[s] = bcolors.CYAN + text[s]
                text[e] = text[e] + bcolors.ENDC
            for i, w in enumerate(text):
                if strip_accents_unicode(w.lower()) in q_words:
                    text[i] = bcolors.ERROR + text[i] + bcolors.ENDC
            print(" ".join(text))
        input()
def main():
    data = TriviaQaOpenDataset()
    # data = TriviaQaWebDataset()
    print("Loading...")
    all_questions = data.get_dev()

    questions = [
        q for q in all_questions if any(
            len(x.answer_spans) > 0 for x in q.all_docs)
    ]
    print(
        "%d/%d (%.4f) have an answer" % (len(questions), len(all_questions),
                                         len(questions) / len(all_questions)))

    np.random.shuffle(questions)

    pre = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400),
                                            TopTfIdf(NltkPlusStopWords(), 20),
                                            require_an_answer=False)
    print("Done")

    out = preprocess_par(questions[:2000], data.evidence, pre, 2, 1000)

    n_counts = np.zeros(20)
    n_any = np.zeros(20)
    n_any_all = np.zeros(20)

    for q in out.data:
        for i, p in enumerate(q.paragraphs):
            n_counts[i] += 1
            n_any[i] += len(p.answer_spans) > 0

        for i, p in enumerate(q.paragraphs):
            if len(p.answer_spans) > 0:
                n_any_all[i:] += 1
                break

    print(n_any_all / out.true_len)
    print(n_any / n_counts)
    print(n_counts)
def show_stats():
    splitter = MergeParagraphs(400)
    stop = NltkPlusStopWords(True)
    ranker = TopTfIdf(stop, 6)

    corpus = TriviaQaWebDataset()
    train = corpus.get_train()
    points = flatten_iterable([(q, d) for d in q.all_docs] for q in train)
    np.random.shuffle(points)

    counts = np.zeros(6)
    answers = np.zeros(6)
    n_answers = []

    points = points[:1000]
    for q, d in tqdm(points):
        doc = corpus.evidence.get_document(d.doc_id)
        doc = splitter.split_annotated(doc, d.answer_spans)
        ranked = ranker.prune(q.question, doc)
        counts[:len(ranked)] += 1
        for i, para in enumerate(ranked):
            if len(para.answer_spans) > 0:
                answers[i] += 1
        n_answers.append(
            tuple(i for i, x in enumerate(ranked) if len(x.answer_spans) > 0))

    print(answers / counts)
    c = Counter()
    other = 0
    for tup in n_answers:
        if len(tup) <= 2:
            c[tup] += 1
        else:
            other += 1

    for p in sorted(c.keys()):
        print(p, c.get(p) / len(points))
    print(other / len(points))
Exemple #12
0
 def __init__(self, n_to_select):
     self.n_to_select = n_to_select
     self._stop = NltkPlusStopWords(True).words
     self._tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self._stop)
Exemple #13
0
def main():
    parser = argparse.ArgumentParser(
        description='Train a model on document-level SQuAD')
    parser.add_argument(
        'mode',
        choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"])
    parser.add_argument("name", help="Output directory")
    args = parser.parse_args()
    mode = args.mode
    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    corpus = SquadCorpus()
    if mode == "merge":
        # Adds paragraph start tokens, since we will be concatenating paragraphs together
        pre = WithIndicators(True, para_tokens=False, doc_start_token=False)
    else:
        pre = None

    model = get_model(50, 100, args.mode, pre)

    if mode == "paragraph":
        # Run in the "standard" known-paragraph setting
        if model.preprocessor is not None:
            raise NotImplementedError()
        n_epochs = 26
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(corpus, None, train_batching,
                                      eval_batching)
        eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")]
    else:
        eval_set_mode = {
            "confidence": "flatten",
            "sigmoid": "flatten",
            "shared-norm": "group",
            "merge": "merge"
        }[mode]
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, eval_set_mode, True, 0)

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # needs to be trained for a really long time for reasons unknown, even this might be too small
                n_epochs = 100
            else:
                n_epochs = 50  # more epochs since we only "see" the label very other epoch-osh
            train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3),
                                              True, False)
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphsBuilder(train_batching, 1),
                eval_dataset,
                eval_on_verified=False,
            )
        else:
            n_epochs = 26
            data = PreprocessedData(
                SquadCorpus(),
                SquadTfIdfRanker(NltkPlusStopWords(True), 4, True,
                                 model.preprocessor),
                StratifyParagraphSetsBuilder(25, args.mode == "merge", True,
                                             1),
                eval_dataset,
                eval_on_verified=False,
            )

        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")]
        data.preprocess(1)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = args.mode + "\n" + notes

    trainer.start_training(data, model, train_params(n_epochs), eval,
                           model_dir.ModelDir(out), notes)
def main():
    parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
    # parser.add_argument("model", type=int, help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
    args = parser.parse_args()

    # Models path
    SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad'
    SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm'
    TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm'
    TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm'
    
    models_directory = [
        SQUAD_MODEL_DIRECTORY_PATH,
        SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH,
        TRIVIAQA_MODEL_DIRECTORY_PATH,
        TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH
    ]

    print("Preprocessing...")

    # Load the model
    # model_dir = ModelDir(args.model)
    model_dir = ModelDir(models_directory[0])
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError("This script is built to work for ParagraphQuestionModel models only")

    # Read the documents
    documents = []
    for doc in args.documents:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [model.preprocessor.encode_text(question, x) for x in context]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]
        
    print("Setting up model")
    
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(10)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(conf)  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    para_id = int(str(best_para))
    # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0])))
    print("Best Paragraph: \n" + " ".join(context[para_id]))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
    print("Confidence: " + str(conf[best_para]))
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--n_processes',
        type=int,
        default=1,
        help=
        "Number of processes to do the preprocessing (selecting paragraphs+loading context) with"
    )
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t',
                        '--tokens',
                        type=int,
                        default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="Number of questions to evaluate on")
    parser.add_argument('-g',
                        '--n_paragraphs',
                        type=int,
                        default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f',
                        '--filter',
                        type=str,
                        default=None,
                        choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument(
        '-c',
        '--corpus',
        choices=[
            "en_dev", "en_test", "fr_dev", "fr_test", "de_dev", "de_test",
            "ru_dev", "ru_test", "pt_dev", "pt_test", "zh_dev", "zh_test",
            "pl_dev", "pl_test", "uk_dev", "uk_test", "ta_dev", "ta_test",
            "fr_trans_en_dev", "fr_trans_en_test", "de_trans_en_dev",
            "de_trans_en_test", "ru_trans_en_dev", "ru_trans_en_test",
            "pt_trans_en_dev", "pt_trans_en_test", "zh_trans_en_dev",
            "zh_trans_en_test", "pl_trans_en_dev", "pl_trans_en_test",
            "uk_trans_en_dev", "uk_trans_en_test", "ta_trans_en_dev",
            "ta_trans_en_test"
        ],
        required=True)
    args = parser.parse_args()

    corpus_name = args.corpus[:args.corpus.rfind("_")]
    eval_set = args.corpus[args.corpus.rfind("_") + 1:]
    dataset = XQADataset(corpus_name)
    if eval_set == "dev":
        test_questions = dataset.get_dev()
    elif eval_set == "test":
        test_questions = dataset.get_test()
    else:
        raise AssertionError()

    corpus = dataset.evidence
    splitter = MergeParagraphs(args.tokens)

    per_document = args.corpus.startswith(
        "web")  # wiki and web are both multi-document

    filter_name = args.filter
    if filter_name is None:
        # Pick default depending on the kind of data we are using
        if per_document:
            filter_name = "tfidf"
        else:
            filter_name = "linear"

    print("Selecting %d paragraphs using method \"%s\" per %s" %
          (args.n_paragraphs, filter_name,
           ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True),
                               args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    if n_questions is not None:
        test_questions.sort(key=lambda x: x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    preprocessor = WithIndicators()
    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter,
                                      para_filter,
                                      preprocessor,
                                      require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter,
                                                 para_filter,
                                                 preprocessor,
                                                 require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep,
                                  args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(
                DocumentParagraphQuestion(q.question_id, p.doc_id,
                                          (p.start, p.end), q.question, p.text,
                                          ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)

    # dump eval data for bert
    import pickle
    pickle.dump(questions,
                open("%s_%d.pkl" % (args.corpus, args.n_paragraphs), "wb"))
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on document-level SQuAD')
    parser.add_argument('model', help='model to use')
    parser.add_argument(
        'output',
        type=str,
        help="Store the per-paragraph results in csv format in this file")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="(for testing) sample documents")
    parser.add_argument(
        '-s',
        '--async',
        type=int,
        default=10,
        help="Encoding batch asynchronously, queueing up to this many")
    parser.add_argument('-a',
                        '--answer_bound',
                        type=int,
                        default=17,
                        help="Max answer span length")
    parser.add_argument('-p',
                        '--n_paragraphs',
                        type=int,
                        default=None,
                        help="Max number of paragraphs to use")
    parser.add_argument(
        '-b',
        '--batch_size',
        type=int,
        default=200,
        help="Batch size, larger sizes can be faster but uses more memory")
    parser.add_argument('-c',
                        '--corpus',
                        choices=["dev", "train", "doc-rd-dev"],
                        default="dev")
    parser.add_argument('--no_ema',
                        action="store_true",
                        help="Don't use EMA weights even if they exist")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)
    print("Loading data")

    questions = []
    ranker = SquadTfIdfRanker(NltkPlusStopWords(True),
                              args.n_paragraphs,
                              force_answer=False)

    if args.corpus == "doc-rd-dev":
        docs = SquadCorpus().get_dev()
        if args.n_sample is not None:
            docs.sort(key=lambda x: x.doc_id)
            np.random.RandomState(0).shuffle(docs)
            docs = docs[:args.n_sample]

        print("Fetching document reader docs...")
        doc_rd_versions = get_doc_rd_doc(docs)
        print("Ranking and matching with questions...")
        for doc in tqdm(docs):
            doc_questions = flatten_iterable(x.questions
                                             for x in doc.paragraphs)
            paragraphs = doc_rd_versions[doc.title]
            ranks = ranker.rank([x.words for x in doc_questions],
                                [x.text for x in paragraphs])
            for i, question in enumerate(doc_questions):
                para_ranks = np.argsort(ranks[i])
                for para_rank, para_num in enumerate(
                        para_ranks[:args.n_paragraphs]):
                    # Just use dummy answers spans for these pairs
                    questions.append(
                        RankedParagraphQuestion(
                            question.words,
                            TokenSpans(question.answer.answer_text,
                                       np.zeros((0, 2), dtype=np.int32)),
                            question.question_id, paragraphs[para_num],
                            para_rank, para_num))
        rl = ResourceLoader()
    else:
        if args.corpus == "dev":
            docs = SquadCorpus().get_dev()
        else:
            docs = SquadCorpus().get_train()
        rl = SquadCorpus().get_resource_loader()

        if args.n_sample is not None:
            docs.sort(key=lambda x: x.doc_id)
            np.random.RandomState(0).shuffle(docs)
            docs = docs[:args.n_sample]

        for q in ranker.ranked_questions(docs):
            for i, p in enumerate(q.paragraphs):
                questions.append(
                    RankedParagraphQuestion(
                        q.question, TokenSpans(q.answer_text, p.answer_spans),
                        q.question_id,
                        ParagraphWithInverse([p.text], p.original_text,
                                             p.spans), i, p.paragraph_num))

    print("Split %d docs into %d paragraphs" % (len(docs), len(questions)))

    questions = sorted(questions,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)
    for q in questions:
        if len(q.answer.answer_spans.shape) != 2:
            raise ValueError()

    checkpoint = model_dir.get_best_weights()
    if checkpoint is not None:
        print("Using best weights")
    else:
        print("Using latest checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
        if checkpoint is None:
            raise ValueError("No checkpoints found")

    data = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    model = model_dir.get_model()
    evaluation = trainer.test(
        model, [RecordParagraphSpanPrediction(args.answer_bound, True)],
        {args.corpus: data}, rl, checkpoint, not args.no_ema,
        args. async)[args.corpus]

    print("Saving result")
    output_file = args.output

    df = pd.DataFrame(evaluation.per_sample)

    df.sort_values(["question_id", "rank"], inplace=True, ascending=True)
    group_by = ["question_id"]
    f1 = compute_ranked_scores(df, "predicted_score", "text_f1", group_by)
    em = compute_ranked_scores(df, "predicted_score", "text_em", group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i + 1), "%.4f" % e, "%.4f" % f]
                  for i, (e, f) in enumerate(zip(em, f1)))
    print_table(table)

    df.to_csv(output_file, index=False)
Exemple #17
0
def paragraph_stats(corpus, splitter: DocumentSplitter, sample):
    stop = NltkPlusStopWords(punctuation=True).words

    data = corpus.get_dev()
    pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in data)
    data = [
        pairs[i] for i in np.random.choice(
            np.arange(0, len(pairs)), sample, replace=False)
    ]

    word_matches = Counter()
    n_para = []
    n_answers = []
    n_question_words = []

    for q, doc in data:
        if len(doc.answer_spans) == 0:
            continue
        q_words = set(x.lower() for x in q.question)
        q_words -= stop
        # q_words = set(norm.normalize(w) for w in q_words)

        text = corpus.evidence.get_document(doc.doc_id)
        para = splitter.split_annotated(text, doc.answer_spans)
        n_para.append(len(para))
        n_answers += [len(x.answer_spans) for x in para]

        for x in para:
            match_set = set()
            n_matches = 0
            text = flatten_iterable(x.text)
            for word in text:
                word = word.lower()
                if word in q_words:
                    n_matches += 1
                    match_set.add(word)
            if len(match_set) == 0 and len(x.answer_spans) > 0:
                print_paragraph(q, x)
                input()
            word_matches.update(match_set)
            n_question_words.append(n_matches)

    n_answers = np.array(n_answers)
    n_question_words = np.array(n_question_words)
    any_answers = n_answers > 0
    any_question_word = n_question_words > 0

    total_para = len(any_answers)
    total_q = len(n_para)

    no_quesiton_and_answer = any_answers[np.logical_not(any_question_word)]

    print("%d/%d (%.4f) pairs have an answer" %
          (total_q, len(data), total_q / len(data)))
    print("%d para in %d questions (av %.4f)" %
          (sum(n_para), total_q, sum(n_para) / total_q))
    print("%d/%d (%.4f) paragraphs have answers" %
          (any_answers.sum(), total_para, any_answers.mean()))
    print("%d/%d (%.4f) paragraphs have question word" %
          (any_question_word.sum(), total_para, any_question_word.mean()))
    print("%d/%d (%.4f) no question words have answers" %
          (no_quesiton_and_answer.sum(), len(no_quesiton_and_answer),
           no_quesiton_and_answer.mean()))
Exemple #18
0
def main(Data: pd.DataFrame, nlp, model_dir, model):
    #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
    #parser.add_argument("model", help="Model directory")
    #parser.add_argument("question", help="Question to answer")
    #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
    #args = parser.parse_args()

    #print("Preprocessing...")

    # Load the model
    #model_dir = ModelDir(MODEL_DIR)
    #model = model_dir.get_model()
    print(model)
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )
    #print(model)
    # Read the documents
    documents = []
    documents.append(Data.at[0, 'Filetext'])
    """import pyodbc

    conn = pyodbc.connect("Driver={ODBC Driver 13 for SQL Server};"               
               "Server=192.168.100.15;"
               "Database=PharmaAce;"
               "UID=sa;"
               "PWD=admin@123;"
               "Trusted_Connection=no;")

    cursor=conn.cursor()
#(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)

    for doc in cursor.execute("select cast(filetext as varchar(max)) as filetext from kpl_tmp"):
        documents.append(doc[0])
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"   
    if not isfile(doc):
        raise ValueError(doc + " does not exist")
    with open(doc, "r") as f:
        documents.append(f.read())
    """
    #print("Loaded %d documents" % len(documents))
    #temp=documents[0].split()
    # Split documents into lists of paragraphs
    #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(
        Data.at[0, 'Question'])  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]
    #print(str(len(documents))+" kpl") #kpl
    # Now select the top paragraphs using a `ParagraphFilter`
    print(len(documents))  #kpl
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [
            model.preprocessor.encode_text(question, x) for x in context
        ]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]

    print("Setting up model")
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(nlp,
                         ParagraphAndQuestionSpec(batch_size=len(context)),
                         voc)
    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    #print("Build tf graph") #kpl
    print("after set input spec")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)
    print("after loading weights")
    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    #print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(
        conf
    )  # We get output for each paragraph, select the most-confident one to print

    #print("Best Paragraph: " + str(best_para))
    #print("Best span: " + str(best_spans[best_para]))
    #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
    #print("Confidence: " + str(conf[best_para]))

    return " ".join(
        context[best_para][best_spans[best_para][0]:best_spans[best_para][1] +
                           1])


#if __name__ == "__main__":
#    main()
def main():
    print('Starting...')
    model_dir = ModelDir(OPTS.model)
    model = model_dir.get_model()
    tokenizer = NltkAndPunctTokenizer()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )
    if OPTS.reload_vocab:
        loader = ResourceLoader()
    else:
        loader = CachingResourceLoader()
    print('Loading word vectors...')
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None),
                         set([',']),
                         word_vec_loader=loader,
                         allow_update=True)
    print('Starting Tensorflow session...')
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    with sess.as_default():
        prediction = model.get_prediction()
        # Take 0-th here because we know we only truncate to one paragraph
        start_logits_tf = prediction.start_logits[0]
        end_logits_tf = prediction.end_logits[0]
        none_logit_tf = prediction.none_logit[0]
        #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH)
    model_dir.restore_checkpoint(sess)
    splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
    selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
    app = bottle.Bottle()

    @app.route('/')
    def index():
        return bottle.template('index')

    @app.route('/post_query', method='post')
    def post_query():
        document_raw = bottle.request.forms.getunicode('document').strip()
        question_raw = bottle.request.forms.getunicode('question').strip()
        document = re.split("\s*\n\s*", document_raw)
        question = tokenizer.tokenize_paragraph_flat(question_raw)
        doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
        split_doc = splitter.split(doc_toks)
        context = selector.prune(question, split_doc)
        if model.preprocessor is not None:
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            context = [flatten_iterable(x.text) for x in context]
        vocab = set(question)
        for txt in context:
            vocab.update(txt)
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]
        model.word_embed.update(loader, vocab)
        encoded = model.encode(data, is_train=False)
        start_logits, end_logits, none_logit = sess.run(
            [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded)
        beam, p_na = logits_to_probs(document_raw,
                                     context[0],
                                     start_logits,
                                     end_logits,
                                     none_logit,
                                     beam_size=BEAM_SIZE)
        return bottle.template('results',
                               document=document_raw,
                               question=question_raw,
                               beam=beam,
                               p_na=p_na)

    cur_dir = os.path.abspath(os.path.dirname(__file__))
    bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views'))
    bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
    def getAnswer(self):
        #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
        #parser.add_argument("model", help="Model directory")
        #parser.add_argument("question", help="Question to answer")
        #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
        #args = parser.parse_args()

        #print("Preprocessing...")

        # Load the model
        model_dir = ModelDir(MODEL_DIR)
        model = model_dir.get_model()
        if not isinstance(model, ParagraphQuestionModel):
            raise ValueError(
                "This script is built to work for ParagraphQuestionModel models only"
            )

        conn = pyodbc.connect(DB_CONN)

        cursor = conn.cursor()
        #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)
        query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\
               str(self.ObjectMasterId)+\
               " order by id asc"
        #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp"
        documents = []
        document = ""
        name = ""
        filetype = 0
        for doc in cursor.execute(query):
            document = document + doc[0]
            name = doc[1]
            filetype = doc[2]
        #open("E:/kpl.txt","w+").write(document)
        documents.append(document)
        #documents.replace("\n\n","\n")
        #r.sub("",documents)
        #documents=" ".join(documents.split())
        #open("E:\kpl_test.txt","w+").write(document)
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"
        # =============================================================================
        #     if not isfile(doc):
        #         raise ValueError(doc + " does not exist")
        #     with open(doc, "r") as f:
        #         documents.append(f.read())
        # =============================================================================

        #print("Loaded %d documents" % len(documents))
        #temp=documents[0].split()
        # Split documents into lists of paragraphs
        #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
        documents = [re.split("\s*\n\s*", doc) for doc in documents]
        # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
        # Note the model expects case-sensitive input
        tokenizer = NltkAndPunctTokenizer()
        question = tokenizer.tokenize_paragraph_flat(
            self.Question)  # List of words

        # Now list of document->paragraph->sentence->word
        documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                     for doc in documents]

        # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
        # that additionally remember the start/end token of the paragraph within the source document
        splitter = MergeParagraphs(400)
        #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
        documents = [splitter.split(doc) for doc in documents]
        #print(str(len(documents))+" kpl") #kpl
        # Now select the top paragraphs using a `ParagraphFilter`
        if len(documents) == 1:
            # Use TF-IDF to select top paragraphs from the document
            selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
            context = selector.prune(question, documents[0])
        else:
            # Use a linear classifier to select top paragraphs among all the documents
            selector = ShallowOpenWebRanker(n_to_select=10)
            context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

        if model.preprocessor is not None:
            # Models are allowed to define an additional pre-processing step
            # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            # Otherwise just use flattened text
            context = [flatten_iterable(x.text) for x in context]
        #x=open("E:\context.txt","a+")
        #[x.write(" ".join(cont)) for cont in context]
        #x.write("\n.......................................................\n")

        #print("Setting up model")
        # Tell the model the batch size (can be None) and vocab to expect, This will load the
        # needed word vectors and fix the batch size to use when building the graph / encoding the input
        voc = set(question)
        for txt in context:
            voc.update(txt)

        model.set_input_spec(self.nlp,
                             ParagraphAndQuestionSpec(batch_size=len(context)),
                             voc)
        # Now we build the actual tensorflow graph, `best_span` and `conf` are
        # tensors holding the predicted span (inclusive) and confidence scores for each
        # element in the input batch, confidence scores being the pre-softmax logit for the span
        #print("Build tf graph") #kpl
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # We need to use sess.as_default when working with the cuNND stuff, since we need an active
        # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
        with sess.as_default():
            # 8 means to limit the span to size 8 or less
            best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
        model_dir.restore_checkpoint(sess)

        # Now the model is ready to run
        # The model takes input in the form of `ContextAndQuestion` objects, for example:
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]

        #print("Starting run")
        # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
        # into numpy arrays, then we use `sess` to run the actual model get the predictions
        encoded = model.encode(
            data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
        best_spans, conf = sess.run(
            [best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

        best_para = np.argmax(
            conf
        )  # We get output for each paragraph, select the most-confident one to print

        #print("Best Paragraph: " + str(best_para))
        #print("Best span: " + str(best_spans[best_para]))
        #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
        #print("Confidence: " + str(conf[best_para]))
        Answer = " ".join(context[best_para]
                          [best_spans[best_para][0]:best_spans[best_para][1] +
                           1])

        print("Confidence: " + str(conf[best_para]))
        print("Best Paragraph: " + str(best_para))
        print("Best span: " + str(best_spans[best_para]))
        print("Answer text: " + Answer)
        print(" ".join(context[best_para]))
        context[best_para][best_spans[best_para][
            0]] = r"<em>" + context[best_para][best_spans[best_para][0]]
        context[best_para][best_spans[best_para][1]] = context[best_para][
            best_spans[best_para][1]] + r"</em>"

        start = 0
        end = len(context[best_para])

        positions = [
            x for x, n in enumerate(context[best_para]
                                    [0:best_spans[best_para][0]]) if n == "."
        ]
        if len(positions) >= 2: start = positions[len(positions) - 2] + 1
        positions = [
            x
            for x, n in enumerate(context[best_para][best_spans[best_para][1] +
                                                     1:]) if n == "."
        ]
        if len(positions) > 1:
            end = best_spans[best_para][1] + 1 + positions[1]

        d = dict()
        if conf[best_para] > 10:
            d["answer"] = Answer
        else:
            d["answer"] = ""
        d["name"] = name
        d["filetype"] = filetype
        d["paragraph"] = re.sub(r' (?=\W)', '',
                                " ".join(context[best_para][start:end]))
        d["ObjectMasterId"] = self.ObjectMasterId

        return d


#if __name__ == "__main__":
#    main()
Exemple #21
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data')
    parser.add_argument('model', help='model directory')
    parser.add_argument('-p', '--paragraph_output', type=str,
                        help="Save fine grained results for each paragraph in csv format")
    parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's"
                                                                  " most confident span for each (question, doc) pair")
    parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist")
    parser.add_argument('--n_processes', type=int, default=None,
                        help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with")
    parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest")
    parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on")
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t', '--tokens', type=int, default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-g', '--n_paragraphs', type=int, default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument('-b', '--batch_size', type=int, default=200,
                        help="Batch size, larger sizes might be faster but wll take more memory")
    parser.add_argument('--max_answer_len', type=int, default=8,
                        help="Max answer span to select")
    parser.add_argument('-c', '--corpus',
                        choices=["web-dev", "web-test", "web-verified-dev", "web-train",
                                 "open-dev", "open-train"],
                        default="web-verified-dev")
    args = parser.parse_args()

    model_dir = ModelDir(args.model)
    model = model_dir.get_model()

    if args.corpus.startswith('web'):
        dataset = TriviaQaWebDataset()
        corpus = dataset.evidence
        if args.corpus == "web-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "web-test":
            test_questions = dataset.get_test()
        elif args.corpus == "web-verified-dev":
            test_questions = dataset.get_verified()
        elif args.corpus == "web-train":
            test_questions = dataset.get_train()
        else:
            raise RuntimeError()
    else:
        dataset = TriviaQaOpenDataset()
        corpus = dataset.evidence
        if args.corpus == "open-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "open-train":
            test_questions = dataset.get_train()
        else:
            raise RuntimeError()

    splitter = MergeParagraphs(args.tokens)

    per_document = not args.corpus.startswith("open")

    filter_name = args.filter
    if filter_name is None:
        if args.corpus.startswith("open"):
            filter_name = "linear"
        else:
            filter_name = "tfidf"

    print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name,
                                                              ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    if n_questions is not None:
        test_questions.sort(key=lambda x:x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(DocumentParagraphQuestion(q.question_id, p.doc_id,
                                                 (p.start, p.end), q.question, p.text,
                                                  ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True)

    print("Done, starting eval")

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True))

    evaluation = trainer.test(model,
                             [RecordParagraphSpanPrediction(args.max_answer_len, True)],
                              {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus]

    if not all(len(x) == len(data) for x in evaluation.per_sample.values()):
        raise RuntimeError()

    df = pd.DataFrame(evaluation.per_sample)

    if args.official_output is not None:
        print("Saving question result")

        # I didn't store the unormalized filenames exactly, so unfortunately we have to reload
        # the source data to get exact filename to output an official test script
        fns = {}
        print("Loading proper filenames")
        if args.corpus == 'web-test':
            source = join(TRIVIA_QA, "qa", "web-test-without-answers.json")
        elif args.corpus == "web-dev":
            source = join(TRIVIA_QA, "qa", "web-dev.json")
        else:
            raise NotImplementedError()

        with open(join(source)) as f:
            data = json.load(f)["Data"]
        for point in data:
            for doc in point["EntityPages"]:
                filename = doc["Filename"]
                fn = join("wikipedia", filename[:filename.rfind(".")])
                fn = normalize_wiki_filename(fn)
                fns[(point["QuestionId"], fn)] = filename

        answers = {}
        scores = {}
        for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end",
                                                        "text_answer", "predicted_score"]].itertuples(index=False):
            filename = dataset.evidence.file_id_map[doc_id]
            if filename.startswith("web"):
                true_name = filename[4:] + ".txt"
            else:
                true_name = fns[(q_id, filename)]

            key = q_id + "--" + true_name
            prev_score = scores.get(key)
            if prev_score is None or prev_score < score:
                scores[key] = score
                answers[key] = txt

        with open(args.official_output, "w") as f:
            json.dump(answers, f)

    if per_document:
        group_by = ["question_id", "doc_id"]
    else:
        group_by = ["question_id"]

    # Print a table of scores as more paragraphs are used
    df.sort_values(group_by + ["rank"], inplace=True)
    f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by)
    em = compute_model_scores(df, "predicted_score", "text_em", group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1)))
    print_table(table)

    output_file = args.paragraph_output
    if output_file is not None:
        print("Saving paragraph result")
        if output_file.endswith("json"):
            with open(output_file, "w") as f:
                json.dump(evaluation.per_sample, f)
        elif output_file.endswith("pkl"):
            with open(output_file, "wb") as f:
                pickle.dump(evaluation.per_sample, f)
        elif output_file.endswith("csv"):

            df.to_csv(output_file, index=False)
        else:
            raise ValueError("Unrecognized file format")
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate a model on TriviaQA data')
    parser.add_argument('model', help='model directory')
    parser.add_argument(
        '-p',
        '--paragraph_output',
        type=str,
        help="Save fine grained results for each paragraph in csv format")
    parser.add_argument('-o',
                        '--official_output',
                        type=str,
                        help="Build an offical output file with the model's"
                        " most confident span for each (question, doc) pair")
    parser.add_argument('--no_ema',
                        action="store_true",
                        help="Don't use EMA weights even if they exist")
    parser.add_argument(
        '--n_processes',
        type=int,
        default=None,
        help=
        "Number of processes to do the preprocessing (selecting paragraphs+loading context) with"
    )
    parser.add_argument('-i',
                        '--step',
                        type=int,
                        default=None,
                        help="checkpoint to load, default to latest")
    parser.add_argument('-n',
                        '--n_sample',
                        type=int,
                        default=None,
                        help="Number of questions to evaluate on")
    parser.add_argument('-a', '--async', type=int, default=10)
    parser.add_argument('-t',
                        '--tokens',
                        type=int,
                        default=400,
                        help="Max tokens per a paragraph")
    parser.add_argument('-g',
                        '--n_paragraphs',
                        type=int,
                        default=15,
                        help="Number of paragraphs to run the model on")
    parser.add_argument('-f',
                        '--filter',
                        type=str,
                        default=None,
                        choices=["tfidf", "truncate", "linear"],
                        help="How to select paragraphs")
    parser.add_argument(
        '-b',
        '--batch_size',
        type=int,
        default=200,
        help="Batch size, larger sizes might be faster but wll take more memory"
    )
    parser.add_argument('--max_answer_len',
                        type=int,
                        default=8,
                        help="Max answer span to select")
    parser.add_argument('-c',
                        '--corpus',
                        choices=[
                            "web-dev", "web-test", "web-verified-dev",
                            "web-train", "open-dev", "open-train", "wiki-dev",
                            "wiki-test"
                        ],
                        default="web-verified-dev")
    parser.add_argument("-s",
                        "--source_dir",
                        type=str,
                        default=None,
                        help="where to take input files")
    parser.add_argument("--n_span_per_q",
                        type=int,
                        default=1,
                        help="where to take input files")
    args = parser.parse_args()

    dataset_name = args.source_dir.split('/')[-1]
    model_name = args.model.split('/')[-1]
    ElasticLogger().write_log('INFO',
                              'Start Evaluation',
                              context_dict={
                                  'model': model_name,
                                  'dataset': dataset_name
                              })

    model_dir = ModelDir(args.model)
    model = model_dir.get_model()

    if args.corpus.startswith('web'):
        dataset = TriviaQaWebDataset()
        if args.corpus == "web-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "web-test":
            test_questions = dataset.get_test()
        elif args.corpus == "web-verified-dev":
            test_questions = dataset.get_verified()
        elif args.corpus == "web-train":
            test_questions = dataset.get_train()
        else:
            raise AssertionError()
    elif args.corpus.startswith("wiki"):
        dataset = TriviaQaWikiDataset()
        if args.corpus == "wiki-dev":
            test_questions = dataset.get_dev()
        elif args.corpus == "wiki-test":
            test_questions = dataset.get_test()
        else:
            raise AssertionError()
    else:
        dataset = TriviaQaOpenDataset(args.source_dir)
        if args.corpus == "open-dev":
            # just loading the pkl that was saved in build_span_corpus
            test_questions = dataset.get_dev()
        elif args.corpus == "open-train":
            test_questions = dataset.get_train()
        else:
            raise AssertionError()

    ### ALON debuging
    #test_questions = test_questions[0:5]

    corpus = dataset.evidence
    splitter = MergeParagraphs(args.tokens)

    per_document = args.corpus.startswith(
        "web")  # wiki and web are both multi-document
    #per_document = True

    filter_name = args.filter
    if filter_name is None:
        # Pick default depending on the kind of data we are using
        if per_document:
            filter_name = "tfidf"
        else:
            filter_name = "linear"

    print("Selecting %d paragraphs using method \"%s\" per %s" %
          (args.n_paragraphs, filter_name,
           ("question-document pair" if per_document else "question")))

    if filter_name == "tfidf":
        para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True),
                               args.n_paragraphs)
    elif filter_name == "truncate":
        para_filter = FirstN(args.n_paragraphs)
    elif filter_name == "linear":
        para_filter = ShallowOpenWebRanker(args.n_paragraphs)
    else:
        raise ValueError()

    n_questions = args.n_sample
    docqa.config.SPANS_PER_QUESTION = args.n_span_per_q
    #n_questions = 1
    if n_questions is not None:
        test_questions.sort(key=lambda x: x.question_id)
        np.random.RandomState(0).shuffle(test_questions)
        test_questions = test_questions[:n_questions]

    print("Building question/paragraph pairs...")
    # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor
    if per_document:
        prep = ExtractMultiParagraphs(splitter,
                                      para_filter,
                                      model.preprocessor,
                                      require_an_answer=False)
    else:
        prep = ExtractMultiParagraphsPerQuestion(splitter,
                                                 para_filter,
                                                 model.preprocessor,
                                                 require_an_answer=False)
    prepped_data = preprocess_par(test_questions, corpus, prep,
                                  args.n_processes, 1000)

    data = []
    for q in prepped_data.data:
        for i, p in enumerate(q.paragraphs):
            if q.answer_text is None:
                ans = None
            else:
                ans = TokenSpans(q.answer_text, p.answer_spans)
            data.append(
                DocumentParagraphQuestion(q.question_id, p.doc_id,
                                          (p.start, p.end), q.question, p.text,
                                          ans, i))

    # Reverse so our first batch will be the largest (so OOMs happen early)
    questions = sorted(data,
                       key=lambda x: (x.n_context_words, len(x.question)),
                       reverse=True)

    print("Done, starting eval")

    if args.step is not None:
        if args.step == "latest":
            checkpoint = model_dir.get_latest_checkpoint()
        else:
            checkpoint = model_dir.get_checkpoint(int(args.step))
    else:
        checkpoint = model_dir.get_best_weights()
        if checkpoint is not None:
            print("Using best weights")
        else:
            print("Using latest checkpoint")
            checkpoint = model_dir.get_latest_checkpoint()

    test_questions = ParagraphAndQuestionDataset(
        questions, FixedOrderBatcher(args.batch_size, True))

    evaluation = trainer.test(
        model, [RecordParagraphSpanPrediction(args.max_answer_len, True)],
        {args.corpus: test_questions}, ResourceLoader(), checkpoint,
        not args.no_ema, args. async)[args.corpus]

    if not all(len(x) == len(data) for x in evaluation.per_sample.values()):
        raise RuntimeError()

    df = pd.DataFrame(evaluation.per_sample)

    if args.official_output is not None:
        print("Saving question result")

        fns = {}
        if per_document:
            # I didn't store the unormalized filenames exactly, so unfortunately we have to reload
            # the source data to get exact filename to output an official test script
            print("Loading proper filenames")
            if args.corpus == 'web-test':
                source = join(TRIVIA_QA, "qa", "web-test-without-answers.json")
            elif args.corpus == "web-dev":
                source = join(TRIVIA_QA, "qa", "web-dev.json")
            else:
                raise AssertionError()

            with open(join(source)) as f:
                data = json.load(f)["Data"]
            for point in data:
                for doc in point["EntityPages"]:
                    filename = doc["Filename"]
                    fn = join("wikipedia", filename[:filename.rfind(".")])
                    fn = normalize_wiki_filename(fn)
                    fns[(point["QuestionId"], fn)] = filename

        answers = {}
        scores = {}
        for q_id, doc_id, start, end, txt, score in df[[
                "question_id", "doc_id", "para_start", "para_end",
                "text_answer", "predicted_score"
        ]].itertuples(index=False):
            filename = dataset.evidence.file_id_map[doc_id]
            if per_document:
                if filename.startswith("web"):
                    true_name = filename[4:] + ".txt"
                else:
                    true_name = fns[(q_id, filename)]
                # Alon Patch for triviaqa test results
                true_name = true_name.replace('TriviaQA_Org/', '')
                key = q_id + "--" + true_name
            else:
                key = q_id

            prev_score = scores.get(key)
            if prev_score is None or prev_score < score:
                scores[key] = score
                answers[key] = txt

        with open(args.official_output, "w") as f:
            json.dump(answers, f)

    output_file = args.paragraph_output
    if output_file is not None:
        print("Saving paragraph result")
        df.to_csv(output_file, index=False)

    print("Computing scores")

    if per_document:
        group_by = ["question_id", "doc_id"]
    else:
        group_by = ["question_id"]

    # Print a table of scores as more paragraphs are used
    df.sort_values(group_by + ["rank"], inplace=True)
    df_scores = df.copy(deep=True)
    df_scores['predicted_score'] = df_scores['predicted_score'].apply(
        lambda x: pd.Series(x).max())

    em = compute_ranked_scores(df_scores, "predicted_score", "text_em",
                               group_by)
    f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1",
                               group_by)
    table = [["N Paragraphs", "EM", "F1"]]
    table += list([str(i + 1), "%.4f" % e, "%.4f" % f]
                  for i, (e, f) in enumerate(zip(em, f1)))

    table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'],
                                                              axis=1)
    ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \
                                                            'max_EM':table_df.max().ix['EM'], \
                                                            'max_F1':table_df.max().ix['F1'], \
                                                            'result_table': str(table_df)})

    df_flat = []
    for id, question in df.iterrows():
        for text_answer, predicted_span, predicted_score in zip(
                question['text_answer'], question['predicted_span'],
                question['predicted_score']):
            new_question = dict(question.copy())
            new_question.update({
                'text_answer': text_answer,
                'predicted_span': predicted_span,
                'predicted_score': predicted_score
            })
            df_flat.append(new_question)

    results_df = pd.DataFrame(df_flat)
    #Alon: outputing the estimates for all the
    #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True)
    results_df.sort_values(by=['question_id', 'predicted_score'],
                           ascending=False).set_index([
                               'question_id', 'text_answer'
                           ])[['question', 'predicted_score',
                               'text_em']].to_csv('results.csv')

    print_table(table)
def main():
    parser = argparse.ArgumentParser(description='Train a model on TriviaQA web')
    parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge",
                                         "shared-norm", "sigmoid", "shared-norm-600"])
    parser.add_argument("name", help="Where to store the model")
    parser.add_argument('-n', '--n_processes', type=int, default=2,
                        help="Number of processes (i.e., select which paragraphs to train on) "
                             "the data with")
    args = parser.parse_args()
    mode = args.mode

    out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S")

    model = get_model(100, 140, mode, WithIndicators())

    stop = NltkPlusStopWords(True)

    if mode == "paragraph-level":
        extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True)
    elif mode == "shared-norm-600":
        extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    else:
        extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True)
    
    if mode == "paragraph-level":
        n_epochs = 16
        train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True))
        test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False))
        n_dev, n_train = 21000, 12000
        eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")]
    else:
        eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")]
        # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions
        n_dev, n_train = 15000, 8000

        if mode == "confidence" or mode == "sigmoid":
            if mode == "sigmoid":
                # Trains very slowly, do this at your own risk
                n_epochs = 71
            else:
                n_epochs = 28
            test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1)
            train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1)
        else:
            n_epochs = 14
            test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1)
            train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1)

    data = TriviaQaWebDataset()

    params = get_triviaqa_train_params(n_epochs, n_dev, n_train)

    data = PreprocessedData(data, extract, train, test, eval_on_verified=False)

    data.preprocess(args.n_processes, 1000)

    with open(__file__, "r") as f:
        notes = f.read()
    notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes

    trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
Exemple #24
0
def predict():
    json_data = {"success": False, "predictions": []}
    print("Preprocessing...")

    # Load the model
    model_dir = ModelDir(
        "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm"
    )
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )

    # Load the question
    question = (flask.request.data).decode("utf-8")

    # Read the documents
    documents = []
    doclist = ["/home/antriv/data/The-Future-Computed.txt"]
    for doc in doclist:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=1000)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [
            model.preprocessor.encode_text(question, x) for x in context
        ]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]

    print("Setting up model")
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)),
                         voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(
        conf
    )  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " +
          " ".join(context[best_para]
                   [best_spans[best_para][0]:best_spans[best_para][1] + 1]))
    print("Confidence: " + str(conf[best_para]))
    y_output = " ".join(
        context[best_para][best_spans[best_para][0]:best_spans[best_para][1] +
                           1])
    print(y_output)
    json_data["predictions"].append(str(y_output))

    #indicate that the request was a success
    json_data["success"] = True
    #return the data dictionary as a JSON response
    return flask.jsonify(json_data)
Exemple #25
0
def main():
    parser = argparse.ArgumentParser("Train our ELMo model on SQuAD")
    parser.add_argument("loss_mode", choices=['default', 'confidence'])
    parser.add_argument("output_dir")
    parser.add_argument("--dim", type=int, default=90)
    parser.add_argument("--l2", type=float, default=0)
    parser.add_argument("--mode",
                        choices=["input", "output", "both", "none"],
                        default="both")
    parser.add_argument("--top_layer_only", action="store_true")
    parser.add_argument("--no-tfidf",
                        action='store_true',
                        help="Don't add TF-IDF negative examples")
    args = parser.parse_args()

    out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S")

    dim = args.dim
    recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05))

    if args.loss_mode == 'default':
        n_epochs = 24
        answer_encoder = SingleSpanAnswerEncoder()
        predictor = BoundsPredictor(
            ChainBiMapper(first_layer=recurrent_layer,
                          second_layer=recurrent_layer))
        batcher = ClusteredBatcher(45, ContextLenKey(), False, False)
        data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher)
    elif args.loss_mode == 'confidence':
        if args.no_tfidf:
            prepro = SquadDefault()
            n_epochs = 15
        else:
            prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True)
            n_epochs = 50
        answer_encoder = DenseMultiSpanAnswerEncoder()
        predictor = ConfidencePredictor(ChainBiMapper(
            first_layer=recurrent_layer,
            second_layer=recurrent_layer,
        ),
                                        AttentionEncoder(),
                                        FullyConnected(80, activation="tanh"),
                                        aggregate="sum")
        eval_dataset = RandomParagraphSetDatasetBuilder(
            100, 'flatten', True, 0)
        train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True,
                                          False)
        data = PreprocessedData(SquadCorpus(),
                                prepro,
                                StratifyParagraphsBuilder(train_batching, 1),
                                eval_dataset,
                                eval_on_verified=False)
        data.preprocess(1)

    params = trainer.TrainParams(trainer.SerializableOptimizer(
        "Adadelta", dict(learning_rate=1.0)),
                                 ema=0.999,
                                 max_checkpoints_to_keep=2,
                                 async_encoding=10,
                                 num_epochs=n_epochs,
                                 log_period=30,
                                 eval_period=1200,
                                 save_period=1200,
                                 best_weights=("dev", "b17/text-f1"),
                                 eval_samples=dict(dev=None, train=8000))

    lm_reduce = MapperSeq(
        ElmoLayer(args.l2,
                  layer_norm=False,
                  top_layer_only=args.top_layer_only),
        DropoutLayer(0.5),
    )
    model = AttentionWithElmo(
        encoder=DocumentAndQuestionEncoder(answer_encoder),
        lm_model=SquadContextConcatSkip(),
        append_before_atten=(args.mode == "both" or args.mode == "output"),
        append_embed=(args.mode == "both" or args.mode == "input"),
        max_batch_size=128,
        word_embed=FixedWordEmbedder(vec_name="glove.840B.300d",
                                     word_vec_init_scale=0,
                                     learn_unk=False,
                                     cpu=True),
        char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14,
                                                        char_th=49,
                                                        char_dim=20,
                                                        init_scale=0.05,
                                                        force_cpu=True),
                                    MaxPool(Conv1d(100, 5, 0.8)),
                                    shared_parameters=True),
        embed_mapper=SequenceMapperSeq(
            VariationalDropoutLayer(0.8),
            recurrent_layer,
            VariationalDropoutLayer(0.8),
        ),
        lm_reduce=None,
        lm_reduce_shared=lm_reduce,
        per_sentence=False,
        memory_builder=NullBiMapper(),
        attention=BiAttention(TriLinear(bias=True), True),
        match_encoder=SequenceMapperSeq(
            FullyConnected(dim * 2, activation="relu"),
            ResidualLayer(
                SequenceMapperSeq(
                    VariationalDropoutLayer(0.8),
                    recurrent_layer,
                    VariationalDropoutLayer(0.8),
                    StaticAttentionSelf(TriLinear(bias=True),
                                        ConcatWithProduct()),
                    FullyConnected(dim * 2, activation="relu"),
                )), VariationalDropoutLayer(0.8)),
        predictor=predictor)

    with open(__file__, "r") as f:
        notes = f.read()
        notes = str(sorted(args.__dict__.items(),
                           key=lambda x: x[0])) + "\n" + notes

    trainer.start_training(
        data, model, params,
        [LossEvaluator(),
         SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
Exemple #26
0
                return out
            if on_token + len(sent) < start:
                pass
            on_para += sent
            on_token += len(sent)
        if len(on_para) > 0:
            out.append(on_para)
            on_para = []

    out.append(on_para)
    if len(flatten_iterable(out)) != end - start:
        raise ValueError(len(flatten_iterable(out)), end-start)
    return out


stop = NltkPlusStopWords(True).words

class bcolors:
    CORRECT = '\033[94m'
    ERROR = '\033[91m'
    CYAN = "\033[96m"
    ENDC = '\033[0m'


def display_para(text: List[str], answers, question, p_start, p_end):
    words = {w.lower() for w in question if w.lower() not in stop}
    text = list(text)
    if answers is not None:
        for s,e in answers:
            text[s] = bcolors.CORRECT + text[s]
            text[e] = text[e] + bcolors.ENDC