def get_stats(mapper):
    q = read_data_as_json("data/10_train-v1.1_with_context.json")
    known = {}
    unknown = {}
    tr = tqdm.tqdm(q)
    for question in tr:
        sents = texts.split_in_sentences(question)
        assert (len(sents) >= 1)
        question = sents[-1]
        qtype = get_question_type(question)
        if len(question) < 2:
            continue

        opt = mapper(question)
        if qtype == QType.UNKNOWN:
            unknown[opt] = unknown.get(opt, 0) + 1
        else:
            known[opt] = known.get(opt, 0) + 1

    known = sorted(known.items(), key=lambda x: x[1], reverse=True)
    unknown = sorted(unknown.items(), key=lambda x: x[1], reverse=True)
    print("Known: ")
    for construct, cnt in known[:30]:
        print("\t", construct, ': ', cnt)

    print("")

    print("Unknown: ")
    for construct, cnt in unknown[:30]:
        print("\t", construct, ': ', cnt)
 def process(question):
     sents = texts.split_in_sentences(question)
     assert (len(sents) >= 1)
     question = QTS.__translate(sents[-1])
     sents = ' '.join([str(x) for x in sents[:-1]])
     if len(sents) >= 1:
         return sents + " " + question
     return question
def iterate():
    q = read_data_as_json("data/10_train-v1.1_with_context.json")
    for question in q:
        sents = texts.split_in_sentences(question)
        assert (len(sents) >= 1)
        question = sents[-1]
        qtype = get_question_type(question)
        if qtype == QType.START_WITH_BE:
            start_with_be.process(question)
def proc():
    with open("kaggle_unknown.txt", "r") as f:
        for line in f:
            question = line.strip()
            text = question
            sents = texts.split_in_sentences(question)
            assert (len(sents) == 1)
            question = sents[0]
            qtype = get_question_type(question)
            if qtype == QType.UNKNOWN:
                print(QTS.process(text))
 def get_qtype_as_int(question):
     sents = texts.split_in_sentences(question)
     assert (len(sents) >= 1)
     question = sents[-1]
     qtype = get_question_type(question)
     idx = 0
     for x in QType:
         if x == qtype:
             return idx
         idx += 1
     assert (False)
def coverage():
    q = read_data_as_json("data/10_train-v1.1_with_context.json")
    known = 0
    total = 0
    tr = tqdm.tqdm(q, desc='Coverage')
    for question in tr:
        sents = texts.split_in_sentences(question)
        assert (len(sents) >= 1)
        question = sents[-1]
        qtype = get_question_type(question)
        if not qtype == QType.UNKNOWN:
            known += 1
        total += 1

        if total % 10 == 0:
            tr.set_description("Coverage: {0:.3f}%".format(100.0 * known /
                                                           total))
            tr.refresh()