Ejemplo n.º 1
0
def generate_templates(fname,
                       stanza_lang,
                       rtl=False,
                       min_support=2,
                       strict=True,
                       case_folding=False,
                       remove_punct=False,
                       temp_fname="gen_templates.txt",
                       sent_fname="gen_sentences.txt",
                       remove_diacritics=True,
                       dot_fix=False,
                       join_char=' ',
                       idf_file=None):
    def record(key):
        problems[key] += 1

    def print_report(templates):
        print("{} templates".format(len(templates)))
        print("{} impossible questions".format(problems['impossible']))
        print(
            "{} possible questions of which {} share the root with the original sentence"
            .format(problems['possible'], problems['same_root']))
        print("{} copula questions".format(problems['copula']))
        print("{} have no question templates".format(
            problems['no_q_template']))
        print("{} have no answer templates".format(problems['no_a_template']))

    problems = {
        'possible': 0,
        'impossible': 0,
        'same_root': 0,
        'copula': 0,
        'no_q_template': 0,
        'no_a_template': 0
    }

    templates = {}
    with open(fname) as f:
        for line in tqdm.tqdm(f):
            if line.strip():
                question, answer, base_sentence = line.split(" #|@ ")

                if remove_diacritics:
                    question = remove_unicode_diacritics(question)
                    base_sentence = remove_unicode_diacritics(base_sentence)
                    answer = remove_unicode_diacritics(answer)

                if case_folding:
                    question, answer, base_sentence = question.lower(
                    ), answer.lower(), base_sentence.lower()

                # lowercasing is a necessary step to mitigate parser's errors
                if remove_punct:
                    question = remove_unicode_punctuation(question)
                    base_sentence = remove_unicode_punctuation(base_sentence)
                    answer = remove_unicode_punctuation(answer)

                question, base_sentence, answer = question.strip(
                ), base_sentence.strip(), answer.strip()
                if dot_fix:
                    if not is_punctuation(question[-1]):
                        question += "?"
                    if not is_punctuation(base_sentence[-1]):
                        base_sentence += "."

                # have to proceed through files, because C++ package works with files
                with open('sentence.conll', 'w') as f1:
                    ss = stanza_lang(base_sentence)
                    conll_list = CoNLL.convert_dict(ss.to_dict())
                    sentence_tokenized = [
                        w.text for s in ss.sentences for w in s.words
                    ]
                    f1.write(CoNLL.conll_as_string(conll_list))

                with open('question.conll', 'w') as f1:
                    qq = stanza_lang(question)
                    conll_list = CoNLL.convert_dict(qq.to_dict())
                    question_tokenized = [
                        w.text for s in qq.sentences for w in s.words
                    ]
                    f1.write(CoNLL.conll_as_string(conll_list))

                ud_s = udon2.ConllReader.read_file('sentence.conll')[0]
                ud_q = udon2.ConllReader.read_file('question.conll')[0]

                # s_roots = udon2.ConllReader.read_file('sentence.conll')
                # q_roots = udon2.ConllReader.read_file('question.conll')
                # ud_s = s_roots[0]
                # ud_q = q_roots[0]

                s_root_word = ud_s.children[0]
                q_root_word = ud_q.children[0]

                if strict:
                    diff = get_difference(question_tokenized,
                                          sentence_tokenized)
                    cond = not diff
                else:
                    same = get_intersection(question_tokenized,
                                            sentence_tokenized)
                    cond = len(same) > 0

                if cond:
                    # means there's a direct dependency tree transformation!
                    record('possible')
                    if s_root_word.form.lower() == q_root_word.form.lower():
                        # many questions that can be asked share the root with a sentence
                        record('same_root')
                    elif q_root_word.prop_exists("deprel", "cop"):
                        # means this is a copula question
                        record('copula')

                    q_temp = generate_question_template(s_root_word,
                                                        q_root_word,
                                                        strict=strict,
                                                        join_char=join_char)

                    to_check = q_temp[:-1] if rtl else q_temp[1:]
                    S_t = sum([type(x) == TemplateElement for x in to_check])
                    S_nt = len(to_check) - S_t

                    if not q_temp:
                        record('no_q_template')
                        continue
                    if S_t == 0:
                        continue

                    qw = q_temp[-1] if rtl else q_temp[0]
                    if type(qw) == TemplateElement:
                        # the first word is not a constant, so no question word there
                        continue

                    if rtl:
                        qw = q_temp.pop()
                        q_temp.append('<qw>')
                    else:
                        qw = q_temp.pop(0)
                        q_temp.insert(0, '<qw>')

                    a_temp = generate_answer_template(s_root_word,
                                                      answer,
                                                      join_char=join_char)
                    if not a_temp:
                        record('no_a_template')
                        continue

                    q_temp, a_temp = normalize_templates(q_temp, a_temp)
                    qtemp_without_qw = join_char.join(map(str, q_temp))

                    if qtemp_without_qw not in templates:
                        templates[qtemp_without_qw] = {
                            'question': q_temp,
                            'all_templates': S_nt == 0,
                            'answer': a_temp,
                            'qw': {}
                        }

                    assert templates[qtemp_without_qw]['all_templates'] == (
                        S_nt == 0), "Inconsistency in templates found"

                    if qw not in templates[qtemp_without_qw]['qw']:
                        templates[qtemp_without_qw]['qw'][qw] = {}

                    atemp_str = join_char.join(map(str, a_temp))
                    if atemp_str not in templates[qtemp_without_qw]['qw'][qw]:
                        templates[qtemp_without_qw]['qw'][qw][atemp_str] = {
                            'answer': a_temp,
                            'examples': []
                        }

                    templates[qtemp_without_qw]['qw'][qw][atemp_str][
                        'examples'].append({
                            'sentence': base_sentence.strip(),
                            'question': question.strip(),
                            'answer': answer.strip(),
                            'node': s_root_word.copy(
                            ),  # If not copying, then we'll have a memory error, since the associated TreeList will be freed
                        })

                    # templates[f"{s_root_word.upos} #|@ {str(s_root_word.feats)} #|@ {s_root_word.child_has_prop('deprel', 'aux')} #|@ {non_temp_el} #|@ {q_temp}"][qw][a_temp].add(
                    #     base_sentence.strip() + " | " + question.strip() + " | " + answer.strip())
                elif strict:
                    record('impossible')

    idf = load_idf(idf_file) if idf_file else None

    final_templates, temp_id = [], 1
    temp_base = os.path.splitext(os.path.basename(temp_fname))[0]
    with open(temp_fname, "w") as f, open(sent_fname, 'w') as f1:
        for _, passport in templates.items():
            N_ex = sum([
                len(data['examples']) for _, endings in passport['qw'].items()
                for _, data in endings.items()
            ])

            q_tmpl = join_char.join(map(str, passport['question']))

            if passport['all_templates'] or N_ex >= min_support:
                idfs = [
                    idf.get(t, float('inf')) for t in passport['question']
                    if type(t) == str and t != '<qw>'
                ]
                max_idf = max(idfs) if idfs else 0
                if max_idf <= math.log(
                        4):  # appeared in at least 25% of the documents
                    for qw, endings in passport['qw'].items():
                        for a_tmpl, data in endings.items():
                            logging.debug(
                                "-- {} - {} - {} -> PASSED --".format(
                                    q_tmpl, passport['all_templates'], N_ex))

                            final_templates.append({
                                'question':
                                q_tmpl.replace("<qw>", qw),
                                'answer':
                                a_tmpl,
                                'props': [{
                                    'pos':
                                    x['node'].upos,
                                    'has_aux':
                                    x['node'].child_has_prop('deprel', 'aux'),
                                    'feats':
                                    x['node'].feats
                                } for x in data['examples']]
                            })

                            sent = "\n".join([
                                " | ".join([
                                    x['sentence'], x['question'], x['answer']
                                ]) for x in data['examples']
                            ])

                            tmpl = "{} => {}".format(
                                q_tmpl.replace('<qw>', qw), a_tmpl)
                            f.write("{}\n".format(tmpl))
                            f1.write("id: {}{}\n{}\n\n".format(
                                temp_base, temp_id, sent))
                            temp_id += 1
                else:
                    logging.debug(
                        "-- {} - {} - {} -> FAILED IDF ({}) --".format(
                            q_tmpl, passport['all_templates'], N_ex, max_idf))
            else:
                logging.debug("-- {} - {} - {} -> FAILED --".format(
                    q_tmpl, passport['all_templates'], N_ex))

    print_report(final_templates)

    return final_templates, temp_fname
Ejemplo n.º 2
0
def serve(doc, port=5000, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for t in doc:
            try:
                m = str(t.morph)
                if m.startswith("<spacy"):
                    m = ""
            except:
                m = ""
            c += str(t.i + 1)
            for i in [
                    t.orth_, t.lemma_, t.pos_, t.tag_, m,
                    str(0 if t.head == t else t.head.i + 1), t.dep_, ""
            ]:
                c += "\t_" if i.strip() == "" else "\t" + i
            if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
            else:
                u = ""
            if RtoL and len(t.orth_) > 1:
                if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                    u += ("" if u == "" else "|") + "Direction=RtoL"
            if not t.whitespace_:
                u += ("" if u == "" else "|") + "SpaceAfter=No"
            if t.norm_ != "" and t.norm_ != t.orth_:
                u += ("" if u == "" else "|") + "Translit=" + t.norm_
            if u == "":
                u = "_"
            c += "\t" + u + "\n"
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        c = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        c = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        c = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        c = str(doc)
    if port == None:
        from IPython.display import IFrame, display
        from urllib.parse import quote
        if RtoL:
            display(
                IFrame(src=EDITOR_RTOL + "#" + quote(c),
                       width="100%",
                       height="400"))
        else:
            display(
                IFrame(src=EDITOR_URL + "#" + quote(c),
                       width="100%",
                       height="400"))
        return
    import sys
    from http.server import HTTPServer
    f = TEMPFILE
    f.seek(0)
    f.truncate(0)
    f.write(c.encode("utf-8"))
    if RtoL:
        httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL)
    else:
        httpd = HTTPServer(("", port), DeplacyRequestHandler)
    print("http://127.0.0.1:" + str(port) + "   " + VERSION, file=sys.stderr)
    try:
        httpd.serve_forever()
    except:
        return
Ejemplo n.º 3
0
        if args.max_examples > 0:
            # sample one gold question and answer
            q_dict_keys = list(q_dict.keys())
            ind = np.random.choice(range(len(q_dict_keys)))
            gold_q = q_dict_keys[ind]
            ind_a = np.random.choice(range(len(q_dict[gold_q])))
            gold_a = q_dict[gold_q][ind_a]

        total += len(q_dict)

        sent = re.sub(r' {2,}', '', sent)

        stanza_sent = stanza_dep_pipe(sent)
        with open(fname, 'w') as f:
            conll_list = CoNLL.convert_dict(stanza_sent.to_dict())
            f.write(CoNLL.conll_as_string(conll_list))
        trees = udon2.ConllReader.read_file(fname)

        res = overgenerate_questions(trees,
                                     guards_root,
                                     templates,
                                     template_examples,
                                     return_first=False)

        if res:
            idx_sorted_by_scores, qwf, atf, scores = rank(
                res,
                stanza_pipe,
                stanza_dep_pipe,
                qw_stat,
Ejemplo n.º 4
0
                    arquivos[arquivo.rsplit(".txt")[0]] = nlp(text)
                except:
                    sys.stderr.write('\nerro: ' + arquivo)
        with open(
                diretorio + "/" +
                f'{diretorio.rsplit("/", 1)[1] if "/" in diretorio else diretorio}'
                + ".p", "wb") as f:
            pickle.dump(arquivos, f)
    else:
        with open(
                diretorio + "/" +
                f'{diretorio.rsplit("/", 1)[1] if "/" in diretorio else diretorio}'
                + ".p", "rb") as f:
            arquivos = pickle.load(f)
    for arquivo in arquivos:
        arquivos[arquivo] = CoNLL.convert_dict(arquivos[arquivo].to_dict())
elif os.path.isfile(diretorio):
    arquivo = diretorio
    if not os.path.isfile(diretorio + ".p"):
        with open(arquivo, encoding="utf-8") as f:
            text = f.read()
        arquivos[arquivo.rsplit(".txt")[0]] = nlp(text)
        with open(diretorio + ".p", "wb") as w:
            pickle.dump(arquivos, w)
    else:
        with open(diretorio + ".p", "rb") as f:
            arquivos = pickle.load(f)
    for arquivo in arquivos:
        arquivos[arquivo] = CoNLL.convert_dict(arquivos[arquivo].to_dict())

sentences = []
Ejemplo n.º 5
0
def test_conllu(processed_doc):
    assert CoNLL.conll_as_string(CoNLL.convert_dict(
        processed_doc.to_dict())) == EN_DOC_CONLLU_GOLD
Ejemplo n.º 6
0
def write_doc_to_file(doc, out_file):
    conll_string = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    with open(str(out_file), "w") as fp:
        fp.write(conll_string)
Ejemplo n.º 7
0
# Converts a texed processed by stanza into an NLTK corpus

import nltk as nltk
from nltk.corpus.reader import conll
import stanza as stanza

from stanza.utils.conll import CoNLL

# stanza spanish tagging
nlp = stanza.Pipeline(lang='es')

# The input to this is arbitrary, it could be a file if you wanted.
doc = nlp("Yo soy Diego. Soy de Puerto Rico.")

# Convert to conll format
stanza_conll = CoNLL.convert_dict(doc.to_dict())

# Write to conll format file - we could write multiple files for multiple
# different input sources here
with open('conll.txt', 'w+') as f:
    f.write(CoNLL.conll_as_string(stanza_conll))

# The columns we want (maybe we can get more info, I'm not sure)
COLUMN_TYPES = (
    'ignore',
    'words',
    'ignore',
    'pos',
    'ignore',
    'ignore',
    'ignore',
Ejemplo n.º 8
0
    with open(sys.argv[2] + ".json") as f:
        tokenized_dict = json.load(f)
else:
    tokenized = [[
        token.split("\t")[0] for token in sentence.splitlines()
        if len(token.split("\t")) > 7 and not '-=' in token.split("\t")[0]
    ] for sentence in arquivo]
    print("1/4 dicionário tokenizado: ok")
    nlp = stanza.Pipeline('pt', tokenize_pretokenized=True)
    tokenized_nlp = nlp([x for x in tokenized if x])
    print("2/4 anotação: ok")
    tokenized_dict = tokenized_nlp.to_dict()
    with open(sys.argv[2] + ".json", "w") as f:
        json.dump(tokenized_dict, f)
    print(":: checkpoint :: conversão para dict: salva em json")
tokenized = CoNLL.convert_dict(tokenized_dict)
print("3/4 conversão para CoNLL: ok")

sentences = []
for s, sentence in enumerate([x for x in tokenized if x]):
    metadados = {}
    #text = arquivo[s].split("# text = ")[1].split("\n")[0]
    #sent_id = arquivo[s].split("# sent_id = ")[1].split("\n")[0]
    for token in arquivo[s].splitlines():
        #print(token)
        if token.startswith("# "):
            metadados[token.split("# ",
                                  1)[1].split(" ")[0]] = token.split(" = ",
                                                                     1)[1]
        if '-=' in token:
            for t, _token in enumerate(sentence):
Ejemplo n.º 9
0
def test_dict_to_conll():
    conll = CoNLL.convert_dict(DICT)
    assert conll == CONLL
Ejemplo n.º 10
0
def to_conllu(doc, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for s in doc.sents:
            for t in s:
                try:
                    m = str(t.morph)
                    if m.startswith("<spacy"):
                        m = ""
                except:
                    m = ""
                c += str(t.i - s.start + 1)
                for i in [
                        t.orth_, t.lemma_, t.pos_, t.tag_, m,
                        str(0 if t.head == t else t.head.i - s.start + 1),
                        t.dep_, ""
                ]:
                    c += "\t_" if i.strip() == "" else "\t" + i
                if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                    u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
                else:
                    u = ""
                if RtoL and len(t.orth_) > 1:
                    if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                        u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u
                if not t.whitespace_:
                    u += ("" if u == "" else "|") + "SpaceAfter=No"
                if t.norm_ != "" and t.norm_ != t.orth_:
                    u += ("" if u == "" else "|") + "Translit=" + t.norm_
                if u == "":
                    u = "_"
                c += "\t" + u + "\n"
            c += "\n"
        return c
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8:
        return doc.to_conll()
    elif s.find("stanfordnlp") == 8:
        return doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        return doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        return sentence2conllu(doc, False).serialize()
    elif s.find("supar") == 8:
        if hasattr(doc, "sentences"):
            return "".join([str(s) + "\n" for s in doc.sentences])
        else:
            return str(doc) + "\n"
    elif s.find("list") == 8:
        return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    elif s.find("dict") == 8 and "sentences" in doc:
        from trankit.utils.conll import CoNLL
        d = []
        for s in doc["sentences"]:
            e = []
            for t in s["tokens"]:
                if "span" in t:
                    i, j = t["span"]
                    t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j)
                e.append(t)
                if "expanded" in t:
                    e.extend(t["expanded"])
            d.append(list(e))
        return CoNLL.conll_as_string(CoNLL.convert_dict(d))
    return str(doc)
Ejemplo n.º 11
0
import io


detok = MosesDetokenizer()

# English
gold_conll_en = ""
for s in nltk.corpus.dependency_treebank.parsed_sents()[:200]:
    gold_conll_en += s.to_conll(10) + '\r\n'

nlp = stanza.Pipeline(processors='tokenize,mwt,pos,lemma,depparse')
stanza_conll_en = ""
for s in nltk.corpus.dependency_treebank.sents()[:200]:
    sent = detok.detokenize(s)
    doc = nlp(sent)
    for s in CoNLL.convert_dict(doc.to_dict()):
        for w in s:
            for i, content in enumerate(w):
                stanza_conll_en += content + '\t'
            stanza_conll_en = stanza_conll_en[:-1] + '\r\n'
        stanza_conll_en += '\r\n'

f_gold_en = io.StringIO(gold_conll_en)
f_stanza_en = io.StringIO(stanza_conll_en)

gold_en_eval = load_conllu(f_gold_en)
stanza_en_eval = load_conllu(f_stanza_en)

stanza_en_evaluation = evaluate(gold_en_eval, stanza_en_eval)

print_results(stanza_en_evaluation,