Esempio n. 1
0
def makeDoc(doc):
    s = str(type(doc))
    if s.find("spacy") == 8:
        return doc
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        d = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        d = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        d = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        d = str(doc)
    DOC = []
    m = []
    misc = ""
    for t in d.split("\n"):
        x = t.split("\t")
        if len(x) != 10:
            continue
        try:
            i, j = int(x[0]), int(x[6])
        except:
            try:
                i = x[0].index("-")
                j = int(x[0][0:i])
                k = int(x[0][i + 1:])
                m.append((len(DOC), j, k, x[1]))
                continue
            except:
                continue
        s = type("", (object, ), {"i": i})
        s.orth_ = x[1]
        s.pos_ = x[3]
        s.head = j
        s.dep_ = x[7]
        s.whitespace_ = (x[9].find("SpaceAfter=No") < 0)
        if s.whitespace_:
            i = x[9].find("start_char=")
            if i >= 0:
                j = x[9].find("|", i)
                k = x[9][i + 5:] if j < 0 else x[9][i + 5:j]
                if misc.find("end" + k) >= 0:
                    DOC[-1].whitespace_ = False
        DOC.append(s)
        misc = x[9]
    for i, j, k, f in reversed(m):
        offset = i - DOC[i].i
        DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)])
    for i, t in enumerate(DOC):
        if t.head == 0:
            t.head = t
        else:
            t.head = DOC[i + t.head - t.i]
    return DOC
Esempio n. 2
0
 def dump_line(self, outputs: data.Sentence) -> str:
     # Check whether serialized (str) tree or token's list
     # Serialized tree has already separators between lines
     if self.without_sentence_embedding:
         outputs.sentence_embedding = []
     if self.line_to_conllu:
         return sentence2conllu(
             outputs, keep_semrel=self._dataset_reader.use_sem).serialize()
     else:
         return outputs.to_json()
Esempio n. 3
0
def serve(doc, port=5000, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for t in doc:
            try:
                m = str(t.morph)
                if m.startswith("<spacy"):
                    m = ""
            except:
                m = ""
            c += str(t.i + 1)
            for i in [
                    t.orth_, t.lemma_, t.pos_, t.tag_, m,
                    str(0 if t.head == t else t.head.i + 1), t.dep_, ""
            ]:
                c += "\t_" if i.strip() == "" else "\t" + i
            if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
            else:
                u = ""
            if RtoL and len(t.orth_) > 1:
                if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                    u += ("" if u == "" else "|") + "Direction=RtoL"
            if not t.whitespace_:
                u += ("" if u == "" else "|") + "SpaceAfter=No"
            if t.norm_ != "" and t.norm_ != t.orth_:
                u += ("" if u == "" else "|") + "Translit=" + t.norm_
            if u == "":
                u = "_"
            c += "\t" + u + "\n"
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8 or s.find("stanfordnlp") == 8:
        c = doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        c = doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        c = sentence2conllu(doc, False).serialize()
    elif s.find("list") == 8:
        c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    else:
        c = str(doc)
    if port == None:
        from IPython.display import IFrame, display
        from urllib.parse import quote
        if RtoL:
            display(
                IFrame(src=EDITOR_RTOL + "#" + quote(c),
                       width="100%",
                       height="400"))
        else:
            display(
                IFrame(src=EDITOR_URL + "#" + quote(c),
                       width="100%",
                       height="400"))
        return
    import sys
    from http.server import HTTPServer
    f = TEMPFILE
    f.seek(0)
    f.truncate(0)
    f.write(c.encode("utf-8"))
    if RtoL:
        httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL)
    else:
        httpd = HTTPServer(("", port), DeplacyRequestHandler)
    print("http://127.0.0.1:" + str(port) + "   " + VERSION, file=sys.stderr)
    try:
        httpd.serve_forever()
    except:
        return
Esempio n. 4
0
 def _to_input_instance(self,
                        sentence: data.Sentence) -> allen_data.Instance:
     return self._dataset_reader.text_to_instance(sentence2conllu(sentence))
Esempio n. 5
0
def to_conllu(doc, RtoL=False):
    s = str(type(doc))
    if s.find("spacy") == 8:
        c = ""
        for s in doc.sents:
            for t in s:
                try:
                    m = str(t.morph)
                    if m.startswith("<spacy"):
                        m = ""
                except:
                    m = ""
                c += str(t.i - s.start + 1)
                for i in [
                        t.orth_, t.lemma_, t.pos_, t.tag_, m,
                        str(0 if t.head == t else t.head.i - s.start + 1),
                        t.dep_, ""
                ]:
                    c += "\t_" if i.strip() == "" else "\t" + i
                if t.ent_iob_ == "B" or t.ent_iob_ == "I":
                    u = "NE=" + t.ent_iob_ + "-" + t.ent_type_
                else:
                    u = ""
                if RtoL and len(t.orth_) > 1:
                    if len([c for c in t.orth_ if ord(c) > 12287]) > 0:
                        u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u
                if not t.whitespace_:
                    u += ("" if u == "" else "|") + "SpaceAfter=No"
                if t.norm_ != "" and t.norm_ != t.orth_:
                    u += ("" if u == "" else "|") + "Translit=" + t.norm_
                if u == "":
                    u = "_"
                c += "\t" + u + "\n"
            c += "\n"
        return c
    elif s.find("stanza") == 8:
        from stanza.utils.conll import CoNLL
        return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict()))
    elif s.find("classla") == 8:
        return doc.to_conll()
    elif s.find("stanfordnlp") == 8:
        return doc.conll_file.conll_as_string()
    elif s.find("nltk") == 8:
        return doc.to_conll(10)
    elif s.find("combo") == 8:
        from combo.data import sentence2conllu
        return sentence2conllu(doc, False).serialize()
    elif s.find("supar") == 8:
        if hasattr(doc, "sentences"):
            return "".join([str(s) + "\n" for s in doc.sentences])
        else:
            return str(doc) + "\n"
    elif s.find("list") == 8:
        return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc)
    elif s.find("dict") == 8 and "sentences" in doc:
        from trankit.utils.conll import CoNLL
        d = []
        for s in doc["sentences"]:
            e = []
            for t in s["tokens"]:
                if "span" in t:
                    i, j = t["span"]
                    t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j)
                e.append(t)
                if "expanded" in t:
                    e.extend(t["expanded"])
            d.append(list(e))
        return CoNLL.conll_as_string(CoNLL.convert_dict(d))
    return str(doc)