def makeDoc(doc): s = str(type(doc)) if s.find("spacy") == 8: return doc elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL d = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: d = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: d = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu d = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: d = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: d = str(doc) DOC = [] m = [] misc = "" for t in d.split("\n"): x = t.split("\t") if len(x) != 10: continue try: i, j = int(x[0]), int(x[6]) except: try: i = x[0].index("-") j = int(x[0][0:i]) k = int(x[0][i + 1:]) m.append((len(DOC), j, k, x[1])) continue except: continue s = type("", (object, ), {"i": i}) s.orth_ = x[1] s.pos_ = x[3] s.head = j s.dep_ = x[7] s.whitespace_ = (x[9].find("SpaceAfter=No") < 0) if s.whitespace_: i = x[9].find("start_char=") if i >= 0: j = x[9].find("|", i) k = x[9][i + 5:] if j < 0 else x[9][i + 5:j] if misc.find("end" + k) >= 0: DOC[-1].whitespace_ = False DOC.append(s) misc = x[9] for i, j, k, f in reversed(m): offset = i - DOC[i].i DOC[k + offset].contract = (f, [i + offset for i in range(j, k + 1)]) for i, t in enumerate(DOC): if t.head == 0: t.head = t else: t.head = DOC[i + t.head - t.i] return DOC
def dump_line(self, outputs: data.Sentence) -> str: # Check whether serialized (str) tree or token's list # Serialized tree has already separators between lines if self.without_sentence_embedding: outputs.sentence_embedding = [] if self.line_to_conllu: return sentence2conllu( outputs, keep_semrel=self._dataset_reader.use_sem).serialize() else: return outputs.to_json()
def serve(doc, port=5000, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for t in doc: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u += ("" if u == "" else "|") + "Direction=RtoL" if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL c = CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8 or s.find("stanfordnlp") == 8: c = doc.conll_file.conll_as_string() elif s.find("nltk") == 8: c = doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu c = sentence2conllu(doc, False).serialize() elif s.find("list") == 8: c = "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) else: c = str(doc) if port == None: from IPython.display import IFrame, display from urllib.parse import quote if RtoL: display( IFrame(src=EDITOR_RTOL + "#" + quote(c), width="100%", height="400")) else: display( IFrame(src=EDITOR_URL + "#" + quote(c), width="100%", height="400")) return import sys from http.server import HTTPServer f = TEMPFILE f.seek(0) f.truncate(0) f.write(c.encode("utf-8")) if RtoL: httpd = HTTPServer(("", port), DeplacyRequestHandlerRtoL) else: httpd = HTTPServer(("", port), DeplacyRequestHandler) print("http://127.0.0.1:" + str(port) + " " + VERSION, file=sys.stderr) try: httpd.serve_forever() except: return
def _to_input_instance(self, sentence: data.Sentence) -> allen_data.Instance: return self._dataset_reader.text_to_instance(sentence2conllu(sentence))
def to_conllu(doc, RtoL=False): s = str(type(doc)) if s.find("spacy") == 8: c = "" for s in doc.sents: for t in s: try: m = str(t.morph) if m.startswith("<spacy"): m = "" except: m = "" c += str(t.i - s.start + 1) for i in [ t.orth_, t.lemma_, t.pos_, t.tag_, m, str(0 if t.head == t else t.head.i - s.start + 1), t.dep_, "" ]: c += "\t_" if i.strip() == "" else "\t" + i if t.ent_iob_ == "B" or t.ent_iob_ == "I": u = "NE=" + t.ent_iob_ + "-" + t.ent_type_ else: u = "" if RtoL and len(t.orth_) > 1: if len([c for c in t.orth_ if ord(c) > 12287]) > 0: u = "Direction=RtoL" if u == "" else "Direction=RtoL|" + u if not t.whitespace_: u += ("" if u == "" else "|") + "SpaceAfter=No" if t.norm_ != "" and t.norm_ != t.orth_: u += ("" if u == "" else "|") + "Translit=" + t.norm_ if u == "": u = "_" c += "\t" + u + "\n" c += "\n" return c elif s.find("stanza") == 8: from stanza.utils.conll import CoNLL return CoNLL.conll_as_string(CoNLL.convert_dict(doc.to_dict())) elif s.find("classla") == 8: return doc.to_conll() elif s.find("stanfordnlp") == 8: return doc.conll_file.conll_as_string() elif s.find("nltk") == 8: return doc.to_conll(10) elif s.find("combo") == 8: from combo.data import sentence2conllu return sentence2conllu(doc, False).serialize() elif s.find("supar") == 8: if hasattr(doc, "sentences"): return "".join([str(s) + "\n" for s in doc.sentences]) else: return str(doc) + "\n" elif s.find("list") == 8: return "".join("".join(str(t) + "\n" for t in s) + "\n" for s in doc) elif s.find("dict") == 8 and "sentences" in doc: from trankit.utils.conll import CoNLL d = [] for s in doc["sentences"]: e = [] for t in s["tokens"]: if "span" in t: i, j = t["span"] t["misc"] = "start_char=" + str(i) + "|end_char=" + str(j) e.append(t) if "expanded" in t: e.extend(t["expanded"]) d.append(list(e)) return CoNLL.conll_as_string(CoNLL.convert_dict(d)) return str(doc)