Ejemplo n.º 1
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = t.text = data
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if not tokenize:
                    t.original = ""
                    for token in data["text"].iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        t.original += f"{form}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    for sentence in data["text"].iter("sentence"):
                        sect_pos = -1
                        curr_line = None
                        for pos, token in enumerate(sentence.iter("word")):
                            if token.get("artificial", False):
                                continue

                            form = token.get("form")
                            if not form:
                                continue
                            t.text = form

                            lemma = token.get("lemma")
                            if not lemma or lemma in (
                                    "???",
                                    ".",
                                    ",",
                                    ";",
                                    "·",
                                    "punc1",
                                    "comma1",
                                    "PERIOD1",
                            ):
                                continue
                            t.lemma = lemma

                            t.morpho = agldt2wn(token.get("postag"))
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            divs = data["meta"].split("-")

                            refs = (token.get("cite").rsplit(
                                ":", maxsplit=1)[1].split("."))
                            for i, div in enumerate(divs):
                                meta[div] = refs[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = str(int(token.get("id")))

                            if curr_line and refs[-1] > curr_line:
                                sect_pos = 0
                            else:
                                sect_pos += 1
                            curr_line = refs[-1]

                            meta["sect_pos"] = sect_pos  # ref in line
                            t.meta = meta

                            if keeporiginal:
                                t.original = f"{form}"
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos
                            if chars:
                                t.startchar = start_char
                                original_len = len(form)

                                t.endchar = start_char + original_len
                            if self.cached:
                                self._cache.append(copy.copy(t))
                            yield t

                            start_char += len(form)
Ejemplo n.º 2
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = data
                t.text = normalize("NFKC", data)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if tokenize:
                    titleStmt = data.find('.//teiHeader').find(
                        'fileDesc').find('titleStmt')
                    auth_code = f"tlg{titleStmt.find('tlgAuthor').text}"
                    work_code = f"tlg{titleStmt.find('tlgId').text}"

                    body = data.find('.//text').find('body')

                    divs = AUTHOR_TAB[auth_code]["works"][work_code]["meta"]

                    meta = {"meta": divs}
                    divv = divs.split("-")
                    for k in divv:
                        meta[k] = None

                    sect_sent = 0
                    sect_pos = 0
                    current_refs = None
                    pos = 0
                    for sentence in body.iter("sentence"):
                        refs = sentence.get("location")
                        if refs != current_refs:
                            current_refs = refs
                            sect_pos = 0
                            sect_sent = 0
                        sent_id = sentence.get("id")
                        sect_sent += 1

                        for i, ref in enumerate(refs.split(".")):
                            meta[divv[i]] = ref

                        for sent_pos, word in enumerate(sentence.iter("word")):
                            t.boost = 1.0

                            sect_pos += 1
                            pos += 1

                            lemma = word.find("lemma").get("entry", None)
                            t.lemma = normalize("NFKC", lemma)

                            meta["sent_id"] = sent_id
                            meta["sent_pos"] = word.get("id")
                            meta["sect_pos"] = str(sect_pos)
                            meta["sect_sent"] = str(sect_sent)
                            t.meta = copy.copy(meta)

                            beta = word.get("form").upper()
                            form = normalize(
                                "NFKC",
                                beta2unicode(
                                    beta +
                                    "\n" if beta.endswith("S") else beta))
                            if (t.lemma.istitle()):
                                form = form.title()
                            t.text = form

                            if keeporiginal:
                                t.original = beta
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos

                            original_len = len(form)
                            if chars:
                                t.startchar = start_char
                                t.endchar = start_char + original_len
                            start_char += original_len

                            POS = word.find("lemma").get("POS", None)
                            analyses = [
                                analysis.get("morph", None) for analysis in
                                word.find("lemma").iter("analysis")
                            ]
                            morphos = []
                            for analysis in analyses:
                                morphos += diorisis2wn(POS, analysis)
                            t.morpho = " ".join(morphos)

                            if self.cached:
                                self._cache.append(copy.deepcopy(t))
                            yield t
                else:
                    body = data.find('.//text').find('body')

                    tokens = []
                    for sentence in body.iter("sentence"):
                        for word in sentence.iter("word"):
                            form = word.get("form")
                            if not form:
                                continue
                            else:
                                tokens.append(form)
                    t.original = t.text = " ".join(tokens)
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
Ejemplo n.º 3
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                self._cache = []
                self._docix = kwargs.get("docix", None)

                if tokenize:
                    for sentence in data["text"].iter("sentence"):
                        for pos, token in enumerate(sentence.iter("word")):
                            if token.get("artificial", False):
                                continue
                            form = token.get("form")
                            if form:
                                form = form.replace(" ", " ").replace(" ", " ")
                                form = re.sub(r"\.([^ ]|^$)", r". \1", form)
                            else:
                                continue
                            lemma = token.get("lemma", None)
                            if not lemma or lemma in (
                                    ".",
                                    ",",
                                    "punc1",
                                    "comma1",
                                    "PERIOD1",
                            ):
                                continue
                            t.lemma = lemma.strip("0123456789")
                            t.morpho = agldt2wn(token.get("postag"))
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            divs = data["meta"].split("-")
                            for i, div in enumerate(divs):
                                if len(divs) <= 2 or div != "line":
                                    meta[div] = sentence.get("subdoc").split(
                                        ".")[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = token.get("id")
                            t.meta = meta

                            if keeporiginal:
                                t.original = f"{form}"
                            t.stopped = False
                            if positions:
                                t.pos = copy.copy(start_pos + pos)
                            original_len = len(form)

                            if (form.istitle() and pos == 0
                                    and not t.lemma.istitle()):
                                form = form.lower()
                            t.text = form
                            if chars:
                                t.startchar = copy.copy(start_char)
                                t.endchar = copy.copy(start_char +
                                                      original_len)
                            if self.cached:
                                self._cache.append(copy.deepcopy(t))
                            yield t

                            if form in editorial:
                                t.text = editorial[form]
                                if self.cached:
                                    self._cache.append(copy.copy(t))
                                yield t
                            start_char += len(form)
                else:
                    t.original = ""
                    for token in data.iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        t.original += f"{form}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
Ejemplo n.º 4
0
    def __call__(
        self,
        data,
        positions=True,
        chars=True,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs,
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(
                positions, chars, removestops=removestops, mode=mode, **kwargs
            )

            if t.mode == "query":
                t.original = data
                t.text = data.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    t.original = ""
                    for token in data.iter("token"):
                        form = token.get("form")
                        if not form:
                            continue
                        after = token.get("presentation-after", "")
                        before = token.get("presentation-before", "")
                        t.original += f"{before}{form}{after}"
                    t.text = t.original
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    for sentence in data["text"].iter("sentence"):
                        for pos, token in enumerate(sentence.iter("token")):
                            form = token.get("form")
                            if not form:
                                continue
                            else:
                                form = form.replace(" ", " ").replace(" ", " ")
                                form = re.sub(r"\.([^ ]|^$)", r". \1", form)
                            t.lemma = token.get("lemma")
                            t.morpho = proiel2wn(
                                token.get("part-of-speech"),
                                token.get("morphology"),
                            )
                            t.morphosyntax = token.get("relation", None)
                            t.boost = 1.0

                            meta = {"meta": data["meta"].lower()}
                            for i, div in enumerate(data["meta"].split("-")):
                                meta[div] = token.get("citation-part").split(
                                    "."
                                )[i]
                            meta["sent_id"] = sentence.get("id")
                            meta["sent_pos"] = token.get("id")
                            t.meta = meta

                            before = token.get("presentation-before", "")
                            after = token.get("presentation-after", "")

                            if keeporiginal:
                                t.original = f"{before}{form}{after}"
                            t.stopped = False
                            if positions:
                                t.pos = start_pos + pos
                            original_len = len(form)

                            if (
                                form.istitle()
                                and pos == 0
                                and not t.lemma.istitle()
                            ):
                                form = form.lower()
                            t.text = form
                            if chars:
                                t.startchar = start_char + len(before)
                                t.endchar = (
                                    start_char + len(before) + original_len
                                )
                            self._cache.append(copy.deepcopy(t))
                            yield t

                            if form in editorial:
                                t.text = editorial[form]
                                self._cache.append(copy.deepcopy(t))
                                yield t
                            start_char += len(before) + len(form) + len(after)
Ejemplo n.º 5
0
    def __call__(self,
                 value: dict,
                 positions=False,
                 chars=False,
                 keeporiginal=True,
                 removestops=True,
                 tokenize=True,
                 start_pos=0,
                 start_char=0,
                 mode="",
                 **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(positions,
                               chars,
                               removestops=removestops,
                               mode=mode,
                               **kwargs)

            if t.mode == "query":
                t.original = value
                t.text = value.translate(jvmap)
                yield t
            else:
                if not tokenize:
                    t.original = t.text = "\n".join(
                        [el for el in value["text"]])
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(value["text"])
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    punctuation = str.maketrans("", "", string.punctuation)
                    editorial = str.maketrans("", "", "[{(<>)}]")
                    added = re.compile(r"(\s?[<(][\w .]+[>)]\s?)")

                    t.boost = 1.0
                    t.pos = t.startchar = t.endchar = 0

                    sect_sent = 0  # sentence count within passage
                    sent_id = "0001"
                    sect_pos = 0  # word pos within passage
                    sent_pos = 0  # word pos within sentence
                    current_refs = tuple(["0"] * len(value["meta"]))
                    nflag = None
                    for pos, line in enumerate(value["text"]):
                        t.pos = pos

                        parsed = parse_bpn(line)

                        if not parsed:
                            continue

                        if int(parsed["sent_id"]) > int(sent_id):
                            sent_pos = 0
                            sent_id = parsed["sent_id"]
                            if (tuple([
                                    alnum(i) for i in parsed["refs"].split(",")
                            ]) > current_refs):
                                sect_sent = 1
                                sect_pos = 0
                            else:
                                sect_sent += 1

                        if keeporiginal:
                            if added.search(parsed["form"]):
                                t.original = added.sub("", parsed["form"])
                            else:
                                t.original = parsed["form"]
                        t.stopped = False

                        if parsed["form_code"] in "&+":
                            if parsed["lemma"] != "#":
                                if parsed["lemma"] == "_SVM":
                                    t.morpho = None
                                    t.lemma = parsed["lemma"]
                                    t.lemma_n = parsed["lemma_n"]
                                    t.original = added.sub("", parsed["form"])
                                    t.text = parsed["form"].translate(
                                        editorial)
                                else:
                                    form = parsed["form"]
                                    t.morpho = parsed["morpho"]

                                    if " " in form:
                                        t.original = added.sub("", form)
                                        text = form.translate(editorial)
                                    else:
                                        t.original = form
                                        text = form
                                    t.lemma = parsed["lemma"]
                                    t.lemma_n = parsed["lemma_n"]
                                    if added.search(parsed["form"]):
                                        t.original = added.sub(
                                            "", parsed["form"])
                                    t.text = text.translate(editorial)
                                    nflag = False
                            else:
                                # could be a Greek form, do we index it?
                                t.morpho = ""
                                t.lemma = ""
                                t.lemma_n = ""
                                t.original = added.sub("", parsed["form"])
                                t.text = parsed["form"].translate(editorial)
                        elif parsed["form_code"] == "@":  # combined forms
                            if parsed["lemma"] != "#":
                                t.lemma = parsed["lemma"]
                                t.lemma_n = parsed["lemma_n"]
                                t.text = parsed["form"].translate(editorial)
                                t.morpho = parsed["morpho"]
                                if nflag:
                                    sect_pos -= 1
                                    sent_pos -= 1
                                else:
                                    nflag = True
                            else:
                                sent_pos += 1
                                sect_pos += 1
                                continue
                        elif parsed["form_code"] == "=":  # que
                            t.text = parsed["form"].translate(editorial)
                            t.lemma = parsed["lemma"]
                            t.lemma_n = parsed["lemma_n"]
                            t.morpho = parsed["morpho"]
                            sent_pos -= 1
                            sect_pos -= 1
                            nflag = False
                        meta = {"meta": value["meta"].lower()}
                        tags = value["meta"].split("-")
                        divs = {i: div.lower() for i, div in enumerate(tags)}
                        refs = tuple([
                            ref.translate(punctuation)
                            for ref in parsed["refs"].strip().split(",")
                        ])
                        for i in range(len(divs)):
                            meta[divs[i]] = refs[i]

                        current_refs = refs

                        t.morphosyntax = parsed["subord"]

                        meta["sect_sent"] = str(sect_sent)
                        meta["sect_pos"] = str(sect_pos)
                        meta["sent_id"] = parsed["sent_id"]
                        meta["sent_pos"] = str(sent_pos)
                        t.meta = meta
                        t.startchar = start_char
                        t.endchar = start_char + len(t.original)

                        if t.text != t.original:
                            tc = copy.deepcopy(t)
                            tc.text = t.original
                            yield tc

                        yield t
                        sent_pos += 1
                        sect_pos += 1
                        start_char += len(t.original) + 1
Ejemplo n.º 6
0
    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=True,
        removestops=True,
        tokenize=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            t = CylleneusToken(
                positions, chars, removestops=removestops, mode=mode, **kwargs
            )

            if t.mode == "query":
                t.text = t.original = value
                yield t
            else:
                if not tokenize:
                    lines = []
                    for line in value["text"]:
                        line = re.sub(r"\t+", "\t", line.strip())
                        if line and line.startswith("# text_line"):
                            text = line.split("# text_line: ")[1]
                            lines.append(text)
                    t.original = t.text = "\n".join([line for line in lines])
                    t.boost = 1.0
                    if positions:
                        t.pos = start_pos
                    if chars:
                        t.startchar = start_char
                        t.endchar = start_char + len(t.text)
                    yield t
                else:
                    self._cache = []
                    self._docix = kwargs.get("docix", None)

                    t.boost = 1.0
                    t.pos = t.startchar = t.endchar = 0

                    meta = {
                        "text":                 None,  # work title
                        "text_id":              None,
                        "chapter":              None,  # reference
                        "chapter_id":           None,
                        "text_line":            None,  # the text
                        "text_line_id":         None,
                        "text_line_counter":    None,  # line number
                        "text_line_subcounter": None,  # token number
                    }

                    sect_pos = 0
                    sent_pos = 0
                    for line in value["text"]:
                        line = line.strip()
                        if line:
                            if line.startswith("#"):
                                try:
                                    label, value = line.split(":", maxsplit=1)
                                except ValueError:
                                    continue
                                label = label.split(" ", maxsplit=1)[1].strip()
                                value = value.strip()
                                meta[label] = (
                                    value
                                    if not value.isnumeric()
                                    else int(value)
                                )

                                if label in [
                                    "text_line_counter",
                                    "text_line_subcounter",
                                ]:
                                    sent_pos = 0
                            else:
                                try:
                                    (
                                        ID,
                                        FORM,
                                        LEMMA,
                                        UPOS,
                                        XPOS,
                                        MORPHO,
                                        _,
                                        _,
                                        _,
                                        _,
                                        LEMMA_ID,
                                        PADA,
                                        SEM,
                                    ) = line.split("\t")
                                except ValueError:
                                    try:
                                        (
                                            ID,
                                            FORM,
                                            LEMMA,
                                            _,
                                            XPOS,
                                            _,
                                            _,
                                            _,
                                            _,
                                            LEMMA_ID,
                                            _,
                                            _,
                                        ) = line.split("\t")
                                    except ValueError:
                                        try:
                                            (
                                                ID,
                                                FORM,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                                _,
                                            ) = line.split("\t")
                                        except ValueError:
                                            continue
                                        else:
                                            t.original = FORM
                                            sect_pos += 1
                                            sent_pos += 1
                                            t.pos = sent_pos
                                            continue
                                    else:
                                        if FORM == "_":
                                            t.text = t.original
                                        else:
                                            sect_pos += 1
                                            sent_pos += 1

                                            t.text = FORM
                                            t.original = FORM
                                            t.pos = sent_pos
                                        t.lemma = LEMMA
                                        t.dcs_id = LEMMA_ID
                                        t.morphosyntax = XPOS
                                        t.morpho = None
                                        t.synset = None

                                        t.meta = {
                                            "meta":      "chapter-line",
                                            "chapter":   meta["chapter"],
                                            "line":      meta["text_line_counter"],
                                            "sect_pos":  sect_pos,
                                            "sect_sent": meta[
                                                             "text_line_counter"
                                                         ],
                                            "sent_id":   meta["text_line_id"],
                                            "sent_pos":  sent_pos,
                                        }
                                        t.startchar = start_char
                                        t.endchar = start_char + len(
                                            t.original
                                        )
                                        yield t

                                        # # Emit Devanagari
                                        # t.text = slp2deva(iast2slp(t.text))
                                        # t.mode = "skip"
                                        # yield t

                                        start_char += len(t.original) + 1
                                else:
                                    if FORM == "_":
                                        t.text = t.original
                                    else:
                                        sect_pos += 1
                                        sent_pos += 1

                                        t.text = FORM
                                        t.original = FORM
                                        t.pos = sent_pos
                                    t.lemma = LEMMA
                                    t.dcs_id = LEMMA_ID
                                    t.morphosyntax = XPOS
                                    t.morpho = None if MORPHO == "_" or not MORPHO else parse_morpho(XPOS, MORPHO)
                                    t.synset = None if SEM == "_" else SEM
                                    t.meta = {
                                        "meta":      "chapter-line",
                                        "chapter":   meta["chapter"],
                                        "line":      meta["text_line_counter"],
                                        "sect_pos":  sect_pos,
                                        "sect_sent": meta[
                                                         "text_line_counter"
                                                     ],
                                        "sent_id":   meta["text_line_id"],
                                        "sent_pos":  sent_pos,
                                    }
                                    t.startchar = start_char
                                    t.endchar = start_char + len(t.original)
                                    yield t

                                    start_char += len(t.original) + 1