Esempio n. 1
0
    def __call__(self, tokens, **kwargs):
        for t in tokens:
            if t.mode == "index":
                text = t.morpho
                if text:
                    morpho, annotations = text.split(">")
                    r = morpho.split("::")[1]
                    uri, n = r.split(":")
                    for i, annotation in enumerate(annotations.split()):
                        t.text = f"{annotation}::{uri}:{n}:{i}"
                        yield t

                        for j, v in enumerate(annotation):
                            if v != "-":
                                text = f"{'-' * j}{v}{'-' * (9 - j)}::{uri}:{n}:{i}"
                                t.text = text
                                yield t
            elif t.mode == "query":
                annotation = leipzig2wn(t.original)
                if annotation == "----------":
                    continue
                else:
                    for i, v in enumerate(annotation):
                        if v != "-":
                            text = f"{'-' * i}{v}{'-' * (9 - i)}"
                            t.text = f"{text}::([\w\d$]+):(\d+):(\d+)$"
                            yield t
Esempio n. 2
0
    def __call__(self, tokens, **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            self._cache = []
            self._docix = kwargs.get("docix", None)

            for t in tokens:
                if t.mode == "index":
                    if t.lemma:
                        lemma = t.lemma
                        dcs_id = t.dcs_id if t.dcs_id else None
                        try:
                            morphos = lemma_morpho[dcs_id]
                        except KeyError:
                            continue

                        annotation = t.morpho if t.morpho is not None else None
                        for i, morpho in enumerate(morphos.split()):
                            uri = lemma_id[lemma][morpho]
                            if annotation is not None:
                                annotations = [
                                    annotation,
                                ]
                                for desc in annotations:
                                    annotation = str(
                                        Morph(morpho) + Morph(desc))
                                    t.morpho = (
                                        f"{morpho}::{uri}:{i}>{annotation}")
                                    t.text = f"{lemma}:{uri}={morpho}"
                                    if self.cached:
                                        self._cache.append(copy.copy(t))
                                    yield t

                                    # # Emit Devanagari
                                    # t.text = f"{slp2deva(iast2slp(lemma))}:{uri}={morpho}"
                                    # t.mode = "skip"
                                    # yield t

                            else:
                                t.morpho = f"{morpho}::{uri}:{i}>{morpho}"
                                t.text = f"{lemma}:{uri}={morpho}"
                                if self.cached:
                                    self._cache.append(copy.copy(t))
                                yield t
                elif t.mode == "query":
                    # Lexical relation
                    if "::" in t.text:
                        reltype, query = t.text.split("::")
                        t.reltype = reltype
                        t.text = query

                    text = t.text
                    if "?" in text:
                        language, word = text.split("?")
                        t.language = language
                        t.text = word
                        yield t
                    elif "#" in text or text.startswith("="):
                        yield t
                    elif leipzig2wn(t.original) != "----------":
                        yield t
                    elif text.isnumeric():
                        yield t
                    else:
                        if hasattr(t, "reltype"):
                            keys = ["transliteration", "uri", "morpho"]
                            if t.reltype in ["\\", "/", "+c", "-c"]:
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = SWN.lemmas_by_uri(
                                        kwargs["uri"]).relations
                                else:
                                    kwargs.pop("uri")
                                    results = SWN.lemmas(**kwargs).relations
                            else:
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = SWN.lemmas_by_uri(
                                        kwargs["uri"]).synsets_relations
                                else:
                                    kwargs.pop("uri")
                                    results = SWN.lemmas(
                                        **kwargs).synsets_relations
                            if results:
                                for result in results:
                                    if (relation_types[t.reltype]
                                            in result["relations"].keys()):
                                        for relation in result["relations"][
                                                relation_types[t.reltype]]:
                                            t.text = (
                                                f"{relation['transliteration']}:{relation['uri']}"
                                                f"={relation['morpho']}")
                                            yield t
                        else:
                            # query may be provided as lemma:uri=morpho
                            if all(
                                    re.match(
                                        r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                        text).groups()):
                                t.text = text
                                yield t
                            else:
                                keys = ["transliteration", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = SWN.lemmas_by_uri(kwargs["uri"])
                                else:
                                    kwargs.pop("uri")
                                    results = SWN.lemmas(**kwargs)

                                if results:
                                    for result in results:
                                        if result["uri"] is not None:
                                            t.text = f"{result['transliteration']}:{result['uri']}={result['morpho']}"
                                            yield t
Esempio n. 3
0
    def __call__(self, tokens, **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            self._cache = []
            self._docix = kwargs.get("docix", None)

            GWN = GreekWordNet()

            for t in tokens:
                if t.mode == "index":
                    morpho = t.morpho
                    lemma = t.lemma
                    if lemma:
                        results = GWN.lemmas(lemma=lemma, pos=morpho[0]).get()
                        if results:
                            for result in results:
                                # FIXME
                                if result["morpho"]:
                                    morpho = (morpho[:-2] +
                                              result["morpho"][-2:])
                                else:
                                    morpho = morpho[:-2] + "--"
                                if (morpho[5] == "p"
                                        and result["morpho"][5] == "d"):
                                    morpho = morpho[:5] + "d" + morpho[6:]
                                t.morpho = f"{result['morpho']}::{result['uri']}:0>{morpho}"
                                t.text = (
                                    f"{result['lemma']}:"
                                    f"{result['uri']}={result['morpho']}")
                                if self.cached:
                                    self._cache.append(copy.copy(t))
                                yield t
                elif t.mode == "query":
                    # Lexical relation
                    if "::" in t.text:
                        reltype, query = t.text.split("::")
                        t.reltype = reltype
                        t.text = query

                    text = t.text
                    if "?" in text:
                        language, word = text.split("?")
                        t.language = language
                        t.text = word
                        yield t
                    elif "#" in text or text.startswith("="):
                        yield t
                    elif leipzig2wn(t.original) != "----------":
                        yield t
                    elif text.isnumeric():
                        yield t
                    else:
                        if hasattr(t, "reltype"):
                            if t.reltype in ["\\", "/", "+c", "-c"]:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = GWN.lemmas_by_uri(
                                        kwargs["uri"]).relations
                                else:
                                    kwargs.pop("uri")
                                    results = GWN.lemmas(**kwargs).relations
                            else:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = GWN.lemmas_by_uri(
                                        kwargs["uri"]).synsets_relations
                                else:
                                    kwargs.pop("uri")
                                    results = GWN.lemmas(
                                        **kwargs).synsets_relations
                            if results:
                                for result in results:
                                    if (relation_types[t.reltype]
                                            in result["relations"].keys()):
                                        for relation in result["relations"][
                                                relation_types[t.reltype]]:
                                            t.text = f"{relation['lemma']}:{relation['uri']}={relation['morpho']}"
                                            yield t
                        else:
                            # query may be provided as lemma:uri=morpho
                            if all(
                                    re.match(
                                        r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                        text).groups()):
                                t.text = text
                                yield t
                            else:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = GWN.lemmas_by_uri(kwargs["uri"])
                                else:
                                    kwargs.pop("uri")
                                    results = GWN.lemmas(**kwargs)

                                if results:
                                    for result in results:
                                        t.text = f"{result['lemma']}:{result['uri']}={result['morpho']}"
                                        yield t
                                else:
                                    yield t
Esempio n. 4
0
    def __call__(self, tokens, **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            self._cache = []
            self._docix = kwargs.get("docix", None)

            for t in tokens:
                if t.mode == "index":
                    if t.text:
                        text = t.text
                        results = LWN.lemmatize(text)
                        if results:
                            for i, lemma in enumerate(results):
                                t.morpho = (
                                    f"{lemma['lemma']['morpho']}::{lemma['lemma']['uri']}:{i}>"
                                    f"{' '.join(lemma['morpho'])}")
                                t.text = f"{lemma['lemma']['lemma']}:{lemma['lemma']['uri']}={lemma['lemma']['morpho']}"
                                if self.cached:
                                    self._cache.append(copy.copy(t))
                                yield t
                elif t.mode == "query":
                    # Lexical relation
                    if "::" in t.text:
                        reltype, query = t.text.split("::")
                        t.reltype = reltype
                        t.text = query
                    text = t.text
                    if "?" in text:
                        language, word = text.split("?")
                        t.language = language
                        t.text = word
                        yield t
                    elif "#" in text or text.startswith("="):
                        yield t
                    elif leipzig2wn(t.original) != "----------":
                        yield t
                    elif text.isnumeric():
                        yield t
                    else:
                        if hasattr(t, "reltype"):
                            keys = ["lemma", "uri", "morpho"]
                            if t.reltype in ["\\", "/", "+c", "-c"]:
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(
                                        kwargs["uri"]).relations
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(**kwargs).relations
                            else:
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(
                                        kwargs["uri"]).synsets_relations
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(
                                        **kwargs).synsets_relations
                            if results:
                                for result in results:
                                    if t.reltype in result["relations"].keys():
                                        for relation in result["relations"][
                                                t.reltype]:
                                            t.text = f"{relation['lemma']}:{relation['uri']}={relation['morpho']}"
                                            yield t
                        else:
                            # query may be provided as lemma:uri=morpho
                            if all(
                                    re.match(
                                        r"([\w\-]+)(?::([A-z0-9]+))?(?:=(.+))?",
                                        text,
                                    ).groups()):
                                t.text = text
                                yield t
                            else:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"([\w\-]+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(kwargs["uri"])
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(**kwargs)

                                if results:
                                    for result in results:
                                        t.text = f"{result['lemma']}:{result['uri']}={result['morpho']}"
                                        yield t
Esempio n. 5
0
    def __call__(self, tokens, **kwargs):
        if kwargs.get("docix", None) == self._docix and self._cache:
            yield from self.cache
        else:
            self._cache = []
            self._docix = kwargs.get("docix", None)

            LWN = LatinWordNet()

            jvmap = str.maketrans("jv", "iu", "")
            for t in tokens:
                if t.mode == "index":
                    if t.lemma:
                        lemma = t.lemma
                        ix = t.lemma_n if t.lemma_n.strip() else "-"
                        morphos = mapping[lemma][ix]["morpho"]
                        uris = mapping[lemma][ix]["uri"]

                        if t.morpho is not None:
                            annotation = bpn2wn(t.morpho)
                        else:
                            annotation = None
                        lemma = lemma.lower().strip("_").translate(jvmap)

                        for uri in uris:
                            for i, morpho in enumerate(morphos):
                                if annotation is not None:
                                    if "/" in annotation:  # n-s---m/nn3-
                                        head, *alts, tail = re.search(
                                            r"^(.*?)([a-z1-9\-])/([a-z1-9\-])(.*?)$",
                                            annotation,
                                        ).groups()
                                        annotations = [
                                            f"{head}{alt}{tail}"
                                            for alt in alts
                                        ]
                                    else:
                                        annotations = [
                                            annotation,
                                        ]
                                    for annotation in annotations:
                                        t.morpho = (
                                            f"{morpho}::{uri}:{i}>{annotation}"
                                        )
                                        t.text = f"{lemma}:{uri}={morpho}"
                                        if self.cached:
                                            self._cache.append(copy.copy(t))
                                        yield t
                                else:
                                    t.morpho = f"{morpho}::{uri}:{i}>{morpho}"
                                    t.text = f"{lemma}:{uri}={morpho}"
                                    if self.cached:
                                        self._cache.append(copy.copy(t))
                                    yield t
                elif t.mode == "query":
                    # Lexical relation
                    if "::" in t.text:
                        reltype, query = t.text.split("::")
                        t.reltype = reltype
                        t.text = query

                    text = t.text
                    if "?" in text:
                        language, word = text.split("?")
                        t.language = language
                        t.text = word
                        yield t
                    elif "#" in text or text.startswith("="):
                        yield t
                    elif leipzig2wn(t.original) != "----------":
                        yield t
                    elif text.isnumeric():
                        yield t
                    else:
                        if hasattr(t, "reltype"):
                            if t.reltype in ["\\", "/", "+c", "-c"]:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(
                                        kwargs["uri"]).relations
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(**kwargs).relations
                            else:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(
                                        kwargs["uri"]).synsets_relations
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(
                                        **kwargs).synsets_relations
                            if results:
                                for result in results:
                                    if (relation_types[t.reltype]
                                            in result["relations"].keys()):
                                        for relation in result["relations"][
                                                relation_types[t.reltype]]:
                                            t.text = f"{relation['lemma']}:{relation['uri']}={relation['morpho']}"
                                            yield t
                        else:
                            # query may be provided as lemma:uri=morpho
                            if all(
                                    re.match(
                                        r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                        text).groups()):
                                t.text = text
                                yield t
                            else:
                                keys = ["lemma", "uri", "morpho"]
                                kwargs = {
                                    k: v
                                    for k, v in zip(
                                        keys,
                                        re.search(
                                            r"(\w+)(?::([A-z0-9]+))?(?:=(.+))?",
                                            text,
                                        ).groups(),
                                    )
                                }
                                if kwargs["uri"] is not None:
                                    results = LWN.lemmas_by_uri(kwargs["uri"])
                                else:
                                    kwargs.pop("uri")
                                    results = LWN.lemmas(**kwargs)

                                if results:
                                    for result in results:
                                        t.text = f"{result['lemma']}:{result['uri']}={result['morpho']}"
                                        yield t
                                else:
                                    yield t