def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = data yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if not tokenize: t.original = "" for token in data["text"].iter("token"): form = token.get("form") if not form: continue t.original += f"{form}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: for sentence in data["text"].iter("sentence"): sect_pos = -1 curr_line = None for pos, token in enumerate(sentence.iter("word")): if token.get("artificial", False): continue form = token.get("form") if not form: continue t.text = form lemma = token.get("lemma") if not lemma or lemma in ( "???", ".", ",", ";", "·", "punc1", "comma1", "PERIOD1", ): continue t.lemma = lemma t.morpho = agldt2wn(token.get("postag")) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} divs = data["meta"].split("-") refs = (token.get("cite").rsplit( ":", maxsplit=1)[1].split(".")) for i, div in enumerate(divs): meta[div] = refs[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = str(int(token.get("id"))) if curr_line and refs[-1] > curr_line: sect_pos = 0 else: sect_pos += 1 curr_line = refs[-1] meta["sect_pos"] = sect_pos # ref in line t.meta = meta if keeporiginal: t.original = f"{form}" t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char original_len = len(form) t.endchar = start_char + original_len if self.cached: self._cache.append(copy.copy(t)) yield t start_char += len(form)
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = data t.text = normalize("NFKC", data) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if tokenize: titleStmt = data.find('.//teiHeader').find( 'fileDesc').find('titleStmt') auth_code = f"tlg{titleStmt.find('tlgAuthor').text}" work_code = f"tlg{titleStmt.find('tlgId').text}" body = data.find('.//text').find('body') divs = AUTHOR_TAB[auth_code]["works"][work_code]["meta"] meta = {"meta": divs} divv = divs.split("-") for k in divv: meta[k] = None sect_sent = 0 sect_pos = 0 current_refs = None pos = 0 for sentence in body.iter("sentence"): refs = sentence.get("location") if refs != current_refs: current_refs = refs sect_pos = 0 sect_sent = 0 sent_id = sentence.get("id") sect_sent += 1 for i, ref in enumerate(refs.split(".")): meta[divv[i]] = ref for sent_pos, word in enumerate(sentence.iter("word")): t.boost = 1.0 sect_pos += 1 pos += 1 lemma = word.find("lemma").get("entry", None) t.lemma = normalize("NFKC", lemma) meta["sent_id"] = sent_id meta["sent_pos"] = word.get("id") meta["sect_pos"] = str(sect_pos) meta["sect_sent"] = str(sect_sent) t.meta = copy.copy(meta) beta = word.get("form").upper() form = normalize( "NFKC", beta2unicode( beta + "\n" if beta.endswith("S") else beta)) if (t.lemma.istitle()): form = form.title() t.text = form if keeporiginal: t.original = beta t.stopped = False if positions: t.pos = start_pos + pos original_len = len(form) if chars: t.startchar = start_char t.endchar = start_char + original_len start_char += original_len POS = word.find("lemma").get("POS", None) analyses = [ analysis.get("morph", None) for analysis in word.find("lemma").iter("analysis") ] morphos = [] for analysis in analyses: morphos += diorisis2wn(POS, analysis) t.morpho = " ".join(morphos) if self.cached: self._cache.append(copy.deepcopy(t)) yield t else: body = data.find('.//text').find('body') tokens = [] for sentence in body.iter("sentence"): for word in sentence.iter("word"): form = word.get("form") if not form: continue else: tokens.append(form) t.original = t.text = " ".join(tokens) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if tokenize: for sentence in data["text"].iter("sentence"): for pos, token in enumerate(sentence.iter("word")): if token.get("artificial", False): continue form = token.get("form") if form: form = form.replace(" ", " ").replace(" ", " ") form = re.sub(r"\.([^ ]|^$)", r". \1", form) else: continue lemma = token.get("lemma", None) if not lemma or lemma in ( ".", ",", "punc1", "comma1", "PERIOD1", ): continue t.lemma = lemma.strip("0123456789") t.morpho = agldt2wn(token.get("postag")) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} divs = data["meta"].split("-") for i, div in enumerate(divs): if len(divs) <= 2 or div != "line": meta[div] = sentence.get("subdoc").split( ".")[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = token.get("id") t.meta = meta if keeporiginal: t.original = f"{form}" t.stopped = False if positions: t.pos = copy.copy(start_pos + pos) original_len = len(form) if (form.istitle() and pos == 0 and not t.lemma.istitle()): form = form.lower() t.text = form if chars: t.startchar = copy.copy(start_char) t.endchar = copy.copy(start_char + original_len) if self.cached: self._cache.append(copy.deepcopy(t)) yield t if form in editorial: t.text = editorial[form] if self.cached: self._cache.append(copy.copy(t)) yield t start_char += len(form) else: t.original = "" for token in data.iter("token"): form = token.get("form") if not form: continue t.original += f"{form}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: if not tokenize: t.original = "" for token in data.iter("token"): form = token.get("form") if not form: continue after = token.get("presentation-after", "") before = token.get("presentation-before", "") t.original += f"{before}{form}{after}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) for sentence in data["text"].iter("sentence"): for pos, token in enumerate(sentence.iter("token")): form = token.get("form") if not form: continue else: form = form.replace(" ", " ").replace(" ", " ") form = re.sub(r"\.([^ ]|^$)", r". \1", form) t.lemma = token.get("lemma") t.morpho = proiel2wn( token.get("part-of-speech"), token.get("morphology"), ) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} for i, div in enumerate(data["meta"].split("-")): meta[div] = token.get("citation-part").split( "." )[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = token.get("id") t.meta = meta before = token.get("presentation-before", "") after = token.get("presentation-after", "") if keeporiginal: t.original = f"{before}{form}{after}" t.stopped = False if positions: t.pos = start_pos + pos original_len = len(form) if ( form.istitle() and pos == 0 and not t.lemma.istitle() ): form = form.lower() t.text = form if chars: t.startchar = start_char + len(before) t.endchar = ( start_char + len(before) + original_len ) self._cache.append(copy.deepcopy(t)) yield t if form in editorial: t.text = editorial[form] self._cache.append(copy.deepcopy(t)) yield t start_char += len(before) + len(form) + len(after)
def __call__(self, value: dict, positions=False, chars=False, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = value t.text = value.translate(jvmap) yield t else: if not tokenize: t.original = t.text = "\n".join( [el for el in value["text"]]) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value["text"]) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) punctuation = str.maketrans("", "", string.punctuation) editorial = str.maketrans("", "", "[{(<>)}]") added = re.compile(r"(\s?[<(][\w .]+[>)]\s?)") t.boost = 1.0 t.pos = t.startchar = t.endchar = 0 sect_sent = 0 # sentence count within passage sent_id = "0001" sect_pos = 0 # word pos within passage sent_pos = 0 # word pos within sentence current_refs = tuple(["0"] * len(value["meta"])) nflag = None for pos, line in enumerate(value["text"]): t.pos = pos parsed = parse_bpn(line) if not parsed: continue if int(parsed["sent_id"]) > int(sent_id): sent_pos = 0 sent_id = parsed["sent_id"] if (tuple([ alnum(i) for i in parsed["refs"].split(",") ]) > current_refs): sect_sent = 1 sect_pos = 0 else: sect_sent += 1 if keeporiginal: if added.search(parsed["form"]): t.original = added.sub("", parsed["form"]) else: t.original = parsed["form"] t.stopped = False if parsed["form_code"] in "&+": if parsed["lemma"] != "#": if parsed["lemma"] == "_SVM": t.morpho = None t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.original = added.sub("", parsed["form"]) t.text = parsed["form"].translate( editorial) else: form = parsed["form"] t.morpho = parsed["morpho"] if " " in form: t.original = added.sub("", form) text = form.translate(editorial) else: t.original = form text = form t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] if added.search(parsed["form"]): t.original = added.sub( "", parsed["form"]) t.text = text.translate(editorial) nflag = False else: # could be a Greek form, do we index it? t.morpho = "" t.lemma = "" t.lemma_n = "" t.original = added.sub("", parsed["form"]) t.text = parsed["form"].translate(editorial) elif parsed["form_code"] == "@": # combined forms if parsed["lemma"] != "#": t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.text = parsed["form"].translate(editorial) t.morpho = parsed["morpho"] if nflag: sect_pos -= 1 sent_pos -= 1 else: nflag = True else: sent_pos += 1 sect_pos += 1 continue elif parsed["form_code"] == "=": # que t.text = parsed["form"].translate(editorial) t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.morpho = parsed["morpho"] sent_pos -= 1 sect_pos -= 1 nflag = False meta = {"meta": value["meta"].lower()} tags = value["meta"].split("-") divs = {i: div.lower() for i, div in enumerate(tags)} refs = tuple([ ref.translate(punctuation) for ref in parsed["refs"].strip().split(",") ]) for i in range(len(divs)): meta[divs[i]] = refs[i] current_refs = refs t.morphosyntax = parsed["subord"] meta["sect_sent"] = str(sect_sent) meta["sect_pos"] = str(sect_pos) meta["sent_id"] = parsed["sent_id"] meta["sent_pos"] = str(sent_pos) t.meta = meta t.startchar = start_char t.endchar = start_char + len(t.original) if t.text != t.original: tc = copy.deepcopy(t) tc.text = t.original yield tc yield t sent_pos += 1 sect_pos += 1 start_char += len(t.original) + 1
def __call__( self, value, positions=False, chars=False, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if t.mode == "query": t.text = t.original = value yield t else: if not tokenize: lines = [] for line in value["text"]: line = re.sub(r"\t+", "\t", line.strip()) if line and line.startswith("# text_line"): text = line.split("# text_line: ")[1] lines.append(text) t.original = t.text = "\n".join([line for line in lines]) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.text) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) t.boost = 1.0 t.pos = t.startchar = t.endchar = 0 meta = { "text": None, # work title "text_id": None, "chapter": None, # reference "chapter_id": None, "text_line": None, # the text "text_line_id": None, "text_line_counter": None, # line number "text_line_subcounter": None, # token number } sect_pos = 0 sent_pos = 0 for line in value["text"]: line = line.strip() if line: if line.startswith("#"): try: label, value = line.split(":", maxsplit=1) except ValueError: continue label = label.split(" ", maxsplit=1)[1].strip() value = value.strip() meta[label] = ( value if not value.isnumeric() else int(value) ) if label in [ "text_line_counter", "text_line_subcounter", ]: sent_pos = 0 else: try: ( ID, FORM, LEMMA, UPOS, XPOS, MORPHO, _, _, _, _, LEMMA_ID, PADA, SEM, ) = line.split("\t") except ValueError: try: ( ID, FORM, LEMMA, _, XPOS, _, _, _, _, LEMMA_ID, _, _, ) = line.split("\t") except ValueError: try: ( ID, FORM, _, _, _, _, _, _, _, _, ) = line.split("\t") except ValueError: continue else: t.original = FORM sect_pos += 1 sent_pos += 1 t.pos = sent_pos continue else: if FORM == "_": t.text = t.original else: sect_pos += 1 sent_pos += 1 t.text = FORM t.original = FORM t.pos = sent_pos t.lemma = LEMMA t.dcs_id = LEMMA_ID t.morphosyntax = XPOS t.morpho = None t.synset = None t.meta = { "meta": "chapter-line", "chapter": meta["chapter"], "line": meta["text_line_counter"], "sect_pos": sect_pos, "sect_sent": meta[ "text_line_counter" ], "sent_id": meta["text_line_id"], "sent_pos": sent_pos, } t.startchar = start_char t.endchar = start_char + len( t.original ) yield t # # Emit Devanagari # t.text = slp2deva(iast2slp(t.text)) # t.mode = "skip" # yield t start_char += len(t.original) + 1 else: if FORM == "_": t.text = t.original else: sect_pos += 1 sent_pos += 1 t.text = FORM t.original = FORM t.pos = sent_pos t.lemma = LEMMA t.dcs_id = LEMMA_ID t.morphosyntax = XPOS t.morpho = None if MORPHO == "_" or not MORPHO else parse_morpho(XPOS, MORPHO) t.synset = None if SEM == "_" else SEM t.meta = { "meta": "chapter-line", "chapter": meta["chapter"], "line": meta["text_line_counter"], "sect_pos": sect_pos, "sect_sent": meta[ "text_line_counter" ], "sent_id": meta["text_line_id"], "sent_pos": sent_pos, } t.startchar = start_char t.endchar = start_char + len(t.original) yield t start_char += len(t.original) + 1