def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode="", **kwargs ): assert isinstance(value, text_type), "%r is not unicode" % value t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) t.text = value t.boost = 1.0 if keeporiginal: t.original = value if positions: t.pos = start_pos + 1 if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = data yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if not tokenize: t.original = "" for token in data["text"].iter("token"): form = token.get("form") if not form: continue t.original += f"{form}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: for sentence in data["text"].iter("sentence"): sect_pos = -1 curr_line = None for pos, token in enumerate(sentence.iter("word")): if token.get("artificial", False): continue form = token.get("form") if not form: continue t.text = form lemma = token.get("lemma") if not lemma or lemma in ( "???", ".", ",", ";", "·", "punc1", "comma1", "PERIOD1", ): continue t.lemma = lemma t.morpho = agldt2wn(token.get("postag")) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} divs = data["meta"].split("-") refs = (token.get("cite").rsplit( ":", maxsplit=1)[1].split(".")) for i, div in enumerate(divs): meta[div] = refs[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = str(int(token.get("id"))) if curr_line and refs[-1] > curr_line: sect_pos = 0 else: sect_pos += 1 curr_line = refs[-1] meta["sect_pos"] = sect_pos # ref in line t.meta = meta if keeporiginal: t.original = f"{form}" t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char original_len = len(form) t.endchar = start_char + original_len if self.cached: self._cache.append(copy.copy(t)) yield t start_char += len(form)
def __call__(self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if tokenize: punc = str.maketrans("", "", string.punctuation) tags = data["meta"].split("-") meta = {"meta": data["meta"].lower()} meta.update({tag: "-" for tag in tags}) divs = ["div1", "div2", "div3", "div4", "div5"] sect_sent = 0 sect_pos = 0 sent_id = 0 pos = 0 for el in data["text"].find("text").find("body").iter(): if el.tag in divs: current_div_ix = divs.index(el.tag) meta[tags[current_div_ix]] = el.get("n", "-") sect_sent = 0 sect_pos = 0 elif el.tag in ["head", "p", "l"]: sent_id += 1 sect_sent += 1 if el.text: text = el.text else: text = "".join([ subel.text + subel.tail for subel in el.iter() if subel.tag != el.tag ]) subs = [ r"<note>(.*?)</note>", r'<sic corr="(\w+?)">\w+?</sic>', r'<reg orig="\w+?">(\w+?)</reg>', ] for sub in subs: text = re.sub(sub, "\1", text) tokens = word_tokenizer.word_tokenize(text) for i, token in enumerate(tokens): pos += 1 sect_pos += 1 t.text = (token.translate( punc).lower().translate(jvmap)) if not t.text or t.text in string.whitespace: start_char += 1 continue t.boost = 1.0 meta["sent_id"] = sent_id meta["sent_pos"] = i meta["sect_sent"] = sect_sent meta["sect_pos"] = sect_pos t.meta = copy.copy(meta) if keeporiginal: t.original = token t.stopped = False if positions: t.pos = start_pos + pos is_enclitic = False for enclitic in enclitics: if token.endswith(enclitic): if enclitic == "n": t.text = (token[:-len(enclitic)] + "s") t.startchar = start_char t.endchar = (start_char + len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "ne": t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + ( len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "st": if token.endswith("ust"): t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "'s": t.text = token + "s" t.startchar = start_char t.endchar = start_char + len(token) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "es" t.startchar = (start_char + len(token) + 1) t.endchar = (start_char + len(token) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + len( token[:-len(enclitic)]) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = enclitic t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t is_enclitic = True break if not is_enclitic: if chars: t.startchar = start_char original_len = len(token) t.endchar = start_char + original_len if self.cached: self._cache.append(copy.copy(t)) yield t start_char += len(token) else: t.original = "" for el in data["text"].find("text").find("body").iter(): if el.tag in ["head", "p", "l"]: if el.text: text = el.text else: text = "".join([ subel.text + subel.tail for subel in el.iter() if subel.tag != el.tag ]) subs = [ r"<note>(.*?)</note>", r'<sic corr="(\w+?)">\w+?</sic>', r'<reg orig="\w+?">(\w+?)</reg>', ] for sub in subs: text = re.sub(sub, "\1", text) t.original += text t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = data t.text = normalize("NFKC", data) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if tokenize: titleStmt = data.find('.//teiHeader').find( 'fileDesc').find('titleStmt') auth_code = f"tlg{titleStmt.find('tlgAuthor').text}" work_code = f"tlg{titleStmt.find('tlgId').text}" body = data.find('.//text').find('body') divs = AUTHOR_TAB[auth_code]["works"][work_code]["meta"] meta = {"meta": divs} divv = divs.split("-") for k in divv: meta[k] = None sect_sent = 0 sect_pos = 0 current_refs = None pos = 0 for sentence in body.iter("sentence"): refs = sentence.get("location") if refs != current_refs: current_refs = refs sect_pos = 0 sect_sent = 0 sent_id = sentence.get("id") sect_sent += 1 for i, ref in enumerate(refs.split(".")): meta[divv[i]] = ref for sent_pos, word in enumerate(sentence.iter("word")): t.boost = 1.0 sect_pos += 1 pos += 1 lemma = word.find("lemma").get("entry", None) t.lemma = normalize("NFKC", lemma) meta["sent_id"] = sent_id meta["sent_pos"] = word.get("id") meta["sect_pos"] = str(sect_pos) meta["sect_sent"] = str(sect_sent) t.meta = copy.copy(meta) beta = word.get("form").upper() form = normalize( "NFKC", beta2unicode( beta + "\n" if beta.endswith("S") else beta)) if (t.lemma.istitle()): form = form.title() t.text = form if keeporiginal: t.original = beta t.stopped = False if positions: t.pos = start_pos + pos original_len = len(form) if chars: t.startchar = start_char t.endchar = start_char + original_len start_char += original_len POS = word.find("lemma").get("POS", None) analyses = [ analysis.get("morph", None) for analysis in word.find("lemma").iter("analysis") ] morphos = [] for analysis in analyses: morphos += diorisis2wn(POS, analysis) t.morpho = " ".join(morphos) if self.cached: self._cache.append(copy.deepcopy(t)) yield t else: body = data.find('.//text').find('body') tokens = [] for sentence in body.iter("sentence"): for word in sentence.iter("word"): form = word.get("form") if not form: continue else: tokens.append(form) t.original = t.text = " ".join(tokens) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t
def __call__( self, value, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix") == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = value.translate(jvmap) yield t else: if not tokenize: t.original = t.text = value["text"] t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value["text"]) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) word_tokenizer = PunktLatinCharsVars() stopchars = str.maketrans( "", "", string.punctuation.replace("&", "").replace("^", "") + "†“”—\n\ŕ", ) divs = { i: div.lower() for i, div in enumerate(value["meta"].split("-")) } lines = iter(value["text"].split("\n")) tpos = start_pos xtitle = ytitle = ztitle = speaker = "" buffer = deque() for line in lines: def parse_phi_line(_line): result = [] nonlocal xtitle, ytitle, ztitle, speaker, buffer try: ref, text = _line.split("\t") except ValueError: result.append((None, None)) else: v, w, x, y, z = ref.rstrip(".").split(".") offset = 0 # d is a number, followed by -, t, a then possibly another number or . for a title # d can be 'opinc' 'sedinc' 'dub', 'inc', # c can be 'Summ' if x == "t": xtitle = text.translate(stopchars).strip() if y == "t": if z: ytitle = text.translate( stopchars).strip() else: speaker = text.translate( stopchars).strip() result.append((None, [text])) elif z == "t": ztitle = text.translate(stopchars).strip() result.append((None, [text])) elif " {" in text: result.append((None, [text])) else: temp_tokens = word_tokenizer.word_tokenize( text) if temp_tokens: if (temp_tokens[0].replace( "j", "i").replace("v", "u") not in proper_names.proper_names): temp_tokens[0] = temp_tokens[ 0].lower() if (temp_tokens[-1].endswith(".") and temp_tokens[-1] != ". . ."): final_word = temp_tokens[-1][:-1] del temp_tokens[-1] temp_tokens += [final_word, "."] if temp_tokens[-1].endswith("-"): buffer += list( parse_phi_line(next(lines))) new_ref, new_tokens = buffer.pop() merged_word = ( "2&" + temp_tokens[-1][:-1] + new_tokens[0]) del temp_tokens[-1] temp_tokens += [merged_word] del new_tokens[0] if new_tokens: if (new_tokens[0] in string.punctuation): new_token = ( f"^1{new_tokens[0]}") del new_tokens[0] new_tokens.insert( 0, new_token) buffer.appendleft( (new_ref, new_tokens)) for ix, token in enumerate( temp_tokens): if temp_tokens[ix] == ". . .": temp_tokens.insert( ix + 1, "&1") if "&" in token: ppp = compound.is_ppp( re.sub( r"[&\d]", "", token)) else: ppp = compound.is_ppp(token) if ppp: if ix == len(temp_tokens) - 1: if not buffer: try: buffer += list( parse_phi_line( next(lines) )) except StopIteration: continue if "&" in buffer[0][1][0]: copula = compound.is_copula( buffer[0][1][0] [2:]) else: copula = compound.is_copula( buffer[0][1][0]) else: copula = compound.is_copula( temp_tokens[ix + 1]) if (copula and ppp[1] == copula[2]): ( tense, mood, number, i, ) = copula if buffer: token = f"{token} &2{compound.copula[tense][mood][number][i]}" else: token = f"{token} {compound.copula[tense][mood][number][i]}" del temp_tokens[ix] if buffer: del buffer[0][1][0] else: del temp_tokens[ix] temp_tokens.insert( ix, token) if (ix != len(temp_tokens) - 1): if (temp_tokens[ix + 1] in string. punctuation): new_token = f"^1{temp_tokens[ix + 1]} " del temp_tokens[ix + 1] temp_tokens.insert( ix + 1, new_token, ) if buffer: for i in range(len(buffer)): result.append(buffer.pop()) result.append( ((v, w, x, y, z), temp_tokens)) yield from result result = list(parse_phi_line(line)) act = scene = None for ref, tokens in reversed(result): enjambed = False if not ref and not tokens: start_char += len(line) + 1 continue elif not ref: text = tokens[0].strip().strip("{}") if re.match( r"[IVXLDMivxldm]+\.[IVXLDMivxldm]+", text): act, scene = text.split(".") act = str(roman_to_arabic(act)) scene = str(roman_to_arabic(scene)) start_char += len(line.split("\t")[1]) + 1 continue notoken = 0 skip = False for line_pos, token in enumerate(tokens): if token == "{" or token == "}": skip = not skip start_char += len(token) continue if skip: speaker = token.replace("v", "u") start_char += len(token) continue offset = 0 line_pos -= notoken meta = {} # extra['meta'] = value['meta'].lower() # setattr(t, 'meta', value['meta'].lower()) for i in range(len(divs)): meta[divs[len(divs) - (i + 1)]] = ref[-( 5 - (5 - (i + 1)))].strip("t") # setattr(t, divs[len(divs) - (i + 1)], ref[-(5 - (5 - (i + 1)))].strip('t')) if xtitle: if len(divs) >= 3: meta[ f"{divs[len(divs) - 3]}_title"] = xtitle # setattr(t, f"{divs[len(divs)-3]}_title", xtitle) if ytitle: if len(divs) >= 2: meta[ f"{divs[len(divs) - 2]}_title"] = ytitle # setattr(t, f"{divs[len(divs)-2]}_title", ytitle) if ztitle: if len(divs) >= 1: meta[ f"{divs[len(divs) - 1]}_title"] = ztitle # setattr(t, f"{divs[len(divs)-1]}_title", ztitle) if act: meta["act"] = act if scene: meta["scene"] = scene # if speaker: # t.speaker = speaker t.boost = 1.0 pre = re.search(r"^\^(\d+?)", token) if pre: start_char -= int(pre.group(1)) token = re.sub(r"^\^\d+?", "", token) pre = re.search(r"^&(\d+?)", token) if pre: start_char += int(pre.group(1)) token = re.sub(r"^&\d+?", "", token) if keeporiginal: t.original = token t.stopped = False original_length = len(token) ltoken = token.lstrip(string.punctuation) ldiff = original_length - len(ltoken) if ldiff != 0: token = ltoken rtoken = token.rstrip(string.punctuation) rdiff = len(token) - len(rtoken) if rdiff != 0: token = rtoken ntoken = token.translate(stopchars) ndiff = len(token) - len(ntoken) if ndiff: token = ntoken if not re.match( r"(?:[\d]&)?[\w]+\s(?:&[\d])?[\w]+", token): token = token.replace(" ", "") if not token: start_char += original_length notoken += 1 continue else: if positions: meta["line_pos"] = line_pos t.pos = tpos t.meta = meta if (token not in exceptions and token.lower() not in exceptions and re.sub(r"\d&|&\d", "", token) not in exceptions): if token in replacements: # t.original for subtoken in replacements[ token]: t.text = subtoken.lower() t.startchar = start_char t.endchar = (start_char + original_length) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t start_char += original_length tpos += 1 continue if re.match( r"(?:[\d]&)?[\w]+\s(?:&[\d])?[\w]+", token, ): ppp, copula = token.split(" ") post = re.match( r"([\d])&[\w]+", ppp) if post: offset += int(post.group(1)) ppp = re.sub(r"[\d]&", "", ppp) original_length -= 2 enjambed = True t.text = ppp.lower() t.startchar = start_char t.endchar = (start_char + len(ppp) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t pre = re.search(r"&(\d+?)", copula) if pre: start_char += int(pre.group(1)) copula = re.sub( r"&\d+?", "", copula) original_length -= 2 enjambed = True t.text = copula.lower() t.startchar = (start_char + len(ppp) + 1) t.endchar = (start_char + len(ppp) + 1 + len(copula)) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t start_char += original_length tpos += 1 continue else: post = re.match( r"([\d])&[\w]+", token) if post: offset += int(post.group(1)) token = re.sub( r"[\d]&", "", token) original_length -= 2 enjambed = True else: offset = 0 is_enclitic = False for enclitic in enclitics: if token.lower().endswith( enclitic): is_enclitic = True if enclitic == "ne": t.text = ( token[:-len(enclitic)] ).lower() t.startchar = start_char t.endchar = start_char + ( len(token) - len(enclitic)) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t t.text = "ne" t.startchar = ( start_char + len(token[:-len( enclitic)]) + offset) t.endchar = ( start_char + len(token[:-len( enclitic)]) + len(enclitic) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t elif enclitic == "n": t.text = ( token[:-len(enclitic)] + "s").lower() t.startchar = start_char t.endchar = ( start_char + (len(token) + 1) - len(enclitic)) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t t.text = "ne" t.startchar = ( start_char + len(token[:-len( enclitic)]) + offset) t.endchar = ( start_char + len(token[:-len( enclitic)]) + len(enclitic) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t elif enclitic == "st": if token.endswith("ust"): t.text = ( token[:-len( enclitic) + 1]).lower() t.startchar = ( start_char) t.endchar = ( start_char + len(token[:-len( enclitic) + 1]) - len(enclitic)) if mode == "index": if self.cached: self._cache.append( copy.copy( t)) yield t t.text = "est" t.startchar = ( start_char + len(token[:-len( enclitic) + 1]) + offset) t.endchar = ( start_char + len(token[:-len( enclitic) + 1]) + len(enclitic) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy( t)) yield t else: t.text = (token[:-len( enclitic)] ).lower() t.startchar = ( start_char) t.endchar = ( start_char + len(token[:-len( enclitic)]) - len(enclitic)) if mode == "index": if self.cached: self._cache.append( copy.copy( t)) yield t t.text = "est" t.startchar = ( start_char + len(token[:-len( enclitic)]) + offset) t.endchar = ( start_char + len(token[:-len( enclitic)]) + len(enclitic) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy( t)) yield t elif enclitic == "'s": t.text = (token.lower() + "s") t.startchar = start_char t.endchar = (start_char + len(token)) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t t.text = "es" t.startchar = (start_char + len(token) + 1) t.endchar = (start_char + len(token) + len(enclitic)) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t else: t.text = ( token[:-len(enclitic)] ).lower() t.startchar = start_char t.endchar = ( start_char + len(token[:-len( enclitic)])) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t t.text = enclitic t.startchar = ( start_char + len(token[:-len( enclitic)]) + offset) t.endchar = ( start_char + len(token[:-len( enclitic)]) + len(enclitic) + offset) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t break else: is_enclitic = False post = re.match(r"([\d])&[\w]+", token) if post: offset += int(post.group(1)) token = re.sub(r"[\d]&", "", token) original_length -= 2 enjambed = True if not is_enclitic: t.text = token if chars: t.startchar = start_char + ldiff t.endchar = (start_char + original_length - rdiff + offset) if mode == "index": if self.cached: self._cache.append( copy.copy(t)) yield t tpos += 1 if enjambed: start_char += original_length + offset else: start_char += original_length start_char += 1 # \n
def __call__(self, value, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs): if (kwargs.get("docix", None) == self._docix and self._cache is not None): yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = value yield t else: if not tokenize: text = "\n".join([el for el in flatten(value["text"])]) t.original = t.text = text t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(text) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) tokenizer = word_tokenize stopchars = string.punctuation doc = value["text"] divs = [ cref.get("n") for cref in reversed( doc.findall( ".//{http://www.tei-c.org/ns/1.0}cRefPattern")) ] ss = doc.iter("{http://www.tei-c.org/ns/1.0}s") sect_sent = 0 sect_pos = 0 for n, s in enumerate(ss): meta = { "meta": "-".join(divs), "sent_id": s.get("{http://www.w3.org/XML/1998/namespace}id"), "sect_sent": sect_sent, "alignment": s.get("n"), } el = s j = 0 while el is not None: if el.getparent() is not None: if (el.getparent().get("type", None) == "textpart"): j -= 1 if (divs[j] in meta and el.getparent().get("n") != meta[divs[j]]): sect_sent = 0 sect_pos = 0 meta[divs[j]] = el.getparent().get("n") el = el.getparent() text = stringify(s) sent_pos = 0 for i, token in enumerate(tokenizer(text)): if token == " " or token in stopchars: sect_pos += 1 sent_pos += 1 continue t.boost = 1.0 if keeporiginal: t.original = token t.stopped = False meta["sent_pos"] = sent_pos meta["sect_pos"] = sect_pos if positions: t.pos = start_pos + sect_pos length = len(token) token = token.strip() if not token: start_char += length continue t.meta = copy.deepcopy(meta) t.text = token if chars: t.startchar = start_char t.endchar = start_char + length if mode == "index": self._cache.append(copy.deepcopy(t)) yield t start_char += length sect_pos += 1 sent_pos += 1 sect_sent += 1 start_char += 1
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if not tokenize: t.original = "" for el in data["text"].find("text").find("body").iter(): if el.tag in ["head", "p", "l"]: if not el.text: text = "".join( [ subel.text + subel.tail for subel in el.iter() if subel.tag != el.tag ] ) else: text = el.text subs = [ r"<note>(.*?)</note>", r'<sic corr="(\w+?)">\w+?</sic>', r'<reg orig="\w+?">(\w+?)</reg>', ] for sub in subs: text = re.sub(sub, "\1", text) t.original += text t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: punc = str.maketrans("", "", string.punctuation) tags = data["meta"].lower() meta = {"meta": tags} if tags != "-": divs = data["meta"].split("-") meta.update({div: "-" for div in divs}) sect_sent = 0 sect_pos = 0 sent_id = 0 pos = 0 for el in ( data["text"] .find("{http://www.tei-c.org/ns/1.0}text") .find("{http://www.tei-c.org/ns/1.0}body") .findall(".//{http://www.tei-c.org/ns/1.0}*") ): if el.tag == "{http://www.tei-c.org/ns/1.0}milestone": meta[el.get("unit")] = el.get("n", "-") sect_sent = 0 sect_pos = 0 elif ( el.tag == "{http://www.tei-c.org/ns/1.0}div" and el.get("n") ): meta[el.get("type")] = el.get("n", "-") sect_sent = 0 sect_pos = 0 if not el.text: text = el.tail if el.tail else "" else: text = el.text + (el.tail if el.tail else "") subs = [ (r"<supplied>(.*?)</supplied>", "\1"), (r'<quote type="\w+?">(.+?)</quote>', "\1"), (r'<hi rend="\w+?">(.+?)</hi>', "\1"), (r'<g ref="\w+?">(.+?)</g>', "\1"), ( r'<foreign xml:lang="\w+?">(\w+?)</foreign>', "\1", ), (r"<del>.+?</del>", ""), ] for old, new in subs: text = re.sub(old, new, text) if text: for sentence in sent_tokenizer.tokenize(text): sent_id += 1 sect_sent += 1 sentence = sentence.strip() replacements = [(r"\n", ""), (r"\s+", " ")] for old, new in replacements: sentence = re.sub(old, new, sentence) sent_pos = 0 tokens = word_tokenizer.word_tokenize(sentence) for token in tokens: token = ( token.translate(punc) .lower() .translate(jvmap) .strip() ) if not token or token in string.whitespace: start_char += 1 continue else: pos += 1 sect_pos += 1 sent_pos += 1 t.text = token t.boost = 1.0 meta["sent_id"] = sent_id meta["sent_pos"] = sent_pos meta["sect_sent"] = sect_sent meta["sect_pos"] = sect_pos t.meta = copy.copy(meta) if keeporiginal: t.original = token t.stopped = False if positions: t.pos = start_pos + pos is_enclitic = False for enclitic in enclitics: if ( token.endswith(enclitic) and token not in latin_exceptions ): if enclitic == "ne": t.text = token[ : -len(enclitic) ] t.startchar = start_char t.endchar = start_char + ( len(token) - len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t t.text = "ne" t.startchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t elif enclitic == "n": t.text = ( token[: -len(enclitic)] + "s" ) t.startchar = start_char t.endchar = ( start_char + len(token) - len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t t.text = "ne" t.startchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t elif enclitic == "st": if token.endswith("ust"): t.text = token[ : -len(enclitic) ] t.startchar = ( start_char ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) - len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy( t ) ) yield t t.text = "est" t.startchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy( t ) ) yield t else: t.text = token[ : -len(enclitic) ] t.startchar = ( start_char ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) - len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy( t ) ) yield t t.text = "est" t.startchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy( t ) ) yield t elif enclitic == "'s": t.text = token + "s" t.startchar = start_char t.endchar = ( start_char + len(token) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t t.text = "es" t.startchar = ( start_char + len(token) + 1 ) t.endchar = ( start_char + len(token) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t else: t.text = token[ : -len(enclitic) ] t.startchar = start_char t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t t.text = enclitic t.startchar = ( start_char + len( token[ : -len( enclitic ) ] ) ) t.endchar = ( start_char + len( token[ : -len( enclitic ) ] ) + len(enclitic) ) if mode == "index": self._cache.append( copy.deepcopy(t) ) yield t is_enclitic = True break if not is_enclitic: original_len = len(token) if chars: t.startchar = start_char t.endchar = ( start_char + original_len ) if self.cached: self._cache.append( copy.copy(t) ) yield t start_char += len(token)
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) if tokenize: for sentence in data["text"].iter("sentence"): for pos, token in enumerate(sentence.iter("word")): if token.get("artificial", False): continue form = token.get("form") if form: form = form.replace(" ", " ").replace(" ", " ") form = re.sub(r"\.([^ ]|^$)", r". \1", form) else: continue lemma = token.get("lemma", None) if not lemma or lemma in ( ".", ",", "punc1", "comma1", "PERIOD1", ): continue t.lemma = lemma.strip("0123456789") t.morpho = agldt2wn(token.get("postag")) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} divs = data["meta"].split("-") for i, div in enumerate(divs): if len(divs) <= 2 or div != "line": meta[div] = sentence.get("subdoc").split( ".")[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = token.get("id") t.meta = meta if keeporiginal: t.original = f"{form}" t.stopped = False if positions: t.pos = copy.copy(start_pos + pos) original_len = len(form) if (form.istitle() and pos == 0 and not t.lemma.istitle()): form = form.lower() t.text = form if chars: t.startchar = copy.copy(start_char) t.endchar = copy.copy(start_char + original_len) if self.cached: self._cache.append(copy.deepcopy(t)) yield t if form in editorial: t.text = editorial[form] if self.cached: self._cache.append(copy.copy(t)) yield t start_char += len(form) else: t.original = "" for token in data.iter("token"): form = token.get("form") if not form: continue t.original += f"{form}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t
def __call__( self, data, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if t.mode == "query": t.original = data t.text = data.translate(jvmap) yield t else: if not tokenize: t.original = "" for token in data.iter("token"): form = token.get("form") if not form: continue after = token.get("presentation-after", "") before = token.get("presentation-before", "") t.original += f"{before}{form}{after}" t.text = t.original t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) for sentence in data["text"].iter("sentence"): for pos, token in enumerate(sentence.iter("token")): form = token.get("form") if not form: continue else: form = form.replace(" ", " ").replace(" ", " ") form = re.sub(r"\.([^ ]|^$)", r". \1", form) t.lemma = token.get("lemma") t.morpho = proiel2wn( token.get("part-of-speech"), token.get("morphology"), ) t.morphosyntax = token.get("relation", None) t.boost = 1.0 meta = {"meta": data["meta"].lower()} for i, div in enumerate(data["meta"].split("-")): meta[div] = token.get("citation-part").split( "." )[i] meta["sent_id"] = sentence.get("id") meta["sent_pos"] = token.get("id") t.meta = meta before = token.get("presentation-before", "") after = token.get("presentation-after", "") if keeporiginal: t.original = f"{before}{form}{after}" t.stopped = False if positions: t.pos = start_pos + pos original_len = len(form) if ( form.istitle() and pos == 0 and not t.lemma.istitle() ): form = form.lower() t.text = form if chars: t.startchar = start_char + len(before) t.endchar = ( start_char + len(before) + original_len ) self._cache.append(copy.deepcopy(t)) yield t if form in editorial: t.text = editorial[form] self._cache.append(copy.deepcopy(t)) yield t start_char += len(before) + len(form) + len(after)
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode="", **kwargs ): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%s is not unicode" % repr(value) t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t elif not self.gaps: # The default: expression matches are used as tokens for pos, match in enumerate(self.expression.finditer(value)): t.text = match.group(0) t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = start_pos + pos if chars: t.startchar = start_char + match.start() t.endchar = start_char + match.end() yield t else: # When gaps=True, iterate through the matches and # yield the text between them. prevend = 0 pos = start_pos for match in self.expression.finditer(value): start = prevend end = match.start() text = value[start:end] if text: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos pos += 1 if chars: t.startchar = start_char + start t.endchar = start_char + end yield t prevend = match.end() # If the last "gap" was before the end of the text, # yield the last bit of text as a final token. if prevend < len(value): t.text = value[prevend:] t.boost = 1.0 if keeporiginal: t.original = t.text t.stopped = False if positions: t.pos = pos if chars: t.startchar = prevend t.endchar = len(value) yield t
def __call__( self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, tokenize=True, mode="", **kwargs ): """ :param value: The unicode string to tokenize. :param positions: Whether to record token positions in the token. :param chars: Whether to record character offsets in the token. :param start_pos: The position number of the first token. For example, if you set start_pos=2, the tokens will be numbered 2,3,4,... instead of 0,1,2,... :param start_char: The offset of the first character of the first token. For example, if you set start_char=2, the text "aaa bbb" will have chars (2,5),(6,9) instead (0,3),(4,7). :param tokenize: if True, the text should be tokenized. """ assert isinstance(value, text_type), "%r is not unicode" % value t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if not tokenize: t.original = t.text = value t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value) yield t else: text = u("") charmap = self.charmap pos = start_pos startchar = currentchar = start_char for char in value: tchar = charmap[ord(char)] if tchar: text += tchar else: if currentchar > startchar: t.text = text t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos pos += 1 if chars: t.startchar = startchar t.endchar = currentchar yield t startchar = currentchar + 1 text = u("") currentchar += 1 if currentchar > startchar: t.text = value[startchar:currentchar] t.boost = 1.0 if keeporiginal: t.original = t.text if positions: t.pos = pos if chars: t.startchar = startchar t.endchar = currentchar yield t
def __call__(self, value: dict, positions=False, chars=False, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = value t.text = value.translate(jvmap) yield t else: if not tokenize: t.original = t.text = "\n".join( [el for el in value["text"]]) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(value["text"]) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) punctuation = str.maketrans("", "", string.punctuation) editorial = str.maketrans("", "", "[{(<>)}]") added = re.compile(r"(\s?[<(][\w .]+[>)]\s?)") t.boost = 1.0 t.pos = t.startchar = t.endchar = 0 sect_sent = 0 # sentence count within passage sent_id = "0001" sect_pos = 0 # word pos within passage sent_pos = 0 # word pos within sentence current_refs = tuple(["0"] * len(value["meta"])) nflag = None for pos, line in enumerate(value["text"]): t.pos = pos parsed = parse_bpn(line) if not parsed: continue if int(parsed["sent_id"]) > int(sent_id): sent_pos = 0 sent_id = parsed["sent_id"] if (tuple([ alnum(i) for i in parsed["refs"].split(",") ]) > current_refs): sect_sent = 1 sect_pos = 0 else: sect_sent += 1 if keeporiginal: if added.search(parsed["form"]): t.original = added.sub("", parsed["form"]) else: t.original = parsed["form"] t.stopped = False if parsed["form_code"] in "&+": if parsed["lemma"] != "#": if parsed["lemma"] == "_SVM": t.morpho = None t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.original = added.sub("", parsed["form"]) t.text = parsed["form"].translate( editorial) else: form = parsed["form"] t.morpho = parsed["morpho"] if " " in form: t.original = added.sub("", form) text = form.translate(editorial) else: t.original = form text = form t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] if added.search(parsed["form"]): t.original = added.sub( "", parsed["form"]) t.text = text.translate(editorial) nflag = False else: # could be a Greek form, do we index it? t.morpho = "" t.lemma = "" t.lemma_n = "" t.original = added.sub("", parsed["form"]) t.text = parsed["form"].translate(editorial) elif parsed["form_code"] == "@": # combined forms if parsed["lemma"] != "#": t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.text = parsed["form"].translate(editorial) t.morpho = parsed["morpho"] if nflag: sect_pos -= 1 sent_pos -= 1 else: nflag = True else: sent_pos += 1 sect_pos += 1 continue elif parsed["form_code"] == "=": # que t.text = parsed["form"].translate(editorial) t.lemma = parsed["lemma"] t.lemma_n = parsed["lemma_n"] t.morpho = parsed["morpho"] sent_pos -= 1 sect_pos -= 1 nflag = False meta = {"meta": value["meta"].lower()} tags = value["meta"].split("-") divs = {i: div.lower() for i, div in enumerate(tags)} refs = tuple([ ref.translate(punctuation) for ref in parsed["refs"].strip().split(",") ]) for i in range(len(divs)): meta[divs[i]] = refs[i] current_refs = refs t.morphosyntax = parsed["subord"] meta["sect_sent"] = str(sect_sent) meta["sect_pos"] = str(sect_pos) meta["sent_id"] = parsed["sent_id"] meta["sent_pos"] = str(sent_pos) t.meta = meta t.startchar = start_char t.endchar = start_char + len(t.original) if t.text != t.original: tc = copy.deepcopy(t) tc.text = t.original yield tc yield t sent_pos += 1 sect_pos += 1 start_char += len(t.original) + 1
def __call__( self, value, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = value.translate(jvmap) yield t elif t.mode == "index": if not tokenize: t.original = t.text = "\n".join( [el for el in flatten(value["text"])]) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.original) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) tokenizer = PunktLatinCharsVars() stopchars = str.maketrans("", "", string.punctuation + "“”—\n") divs = { i: div.lower() for i, div in enumerate(value["meta"].split("-")) } sect_sent = 0 prev_sect = 0 sect_pos = 0 for i, (path, text) in enumerate( nested_dict_iter(value["text"])): sent_id = i if len(path) >= 2 and int(path[-2]) > prev_sect: sect_sent = 0 sect_pos = 0 prev_sect = int(path[-2]) tokens = [] temp_tokens = tokenizer.word_tokenize(text) if temp_tokens: if (temp_tokens[0].replace("j", "i").replace("v", "u") not in proper_names.proper_names): temp_tokens[0] = temp_tokens[0] for ix, token in enumerate(temp_tokens): ppp = compound.is_ppp(token) if ppp and ix < len(temp_tokens) - 2: copula = compound.is_copula( temp_tokens[ix + 2]) # whitespace if copula and ppp[1] == copula[2]: tense, mood, number, i = copula token = f"{token} {compound.copula[tense][mood][number][i]}" del temp_tokens[ix + 1:ix + 3] tokens.insert(ix, token) else: tokens.append(token) else: tokens.append(token) pos = 0 sent_pos = 0 for token in tokens: meta = {"meta": value["meta"].lower()} for i in range(len(divs)): meta[divs[i]] = str(int(path[i]) + 1) t.boost = 1.0 if keeporiginal: t.original = token t.stopped = False token = convert_diphthongs( strip_diacritics(token)).translate(jvmap) if (token in (" ", "\n") or token in punctuation or token in stopchars): pos -= 1 else: pos += 2 if positions: t.pos = start_pos + pos original_length = len(token) token = token.strip() ltoken = token.lstrip(string.punctuation) ldiff = original_length - len(ltoken) if ldiff != 0: token = ltoken rtoken = token.rstrip(string.punctuation) rdiff = len(token) - len(rtoken) if rdiff != 0: token = rtoken ntoken = token.translate(stopchars) ndiff = len(token) - len(ntoken) if ndiff: token = ntoken if not token: start_char += original_length continue meta["sect_sent"] = sect_sent meta["sect_pos"] = sect_pos meta["sent_id"] = sent_id meta["sent_pos"] = sent_pos t.meta = meta is_enclitic = False if token not in exceptions: if t.original in replacements: for subtoken in replacements[t.original]: t.text = subtoken t.startchar = start_char t.endchar = (start_char + original_length) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t start_char += original_length continue if re.match(r"(?:\w+) (?:\w+)", token): ppp, copula = token.split(" ") t.text = ppp t.startchar = start_char t.endchar = start_char + len(ppp) + 1 if mode == "index": self._cache.append(copy.deepcopy(t)) yield t t.text = copula t.startchar = start_char + len(ppp) t.endchar = (start_char + len(ppp) + len(copula)) if mode == "index": self._cache.append(copy.deepcopy(t)) yield t start_char += original_length continue for enclitic in enclitics: if token.endswith(enclitic): if enclitic == "ne": t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + ( len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "n": t.text = (token[:-len(enclitic)] + "s") t.startchar = start_char t.endchar = (start_char + len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "st": if token.endswith("ust"): t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "'s": t.text = token + "s" t.startchar = start_char t.endchar = start_char + len(token) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "es" t.startchar = (start_char + len(token) + 1) t.endchar = (start_char + len(token) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + len( token[:-len(enclitic)]) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = enclitic t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t is_enclitic = True break if not is_enclitic: t.text = token if chars: t.startchar = start_char + ldiff t.endchar = (start_char + original_length - rdiff) # - ndiff - rdiff if mode == "index": self._cache.append(copy.deepcopy(t)) yield t start_char += original_length sent_pos += 1 sect_pos += 1 start_char += 1
def __call__( self, value, positions=False, chars=False, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs ): if kwargs.get("docix", None) == self._docix and self._cache: yield from self.cache else: t = CylleneusToken( positions, chars, removestops=removestops, mode=mode, **kwargs ) if t.mode == "query": t.text = t.original = value yield t else: if not tokenize: lines = [] for line in value["text"]: line = re.sub(r"\t+", "\t", line.strip()) if line and line.startswith("# text_line"): text = line.split("# text_line: ")[1] lines.append(text) t.original = t.text = "\n".join([line for line in lines]) t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(t.text) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) t.boost = 1.0 t.pos = t.startchar = t.endchar = 0 meta = { "text": None, # work title "text_id": None, "chapter": None, # reference "chapter_id": None, "text_line": None, # the text "text_line_id": None, "text_line_counter": None, # line number "text_line_subcounter": None, # token number } sect_pos = 0 sent_pos = 0 for line in value["text"]: line = line.strip() if line: if line.startswith("#"): try: label, value = line.split(":", maxsplit=1) except ValueError: continue label = label.split(" ", maxsplit=1)[1].strip() value = value.strip() meta[label] = ( value if not value.isnumeric() else int(value) ) if label in [ "text_line_counter", "text_line_subcounter", ]: sent_pos = 0 else: try: ( ID, FORM, LEMMA, UPOS, XPOS, MORPHO, _, _, _, _, LEMMA_ID, PADA, SEM, ) = line.split("\t") except ValueError: try: ( ID, FORM, LEMMA, _, XPOS, _, _, _, _, LEMMA_ID, _, _, ) = line.split("\t") except ValueError: try: ( ID, FORM, _, _, _, _, _, _, _, _, ) = line.split("\t") except ValueError: continue else: t.original = FORM sect_pos += 1 sent_pos += 1 t.pos = sent_pos continue else: if FORM == "_": t.text = t.original else: sect_pos += 1 sent_pos += 1 t.text = FORM t.original = FORM t.pos = sent_pos t.lemma = LEMMA t.dcs_id = LEMMA_ID t.morphosyntax = XPOS t.morpho = None t.synset = None t.meta = { "meta": "chapter-line", "chapter": meta["chapter"], "line": meta["text_line_counter"], "sect_pos": sect_pos, "sect_sent": meta[ "text_line_counter" ], "sent_id": meta["text_line_id"], "sent_pos": sent_pos, } t.startchar = start_char t.endchar = start_char + len( t.original ) yield t # # Emit Devanagari # t.text = slp2deva(iast2slp(t.text)) # t.mode = "skip" # yield t start_char += len(t.original) + 1 else: if FORM == "_": t.text = t.original else: sect_pos += 1 sent_pos += 1 t.text = FORM t.original = FORM t.pos = sent_pos t.lemma = LEMMA t.dcs_id = LEMMA_ID t.morphosyntax = XPOS t.morpho = None if MORPHO == "_" or not MORPHO else parse_morpho(XPOS, MORPHO) t.synset = None if SEM == "_" else SEM t.meta = { "meta": "chapter-line", "chapter": meta["chapter"], "line": meta["text_line_counter"], "sect_pos": sect_pos, "sect_sent": meta[ "text_line_counter" ], "sent_id": meta["text_line_id"], "sent_pos": sent_pos, } t.startchar = start_char t.endchar = start_char + len(t.original) yield t start_char += len(t.original) + 1
def __call__( self, value: str, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if kwargs.get("docix") == self._docix and self._cache: yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) stopchars = '!"#$%()*+,-—./:;<=>?@[\]^_`{|}~' punctuation = str.maketrans("", "", stopchars) if t.mode == "query": t.original = t.text = value.translate(jvmap) yield t else: if not tokenize: # Assume value is a list for pos, token in enumerate(value): t.original = t.text = token t.boost = 1.0 if positions: t.pos = pos if chars: t.startchar = start_char t.endchar = start_char + len(token) start_char += len(token) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) work_pos = 0 for i, sentence in enumerate( sent_tokenizer.tokenize(value)): sent_pos = 0 for token in word_tokenizer.word_tokenize(sentence): if token in string.whitespace: start_char += 1 continue t.boost = 1.0 if keeporiginal: t.original = token original_length = len(token) t.stopped = False token = convert_diphthongs( strip_diacritics(token)).translate(jvmap) if token in stopchars: start_char += original_length continue t.text = token.translate(punctuation) if positions: t.pos = start_pos + work_pos if chars: t.startchar = start_char t.endchar = start_char + original_length t.meta = {"sent_id": i, "sent_pos": sent_pos} if mode == "index" and self.cached: self._cache.append(copy.copy(t)) yield t work_pos += 1 sent_pos += 1 start_char += original_length start_char += 1
def __call__( self, value, positions=True, chars=True, keeporiginal=True, removestops=True, tokenize=True, start_pos=0, start_char=0, mode="", **kwargs, ): if (kwargs.get("docix", None) == self._docix and self._cache is not None): yield from self.cache else: t = CylleneusToken(positions, chars, removestops=removestops, mode=mode, **kwargs) if t.mode == "query": t.original = t.text = value.translate(jvmap) yield t else: if not tokenize: text = "\n".join([el for el in flatten(value["text"])]) t.original = t.text = text t.boost = 1.0 if positions: t.pos = start_pos if chars: t.startchar = start_char t.endchar = start_char + len(text) yield t else: self._cache = [] self._docix = kwargs.get("docix", None) tokenizer = PunktLatinCharsVars() stopchars = str.maketrans("", "", string.punctuation) doc = value["text"] divs = [ cref.get("n") for cref in doc.findall( ".//{http://www.tei-c.org/ns/1.0}cRefPattern") ] tei_base = "/tei:TEI/tei:text/tei:body/tei:div" # Prose divisions sentences = doc.xpath( tei_base + ("/tei:div" * len(divs)), namespaces={"tei": "http://www.tei-c.org/ns/1.0"}, ) # Fall back to poetry divisions if len(sentences) == 0: sentences = doc.xpath( tei_base + ("/tei:div" * (len(divs) - 1)) + "/tei:l", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}, ) # Fall back to speaker divisions (plays) if len(sentences) == 0: sentences = doc.xpath( tei_base + ("/tei:div" * (len(divs) - 1)) + "/tei:sp/tei:l", namespaces={"tei": "http://www.tei-c.org/ns/1.0"}, ) for i, sentence in enumerate(sentences): meta = { "meta": "-".join(divs), divs[-1]: sentence.get("n"), "sent_id": i, } el = sentence j = -1 while el is not None: if el.getparent() is not None: if (el.getparent().get("type", None) == "textpart" or el.getparent().tag == "{http://www.tei-c.org/ns/1.0}sp"): if (el.getparent().tag == "{http://www.tei-c.org/ns/1.0}sp"): meta["speaker"] = (el.getparent().find( ".//{http://www.tei-c.org/ns/1.0}speaker" ).text) elif (el.getparent().get( "type", None) == "textpart"): j -= 1 meta[divs[j]] = el.getparent().get("n") el = el.getparent() text = sentence.text # If the text is not embedded in an XML node, use the 'text' attribute if not text: text = stringify(sentence) if not text: continue tokens = [] temp_tokens = tokenizer.word_tokenize(text.strip()) if temp_tokens: if (temp_tokens[0].replace("j", "i").replace("v", "u") not in proper_names.proper_names): temp_tokens[0] = temp_tokens[0] for ix, token in enumerate(temp_tokens): ppp = compound.is_ppp(token) if ppp and ix < len(temp_tokens) - 2: copula = compound.is_copula( temp_tokens[ix + 2]) # whitespace if copula and ppp[1] == copula[2]: tense, mood, number, i = copula token = f"{token} {compound.copula[tense][mood][number][i]}" del temp_tokens[ix + 1:ix + 3] tokens.insert(ix, token) else: tokens.append(token) else: tokens.append(token) pos = 0 for token in tokens: meta["sent_pos"] = pos t.boost = 1.0 if keeporiginal: t.original = token t.stopped = False token = convert_diphthongs( strip_diacritics(token)).translate(jvmap) if positions: t.pos = start_pos + pos if (token == " " or token in punctuation or token in stopchars): pos += 1 continue original_length = len(token) token = token.strip() ltoken = token.lstrip(string.punctuation) ldiff = original_length - len(ltoken) if ldiff != 0: token = ltoken rtoken = token.rstrip(string.punctuation) rdiff = len(token) - len(rtoken) if rdiff != 0: token = rtoken ntoken = token.translate(stopchars) ndiff = len(token) - len(ntoken) if ndiff: token = ntoken if not token: start_char += original_length continue t.meta = copy.deepcopy(meta) is_enclitic = False if token not in exceptions: if t.original in replacements: for subtoken in replacements[t.original]: t.text = subtoken t.startchar = start_char t.endchar = (start_char + original_length) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t start_char += original_length continue if re.match(r"(?:\w+) (?:\w+)", token): ppp, copula = token.split(" ") t.text = ppp t.startchar = start_char t.endchar = start_char + len(ppp) + 1 if mode == "index": self._cache.append(copy.deepcopy(t)) yield t t.text = copula t.startchar = start_char + len(ppp) t.endchar = (start_char + len(ppp) + len(copula)) if mode == "index": self._cache.append(copy.deepcopy(t)) yield t start_char += original_length continue for enclitic in enclitics: if token.endswith(enclitic): if enclitic == "ne": t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + ( len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "n": t.text = (token[:-len(enclitic)] + "s") t.startchar = start_char t.endchar = (start_char + len(token) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "ne" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "st": if token.endswith("ust"): t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = (start_char + len( token[:-len(enclitic)]) - len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "est" t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = (start_char + len( token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t elif enclitic == "'s": t.text = token + "s" t.startchar = start_char t.endchar = start_char + len(token) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = "es" t.startchar = (start_char + len(token) + 1) t.endchar = (start_char + len(token) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t else: t.text = token[:-len(enclitic)] t.startchar = start_char t.endchar = start_char + len( token[:-len(enclitic)]) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t t.text = enclitic t.startchar = start_char + len( token[:-len(enclitic)]) t.endchar = ( start_char + len(token[:-len(enclitic)]) + len(enclitic)) if mode == "index": self._cache.append( copy.deepcopy(t)) yield t is_enclitic = True break if not is_enclitic: t.text = token if chars: t.startchar = start_char + ldiff t.endchar = (start_char + original_length - rdiff) if mode == "index": self._cache.append(copy.deepcopy(t)) yield t start_char += original_length start_char += 1