def split_sent(parsed, validate=True): if validate: extract_text(parsed, validate) result = [] tokens = [] for t in parsed: tokens.append(t) if 'misc' not in t or t['misc'] is None or 'SentenceBreak' not in t[ 'misc']: continue assert 'Yes' == t['misc']['SentenceBreak'] result.append(tokens) tokens = [] if len(tokens): result.append(tokens) for i in range(len(result)): r = result[i][-1] if 'misc' not in r or r['misc'] is None or 'SpaceAfter' not in r[ 'misc']: continue del result[i][-1]['misc']['SpaceAfter'] if not len(result[i][-1]['misc']): result[i][-1]['misc'] = None return [TokenList(r) for r in result]
def get_treebank_annotations(urn: str) -> List[TokenList]: """ Retrieves annotations from a treebank. """ cc: CustomCorpus = next(x for x in CustomCorpusService.custom_corpora if x.corpus.source_urn in urn) annotations: List[TokenList] = [] file_name: str = ntpath.basename(cc.file_path) cache_file_path: str = os.path.join(Config.TREEBANKS_CACHE_DIRECTORY, file_name + ".json") if os.path.exists(cache_file_path): try: annotations = [ TokenList(tokens=x["tokens"], metadata=x["metadata"]) for x in json.loads( FileService.get_file_content(cache_file_path)) ] except ValueError: pass if not annotations: annotations = conllu.parse( FileService.get_file_content(cc.file_path)) # need to cache the result because the syntax parser is so slow with open(cache_file_path, "w+") as f: f.write( json.dumps( dict(tokens=x.tokens, metadata=x.metadata) for x in annotations)) if cc.corpus.source_urn != urn: # the given URN points to a sub-graph, so we make a selection from our annotations annotations = CustomCorpusService.get_treebank_sub_annotations( urn, annotations, cc) # add an artificial punctuation sign at the end of each sentence for sent in annotations: sent.metadata["urn"] = ":".join( urn.split(":")[:-1] + [sent.tokens[0]["misc"]["ref"]]) if sent.tokens[-1]["form"] != ".": root_token: OrderedDict = next( x for x in sent.tokens if x[Config.AQL_DEPREL] == "root") sent.append( OrderedDict([ ("id", sent.tokens[-1]["id"] + 1), ("form", "."), ("lemma", "."), ("upostag", "PUNCT"), ("xpostag", None), ("feats", None), ("head", root_token["id"] if root_token else 0), ("deps", None), ("misc", OrderedDict([("ref", sent.tokens[0]["misc"]["ref"])])) ])) # add root dependencies as separate node annotations so we can search for them later for token_list in annotations: for token in token_list: if token["head"] == 0: token["deps"] = token[Config.AQL_DEPREL] return annotations
def read_conllu_file(file_path: str, seg_threshold: bool = True) -> List[TokenList]: document = [] with open(file_path, "r") as file: contents = file.read() tokenlists = parse(contents) if len(tokenlists) == 0: print(f"WARNING: {file_path} is empty--likely conversion error.") return [] for annotation in tokenlists: m = annotation.metadata if seg_threshold and "segmentation" in m and m["segmentation"] not in ["checked", "gold"]: print("Skipping " + file_path + " because its segmentation is not checked or gold.") return [] if len(annotation) > 200: subannotation = TokenList(annotation[:200]) subannotation.metadata = annotation.metadata.copy() logger.info(f"Breaking up huge sentence in {file_path} with length {len(annotation)} " f"into chunks of 200 norms") while len(subannotation) > 0: document.append(subannotation) subannotation = TokenList(subannotation[200:]) subannotation.metadata = annotation.metadata.copy() else: document.append(annotation) return document
def compile_sent(sent): """compile a list of sublists, [word, head, full POS] format, into a CoNLL-U format sentence""" sent_list = list() for i, tok_data in enumerate(sent): tok_id = i + 1 tok = tok_data[0] head = tok_data[1] pos = get_pos(tok_data[2]) feats = get_feats(tok_data[2]) compiled_tok = OrderedDict({ 'id': tok_id, 'form': tok, 'lemma': head, 'upostag': pos, 'xpostag': None, 'feats': feats, 'head': None, 'deprel': None, 'deps': None, 'misc': None }) sent_list.append(compiled_tok) sent_list = TokenList(sent_list).serialize() return sent_list
def handle_exercise_data(ed: ExerciseData, ctx_left: int, ctx_right: int) -> str: """ Constructs an SVG image (for POS and syntactic dependencies) from given annotations. """ conllu_list: List[TokenList] = [] dep_links: List[LinkMC] = [ x for x in ed.graph.links if x.annis_component_name == Config.GRAPHANNIS_DEPENDENCY_LINK ] current_sentence_id: str = "-1" salt_id_to_conll_id_dict: Dict[str, str] = {} for node in ed.graph.nodes: new_sentence_id: str = str(AnnotationService.get_sentence_id(node)) if new_sentence_id != current_sentence_id: update_heads(conllu_list=conllu_list, salt_id_to_conll_id_dict=salt_id_to_conll_id_dict) conllu_list.append( TokenList(tokens=[], metadata=OrderedDict([("sent_id", new_sentence_id) ]))) current_sentence_id = new_sentence_id salt_id_to_conll_id_dict = {} relevant_link: LinkMC = next( (x for x in dep_links if x.target == node.id), None) salt_id_to_conll_id_dict[node.id] = str( len(conllu_list[-1].tokens) + 1) conllu_list[-1].tokens.append({ "id": salt_id_to_conll_id_dict[node.id], "form": node.annis_tok, "lemma": node.udep_lemma, "upostag": node.udep_upostag, "xpostag": node.udep_xpostag, "feats": node.udep_feats, "head": "0" if relevant_link is None else relevant_link.source, Config.AQL_DEPREL: "root" if (relevant_link is None or not hasattr(relevant_link, "udep_deprel")) else relevant_link.udep_deprel, "deps": None, "misc": None }) update_heads(conllu_list=conllu_list, salt_id_to_conll_id_dict=salt_id_to_conll_id_dict) conllu_string: str = "".join(x.tokens.serialize() for x in conllu_list) # generate temp file (handle, tmp_file_path) = mkstemp(suffix=".conllu") with open(tmp_file_path, "w+") as f: f.write(conllu_string) conllu2svg_path: str = Config.CONLLU2SVG_PATH_OSX if platform == Config.PLATFORM_MACOS else Config.CONLLU2SVG_PATH_LINUX html_bytes: bytes = subprocess.check_output("{0} {1}".format( conllu2svg_path, tmp_file_path), shell=True) os.remove(tmp_file_path) html_string: str = html_bytes.decode('utf-8') svg_list: List[str] = re.findall(r"<svg[\s\S]*?svg>", html_string) ret_val: str = "".join(svg_list) + "<br>" ret_val = re.sub(r' onclick=".*?"', "", ret_val) ret_val = re.sub(r' onmouseover=".*?"', "", ret_val) ret_val = re.sub(r' onmouseout=".*?"', "", ret_val) return highlight_targets(ret_val, ctx_left, ctx_right, ed)
def conllu_parse(json_file, tok_style=2, tagged_only=True): """takes the Wb. Glosses from the .json file changes their format to match the Sg. Glosses from the .conllu file""" # Extract data from the JSON file format parse_list = list() for level_0 in json_file: fol = level_0.get("folios") for level_1 in fol: fol_col = level_1.get("folio") glosses = level_1.get("glosses") for level_2 in glosses: glossnum = level_2.get("glossNo") gloss_text = level_2.get("glossText") gloss_trans = level_2.get("glossTrans") gloss_hand = level_2.get("glossHand") tokens = level_2.get(f"glossTokens{tok_style}") # Check that glosses have been tagged before inclusion by ensuring they contain at least one POS which # does not appear in Latin/Greek-only glosses. if tagged_only: vernacular_pos = ['ADJ', 'ADP', 'AUX', 'DET', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'SCONJ', 'VERB'] if [i for i in [tok[1] for tok in tokens] if i in vernacular_pos]: parse_list.append([fol_col[3:] + glossnum, gloss_text, gloss_trans, gloss_hand, [[tok[0], tok[1], tok[2], tok[3]] for tok in tokens]]) else: parse_list.append([fol_col[3:] + glossnum, gloss_text, gloss_trans, gloss_hand, [[tok[0], tok[1], tok[2], tok[3]] for tok in tokens]]) # Compile the data into CoNLL_U file format conllu_format = None for sentnum, sent in enumerate(parse_list): sent_id = sentnum + 1 this_id = f'# sent_id = {sent_id}' gloss_id = sent[0] ref = f'# reference = {gloss_id}' sent_toks = sent[4] full_gloss = f'# text = {sent[1]}' full_gloss = "".join(full_gloss.split("<em>")) full_gloss = "".join(full_gloss.split("</em>")) full_gloss = "".join(full_gloss.split("<sup>")) full_gloss = "".join(full_gloss.split("</sup>")) translation = f'# translation = {sent[2]}' translation = "".join(translation.split("<em>")) translation = "".join(translation.split("</em>")) translation = "".join(translation.split("<sup>")) translation = "".join(translation.split("</sup>")) hand = f'# scribe = {sent[3]}' meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n' sent_list = list() for i, tok_data in enumerate(sent_toks): tok_id = i + 1 tok = tok_data[0] head = tok_data[2] if not head: head = "_" pos = tok_data[1] feats = tok_data[3] if pos in ["<Latin>", "<Latin CCONJ>", "<Greek>"] and (not feats or feats == "Foreign=Yes"): pos = "X" feats = "Foreign=Yes" elif pos in ["<Latin>", "<Latin CCONJ>"]: raise RuntimeError(f"Latin word found with features: {feats}") if feats: feats = feats.split("|") feats = OrderedDict({i.split("=")[0]: i.split("=")[1] for i in feats}) compiled_tok = OrderedDict({'id': tok_id, 'form': tok, 'lemma': head, 'upostag': pos, 'xpostag': None, 'feats': feats, 'head': None, 'deprel': None, 'deps': None, 'misc': None}) sent_list.append(compiled_tok) sent_list = TokenList(sent_list).serialize() if not conllu_format: conllu_format = meta + sent_list else: conllu_format = conllu_format + meta + sent_list conllu_format = conllu_format.strip("\n") + "\n" conllu_format = parse(conllu_format) return conllu_format
for t in tokenlist: if isinstance(t["id"], tuple): constituent_ids = list( range(t["id"][0], t["id"][2] + 1)) t["um_feats"] = merge_attributes([ non_contracted_token_dict[x] for x in constituent_ids ], _UNIMORPH_VALUES_ATTRIBUTE) # If this is a contraction, add it final_tokens.append(t) elif t["id"] not in contracted_ids: # Check if this t is part of a contraction final_tokens.append(t) final_tokens: TokenList = TokenList(final_tokens) # Skip if this would have more than 512 subtokens labelled_subwords = subword_tokenize( tokenizer, [t["form"] for t in final_tokens]) subtoken_indices, subtokens = zip(*labelled_subwords) if len(subtokens) >= 512: if "subtoken_count" not in skipped: skipped["subtoken_count"] = 0 skipped["subtoken_count"] += 1 continue if "total_sents" not in skipped: skipped["total_sents"] = 0
def remove_foreign(sentence): """Divide glosses into separate sentences at points where non-vernacular words occur, return only Irish text""" # Create the metadata to be added to each sub-gloss meta = get_metadata(sentence) this_id = f'# sent_id = {meta.get("sent_id")}' ref = f'# reference = {meta.get("reference")}' full_gloss = f'# text = {meta.get("text")}' translation = f'# translation = {meta.get("translation")}' if meta.get("scribe"): hand = f'# scribe = {meta.get("scribe")}' meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n' else: meta = f'{this_id}\n{ref}\n{full_gloss}\n{translation}\n' split_sents = None cur_sent = list() tok_id = 0 # Check each token in a sentence to see if it's non-vernacular for tok_num, tok_data in enumerate(sentence): tok_id += 1 tok_data["id"] = tok_id feats = tok_data.get("feats") if feats: # If a non-vernacular word is found (other than latin "et") if feats.get("Foreign") == "Yes" and tok_data.get("form") != "et": # If a sentence has been compiled up to this point if cur_sent: # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList( cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 # If this is the last token in the sentence and it has features add it to the current split elif tok_num == len(sentence) - 1: cur_sent.append(tok_data) # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList(cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 else: cur_sent.append(tok_data) # If this is the last token in the sentence but it has no features add it to the current split elif tok_num == len(sentence) - 1: cur_sent.append(tok_data) # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList(cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 else: cur_sent.append(tok_data) if not split_sents: split_sents = [] else: split_sents = split_sents.strip("\n") + "\n" split_sents = parse(split_sents) # Renumber each substring's sentence ID to separately identify any splits made for i, sub_sent in enumerate(split_sents): meta = get_metadata(sub_sent) meta["sent_id"] = f'{meta.get("sent_id")}.{i}' return split_sents
def clean_punct(sentence): """Remove various forms of punctuation as necessary to split glosses into sentences""" # Check if any POS in the sentence identifies punctuation. If not, return the sentence unchanged. unknown_check = get_pos(sentence) if "PUNCT" not in unknown_check: return [sentence] else: # Create the metadata to be added to each sub-gloss meta = get_metadata(sentence) this_id = f'# sent_id = {meta.get("sent_id")}' ref = f'# reference = {meta.get("reference")}' full_gloss = f'# text = {meta.get("text")}' translation = f'# translation = {meta.get("translation")}' if meta.get("scribe"): hand = f'# scribe = {meta.get("scribe")}' meta = f'{this_id}\n{ref}\n{hand}\n{full_gloss}\n{translation}\n' else: meta = f'{this_id}\n{ref}\n{full_gloss}\n{translation}\n' # List punctuation types which can be removed without splitting a sentence removables = [",", ":"] keepables = ["·"] split_sents = None cur_sent = list() tok_id = 0 # Check each POS in a sentence to see if it's unknown for tok_num, tok_data in enumerate(sentence): tok_id += 1 tok_data["id"] = tok_id pos = tok_data.get("upos") lemma = tok_data.get("lemma") # If a POS identifies punctuation that splits a sentence if pos == "PUNCT" and lemma not in removables and lemma not in keepables: # If a sentence has been compiled up to this point if cur_sent: # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList( cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 # If a POS identifies punctuation that can be removed without splitting a sentence elif pos == "PUNCT" and lemma in removables: if tok_num == len(sentence) - 1: # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceeding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList( cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 else: tok_id -= 1 # If this is the last token in the sentence add it to the current split elif tok_num == len(sentence) - 1: cur_sent.append(tok_data) # If all of the words in the compiled sentence are valid, Irish words (not exclusions) # add the metadata to the sentence, and concatenate it with any preceeding sentence splits if [ i for i in [[tok.get("lemma"), tok.get("upos"), tok.get("feats")] for tok in cur_sent] if i not in exclusions ]: # If any split sentence ends with an invalid word (an exclusion) remove it from the end if [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: while [ cur_sent[-1].get("lemma"), cur_sent[-1].get("upos"), cur_sent[-1].get("feats") ] in exclusions: cur_sent = cur_sent[:-1] if not split_sents: split_sents = meta + TokenList(cur_sent).serialize() else: split_sents = split_sents + meta + TokenList( cur_sent).serialize() cur_sent = list() tok_id = 0 else: cur_sent.append(tok_data) if not split_sents: split_sents = [] else: split_sents = split_sents.strip("\n") + "\n" split_sents = parse(split_sents) # Renumber each substring's sentence ID to separately identify any splits made for i, sub_sent in enumerate(split_sents): meta = get_metadata(sub_sent) meta["sent_id"] = f'{meta.get("sent_id")}.{i}' return split_sents
def annotate_sample(example_conllu: conllu.TokenList, lang: str, use_triggers: bool = True) -> List[ AnnotatedSample]: sent, metadata = parse_conllu(example_conllu.serialize()) assert len(sent) == len(metadata) == 1 sent = sent[0] _ = sent.pop(0) # for internal use, we remove the stub root-node metadata = metadata[0] tokens = [node.get_conllu_field("form") for node in sent.values()] tags = [node.get_conllu_field("xpos") for node in sent.values()] lemmas = [node.get_conllu_field("lemma") for node in sent.values()] entities = get_entities(example_conllu) chunks = ["O"] * len(tokens) # chunks - not interesting # create a networkX graph from the returned graph. one multiDi and one not - for later use. g = nx.Graph() mdg = nx.MultiDiGraph() for node in sent.values(): for parent, label in node.get_new_relations(): if parent.get_conllu_field("id") == 0: continue # TODO: Why (-1) g.add_node(parent.get_conllu_field("id") - 1, label=parent.get_conllu_field("form")) g.add_node(node.get_conllu_field("id") - 1, label=node.get_conllu_field("form")) g.add_edge(parent.get_conllu_field("id") - 1, node.get_conllu_field("id") - 1, label=label) mdg.add_node(parent.get_conllu_field("id") - 1, label=parent.get_conllu_field("form")) mdg.add_node(node.get_conllu_field("id") - 1, label=node.get_conllu_field("form")) mdg.add_edge(parent.get_conllu_field("id") - 1, node.get_conllu_field("id") - 1, label=label) # add an annotated sample to the list for each trigger on the path rel = example_conllu.metadata['relation'] ann_samples = [] trigger_toks = search_triggers(int(example_conllu.metadata['subj_start']), int(example_conllu.metadata['subj_end']), int(example_conllu.metadata['obj_start']), int(example_conllu.metadata['obj_end']), rel, tokens, lang) if use_triggers else [] # The AnnotatedSample class expect exclusive range that start from 0 trigger_tokens_fixed = [] for trigger_range in trigger_toks: (start, end) = trigger_range trigger_tokens_fixed.append((start - 1, end)) if len(trigger_tokens_fixed) == 0: trigger_tokens_fixed = [None] for trigger_tok in trigger_tokens_fixed: ann_samples.append( AnnotatedSample( " ".join(tokens), " ".join(tokens), rel, example_conllu.metadata['subj_type'].title(), example_conllu.metadata['obj_type'].title(), tokens, tags, entities, chunks, lemmas, # add -1 to start as the AnnotatedSample expect an exclusive range that start from 0 # and the conllu on start from 1 and is inclusive (int(example_conllu.metadata['subj_start']) - 1, int(example_conllu.metadata['subj_end'])), (int(example_conllu.metadata['obj_start']) - 1, int(example_conllu.metadata['obj_end'])), trigger_tok, g, mdg)) assert len(ann_samples) != 0 return ann_samples