def make_lematizer(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: tokens = [ extract_annotations(entry) for entry in root.findall(".//tok_anno") ] # lemmata = extract_by_tag("lemma", tokens) # lemmata_set = set(lemmata) normalized_to_lemma = collections.defaultdict(set) l = [[token["norm"], token["lemma"]] for token in tokens if "lemma" in token] # {token["norm"]: token["lemma"] for token in tokens if "lemma" in token} for norm, lemma in l: normalized_to_lemma[norm].add(lemma) yield normalized_to_lemma # # pos_tags = extract_by_tag("pos", tokens) # # pos_set = set(pos_tags) # # pos_to_lemmata = {pos: {token["lemma"] for token in tokens # if "lemma" in token and "pos" in token and token["pos"] == pos} # for pos in pos_set} else: print("no root")
def make_pos_tagger_to_lemma(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")] pos_tags = extract_by_tag("pos", tokens) pos_set = set(pos_tags) pos_to_lemmata = {pos: {token["lemma"] for token in tokens if "lemma" in token and "pos" in token and token["pos"] == pos} for pos in pos_set} yield pos_to_lemmata
def make_norm_to_pos_tagger(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")] norm = extract_by_tag("norm", tokens) norm_set = set(norm) pos_to_norm = {norm: {token["pos"] for token in tokens if "lemma" in token and "pos" in token and token["norm"] == norm} for norm in norm_set} yield pos_to_norm
def make_lemma_to_pos_tagger(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: tokens = [extract_annotations(entry) for entry in root.findall(".//tok_anno")] lemmata = extract_by_tag("lemma", tokens) lemmata_set = set(lemmata) pos_tags = extract_by_tag("pos", tokens) pos_set = set(pos_tags) lemmata_to_pos = {lemma: {token["pos"] for token in tokens if "lemma" in token and "pos" in token and token["lemma"] == lemma} for lemma in lemmata_set} yield lemmata_to_pos
def make_lemma_to_forms(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: tokens = [ extract_annotations(entry) for entry in root.findall(".//tok_anno") ] lemmata = extract_by_tag("lemma", tokens) lemmata_set = set(lemmata) lemma_to_normalized = { lemma: { token["norm"] for token in tokens if "norm" in token and "lemma" in token and token["lemma"] == lemma } for lemma in lemmata_set } yield lemma_to_normalized
def read_xml_annotations(): for root in get_data(DATA_DIRECTORY, parser): if root is not None: yield read_text_from_root(root)