def read_rogetmap(xml="roget_hierarchy.xml", verbose=True): """ Parse Roget map (Roget hierarchy) into a dictionary with Roget head words as keys. """ import xml.etree.cElementTree as cet if verbose: util.log.info("Reading XML lexicon") lexicon = {} context = cet.iterparse(xml, events=("start", "end")) context = iter(context) event, root = next(context) for event, elem in context: if elem.tag == "class": l1 = elem.get("name") elif elem.tag == "section": l2 = elem.get("name") elif elem.tag == "subsection": l3 = elem.get("name") elif elem.tag == "headword": head = elem.get("name") lexicon[head] = (l3, l2, l1) testwords = ["Existence", "Health", "Amusement", "Marriage"] util.test_annotations(lexicon, testwords) if verbose: util.log.info("OK, read.") return lexicon
def read_blingbring(tsv="blingbring.txt", classmap="rogetMap.xml", verbose=True): """ Read the tsv version of the Blingbring lexicon (blingbring.xml). Return a lexicon dictionary: {senseid: {roget_head: roget_head, roget_subsection: roget_subsection, roget_section: roget_section, roget_class: roget_class, bring: bring_ID} """ rogetdict = read_rogetmap(xml=classmap, verbose=True) import csv if verbose: util.log.info("Reading tsv lexicon") lexicon = {} classmapping = {} with open(tsv) as f: for line in csv.reader(f, delimiter="\t"): if line[0].startswith("#"): continue rogetid = line[1].split("/")[-1] if rogetid in rogetdict: roget_l3 = rogetdict[rogetid][0] # subsection roget_l2 = rogetdict[rogetid][1] # section roget_l1 = rogetdict[rogetid][2] # class else: roget_l3 = roget_l2 = roget_l1 = "" senseids = set(line[3].split(":")) for senseid in senseids: lexicon.setdefault(senseid, set()).add( (rogetid, roget_l3, roget_l2, roget_l1)) # Make mapping between Roget and Bring classes if line[0].split("/")[1] == "B": classmapping[rogetid] = line[2] for senseid, rogetids in lexicon.items(): roget_head = set([tup[0] for tup in rogetids]) roget_subsection = set([tup[1] for tup in rogetids if tup[1]]) roget_section = set([tup[2] for tup in rogetids if tup[2]]) roget_class = set([tup[3] for tup in rogetids if tup[3]]) lexicon[senseid] = { "roget_head": roget_head, "roget_subsection": roget_subsection, "roget_section": roget_section, "roget_class": roget_class, "bring": set([classmapping[r] for r in roget_head]) } testwords = ["fågel..1", "behjälplig..1", "köra_ner..1"] util.test_annotations(lexicon, testwords) if verbose: util.log.info("OK, read") return lexicon
def read_swefn(xml='swefn.xml', verbose=True): """ Read the XML version of the swedish Framenet resource. Return a lexicon dictionary, {saldoID: {swefnID}}. """ import xml.etree.cElementTree as cet if verbose: util.log.info("Reading XML lexicon") lexicon = {} context = cet.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = next(context) for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': sense = elem.find("Sense") sid = sense.get("id").lstrip("swefn--") for lu in sense.findall("feat[@att='LU']"): saldosense = lu.get("val") lexicon.setdefault(saldosense, set()).add(sid) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() testwords = [ "slant..1", "befrielse..1", "granne..1", "sisådär..1", "mjölkcentral..1" ] util.test_annotations(lexicon, testwords) if verbose: util.log.info("OK, read.") return lexicon
def read_sensaldo(tsv="sensaldo-base-v02.txt", verbose=True): """ Read the TSV version of the sensaldo lexicon (sensaldo-base.txt). Return a lexicon dictionary: {senseid: (class, ranking)} """ if verbose: util.log.info("Reading TSV lexicon") lexicon = {} with open(tsv) as f: for line in f: if line.lstrip().startswith("#"): continue saldoid, label = line.split() lexicon[saldoid] = label testwords = ["förskräcklig..1", "ödmjukhet..1", "handla..1"] util.test_annotations(lexicon, testwords) if verbose: util.log.info("OK, read") return lexicon
def read_xml(xml='dalinm.xml', annotation_elements='writtenForm lemgram', tagset='SUC', verbose=True, skip_multiword=False, translate_tags=True): """Read the XML version of a morphological lexicon in lmf format (dalinm.xml). Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - annotation_element is the XML element for the annotation value, 'writtenForm' for baseform, 'lemgram' for lemgram writtenForm is translated to 'gf' and lemgram to 'lem' (for compatability with Saldo) - skip_multiword is a flag telling whether to make special entries for multiword expressions. Set this to False only if the tool used for text annotation cannot handle this at all """ annotation_elements = annotation_elements.split() # assert annotation_element in ("writtenForm lemgram") "Invalid annotation element" import xml.etree.cElementTree as cet if verbose: util.log.info("Reading XML lexicon") lexicon = {} tagmap = getattr(util.tagsets, "saldo_to_" + tagset.lower()) context = cet.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = next(context) for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': annotations = saldo.HashableDict() lem = elem.find("Lemma").find("FormRepresentation") for a in annotation_elements: if a == "writtenForm": key = "gf" elif a == "lemgram": key = "lem" annotations[key] = tuple([findval(lem, a)]) pos = findval(lem, "partOfSpeech") inhs = findval(lem, "inherent") if inhs == "-": inhs = "" inhs = inhs.split() # there may be several WordForms for forms in elem.findall("WordForm"): word = findval(forms, "writtenForm") param = findval(forms, "msd") multiwords = [] wordparts = word.split() for i, word in enumerate(wordparts): if (not skip_multiword) and len(wordparts) > 1: # Handle multi-word expressions multiwords.append(word) particle = False # we don't use any particles or mwe:s with gaps # since that information is not formally expressed in the historical lexicons mwe_gap = False # but keep the fields so that the file format match the normal saldo-pickle format # is it the last word in the multi word expression? if i == len(wordparts) - 1: lexicon.setdefault( multiwords[0], {}).setdefault(annotations, (set(), set(), mwe_gap, particle))[1].add( tuple(multiwords[1:])) multiwords = [] else: # Single word expressions particle = False # we don't use any particles or mwe:s with gaps mwe_gap = False # but keep the fields so that the file format match the normal saldo-pickle format if translate_tags: tags = convert_default(pos, inhs, param, tagmap) if tags: lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), mwe_gap, particle))[0].update(tags) else: saldotag = " ".join( [pos, param] ) # this tag is rather useless, but at least gives some information tags = tuple([saldotag]) lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), mwe_gap, particle))[0].update(tags) # Done parsing section. Clear tree to save memory if elem.tag in ['LexicalEntry', 'frame', 'resFrame']: root.clear() if verbose: testwords = ["äplebuske", "stöpljus", "katt"] util.test_annotations(lexicon, testwords) util.log.info("OK, read") return lexicon
def read_xml(xml='saldom.xml', annotation_elements='gf lem saldo', tagset='SUC', verbose=True): """Read the XML version of SALDO's morphological lexicon (saldom.xml). Return a lexicon dictionary, {wordform: {{annotation-type: annotation}: ( set(possible tags), set(tuples with following words) )}} - annotation_element is the XML element for the annotation value (currently: 'gf' for baseform, 'lem' for lemgram or 'saldo' for SALDO id) - tagset is the tagset for the possible tags (currently: 'SUC', 'Parole', 'Saldo') """ annotation_elements = annotation_elements.split() # assert annotation_element in ("gf", "lem", "saldo"), "Invalid annotation element" import xml.etree.cElementTree as cet tagmap = getattr(util.tagsets, "saldo_to_" + tagset.lower()) if verbose: util.log.info("Reading XML lexicon") lexicon = {} context = cet.iterparse( xml, events=("start", "end")) # "start" needed to save reference to root element context = iter(context) event, root = next(context) for event, elem in context: if event == "end": if elem.tag == 'LexicalEntry': annotations = HashableDict() for a in annotation_elements: annotations[a] = tuple(x.text for x in elem.findall(a)) pos = elem.findtext("pos") inhs = elem.findtext("inhs") if inhs == "-": inhs = "" inhs = inhs.split() # Check the paradigm for an "x", meaning a multi-word expression with a required gap p = elem.findtext("p") x_find = re.search(r"_x(\d*)_", p) x_insert = x_find.groups()[0] if x_find else None if x_insert == "": x_insert = "1" # Only vbm and certain paradigms allow gaps gap_allowed = (pos == "vbm" or p in (u"abm_x1_var_än", u"knm_x_ju_ju", u"pnm_x1_inte_ett_dugg", u"pnm_x1_vad_än", u"ppm_x1_för_skull")) table = elem.find("table") multiwords = [] for form in list(table): word = form.findtext("wf") param = form.findtext("param") if param in ("frag", "c", "ci", "cm"): # We don't use these wordforms, so skip continue elif param[-1].isdigit() and param[-2:] != "-1": # Handle multi-word expressions multiwords.append(word) multipart, multitotal = param.split(":")[-1].split("-") particle = bool(re.search( r"vbm_.+?p.*?\d+_", p)) # Multi-word with particle # Add a "*" where the gap should be if x_insert and multipart == x_insert: multiwords.append("*") if multipart == multitotal: lexicon.setdefault(multiwords[0], {}).setdefault( annotations, (set(), set(), gap_allowed, particle))[1].add( tuple(multiwords[1:])) multiwords = [] else: # Single word expressions if param[-2:] == "-1": param = param.rsplit(" ", 1)[0] if pos == "vbm": pos = "vb" saldotag = " ".join([pos] + inhs + [param]) tags = tagmap.get(saldotag) if tags: lexicon.setdefault(word, {}).setdefault( annotations, (set(), set(), False, False))[0].update(tags) # Done parsing section. Clear tree to save memory if elem.tag in ["LexicalEntry", "frame", "resFrame"]: root.clear() testwords = [ "äggtoddyarna", "Linköpingsbors", "katabatiska", "väg-", "formar", "in", "datorrelaterade" ] util.test_annotations(lexicon, testwords) if verbose: util.log.info("OK, read") return lexicon