def infer(self): noad_map = {} dst = Distiller(dictName='ode') dst.load_distilled_file() for e in dst.entries: noad_id = None for p in e.wordclass_blocks: if p.complement is not None: noad_id = p.complement if noad_id is not None: noad_map[e.lexid] = noad_id tree = etree.parse(self.in_file) entries = tree.findall('./link') for e in entries: e.attrib.pop('targetHref') oed_id = e.get('sourceID') lexid = e.get('targetID') if lexid in noad_map: e.set('targetID', noad_map[lexid]) else: e.getparent().remove(e) tree.getroot().set('type', 'noad') with open(self.out_file, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode'))
def __init__(self, **kwargs): self.dict_name = kwargs.get('dictName') self.oed_in = kwargs.get('oedIn', None) self.oed_out = kwargs.get('oedOut', None) self.odo_in = kwargs.get('odoIn', None) self.odo_out = kwargs.get('odoOut', None) self.oed_index = VitalStatisticsCache() self.odo_index = Distiller(dictName=self.dict_name) self.odo_index.load_distilled_file()
def parse_distilled_file(self): distilled = Distiller(dictName=self.dict_name) distilled.load_distilled_file() for entry in distilled.entries: self.entry_content[entry.lexid] = entry
class LinkUpdater(object): error_message = '!ERROR entry not found' def __init__(self, **kwargs): self.dict_name = kwargs.get('dictName') self.oed_in = kwargs.get('oedIn', None) self.oed_out = kwargs.get('oedOut', None) self.odo_in = kwargs.get('odoIn', None) self.odo_out = kwargs.get('odoOut', None) self.oed_index = VitalStatisticsCache() self.odo_index = Distiller(dictName=self.dict_name) self.odo_index.load_distilled_file() def update_odo(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.odo_in) for entry in tree.findall('./e'): lexid = entry.get('lexid', None) odo_label = entry.find('./label') odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm') odo_label.text = odo_label_text link = entry.find('./linkSet/link') if link is not None: refentry = link.get('refentry', '0') refid = link.get('refid', '0') oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message etree.strip_tags(link, 'i', 'sup', 'sub', 'hm') link.text = oed_label_text if (valid_links_only and (link is None or link.text == LinkUpdater.error_message or odo_label.text == LinkUpdater.error_message or not check_match(link.text, odo_label.text))): entry.getparent().remove(entry) with open(self.odo_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode')) def update_oed(self, **kwargs): valid_links_only = kwargs.get('validLinksOnly', False) tree = etree.parse(self.oed_in) for entry in tree.findall('./link'): oed_id = entry.get('sourceID', None) oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message source_label = entry.find('./sourceLabel') etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm') source_label.text = oed_label_text lexid = entry.get('targetID', None) ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message target_label = entry.find('./targetLabel') etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm') target_label.text = ode_label_text if (valid_links_only and (oed_id is None or lexid is None or source_label.text == LinkUpdater.error_message or target_label.text == LinkUpdater.error_message or not check_match(source_label.text, target_label.text))): entry.getparent().remove(entry) with open(self.oed_out, 'w') as filehandle: filehandle.write(etree.tostring(tree, pretty_print=True, encoding='unicode'))