Exemple #1
0
    def infer(self):
        noad_map = {}
        dst = Distiller(dictName='ode')
        dst.load_distilled_file()
        for e in dst.entries:
            noad_id = None
            for p in e.wordclass_blocks:
                if p.complement is not None:
                    noad_id = p.complement
            if noad_id is not None:
                noad_map[e.lexid] = noad_id

        tree = etree.parse(self.in_file)
        entries = tree.findall('./link')
        for e in entries:
            e.attrib.pop('targetHref')
            oed_id = e.get('sourceID')
            lexid = e.get('targetID')
            if lexid in noad_map:
                e.set('targetID', noad_map[lexid])
            else:
                e.getparent().remove(e)
        tree.getroot().set('type', 'noad')

        with open(self.out_file, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))
Exemple #2
0
    def __init__(self, **kwargs):
        self.dict_name = kwargs.get('dictName')
        self.oed_in = kwargs.get('oedIn', None)
        self.oed_out = kwargs.get('oedOut', None)
        self.odo_in = kwargs.get('odoIn', None)
        self.odo_out = kwargs.get('odoOut', None)

        self.oed_index = VitalStatisticsCache()

        self.odo_index = Distiller(dictName=self.dict_name)
        self.odo_index.load_distilled_file()
Exemple #3
0
 def parse_distilled_file(self):
     distilled = Distiller(dictName=self.dict_name)
     distilled.load_distilled_file()
     for entry in distilled.entries:
         self.entry_content[entry.lexid] = entry
Exemple #4
0
class LinkUpdater(object):
    error_message = '!ERROR entry not found'

    def __init__(self, **kwargs):
        self.dict_name = kwargs.get('dictName')
        self.oed_in = kwargs.get('oedIn', None)
        self.oed_out = kwargs.get('oedOut', None)
        self.odo_in = kwargs.get('odoIn', None)
        self.odo_out = kwargs.get('odoOut', None)

        self.oed_index = VitalStatisticsCache()

        self.odo_index = Distiller(dictName=self.dict_name)
        self.odo_index.load_distilled_file()

    def update_odo(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.odo_in)
        for entry in tree.findall('./e'):
            lexid = entry.get('lexid', None)
            odo_label = entry.find('./label')
            odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm')
            odo_label.text = odo_label_text
            link = entry.find('./linkSet/link')

            if link is not None:
                refentry = link.get('refentry', '0')
                refid = link.get('refid', '0')
                oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message
                etree.strip_tags(link, 'i', 'sup', 'sub', 'hm')
                link.text = oed_label_text

            if (valid_links_only and
                (link is None or
                 link.text == LinkUpdater.error_message or
                 odo_label.text == LinkUpdater.error_message or
                 not check_match(link.text, odo_label.text))):
                entry.getparent().remove(entry)

        with open(self.odo_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))

    def update_oed(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.oed_in)
        for entry in tree.findall('./link'):
            oed_id = entry.get('sourceID', None)
            oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message
            source_label = entry.find('./sourceLabel')
            etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm')
            source_label.text = oed_label_text

            lexid = entry.get('targetID', None)
            ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            target_label = entry.find('./targetLabel')
            etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm')
            target_label.text = ode_label_text

            if (valid_links_only and
                (oed_id is None or
                 lexid is None or
                 source_label.text == LinkUpdater.error_message or
                 target_label.text == LinkUpdater.error_message or
                 not check_match(source_label.text, target_label.text))):
                entry.getparent().remove(entry)

        with open(self.oed_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))