def _process_wordclass_set(wordclass_set, id): varset = VARIANTS_CACHE.find(id=id, wordclass=wordclass_set.wordclass()) if varset: seen = set([type_unit.sort for type_unit in wordclass_set.types()]) bucket = [] for inflection in INFLECTIONS[wordclass_set.wordclass()]: for variant_form in varset.variants.get(inflection, []): if variant_form.sort in seen: continue if variant_form.date.end < MINIMUM_DATE: continue variant_form.wordclass = inflection bucket.append(variant_form) seen.add(variant_form.sort) for variant_form in bucket: morphset_node = etree.Element('morphSet', fragment='true') d1, d2 = variant_form.date.constrain((wordclass_set.date().start, wordclass_set.date().end)) daterange = DateRange(start=d1, end=d2, hardEnd=True) morphset_node.append(daterange.to_xml(omitProjected=True)) if variant_form.regional: morphset_node.set('regional', 'true') if variant_form.irregular: morphset_node.set('irregular', 'true') type_node = etree.SubElement(morphset_node, 'type') form_node = etree.SubElement(type_node, 'form') form_node.text = variant_form.form type_node.append(Wordclass(variant_form.wordclass).to_xml()) wordclass_set.morphset_block().append(morphset_node)
def _variant_date_node(variant_date, block_date): """ Return a date-range node for a variant form """ # Constrain the variant's dates, so that they don't # fall outside the limits of the parent entry start_date, end_date = variant_date.constrain((block_date.start, block_date.projected_end())) # Create a new date range with the constrained dates constrained_date_range = DateRange(start=start_date, end=end_date, hardEnd=True) return constrained_date_range.to_xml(omitProjected=True)
def construct_entry_node(self, entry): entry_node = etree.Element('e', odoLexid=entry.lexid) if entry.wordclass_blocks[0].wordclass == 'NP': entry_node.set('encyclopedic', 'true') lemma_node = etree.Element('lemma', src=self.dictname) lemma_node.text = entry.headword if entry.headword_us: lemma_node.set('locale', 'uk') lemma_node2 = etree.Element('lemma', locale='us', src=self.dictname) lemma_node2.text = entry.headword_us entry_node.append(lemma_node) entry_node.append(lemma_node2) else: entry_node.append(lemma_node) for block in entry.wordclass_blocks: # date node if entry.date is not None: daterange = DateRange(start=entry.date, end=END_DATE, estimated=False) if block.wordclass == 'NP': daterange.is_estimated = True local_date_node = etree.tostring(daterange.to_xml(omitProjected=True)) elif block.wordclass != 'NP': local_date_node = DEFAULT_DATE_NODE else: local_date_node = None wordclass_set_node = etree.SubElement(entry_node, 'wordclassSet') wordclass_set_node.append(Wordclass(block.wordclass).to_xml()) if local_date_node: wordclass_set_node.append(etree.fromstring(local_date_node)) morphsetblock_node = etree.Element('morphSetBlock') for morphgroup in block.morphgroups: morphset_node = etree.SubElement(morphsetblock_node, 'morphSet') if local_date_node: morphset_node.append(etree.fromstring(local_date_node)) for unit in morphgroup.morphunits: type_node = etree.SubElement(morphset_node, 'type') form_node = etree.SubElement(type_node, 'form') form_node.text = unit.form type_node.append(Wordclass(unit.wordclass).to_xml()) wordclass_set_node.append(morphsetblock_node) wordclass_set_node.append(self.definition_node(block)) wordclass_set_node.append(self.resource_node(block, entry.lexid)) return entry_node
def __init__(self, form, start_date, end_date): self.original_form = form self.date = DateRange(start=start_date, end=end_date, hardEnd=True) self.wordclass = None # usually stays undefined self._lemma_manager = None self.regional = False # default self.irregular = False # default self.has_en_ending = False # default self.undated = False self.computed = False self.structural_id = 0 # default self.headers = [] self.header_labels = [] self.label = '' self.grammatical_information = ''
def date(self): """ Return a DateRange object for this entry. """ # Entries don't have their own date ranges; so we compile this # from the date ranges of its component wordclasses try: return self._date except AttributeError: start = min(block.date().start for block in self.wordclass_sets()) end = max(block.date().end for block in self.wordclass_sets()) exact_start = min(block.date().exact('start') for block in self.wordclass_sets()) exact_end = min(block.date().exact('end') for block in self.wordclass_sets()) self._date = DateRange(start=start, end=end) self._date.set_exact('start', exact_start) self._date.set_exact('end', exact_end) return self._date
class GelEntry(_GelComponent): def __init__(self, node, filename): _GelComponent.__init__(self, node=node, file=filename) def oed_id(self): return self.attribute('oedId') def oed_lexid(self): return self.attribute('oedLexid') def tag(self): return self.attribute('tag') def wordclass_sets(self): try: return self._wordclass_sets except AttributeError: self._wordclass_sets = [WordclassSet(n, self.filepath) for n in self.node.findall('./wordclassSet')] return self._wordclass_sets def definition(self, **kwargs): """ Return the definition from the first wordclass block. """ return self.wordclass_sets()[0].definition(**kwargs) def primary_wordclass(self): """ Return the Penn wordclass from the first wordclass block. """ return self.wordclass_sets()[0].wordclass() def lemma_manager(self): try: return self._lemma_manager except AttributeError: self._lemma_manager = Lemma(self.node.findtext('./lemma')) return self._lemma_manager @property def lemma(self): return self.lemma_manager().lemma @property def headword(self): return self.lemma_manager().lemma @property def sort(self): return self.lemma_manager().lexical_sort() def lemmas(self): return [Lemma(l.text) for l in self.node.findall('./lemma')] def us_variant(self): for l in self.node.findall('./lemma'): if l.get('locale') == 'us': return l.text return None def frequency(self): """ Return the modern frequency for this entry. """ # Entries don't have their own frequency tables; so we sum # the frequencies of its component wordclasses return sum([block.frequency() for block in self.wordclass_sets()]) def date(self): """ Return a DateRange object for this entry. """ # Entries don't have their own date ranges; so we compile this # from the date ranges of its component wordclasses try: return self._date except AttributeError: start = min(block.date().start for block in self.wordclass_sets()) end = max(block.date().end for block in self.wordclass_sets()) exact_start = min(block.date().exact('start') for block in self.wordclass_sets()) exact_end = min(block.date().exact('end') for block in self.wordclass_sets()) self._date = DateRange(start=start, end=end) self._date.set_exact('start', exact_start) self._date.set_exact('end', exact_end) return self._date def types(self): types_list = [] for block in self.wordclass_sets(): types_list.extend(block.types()) return types_list def morphsets(self): morphset_list = [] for block in self.wordclass_sets(): morphset_list.extend(block.morphsets()) return morphset_list def oed_entry_type(self): for block in self.wordclass_sets(): etype = block.oed_entry_type() if etype is not None: return etype return None
class VariantForm(object): """ Class representing an OED variant form and its date range. Arguments to init: -- form (string) -- start_date (int) -- end_date (int) """ def __init__(self, form, start_date, end_date): self.original_form = form self.date = DateRange(start=start_date, end=end_date, hardEnd=True) self.wordclass = None # usually stays undefined self._lemma_manager = None self.regional = False # default self.irregular = False # default self.has_en_ending = False # default self.undated = False self.computed = False self.structural_id = 0 # default self.headers = [] self.header_labels = [] self.label = '' self.grammatical_information = '' def __repr__(self): return '<VariantForm: %s (%d\u2014%d)>' % (self.form, self.date.start, self.date.projected_end()) @property def form(self): """ Return the variant form itself (shortcut for self.lemma_manager.lemma). """ return self.lemma_manager().lemma @property def sort(self): return self.lemma_manager().lexical_sort() def lemma_manager(self): """ Return a Lemma object based on the variant form. """ if self._lemma_manager is None: form = self.original_form.strip().replace('(', '').replace(')', '') self._lemma_manager = Lemma(form) return self._lemma_manager def reset_form(self, new_lemma): """ Replace the Lemma object with a new Lemma object using a new form. Used by Formslist.detruncate when detruncating the forms list. """ self._lemma_manager = Lemma(new_lemma) return self._lemma_manager def set_grammatical_information(self, value): self.grammatical_information = DISTIL_PATTERNS.sub(r'\1', value) return self.grammatical_information def is_truncated(self): """ Return True if the form is truncated, or False if not. """ return self.lemma_manager().is_affix() def check_en_ending(self, wordclass, headword): """ Switch self.has_en_ending to True if this is a verb ending in -en (and the headword does *not* end in -n). """ if (wordclass == 'VB' and self.date.end <= 1600 and re.search(r'[eay]n+$', self.form) and not re.search(r'ne?$', headword)): self.has_en_ending = True def merge(self, other): """ Merge another VariantForm instance into this one. Argument is another VariantForm instance. Used by Formslist.forms_uniq. Since merging will only be done in cases where the two instances have exactly the same form and exactly the same grammatical info, we only have to check if the date range needs to be extended in either direction. """ # Switch 'regional' setting to other's regional setting if other # looks like a more significant form. Ditto 'irregular' setting. if not self.is_more_important_than(other): self.regional = other.regional self.irregular = other.irregular # If necessary, extend the start date to an earlier start date self.date.extend_range(other.date) return def is_more_important_than(self, other): if self.date.projected_end() > other.date.projected_end(): return True if (self.date.projected_end() == other.date.projected_end() and self.date.span() > other.date.span()): return True return False