def _process_wordclass_set(wordclass_set, id):
    varset = VARIANTS_CACHE.find(id=id, wordclass=wordclass_set.wordclass())
    if varset:
        seen = set([type_unit.sort for type_unit in wordclass_set.types()])
        bucket = []
        for inflection in INFLECTIONS[wordclass_set.wordclass()]:
            for variant_form in varset.variants.get(inflection, []):
                if variant_form.sort in seen:
                    continue
                if variant_form.date.end < MINIMUM_DATE:
                    continue
                variant_form.wordclass = inflection
                bucket.append(variant_form)
                seen.add(variant_form.sort)

        for variant_form in bucket:
            morphset_node = etree.Element('morphSet', fragment='true')

            d1, d2 = variant_form.date.constrain((wordclass_set.date().start,
                                                  wordclass_set.date().end))
            daterange = DateRange(start=d1, end=d2, hardEnd=True)
            morphset_node.append(daterange.to_xml(omitProjected=True))
            if variant_form.regional:
                morphset_node.set('regional', 'true')
            if variant_form.irregular:
                morphset_node.set('irregular', 'true')

            type_node = etree.SubElement(morphset_node, 'type')
            form_node = etree.SubElement(type_node, 'form')
            form_node.text = variant_form.form
            type_node.append(Wordclass(variant_form.wordclass).to_xml())

            wordclass_set.morphset_block().append(morphset_node)
Beispiel #2
0
def _variant_date_node(variant_date, block_date):
    """
    Return a date-range node for a variant form
    """
    # Constrain the variant's dates, so that they don't
    # fall outside the limits of the parent entry
    start_date, end_date = variant_date.constrain((block_date.start, block_date.projected_end()))
    # Create a new date range with the constrained dates
    constrained_date_range = DateRange(start=start_date, end=end_date, hardEnd=True)
    return constrained_date_range.to_xml(omitProjected=True)
Beispiel #3
0
    def construct_entry_node(self, entry):
        entry_node = etree.Element('e', odoLexid=entry.lexid)
        if entry.wordclass_blocks[0].wordclass == 'NP':
            entry_node.set('encyclopedic', 'true')

        lemma_node = etree.Element('lemma', src=self.dictname)
        lemma_node.text = entry.headword
        if entry.headword_us:
            lemma_node.set('locale', 'uk')
            lemma_node2 = etree.Element('lemma', locale='us', src=self.dictname)
            lemma_node2.text = entry.headword_us
            entry_node.append(lemma_node)
            entry_node.append(lemma_node2)
        else:
            entry_node.append(lemma_node)

        for block in entry.wordclass_blocks:
            # date node
            if entry.date is not None:
                daterange = DateRange(start=entry.date,
                                      end=END_DATE,
                                      estimated=False)
                if block.wordclass == 'NP':
                    daterange.is_estimated = True
                local_date_node = etree.tostring(daterange.to_xml(omitProjected=True))
            elif block.wordclass != 'NP':
                local_date_node = DEFAULT_DATE_NODE
            else:
                local_date_node = None

            wordclass_set_node = etree.SubElement(entry_node, 'wordclassSet')
            wordclass_set_node.append(Wordclass(block.wordclass).to_xml())
            if local_date_node:
                wordclass_set_node.append(etree.fromstring(local_date_node))

            morphsetblock_node = etree.Element('morphSetBlock')
            for morphgroup in block.morphgroups:
                morphset_node = etree.SubElement(morphsetblock_node, 'morphSet')
                if local_date_node:
                    morphset_node.append(etree.fromstring(local_date_node))
                for unit in morphgroup.morphunits:
                    type_node = etree.SubElement(morphset_node, 'type')
                    form_node = etree.SubElement(type_node, 'form')
                    form_node.text = unit.form
                    type_node.append(Wordclass(unit.wordclass).to_xml())

            wordclass_set_node.append(morphsetblock_node)
            wordclass_set_node.append(self.definition_node(block))
            wordclass_set_node.append(self.resource_node(block, entry.lexid))

        return entry_node
Beispiel #4
0
 def __init__(self, form, start_date, end_date):
     self.original_form = form
     self.date = DateRange(start=start_date,
                           end=end_date,
                           hardEnd=True)
     self.wordclass = None  # usually stays undefined
     self._lemma_manager = None
     self.regional = False  # default
     self.irregular = False  # default
     self.has_en_ending = False  # default
     self.undated = False
     self.computed = False
     self.structural_id = 0  # default
     self.headers = []
     self.header_labels = []
     self.label = ''
     self.grammatical_information = ''
Beispiel #5
0
 def date(self):
     """
     Return a DateRange object for this entry.
     """
     # Entries don't have their own date ranges; so we compile this
     # from the date ranges of its component wordclasses
     try:
         return self._date
     except AttributeError:
         start = min(block.date().start for block in self.wordclass_sets())
         end = max(block.date().end for block in self.wordclass_sets())
         exact_start = min(block.date().exact('start')
                           for block in self.wordclass_sets())
         exact_end = min(block.date().exact('end')
                         for block in self.wordclass_sets())
         self._date = DateRange(start=start, end=end)
         self._date.set_exact('start', exact_start)
         self._date.set_exact('end', exact_end)
         return self._date
Beispiel #6
0
class GelEntry(_GelComponent):

    def __init__(self, node, filename):
        _GelComponent.__init__(self, node=node, file=filename)

    def oed_id(self):
        return self.attribute('oedId')

    def oed_lexid(self):
        return self.attribute('oedLexid')

    def tag(self):
        return self.attribute('tag')

    def wordclass_sets(self):
        try:
            return self._wordclass_sets
        except AttributeError:
            self._wordclass_sets = [WordclassSet(n, self.filepath) for n in
                                    self.node.findall('./wordclassSet')]
            return self._wordclass_sets

    def definition(self, **kwargs):
        """
        Return the definition from the first wordclass block.
        """
        return self.wordclass_sets()[0].definition(**kwargs)

    def primary_wordclass(self):
        """
        Return the Penn wordclass from the first wordclass block.
        """
        return self.wordclass_sets()[0].wordclass()

    def lemma_manager(self):
        try:
            return self._lemma_manager
        except AttributeError:
            self._lemma_manager = Lemma(self.node.findtext('./lemma'))
            return self._lemma_manager

    @property
    def lemma(self):
        return self.lemma_manager().lemma

    @property
    def headword(self):
        return self.lemma_manager().lemma

    @property
    def sort(self):
        return self.lemma_manager().lexical_sort()

    def lemmas(self):
        return [Lemma(l.text) for l in self.node.findall('./lemma')]

    def us_variant(self):
        for l in self.node.findall('./lemma'):
            if l.get('locale') == 'us':
                return l.text
        return None

    def frequency(self):
        """
        Return the modern frequency for this entry.
        """
        # Entries don't have their own frequency tables; so we sum
        # the frequencies of its component wordclasses
        return sum([block.frequency() for block in self.wordclass_sets()])

    def date(self):
        """
        Return a DateRange object for this entry.
        """
        # Entries don't have their own date ranges; so we compile this
        # from the date ranges of its component wordclasses
        try:
            return self._date
        except AttributeError:
            start = min(block.date().start for block in self.wordclass_sets())
            end = max(block.date().end for block in self.wordclass_sets())
            exact_start = min(block.date().exact('start')
                              for block in self.wordclass_sets())
            exact_end = min(block.date().exact('end')
                            for block in self.wordclass_sets())
            self._date = DateRange(start=start, end=end)
            self._date.set_exact('start', exact_start)
            self._date.set_exact('end', exact_end)
            return self._date

    def types(self):
        types_list = []
        for block in self.wordclass_sets():
            types_list.extend(block.types())
        return types_list

    def morphsets(self):
        morphset_list = []
        for block in self.wordclass_sets():
            morphset_list.extend(block.morphsets())
        return morphset_list

    def oed_entry_type(self):
        for block in self.wordclass_sets():
            etype = block.oed_entry_type()
            if etype is not None:
                return etype
        return None
Beispiel #7
0
class VariantForm(object):
    """
    Class representing an OED variant form and its date range.

    Arguments to init:
     -- form (string)
     -- start_date (int)
     -- end_date (int)
    """

    def __init__(self, form, start_date, end_date):
        self.original_form = form
        self.date = DateRange(start=start_date,
                              end=end_date,
                              hardEnd=True)
        self.wordclass = None  # usually stays undefined
        self._lemma_manager = None
        self.regional = False  # default
        self.irregular = False  # default
        self.has_en_ending = False  # default
        self.undated = False
        self.computed = False
        self.structural_id = 0  # default
        self.headers = []
        self.header_labels = []
        self.label = ''
        self.grammatical_information = ''

    def __repr__(self):
        return '<VariantForm: %s (%d\u2014%d)>' % (self.form,
                                                   self.date.start,
                                                   self.date.projected_end())

    @property
    def form(self):
        """
        Return the variant form itself (shortcut for self.lemma_manager.lemma).
        """
        return self.lemma_manager().lemma

    @property
    def sort(self):
        return self.lemma_manager().lexical_sort()

    def lemma_manager(self):
        """
        Return a Lemma object based on the variant form.
        """
        if self._lemma_manager is None:
            form = self.original_form.strip().replace('(', '').replace(')', '')
            self._lemma_manager = Lemma(form)
        return self._lemma_manager

    def reset_form(self, new_lemma):
        """
        Replace the Lemma object with a new Lemma object using a new form.

        Used by Formslist.detruncate when detruncating the forms list.
        """
        self._lemma_manager = Lemma(new_lemma)
        return self._lemma_manager

    def set_grammatical_information(self, value):
        self.grammatical_information = DISTIL_PATTERNS.sub(r'\1', value)
        return self.grammatical_information

    def is_truncated(self):
        """
        Return True if the form is truncated, or False if not.
        """
        return self.lemma_manager().is_affix()

    def check_en_ending(self, wordclass, headword):
        """
        Switch self.has_en_ending to True if this is a verb
        ending in -en (and the headword does *not* end in -n).
        """
        if (wordclass == 'VB' and
            self.date.end <= 1600 and
            re.search(r'[eay]n+$', self.form) and
            not re.search(r'ne?$', headword)):
            self.has_en_ending = True

    def merge(self, other):
        """
        Merge another VariantForm instance into this one.

        Argument is another VariantForm instance.

        Used by Formslist.forms_uniq. Since merging will only be done in
        cases where the two instances have exactly the same form and exactly
        the same grammatical info, we only have to check if the date range
        needs to be extended in either direction.
        """
        # Switch 'regional' setting to other's regional setting if other
        #  looks like a more significant form. Ditto 'irregular' setting.
        if not self.is_more_important_than(other):
            self.regional = other.regional
            self.irregular = other.irregular
        # If necessary, extend the start date to an earlier start date
        self.date.extend_range(other.date)
        return

    def is_more_important_than(self, other):
        if self.date.projected_end() > other.date.projected_end():
            return True
        if (self.date.projected_end() == other.date.projected_end() and
            self.date.span() > other.date.span()):
            return True
        return False