Example #1
0
    def __init__(self, **kwargs):
        self.lemma_manager = LemmaWithVariants(kwargs.get('lemma'))
        self.lemma = kwargs.get('lemma')
        self.id = kwargs.get('id')
        self.wordclass = kwargs.get('wordclass')
        self.date = kwargs.get('daterange', DateRange(start=0, end=0))
        self.headwords = kwargs.get('headwords', [])
        self.hint_ids = []
        self.etyma = []

        if self.wordclass == 'NNS':
            self.wordclass = 'NN'
        self.headwords = [hw.replace('(', '').replace(')', '')
                          for hw in self.headwords]
        self.variants_cache = VariantsCache()
Example #2
0
class VariantsComputer(object):

    def __init__(self, **kwargs):
        self.lemma_manager = LemmaWithVariants(kwargs.get('lemma'))
        self.lemma = kwargs.get('lemma')
        self.id = kwargs.get('id')
        self.wordclass = kwargs.get('wordclass')
        self.date = kwargs.get('daterange', DateRange(start=0, end=0))
        self.headwords = kwargs.get('headwords', [])
        self.hint_ids = []
        self.etyma = []

        if self.wordclass == 'NNS':
            self.wordclass = 'NN'
        self.headwords = [hw.replace('(', '').replace(')', '')
                          for hw in self.headwords]
        self.variants_cache = VariantsCache()

    def set_hint_ids(self, identifiers):
        self.hint_ids = identifiers

    def set_etyma(self, etyma):
        self.etyma = etyma

    def primary_sets(self):
        try:
            return self._primary_variant_sets
        except AttributeError:
            self._primary_variant_sets = self.variants_cache.find_all(id=self.id)
            return self._primary_variant_sets

    def hint_sets(self):
        try:
            return self._hint_variant_sets
        except AttributeError:
            self._hint_variant_sets = []
            for hint_id in self.hint_ids:
                for varset in self.variants_cache.find_all(id=hint_id):
                    self._hint_variant_sets.append(varset)
            return self._hint_variant_sets

    def compute(self):
        varsets = []
        for varset in self.primary_sets():
            if (varset.lemma == self.lemma and
                varset.revised_status != 'omitted'):
                varsets.append(varset)
        for varset in self.hint_sets():
            if (varset.lemma == self.lemma and
                varset.revised_status != 'omitted'):
                varsets.append(varset)
        if varsets:
            filtered_varsets = _filter_varsets(varsets,
                                               self.wordclass,
                                               self.date)
            self.lemma_manager.variants.extend(filtered_varsets)

        if (not varsets and
            self.lemma_manager.is_compound() and
            self.date.start):
            self._build_compound_variants()

        # If no variants listed, then default to the lemma itself
        if not self.lemma_manager.variants:
            variant_form = VariantForm(self.lemma,
                                       self.date.start,
                                       self.date.projected_end())
            self.lemma_manager.variants.append(variant_form)

        self._check_for_omissions()
        self._check_dating()
        self._check_compound_variation()

    def _build_compound_variants(self):
        for i, component in enumerate(self.lemma_manager.components()):
            if not STOPWORDS_PATTERN.search(component.lemma):
                component_varsets = [vs for vs in self.hint_sets() if
                                     vs.lemma == component.lemma]
                if not component_varsets:
                    component_varsets = self.variants_cache.find_all(lemma=component.lemma)
                    component_varsets = [vs for vs in component_varsets
                                         if vs.revised_status != 'omitted']

                if (i == 0 and
                        self.wordclass in ('NN', 'JJ') and
                        _test_varsets_for_wordclass(component_varsets, 'JJ')):
                    wordclass = 'JJ'
                elif component.lemma.startswith('-'):
                    wordclass = None
                else:
                    wordclass = self.wordclass
                filtered_varsets = _filter_varsets(component_varsets,
                                                   wordclass,
                                                   self.date)
                component.variants.extend(filtered_varsets)
            if not component.variants:
                dummy_vf = VariantForm(component.lemma,
                                       self.date.start - 10,
                                       self.date.projected_end() + 10)
                component.variants.append(dummy_vf)
        self.lemma_manager.recombine_components(date=self.date,
                                                cap=COMPOUNDS_CAP)

    def _check_for_omissions(self):
        # If there's a secondary/alternative headword, check that this has
        #   ended up included in the list of variants
        if self.lemma_manager.alt is not None:
            self.lemma_manager.refresh_variants_set()
            if not self.lemma_manager.in_variants_list(self.lemma_manager.alt.dictionary_sort):
                variant_form = VariantForm(self.lemma_manager.alt.lemma,
                                           self.date.start,
                                           self.date.projected_end())
                self.lemma_manager.variants.append(variant_form)

        varsets = []
        for varset in self.primary_sets():
            if varset.lemma == self.lemma:
                varsets.append(varset)
        variant_forms = _filter_varsets(self.primary_sets(),
                                        self.wordclass,
                                        self.date)
        if variant_forms:
            self.lemma_manager.refresh_variants_set()
            for variant_form in variant_forms:
                if not self.lemma_manager.in_variants_list(lexical_sort(variant_form.form)):
                    self.lemma_manager.variants.append(variant_form)

        # Check that the entry headword(s) is represented; given that the ODE
        #  lemma form may be substituted for the original OED lemma form, it's
        #  possible that it's not.
        if self.date.end > 1750:
            for headword in self.headwords:
                matches = [vf for vf in self.lemma_manager.variants if
                           vf.form.replace('~', '') == headword.replace('~', '')]
                if not matches:
                    variant_form = VariantForm(headword,
                                               self.date.start,
                                               self.date.projected_end())
                    self.lemma_manager.variants.append(variant_form)


    def _check_compound_variation(self):
        if (self.lemma_manager.is_compound() and
            not self.lemma_manager.is_affix() and
            self.lemma_manager.num_words() == 2 and
            self.lemma_manager.capitalization_type() == 'downcased'):
            compound_variants = []
            if self.wordclass == 'NN' and '-' in self.lemma:
                compound_variants.append(self.lemma.replace('-', ' '))
            elif (self.wordclass == 'NN' and
                  ' ' in self.lemma and
                  not '\'s ' in self.lemma):
                compound_variants.append(self.lemma.replace(' ', '-'))
            elif (self.wordclass == 'NN' and
                  '~' in self.lemma and
                  len(self.lemma_manager.words()[1]) > 3 and
                  not self.lemma_manager.words()[1] in DERIVATIVE_AFFIXES):
                compound_variants.append(self.lemma.replace('~', '-'))
            elif self.wordclass == 'JJ' and ' ' in self.lemma:
                compound_variants.append(self.lemma.replace(' ', '-'))

            for compound_form in compound_variants:
                matches = [vf for vf in self.lemma_manager.variants
                           if vf.form == compound_form]
                if not matches:
                    # print repr(self.lemma), self.wordclass, repr(compound_form)
                    variant_form = VariantForm(compound_form,
                                               self.date.start,
                                               self.date.projected_end())
                    self.lemma_manager.variants.append(variant_form)


    def _check_dating(self):
        """
        If the lemma is still current, make sure that the variant
        representing the lemma is also current. (To avoid e.g. the variant
        for 'whereupon' being dated according to the dates for 'where' and
        'upon' adv., the latter of which is obsolete.)
        """
        if self.date.projected_end() > 1950:
            for variant_form in self.lemma_manager.variants:
                if (variant_form.form == self.lemma and
                    variant_form.date.end < self.date.projected_end()):
                    variant_form.date.reset('end', self.date.projected_end())