def _set_ratios(self): self._set_partofspeech_ratios() self._set_base_ratios() self._set_group_ratios() self.wordclass_model.log_methods() self.wordclass_model = FlatModel(self.wordclass_model) calibrator = Calibrator(self.wordclass_model, self.ngram_manager) if calibrator.is_viable(): self.calibrator = calibrator
def __init__(self, **kwargs): self.wordform = kwargs.get('form', None) self.lex_items = kwargs.get('lex_items', []) self.ngram = kwargs.get('ngram', None) self.ngram_manager = NgramSetManager(self.ngram, kwargs.get('tagged_ngrams', [])) self.wordclass_model = HierarchicalModel(self.lex_items, self.wordform) self.corpus_probability_sets = { 'bnc': CORPUS_MANAGERS['bnc'].find(self.wordform), 'oec': CORPUS_MANAGERS['oecpos'].find(self.wordform), 'oec_lempos': _find_oec(self.lex_items), } self.calibrator = None self._set_ratios()
class WordclassRatios(object): """ Manage the way that an overall ngram score (for the main untagged ngram) is subdivided between different parts of speech, e.g. between impact n. and impact v. There are four possible methods: 1. base subdivision on ratios found in OEC pos tables 2. base subdivision on ratios found in BNC pos tables 3. base subdivision on ratios found in OEC lempos tables 4. base subdivision on predicted frequency (1/2) is used if the OEC/BNC tables cover all the right parts of speech; failing that, (3) is used if the OEC lempos tables cover all the right parts of speech; failing that, (4) is used. Special handling ('verblike') for cases where there's a VBG or VBN and JJ or NN: these can be handled by the BNC method, but can't be usefully handled by any of the other methods. """ def __init__(self, **kwargs): self.wordform = kwargs.get('form', None) self.lex_items = kwargs.get('lex_items', []) self.ngram = kwargs.get('ngram', None) self.ngram_manager = NgramSetManager(self.ngram, kwargs.get('tagged_ngrams', [])) self.wordclass_model = HierarchicalModel(self.lex_items, self.wordform) self.corpus_probability_sets = { 'bnc': CORPUS_MANAGERS['bnc'].find(self.wordform), 'oec': CORPUS_MANAGERS['oecpos'].find(self.wordform), 'oec_lempos': _find_oec(self.lex_items), } self.calibrator = None self._set_ratios() def find_ratios(self, wordclasses, year): """ Derive the appropriate set of ratios for each decade. """ ratios = {} # Shortcut in case of just a single wordclass if len(self.wordclass_model.full_set_of_wordclasses()) <= 1: for w in wordclasses: ratios[w] = 1.0 / len(wordclasses) for lex_item in self.lex_items: lex_item.wordclass_method = 'singleton' return ratios else: if self.calibrator: calibrated_value = self.calibrator.calibrate(year) self.wordclass_model.inject_calibration(calibrated_value) for w in wordclasses: ratios[w] = self.wordclass_model.pos_ratio(w) if not 'NP' in ratios and self.wordclass_model.pos_ratio('NP') > 0: ratios['NP'] = self.wordclass_model.pos_ratio('NP') return adjust_to_unity(ratios) def _set_ratios(self): self._set_partofspeech_ratios() self._set_base_ratios() self._set_group_ratios() self.wordclass_model.log_methods() self.wordclass_model = FlatModel(self.wordclass_model) calibrator = Calibrator(self.wordclass_model, self.ngram_manager) if calibrator.is_viable(): self.calibrator = calibrator def _set_partofspeech_ratios(self): """ Establish ratios for specific parts of speech (lowest level of the wordclass model, below base wordclasses) Note that OEC lempos probabilities can't be used here, since the OEC lempos tables are not granular enough (they only give probabilities for base wordclasses, not for specific parts of speech) """ for group in self.wordclass_model.model().values(): for base in group.model().values(): method_type = None ratio_set = dict() if len(base.model()) == 1: for pos in base.model().keys(): ratio_set[pos] = 1.0 method_type = 'singleton' if not method_type: for corpus in ('oec', 'bnc'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.covers(base.full_set_of_wordclasses())): for pos in base.model().keys(): ratio_set[pos] = probability_set.ratio(pos) method_type = corpus break if not method_type: for pos, item in base.model().items(): ratio_set[pos] = item.predicted_frequency() method_type = 'predictions' if ('NP' in base.model() and 'NN' in base.model() and len(base.model().keys()) == 2): ratio_set = self._np_adjuster(base.model(), ratio_set) ratio_set = adjust_to_unity(ratio_set) base.set_ratios(ratio_set, method_type) def _set_base_ratios(self): """ Set ratios for base wordclasses within each group. Based on measured or predicted frequencies. """ for group in self.wordclass_model.model().values(): method_type = None ratio_set = {} # No need to bother when there is only one wordclass (singleton) if len(group.model()) == 1: for wc in group.model().keys(): ratio_set[wc] = 1.0 method_type = 'singleton' if not method_type: # Take ratios from OEC/BNC pos, if it's available and covers # the right set of wordclasses for corpus in ('bnc', 'oec'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.covers(group.base_set_of_wordclasses(), base=True)): for wc in group.model().keys(): ratio_set[wc] = probability_set.base_ratios()[wc] method_type = corpus break if not method_type: # Take ratios from OEC/BNC pos, if it's available and covers # *nearly* the right set of wordclasses. If a minor wordclass # is not covered, use an estimate for this. for corpus in ('bnc', 'oec'): probability_set = self.corpus_probability_sets[corpus] if (probability_set and probability_set.almost_covers(group.base_set_of_wordclasses())): missing = probability_set.almost_covers(group.base_set_of_wordclasses()) est = self._estimate_missing(missing=missing, corpus=corpus, model=group.model()) if est is not None: # Set the ratios of the wordclasses that *are* covered for wc in group.base_set_of_wordclasses(): if wc != missing: ratio_set[wc] = probability_set.base_ratios()[wc] # Use estimate as the ratio of the missing wordclass ratio_set[missing] = est method_type = corpus break if not method_type: # Take ratios from OEC lempos, if it's available and covers # the right set of wordclasses probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and probability_set.covers(group.base_set_of_wordclasses(), base=True) and not group.is_verblike()): for wc in group.model().values(): ratio_set[wc.wordclass] =\ probability_set.sum_subcategories(list(wc.model().keys())) method_type = 'oeclempos' if not method_type: # Take ratios from OEC lempos, if it's available and covers # *nearly* the right set of wordclasses. If a minor wordclass # is not covered, use an estimate for this. probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and probability_set.almost_covers(group.base_set_of_wordclasses()) and not group.is_verblike()): missing = probability_set.almost_covers(group.base_set_of_wordclasses()) est = self._estimate_missing(missing=missing, trace=False, corpus='oec', model=group.model()) if est: # Set the ratios of the wordclasses that *are* covered for wc in group.base_set_of_wordclasses(): if wc != missing: ratio_set[wc] = probability_set.base_ratios()[wc] # Use estimate as the ratio of the missing wordclass ratio_set[missing] = est method_type = 'oeclempos' # Fall back on predictions if not method_type: for wc, item in group.model().items(): ratio_set[wc] = item.predicted_frequency() ratio_set = _crosscheck(ratio_set, group.model()) method_type = 'predictions' ratio_set = adjust_to_unity(ratio_set) group.set_ratios(ratio_set, method_type) def _set_group_ratios(self): """ Set ratios for main groups Based on measured or predicted frequencies. Groups are either 'core' (NN + VB + JJ) or 'other' (everything else) """ method_type = None if len(self.wordclass_model.model()) == 1: ratio_set = {grp: 1.0 for grp in self.wordclass_model.model().keys()} method_type = 'singleton' if not method_type: probability_set = self.corpus_probability_sets['bnc'] if (probability_set and self.wordclass_model.groupset() == probability_set.groupset()): ratio_set = {grp: probability_set.group_ratios()[grp] for grp in self.wordclass_model.groupset()} method_type = 'bnc' if not method_type: probability_set = self.corpus_probability_sets['oec_lempos'] if (probability_set and self.wordclass_model.groupset() == probability_set.groupset() and probability_set.covers(self.wordclass_model.base_set_of_wordclasses(), base=True) and probability_set.sum_ratios(self.wordclass_model.base_set_of_wordclasses()) > 0.9): ratio_set = {grp: probability_set.group_ratios()[grp] for grp in self.wordclass_model.groupset()} method_type = 'oeclempos' if not method_type: ratio_set = {pos: item.predicted_frequency() for pos, item in self.wordclass_model.model().items()} ratio_set = _crosscheck(ratio_set, self.wordclass_model.model()) method_type = 'predictions' ratio_set = adjust_to_unity(ratio_set) self.wordclass_model.set_ratios(ratio_set, method_type) def method(self): return self.wordclass_model.method() def _estimate_missing(self, **kwargs): """ If all but one of the wordclasses are accounted for by the corpus probability set, and the missing wordclass appears to be minor (based on predicted frequency), then we'll just estimate the missing wordclass. This is most likely to capture VBG+JJ+NN sets where either the JJ or NN is missing in BNC. May also capture some VBN+JJ+NN sets, where the NN is vanishingly rare. """ missing = kwargs.get('missing') model = kwargs.get('model') corpus = kwargs.get('corpus', 'bnc').lower() trace = kwargs.get('trace', False) # The missing item has to have a predicted frequency ratio # below this threshold if corpus == 'bnc': threshold = 0.2 elif corpus == 'oec': threshold = 0.1 # How significant is the missing wordclass, as a proportion of # the total predicted frequencies? The ratio needs to be low, in # order for it to be plausible that it's missing from the BNC data sum_predictions = sum([b.predicted_frequency() for b in model.values()]) if sum_predictions: predicted_ratio = model[missing].predicted_frequency()\ / sum_predictions else: predicted_ratio = 1 if trace: print('-------------------------------------------------') print(self.wordform) print('\t%s:' % corpus.upper()) probability_set = self.corpus_probability_sets[corpus] for wordclass, f in probability_set.base_ratios().items(): print('\t\t%s\t%0.3g' % (wordclass, f)) print('\tpredictions:') for b in model.values(): print('\t\t%s\t%0.3g' % (b.wordclass, b.predicted_frequency())) if predicted_ratio < threshold: print('---> %s = %0.3g' % (missing, predicted_ratio)) else: print('FAILED (%s = %0.3g)' % (missing, predicted_ratio)) print('-------------------------------------------------') if predicted_ratio < threshold: return predicted_ratio else: return None def _np_adjuster(self, model, ratio_set): """ Where a set consists of NN and NP, make sure that the NN is not scoring artificially high. If the NN's calculated score (based on the ratio already derived) is higher than its predicted frequency (based on size), then the ratio is recalculated from the NN's predicted frequency. """ ratio_set = adjust_to_unity(ratio_set) ngram_total = self.ngram.frequency('1970-2000') nn_freq_calculated = ngram_total * ratio_set['NN'] nn_freq_predicted = model['NN'].predicted_frequency() if nn_freq_predicted < nn_freq_calculated: nn_revised_ratio = nn_freq_predicted / ngram_total ratio_set = {'NN': nn_revised_ratio, 'NP': 1 - nn_revised_ratio} return ratio_set