def store_values(self):
        print('Loading coordinates...')
        coords = Coordinates()
        print('Checking language overrides...')
        overrides = LanguageOverrides().list_language_overrides()
        print('Loading OED vital statistics...')
        vitalstats = VitalStatisticsCache()

        entries = []
        iterator = FrequencyIterator(message='Listing entries')
        for entry in iterator.iterate():
            if (entry.has_frequency_table() and
                    not ' ' in entry.lemma and
                    not '-' in entry.lemma):
                language_breadcrumb = vitalstats.find(entry.id, field='language')
                year = vitalstats.find(entry.id, field='first_date') or 0

                languages = []
                if language_breadcrumb is not None:
                    languages = [l for l in language_breadcrumb.split('/')
                                 if coords.is_listed(l)
                                 or l == 'English']
                else:
                    languages = ['unspecified', ]
                if entry.id in overrides:
                    languages = [overrides[entry.id], ]

                if languages:
                    # pick the most granular level (e.g. 'Icelandic' in
                    #  preference to 'Germanic')
                    language = languages[-1]
                    # Find frequency for this word
                    freq_table = entry.frequency_table()
                    frequency = freq_table.frequency(period='modern')
                    band = freq_table.band(period='modern')
                    row = (entry.lemma,
                           entry.label,
                           entry.id,
                           year,
                           frequency,
                           band,
                           language)
                    entries.append(row)

        entries = sorted(entries, key=lambda entry: entry[2])

        with (open(self.out_file, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(entries)
Exemple #2
0
    def __init__(self, **kwargs):
        self.dict_name = kwargs.get('dictName')
        self.oed_in = kwargs.get('oedIn', None)
        self.oed_out = kwargs.get('oedOut', None)
        self.odo_in = kwargs.get('odoIn', None)
        self.odo_out = kwargs.get('odoOut', None)

        self.oed_index = VitalStatisticsCache()

        self.odo_index = Distiller(dictName=self.dict_name)
        self.odo_index.load_distilled_file()
    def store_values(self):
        def nullvalues():
            return {y: 0 for y in YEARS}
        languages = defaultdict(nullvalues)
        num_entries = defaultdict(nullvalues)
        vitalstats = VitalStatisticsCache()
        iterator = FrequencyIterator(message='Measuring language frequency')
        for entry in iterator.iterate():
            if (entry.has_frequency_table() and
                not ' ' in entry.lemma and
                not '-' in entry.lemma):
                freq_table = entry.frequency_table()
                ltext = vitalstats.find(entry.id, field='indirect_language') or 'unspecified'
                langs = ltext.split('/')
                for year in YEARS:
                    frequency = freq_table.frequency(year=year, interpolated=True)
                    for language in langs:
                        languages[language][year] += frequency
                        if entry.start < year:
                            num_entries[language][year] += 1

        rows1 = []
        rows1.append(['language', ] + YEARS)
        for lang in sorted(languages.keys()):
            row = [lang, ] + [languages[lang][y] for y in YEARS]
            rows1.append(row)

        rows2 = []
        rows2.append(['language', ] + YEARS)
        for lang in sorted(languages.keys()):
            row = [lang, ] + [num_entries[lang][y] for y in YEARS]
            rows2.append(row)

        with (open(self.csv1, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(rows1)

        with (open(self.csv2, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(rows2)
    def build_currency_data(self):
        self.vs = VitalStatisticsCache()
        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Getting data')
        self.candidates = []
        self.candidates.append(list(RawCurrencyData.headers))
        for e in iterator.iterate():
            if (e.end and
                    e.end >= RawCurrencyData.start and
                    e.end <= RawCurrencyData.end and
                    not e.is_obsolete() and
                    not self.vs.find(e.id, field='revised') and
                    not e.lemma.startswith('-') and
                    not e.lemma.endswith('-')):
                if e.frequency_table() is not None:
                    freqs = [e.frequency_table().frequency(period=p)
                             for p in RawCurrencyData.periods]
                    delta = self.find_delta(e.frequency_table())
                else:
                    freqs = [float(0) for p in RawCurrencyData.periods]
                    delta = float(1)
                definition = e.definition or ''
                definition = '.' + definition

                row = [
                    e.id,
                    e.label,
                    e.wordclass(),
                    self.vs.find(e.id, field='header'),
                    self.vs.find(e.id, field='subject'),
                    self.vs.find(e.id, field='region'),
                    self.vs.find(e.id, field='usage'),
                    definition,
                    e.start,
                    e.end,
                    self.vs.find(e.id, field='quotations'),
                    self.vs.find(e.id, field='weighted_size'),
                    self.is_linked_to_odo(e),
                    self.is_logically_current(e),
                ]
                row.extend(['%0.2g' % f for f in freqs])
                row.append('%0.2g' % delta)
                self.candidates.append(tuple(row))
Exemple #5
0
def refine_index():
    """
    Refine the data build by index_raw_forms(), in particular removing minor
    homographs (both lemma-level homographs and wordform-level homographs).

    Also swaps in standard lemma forms, main-sense definitions, and
    thesaurus links.
    """
    # Determine which alien variants are okay to keep (because they don't
    #  shadow main forms). - Alien types are wordforms which begin with
    #  a different letter from their parent lemma, and so wouldn't be
    #  correctly filtered by the main letter-by-letter filtering process.
    stdout.write("Filtering alien types...\n")
    allowed_alien_types = _filter_alien_types()
    stdout.write("...done\n")

    # Initialize the resources that will be used for look-ups
    vitalstats = VitalStatisticsCache()
    main_sense_checker = MainSensesCache(with_definitions=True)

    for letter in string.ascii_lowercase:
        stdout.write("Refining index for %s...\n" % letter)
        blocks = []
        for block in _raw_pickle_iterator(letter):
            blocks.append(block)

        # Remove duplicate types, so that only the version
        #  in the block with the highest frequency is retained.
        # Cluster together typeunits with the same wordform + wordclass
        standardmap = defaultdict(lambda: defaultdict(list))
        for i, block in enumerate(blocks):
            for typeunit in block.standard_types:
                standardmap[typeunit.wordform][typeunit.wordclassflat].append((i, typeunit))
        # Go through each wordclass-cluster for each wordform, and pick
        #  the highest-frequency instance in each case
        for wordform, wordclasses in standardmap.items():
            winners = []
            for candidates in wordclasses.values():
                # Sort by frequency (highest first)
                candidates.sort(key=lambda c: c[1].f2000, reverse=True)
                # Remove the first candidate (the highest-frequency one);
                #  this is the one we'll keep.
                winners.append(candidates.pop(0))
                # Delete all the rest
                for index, typeunit in candidates:
                    blocks[index].standard_types.discard(typeunit)
            # We should now be left with the highest-scoring wordclasses
            #  for the current wordform (e.g. the highest-frequency
            #  homograph for spell_VB and the highest-frequency
            #  homograph for one spell_NN). We now need to decide which
            #  of these to keep and which to discard
            discards = _discardable_homographs(winners)
            for index, typeunit in discards:
                blocks[index].standard_types.discard(typeunit)

        # Remove variant types which either duplicate each other
        #  or that shadow a standard type. (Standard types are always
        #  given precedence.)
        varmap = defaultdict(list)
        for i, block in enumerate(blocks):
            for typeunit in block.variant_types:
                varmap[typeunit.wordform].append((i, typeunit, block.f2000))
        for wordform, candidates in varmap.items():
            if wordform not in standardmap:
                # Sort by the frequency of the parent lemma
                candidates.sort(key=lambda c: c[2], reverse=True)
                # Remove the first candidate (the highest-frequency
                #  one); this is the one we'll keep.
                candidates.pop(0)
            # Delete all the rest
            for index, typeunit, _ in candidates:
                blocks[index].variant_types.discard(typeunit)

        # Remove any alien types that are not allowed (because they
        #  shadow other standard types or variants).
        for block in blocks:
            to_be_deleted = set()
            for typeunit in block.alien_types:
                if typeunit.wordform not in allowed_alien_types:
                    to_be_deleted.add(typeunit)
            for typeunit in to_be_deleted:
                block.alien_types.discard(typeunit)

        # Remove any blocks whose standard_types and
        #  variant_types sets have now been completely emptied
        # For the remainder, turn standard_forms and variant_forms
        #  from sets into lists
        blocks = [_listify_forms(b) for b in blocks if b.standard_types or b.variant_types]

        blocks_filtered = []
        for block in blocks:
            language = vitalstats.find(block.refentry, field="indirect_language")
            if not language and block.start and block.start < 1200:
                language = "West Germanic"
            block = _replace_language(block, language)

            # Acquire main-sense data for this block (which will be
            #  used to swap in a new definition and a thesaurus link)
            if block.type == "entry":
                ms_block_data = main_sense_checker.find_block_data(block.refentry, block.refid)
                if ms_block_data and ms_block_data.senses:
                    main_sense_data = ms_block_data.senses[0]
                    main_sense_confidence = ms_block_data.confidence()
                else:
                    main_sense_data = None
                    main_sense_confidence = None
            else:
                main_sense_data = None
                main_sense_confidence = None

            # Swap in thesaurus-class link
            block = _replace_htclass(block, main_sense_data, main_sense_confidence)

            if block.type == "entry":
                # Make sure we use the OED headword, not the headword
                #  that's been used in GEL (which could be the version
                #  of the headword found in ODE or NOAD).
                headword = vitalstats.find(block.refentry, field="headword")
                if headword and headword != block.lemma:
                    block = _replace_lemma(block, headword)
                # Make sure we use the best main-sense definition
                if main_sense_data and main_sense_data.definition:
                    block = _replace_definition(block, main_sense_data.definition)
            blocks_filtered.append(block)

        out_file = os.path.join(FORM_INDEX_DIR, "refined", letter + ".json")
        with open(out_file, "w") as filehandle:
            for block in blocks_filtered:
                filehandle.write(json.dumps(block) + "\n")
Exemple #6
0
class LinkUpdater(object):
    error_message = '!ERROR entry not found'

    def __init__(self, **kwargs):
        self.dict_name = kwargs.get('dictName')
        self.oed_in = kwargs.get('oedIn', None)
        self.oed_out = kwargs.get('oedOut', None)
        self.odo_in = kwargs.get('odoIn', None)
        self.odo_out = kwargs.get('odoOut', None)

        self.oed_index = VitalStatisticsCache()

        self.odo_index = Distiller(dictName=self.dict_name)
        self.odo_index.load_distilled_file()

    def update_odo(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.odo_in)
        for entry in tree.findall('./e'):
            lexid = entry.get('lexid', None)
            odo_label = entry.find('./label')
            odo_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            etree.strip_tags(odo_label, 'i', 'sup', 'sub', 'hm')
            odo_label.text = odo_label_text
            link = entry.find('./linkSet/link')

            if link is not None:
                refentry = link.get('refentry', '0')
                refid = link.get('refid', '0')
                oed_label_text = self.oed_index.find(refentry, field='label') or LinkUpdater.error_message
                etree.strip_tags(link, 'i', 'sup', 'sub', 'hm')
                link.text = oed_label_text

            if (valid_links_only and
                (link is None or
                 link.text == LinkUpdater.error_message or
                 odo_label.text == LinkUpdater.error_message or
                 not check_match(link.text, odo_label.text))):
                entry.getparent().remove(entry)

        with open(self.odo_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))

    def update_oed(self, **kwargs):
        valid_links_only = kwargs.get('validLinksOnly', False)
        tree = etree.parse(self.oed_in)
        for entry in tree.findall('./link'):
            oed_id = entry.get('sourceID', None)
            oed_label_text = self.oed_index.find(oed_id, field='label') or LinkUpdater.error_message
            source_label = entry.find('./sourceLabel')
            etree.strip_tags(source_label, 'i', 'sup', 'sub', 'hm')
            source_label.text = oed_label_text

            lexid = entry.get('targetID', None)
            ode_label_text = self.odo_index.headword_by_id(lexid) or LinkUpdater.error_message
            target_label = entry.find('./targetLabel')
            etree.strip_tags(target_label, 'i', 'sup', 'sub', 'hm')
            target_label.text = ode_label_text

            if (valid_links_only and
                (oed_id is None or
                 lexid is None or
                 source_label.text == LinkUpdater.error_message or
                 target_label.text == LinkUpdater.error_message or
                 not check_match(source_label.text, target_label.text))):
                entry.getparent().remove(entry)

        with open(self.oed_out, 'w') as filehandle:
            filehandle.write(etree.tostring(tree,
                                            pretty_print=True,
                                            encoding='unicode'))
    def analyse(self):
        vs = VitalStatisticsCache()
        self.track = {
            'band_distribution': defaultdict(lambda: 0),
            'total_frequency': defaultdict(lambda: 0),
            'high_frequency': [],
            'high_delta_up': [],
            'high_delta_down': [],
            'delta_dist': defaultdict(lambda: 0),
            'plural_to_singular': [],
            'high_frequency_rare': [],
            'frequency_to_size_high': [],
            'frequency_to_size_low': [],
        }

        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Analysing frequency data')
        for e in iterator.iterate():
            if not e.has_frequency_table():
                self.track['band_distribution'][16] += 1

            if e.has_frequency_table():
                ft = e.frequency_table()
                self.track['band_distribution'][ft.band(period='modern')] += 1

                if ft.band(period='modern') <= 5:
                    self.track['high_frequency'].append({
                        'label': e.label,
                        'id': e.id,
                        'ftable': ft
                    })

                if ft.frequency(period='modern') > 0.5 and e.start < 1750:
                    delta = ft.delta('1800-49', 'modern')
                    if delta is not None:
                        self.log_delta(delta, reciprocal=True)
                        if delta > 2:
                            self.track['high_delta_up'].append({
                                'label': e.label,
                                'id': e.id,
                                'ftable': ft
                            })

                if (ft.frequency(period='1800-49') > 0.5 and
                        not e.is_obsolete()):
                    delta = ft.delta('1800-49', 'modern')
                    if delta is not None and delta < 0.5:
                        self.track['high_delta_down'].append({
                            'label': e.label,
                            'id': e.id,
                            'ftable': ft
                        })
                        self.log_delta(delta)

                if not ' ' in e.lemma and not '-' in e.lemma:
                    for p in e.frequency_table().data.keys():
                        self.track['total_frequency'][p] +=\
                            ft.frequency(period=p)

                if (ft.frequency() > 0.01 and
                        self.is_marked_rare(vs.find(e.id, 'header'))):
                    self.track['high_frequency_rare'].append({
                        'label': e.label,
                        'id': e.id,
                        'header': vs.find(e.id, 'header'),
                        'fpm': ft.frequency()
                    })

                if ft.frequency() > 1:
                    self.compare_singular_to_plural(e)

                if ft.frequency() >= 0.0001 and vs.find(e.id, 'quotations') > 0:
                    ratio = log(ft.frequency()) / vs.find(e.id, 'quotations')
                    if ratio > 0.2:
                        self.track['frequency_to_size_high'].append({
                            'label': e.label,
                            'id': e.id,
                            'quotations': vs.find(e.id, 'quotations'),
                            'fpm': ft.frequency(),
                            'ratio': ratio,
                        })
                    if vs.find(e.id, 'quotations') >= 20:
                        self.track['frequency_to_size_low'].append({
                            'label': e.label,
                            'id': e.id,
                            'quotations': vs.find(e.id, 'quotations'),
                            'fpm': ft.frequency(),
                            'ratio': ratio,
                        })
    def refine_index(self):
        allowed_alien_types = _filter_alien_types()

        vitalstats = VitalStatisticsCache()
        main_sense_checker = MainSensesCache(with_definitions=True)
        for letter in string.ascii_lowercase:
            print('Refining index for %s...' % letter)
            blocks = []
            for block in raw_pickle_iterator(letter):
                blocks.append(block)

            # Remove duplicate types, so that only the version
            #  in the block with the highest frequency is retained.
            standardmap = defaultdict(list)
            for i, block in enumerate(blocks):
                for wordform in block.standard_types:
                    standardmap[wordform].append((i, block.frequency))
            for wordform, candidates in standardmap.items():
                if len(candidates) > 1:
                    # Sort by frequency
                    candidates.sort(key=lambda c: c[1], reverse=True)
                    # Remove the first candidate (the highest-frequency
                    #  one); this is the one we'll keep.
                    candidates.pop(0)
                    # Delete all the rest
                    for index in [c[0] for c in candidates]:
                        blocks[index].standard_types.discard(wordform)

            # Remove variant types which either duplicate each other
            #  or that shadow a standard type (standard types are always
            #  given precedence).
            varmap = defaultdict(list)
            for i, block in enumerate(blocks):
                for wordform in block.variant_types:
                    varmap[wordform].append((i, block.frequency))
            for wordform, candidates in varmap.items():
                if wordform not in standardmap:
                    # Sort by frequency
                    candidates.sort(key=lambda c: c[1], reverse=True)
                    # Remove the first candidate (the highest-frequency
                    #  one); this is the one we'll keep.
                    candidates.pop(0)
                # Delete all the rest
                for index in [c[0] for c in candidates]:
                    blocks[index].variant_types.discard(wordform)

            # Remove any alien types that are not allowed (because they
            #  shadow other standard types or variants).
            for block in blocks:
                to_be_deleted = set()
                for wordform in block.alien_types:
                    if wordform not in allowed_alien_types:
                        to_be_deleted.add(wordform)
                for wordform in to_be_deleted:
                    block.alien_types.discard(wordform)

            # Remove any blocks whose standard_types and
            #  variant_types sets have now been completely emptied
            # For the remainder, turn standard_forms and variant_forms
            #  from sets into lists
            blocks = [_listify_forms(b) for b in blocks if b.standard_types
                      or b.variant_types]

            blocks_filtered = []
            for block in blocks:
                language = vitalstats.find(block.refentry,
                                           field='indirect_language')
                if not language and block.start and block.start < 1200:
                    language = 'West Germanic'
                block = _replace_language(block, language)

                if block.type == 'entry':
                    # Make sure we use the OED headword, not the headword
                    #  that's been used in GEL (which could be the version
                    #  of the headword found in ODE or NOAD).
                    headword = vitalstats.find(block.refentry,
                                               field='headword')
                    if headword and headword != block.lemma:
                        block = _replace_lemma(block, headword)
                    # Make sure we use the correct main-sense definition
                    main_sense = main_sense_checker.find_main_sense_data(
                        block.refentry,
                        block.refid)
                    if main_sense and main_sense.definition:
                        block = _replace_definition(block, main_sense.definition)
                blocks_filtered.append(block)

            out_file = os.path.join(FORM_INDEX_DIR, 'refined', letter + '.json')
            with open(out_file, 'w') as filehandle:
                for block in blocks_filtered:
                    filehandle.write(json.dumps(block) + '\n')
class RawCurrencyData(object):

    start = frequencyconfig.RANGE_START
    end = frequencyconfig.RANGE_END

    periods = ('1800-49', '1850-99', '1900-49', '1950-99', '2000-')
    headers = ['id', 'label', 'wordclass', 'header', 'subject', 'region',
               'usage', 'definition', 'start', 'end', 'quotations',
               'weighted size', 'ODO-linked', 'logically current']
    headers.extend(periods)
    headers.append('frequency change')

    # parameters for testing logical currency
    logical = {
        'date':  frequencyconfig.LOGICAL_CURRENCY_DATE,
        'size': frequencyconfig.LOGICAL_CURRENCY_SIZE,
        'suffixes1': ['-' + j for j in
                      frequencyconfig.LOGICAL_CURRENCY_SUFFIXES1.split('|')],
        'suffixes2': ['-' + j for j in
                      frequencyconfig.LOGICAL_CURRENCY_SUFFIXES1.split('|')],
    }

    def __init__(self, **kwargs):
        self.in_dir = kwargs.get('in_dir')

    def build_currency_data(self):
        self.vs = VitalStatisticsCache()
        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Getting data')
        self.candidates = []
        self.candidates.append(list(RawCurrencyData.headers))
        for e in iterator.iterate():
            if (e.end and
                    e.end >= RawCurrencyData.start and
                    e.end <= RawCurrencyData.end and
                    not e.is_obsolete() and
                    not self.vs.find(e.id, field='revised') and
                    not e.lemma.startswith('-') and
                    not e.lemma.endswith('-')):
                if e.frequency_table() is not None:
                    freqs = [e.frequency_table().frequency(period=p)
                             for p in RawCurrencyData.periods]
                    delta = self.find_delta(e.frequency_table())
                else:
                    freqs = [float(0) for p in RawCurrencyData.periods]
                    delta = float(1)
                definition = e.definition or ''
                definition = '.' + definition

                row = [
                    e.id,
                    e.label,
                    e.wordclass(),
                    self.vs.find(e.id, field='header'),
                    self.vs.find(e.id, field='subject'),
                    self.vs.find(e.id, field='region'),
                    self.vs.find(e.id, field='usage'),
                    definition,
                    e.start,
                    e.end,
                    self.vs.find(e.id, field='quotations'),
                    self.vs.find(e.id, field='weighted_size'),
                    self.is_linked_to_odo(e),
                    self.is_logically_current(e),
                ]
                row.extend(['%0.2g' % f for f in freqs])
                row.append('%0.2g' % delta)
                self.candidates.append(tuple(row))

    def is_logically_current(self, e):
        etyma = self.vs.find(e.id, field='etyma')
        if len(etyma) == 2:
            if etyma[1][0] in RawCurrencyData.logical['suffixes1']:
                parent_id = etyma[0][1]
                tier = 'high'
            elif etyma[1][0] in RawCurrencyData.logical['suffixes2']:
                parent_id = etyma[0][1]
                tier = 'low'
            else:
                tier = None
            if (tier is not None and
                    (self.vs.find(parent_id, field='last_date') > RawCurrencyData.end or
                    (self.vs.find(parent_id, field='last_date') > RawCurrencyData.logical['date'] and
                    self.vs.find(parent_id, field='quotations') > RawCurrencyData.logical['size']))):
                return tier
        return None

    def is_linked_to_odo(self, e):
        if (self.vs.find(e.id, field='ode') is not None or
                self.vs.find(e.id, field='noad') is not None):
            return True
        else:
            return False

    def write(self, filepath):
        with open(filepath, 'w') as csvfile:
            csvw = csv.writer(csvfile)
            csvw.writerows(self.candidates)

    def find_delta(self, ft):
        f1 = ft.frequency(period='1800-99')
        f2 = ft.frequency(period='1950-99')
        if f1 == 0:
            d = float(1)
        elif f2 == 0:
            d = 0.0001 / f1
        else:
            d = f2 / f1
        if d < 1:
            d = -(1 / d)
        return d