def index_frequency_files(in_dir, out_file):
    entry_list = defaultdict(lambda: defaultdict(list))
    iterator = FrequencyIterator(in_dir=in_dir,
                                 message='Compiling index')
    for e in iterator.iterate():
        entry_list[e.letter][e.filename].append(e.label)

    doc = etree.Element('letters')
    doc.addprevious(XSLPI)

    for letter in sorted(entry_list.keys()):
        num_files = len(entry_list[letter].keys())
        num_entries = sum([len(entry_list[letter][f])
                           for f in entry_list[letter].keys()])
        letter_node = etree.SubElement(doc, 'letterSet',
                                       letter=letter,
                                       files=str(num_files),
                                       entries=str(num_entries),)

        for filename in sorted(entry_list[letter].keys()):
            fnode = etree.SubElement(letter_node, 'file',
                                     name=filename,
                                     letter=letter,
                                     entries=str(len(entry_list[letter][filename])))
            t1 = etree.SubElement(fnode, 'first')
            t1.text = entry_list[letter][filename][0]
            t2 = etree.SubElement(fnode, 'last')
            t2.text = entry_list[letter][filename][-1]

    with open(out_file, 'w') as filehandle:
        filehandle.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        filehandle.write(etree.tounicode(doc.getroottree(),
                                         pretty_print=True,))
Esempio n. 2
0
    def store_values(self):
        print('Loading coordinates...')
        coords = Coordinates()
        print('Checking language overrides...')
        overrides = LanguageOverrides().list_language_overrides()
        print('Loading OED vital statistics...')
        vitalstats = VitalStatisticsCache()

        entries = []
        iterator = FrequencyIterator(message='Listing entries')
        for entry in iterator.iterate():
            if (entry.has_frequency_table() and
                    not ' ' in entry.lemma and
                    not '-' in entry.lemma):
                language_breadcrumb = vitalstats.find(entry.id, field='language')
                year = vitalstats.find(entry.id, field='first_date') or 0

                languages = []
                if language_breadcrumb is not None:
                    languages = [l for l in language_breadcrumb.split('/')
                                 if coords.is_listed(l)
                                 or l == 'English']
                else:
                    languages = ['unspecified', ]
                if entry.id in overrides:
                    languages = [overrides[entry.id], ]

                if languages:
                    # pick the most granular level (e.g. 'Icelandic' in
                    #  preference to 'Germanic')
                    language = languages[-1]
                    # Find frequency for this word
                    freq_table = entry.frequency_table()
                    frequency = freq_table.frequency(period='modern')
                    band = freq_table.band(period='modern')
                    row = (entry.lemma,
                           entry.label,
                           entry.id,
                           year,
                           frequency,
                           band,
                           language)
                    entries.append(row)

        entries = sorted(entries, key=lambda entry: entry[2])

        with (open(self.out_file, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(entries)
Esempio n. 3
0
    def build_currency_data(self):
        self.vs = VitalStatisticsCache()
        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Getting data')
        self.candidates = []
        self.candidates.append(list(RawCurrencyData.headers))
        for e in iterator.iterate():
            if (e.end and
                    e.end >= RawCurrencyData.start and
                    e.end <= RawCurrencyData.end and
                    not e.is_obsolete() and
                    not self.vs.find(e.id, field='revised') and
                    not e.lemma.startswith('-') and
                    not e.lemma.endswith('-')):
                if e.frequency_table() is not None:
                    freqs = [e.frequency_table().frequency(period=p)
                             for p in RawCurrencyData.periods]
                    delta = self.find_delta(e.frequency_table())
                else:
                    freqs = [float(0) for p in RawCurrencyData.periods]
                    delta = float(1)
                definition = e.definition or ''
                definition = '.' + definition

                row = [
                    e.id,
                    e.label,
                    e.wordclass(),
                    self.vs.find(e.id, field='header'),
                    self.vs.find(e.id, field='subject'),
                    self.vs.find(e.id, field='region'),
                    self.vs.find(e.id, field='usage'),
                    definition,
                    e.start,
                    e.end,
                    self.vs.find(e.id, field='quotations'),
                    self.vs.find(e.id, field='weighted_size'),
                    self.is_linked_to_odo(e),
                    self.is_logically_current(e),
                ]
                row.extend(['%0.2g' % f for f in freqs])
                row.append('%0.2g' % delta)
                self.candidates.append(tuple(row))
    def measure_ratios(self):
        ratios = defaultdict(list)
        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Analysing p.o.s. ratios')
        for e in iterator.iterate():
            for wcs in e.wordclass_sets():
                if ((wcs.wordclass == 'NN' or wcs.wordclass == 'VB') and
                    wcs.has_frequency_table()):
                    total = wcs.frequency_table().frequency()
                    local = defaultdict(lambda: 0)
                    for type in wcs.types():
                        if type.frequency_table().frequency() > 0:
                            local[type.wordclass] += type.frequency_table().frequency()
                    for wordclass, fpm in local.items():
                        ratios[wordclass].append(total / fpm)

        for wordclass in ratios:
            print('%s\t%0.4g' % (wordclass, numpy.median(ratios[wordclass])))
    def store_values(self):
        def nullvalues():
            return {y: 0 for y in YEARS}
        languages = defaultdict(nullvalues)
        num_entries = defaultdict(nullvalues)
        vitalstats = VitalStatisticsCache()
        iterator = FrequencyIterator(message='Measuring language frequency')
        for entry in iterator.iterate():
            if (entry.has_frequency_table() and
                not ' ' in entry.lemma and
                not '-' in entry.lemma):
                freq_table = entry.frequency_table()
                ltext = vitalstats.find(entry.id, field='indirect_language') or 'unspecified'
                langs = ltext.split('/')
                for year in YEARS:
                    frequency = freq_table.frequency(year=year, interpolated=True)
                    for language in langs:
                        languages[language][year] += frequency
                        if entry.start < year:
                            num_entries[language][year] += 1

        rows1 = []
        rows1.append(['language', ] + YEARS)
        for lang in sorted(languages.keys()):
            row = [lang, ] + [languages[lang][y] for y in YEARS]
            rows1.append(row)

        rows2 = []
        rows2.append(['language', ] + YEARS)
        for lang in sorted(languages.keys()):
            row = [lang, ] + [num_entries[lang][y] for y in YEARS]
            rows2.append(row)

        with (open(self.csv1, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(rows1)

        with (open(self.csv2, 'w')) as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(rows2)
Esempio n. 6
0
def xml_to_csv(in_dir, out_file):
    iterator = FrequencyIterator(in_dir=in_dir, message='Populating .csv file')
    entries = []
    for e in iterator.iterate():
        if not e.has_frequency_table():
            continue

        frequency = e.frequency_table().frequency(period='modern')
        band = e.frequency_table().band(period='modern')
        label = e.label
        entry_id = e.id
        if e.is_main_entry:
            node_id = None
        else:
            node_id = e.xrnode

        row = (entry_id, node_id, label, frequency, band)
        entries.append(row)

    with open(out_file, 'w') as filehandle:
        csvwriter = csv.writer(filehandle)
        csvwriter.writerows(entries)
Esempio n. 7
0
def store_rankings(**kwargs):
    in_dir = kwargs.get('in_dir')
    out_file = kwargs.get('out_file') or DEFAULT_FILE

    iterator = FrequencyIterator(in_dir=in_dir,
                                 letters=None,
                                 message='Compiling frequency ranking')

    entryrank = []
    for e in iterator.iterate():
        if e.has_frequency_table():
            entryrank.append((
                e.label,
                e.lemma,
                e.xrid,
                e.frequency_table().frequency(),
            ))

    entryrank = sorted(entryrank, key=lambda e: e[3], reverse=True)
    with (open(out_file, 'w')) as filehandle:
        csv_writer = csv.writer(filehandle)
        for row in entryrank:
            csv_writer.writerow(row)
Esempio n. 8
0
def populate_lemmas():

    ranking = EntryRank()
    iterator = FrequencyIterator(in_dir=INPUT_DIR, message='Populating database')
    count = 0
    entries = []
    for e in iterator.iterate():
        count += 1
        if e.is_obsolete():
            last_date = e.end
        else:
            last_date = 2050

        if e.is_main_entry:
            try:
                rank = ranking.entry(e.id).rank
            except AttributeError:
                rank = 250000
        else:
            rank = None

        if e.is_main_entry:
            xrnode = None
        else:
            xrnode = e.xrnode

        entry = Lemma(
            xrnode=xrnode,
            label=e.label[:LABEL_LENGTH],
            alphasort=e.alphasort()[:ALPHASORT_LENGTH],
            definition=e.definition[:DEFINITION_LENGTH],
            dictsort=count,
            json=e.todict(),
            wordclass=e.wordclass() or 'X',
            startdate=e.start,
            enddate=last_date,
            rank=rank,
            entry_id=e.id,
            mainentry=e.is_main_entry,
        )

        # Frequency + frequency-band fields
        if not e.has_frequency_table():
            for year in FREQUENCY_FIELDS:
                field = 'f%d' % year
                entry.__dict__[field] = 0
            entry.fmodern = 0
            for year in BAND_FIELDS:
                field = 'fb%d' % year
                entry.__dict__[field] = NULL_FREQUENCY_BAND
            entry.fbmodern = NULL_FREQUENCY_BAND
        else:
            for year in FREQUENCY_FIELDS:
                field = 'f%d' % year
                entry.__dict__[field] = e.frequency_table().frequency(year=year)
            entry.fmodern = e.frequency_table().frequency(period='modern')
            for year in BAND_FIELDS:
                field = 'fb%d' % year
                entry.__dict__[field] = e.frequency_table().band(year=year)
            entry.fbmodern = e.frequency_table().band(period='modern')

        entries.append(entry)
        if len(entries) > 1000:
            Lemma.objects.bulk_create(entries)
            entries = []

    Lemma.objects.bulk_create(entries)
    def analyse(self):
        vs = VitalStatisticsCache()
        self.track = {
            'band_distribution': defaultdict(lambda: 0),
            'total_frequency': defaultdict(lambda: 0),
            'high_frequency': [],
            'high_delta_up': [],
            'high_delta_down': [],
            'delta_dist': defaultdict(lambda: 0),
            'plural_to_singular': [],
            'high_frequency_rare': [],
            'frequency_to_size_high': [],
            'frequency_to_size_low': [],
        }

        iterator = FrequencyIterator(in_dir=self.in_dir,
                                     letters=None,
                                     message='Analysing frequency data')
        for e in iterator.iterate():
            if not e.has_frequency_table():
                self.track['band_distribution'][16] += 1

            if e.has_frequency_table():
                ft = e.frequency_table()
                self.track['band_distribution'][ft.band(period='modern')] += 1

                if ft.band(period='modern') <= 5:
                    self.track['high_frequency'].append({
                        'label': e.label,
                        'id': e.id,
                        'ftable': ft
                    })

                if ft.frequency(period='modern') > 0.5 and e.start < 1750:
                    delta = ft.delta('1800-49', 'modern')
                    if delta is not None:
                        self.log_delta(delta, reciprocal=True)
                        if delta > 2:
                            self.track['high_delta_up'].append({
                                'label': e.label,
                                'id': e.id,
                                'ftable': ft
                            })

                if (ft.frequency(period='1800-49') > 0.5 and
                        not e.is_obsolete()):
                    delta = ft.delta('1800-49', 'modern')
                    if delta is not None and delta < 0.5:
                        self.track['high_delta_down'].append({
                            'label': e.label,
                            'id': e.id,
                            'ftable': ft
                        })
                        self.log_delta(delta)

                if not ' ' in e.lemma and not '-' in e.lemma:
                    for p in e.frequency_table().data.keys():
                        self.track['total_frequency'][p] +=\
                            ft.frequency(period=p)

                if (ft.frequency() > 0.01 and
                        self.is_marked_rare(vs.find(e.id, 'header'))):
                    self.track['high_frequency_rare'].append({
                        'label': e.label,
                        'id': e.id,
                        'header': vs.find(e.id, 'header'),
                        'fpm': ft.frequency()
                    })

                if ft.frequency() > 1:
                    self.compare_singular_to_plural(e)

                if ft.frequency() >= 0.0001 and vs.find(e.id, 'quotations') > 0:
                    ratio = log(ft.frequency()) / vs.find(e.id, 'quotations')
                    if ratio > 0.2:
                        self.track['frequency_to_size_high'].append({
                            'label': e.label,
                            'id': e.id,
                            'quotations': vs.find(e.id, 'quotations'),
                            'fpm': ft.frequency(),
                            'ratio': ratio,
                        })
                    if vs.find(e.id, 'quotations') >= 20:
                        self.track['frequency_to_size_low'].append({
                            'label': e.label,
                            'id': e.id,
                            'quotations': vs.find(e.id, 'quotations'),
                            'fpm': ft.frequency(),
                            'ratio': ratio,
                        })