Beispiel #1
0
    def test_markconcservative(self):
        from pyglottolog.monsterlib._libmonster import markconservative

        res = markconservative({1: ('article', {
            'title': 'Grammar'
        })},
                               self.api.hhtypes.triggers,
                               {1: ('article', {
                                   'title': 'Grammar'
                               })},
                               self.api.hhtypes,
                               self.tmp_path('marks.txt'),
                               verbose=False)
        self.assertEqual(res[1][1]['hhtype'].split()[0], 'grammar')

        # If a higher hhtype is computed, this cancels out previous computations.
        res = markconservative(
            {1: ('article', {
                'title': 'grammar',
                'lgcode': 'abc'
            })},
            self.api.hhtypes.triggers, {
                1: ('article', {
                    'title': 'other',
                    'hhtype': 'other',
                    'lgcode': 'abc'
                })
            },
            self.api.hhtypes,
            self.tmp_path('marks.txt'),
            verbose=False)
        self.assertNotIn('hhtype', res[1][1])
Beispiel #2
0
    def test_markconcservative(self):
        from pyglottolog.monsterlib._libmonster import markconservative

        hht = HHTypes(repos=self.repos)
        res = markconservative(
            {1: ('article', {'title': 'Grammar'})},
            hht.triggers,
            {1: ('article', {'title': 'Grammar'})},
            hht,
            self.tmp_path('marks.txt'),
            verbose=False)
        self.assertEqual(res[1][1]['hhtype'].split()[0], 'grammar')

        # If a higher hhtype is computed, this cancels out previous computations.
        res = markconservative(
            {1: ('article', {'title': 'grammar', 'lgcode': 'abc'})},
            hht.triggers,
            {1: ('article', {'title': 'other', 'hhtype': 'other', 'lgcode': 'abc'})},
            hht,
            self.tmp_path('marks.txt'),
            verbose=False)
        self.assertNotIn('hhtype', res[1][1])
Beispiel #3
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())
Beispiel #4
0
def compile(api, log=None, rebuild=False):
    log = log or logging.getLogger('pyglottolog')
    previous = api.references_path('monster.csv')
    replacements = api.references_path('replacements.json')
    monster = BibFile(fname=api.build_path('monster-utf8.bib'),
                      encoding='utf-8',
                      sortkey='bibkey')

    log.info('%s open/rebuild bibfiles db' % time.ctime())
    db = api.bibfiles.to_sqlite(api.build_path('_bibfiles.sqlite3'),
                                rebuild=rebuild)

    log.info('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    log.info('%s load hh.bib' % time.ctime())
    hhbib = api.bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    log.info('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, api.macroarea_map)

    # Annotate with hhtype
    log.info('%s annotate hhtype' % time.ctime())
    m = markconservative(m,
                         api.hhtypes.triggers,
                         hhbib,
                         api.hhtypes,
                         api.build_path('monstermark-hht.txt'),
                         rank=lambda l: api.hhtypes[l])

    # Annotate with lgcode
    log.info('%s annotate lgcode' % time.ctime())
    m = markconservative(m, api.triggers['lgcode'], hhbib, api.hhtypes,
                         api.build_path('monstermark-lgc.txt'))

    # Annotate with inlg
    log.info('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, api.triggers['inlg'])

    # Print some statistics
    stats = Counter()
    log.info(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    log.info("# entries {0}".format(stats['entry']))
    for field in ['lgcode', 'hhtype', 'macro_area']:
        log.info("with {0}: {1}".format(field, stats[field]))

    # Update the CSV with the previous mappings for later reference
    log.info('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    log.info('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    log.info('%s trickle' % time.ctime())
    db.trickle()

    # Save
    log.info('%s save as utf8' % time.ctime())
    monster.save(m)

    log.info('%s done.' % time.ctime())
Beispiel #5
0
def main(repos=DATA_DIR, rebuild=False):
    bibfiles = _bibfiles.Collection(references_path('bibtex', repos=repos))
    previous = references_path('monster.csv', repos=repos)
    replacements = build_path('monster-replacements.json', repos=repos)
    monster = _bibfiles.BibFile(
        build_path('monster-utf8.bib', repos=repos), encoding='utf-8', sortkey='bibkey')
    tree = languoids_path('tree', repos=repos)
    hht = HHTypes(repos=repos)

    print('%s open/rebuild bibfiles db' % time.ctime())
    db = bibfiles.to_sqlite(
        build_path('_bibfiles.sqlite3', repos=repos).as_posix(),
        rebuild=rebuild)

    print('%s compile_monster' % time.ctime())
    m = dict(db.merged())

    print('%s load hh.bib' % time.ctime())
    hhbib = bibfiles['hh.bib'].load()

    # Annotate with macro_area from lgcode when lgcode is assigned manually
    print('%s macro_area_from_lgcode' % time.ctime())
    m = macro_area_from_lgcode(m, tree)

    # Annotate with hhtype
    print('%s annotate hhtype' % time.ctime())
    m = markconservative(
        m,
        hht.triggers,
        hhbib,
        hht,
        build_path('monstermark-hht.txt', repos=repos),
        rank=lambda l: hht[l])

    ltriggers = languoids.load_triggers(tree=tree)

    # Annotate with lgcode
    print('%s annotate lgcode' % time.ctime())
    m = markconservative(
        m,
        ltriggers['lgcode'],
        hhbib,
        hht,
        build_path('monstermark-lgc.txt', repos=repos))

    # Annotate with inlg
    print('%s add_inlg_e' % time.ctime())
    m = add_inlg_e(m, ltriggers['inlg'])

    # Print some statistics
    stats = Counter()
    print(time.ctime())
    for t, f in m.values():
        stats.update(['entry'])
        for field in ['lgcode', 'hhtype', 'macro_area']:
            if field in f:
                stats.update([field])
    print("# entries", stats['entry'])
    for field in ['lgcode', 'hhtype', 'macro_area']:
        print("with " + field, stats[field])

    # Update the CSV with the previous mappings for later reference
    print('%s update_previous' % time.ctime())
    db.to_csvfile(previous)

    print('%s save_replacements' % time.ctime())
    db.to_replacements(replacements)

    # Trickling back
    print('%s trickle' % time.ctime())
    db.trickle(bibfiles)

    # Save
    print('%s save as utf8' % time.ctime())
    monster.save(m, verbose=False)

    print('%s done.' % time.ctime())