Exemple #1
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodiucally whenever data has been updated.
    """
    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Exemple #2
0
def google_books(**kw):  # pragma: no cover
    add_args = [
        (("command",), dict(help="download|verify|update|cleanup")),
        (("--api-key",), dict(default=kw.get("key", os.environ.get("GBS_API_KEY")))),
    ]

    args = parsed_args(*add_args, **kw)
    if args.command == "download" and not args.api_key:
        raise argparse.ArgumentError(None, "no API key found for download")

    with transaction.manager:
        gbs_func(args.command, args, kw.get("sources"))
Exemple #3
0
def google_books(**kw):  # pragma: no cover
    add_args = [
        (("command",), dict(help="download|verify|update|cleanup")),
        (("--api-key",), dict(default=kw.get('key', os.environ.get('GBS_API_KEY')))),
    ]

    args = parsed_args(*add_args, **kw)
    if args.command == 'download' and not args.api_key:
        raise argparse.ArgumentError(None, 'no API key found for download')

    with transaction.manager:
        gbs_func(args.command, args, kw.get('sources'))
Exemple #4
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    q = DBSession.query(common.Parameter).join(common.ValueSet).distinct()
    n = q.count()
    m = DBSession.query(models.Inventory).count()
    for segment in q:
        #
        # TODO: this ratio (number of inventories a segment appears in by number of
        # distinct segment total) doesn't make much sense, does it?
        #
        segment.frequency = float(len(segment.valuesets)) / float(n)
        segment.in_inventories = len(segment.valuesets)
        segment.total_inventories = m

    for inventory in DBSession.query(models.Inventory).options(
            joinedload_all(common.Contribution.valuesets, common.ValueSet.parameter)
    ):
        if '(UPSID)' not in inventory.name:
            inventory.count_tone = 0

        for vs in inventory.valuesets:
            attr = 'count_' + vs.parameter.segment_class
            if hasattr(inventory, attr):
                val = getattr(inventory, attr) or 0
                setattr(inventory, attr, val + 1)

    ficons = cycle(ORDERED_ICONS)
    gicons = cycle(ORDERED_ICONS)
    for root, genus in groupby(
            DBSession.query(models.Genus).order_by(models.Genus.description),
            lambda g: g.description):
        ficon = ficons.next().name
        for g in genus:
            g.ficon = ficon
            g.gicon = gicons.next().name

    for variety in DBSession.query(models.Variety).options(
            joinedload(models.Variety.inventories)):
        variety.count_inventories = len(variety.inventories)

    if astroman:
        ia_func('update', args)
        gbs_func('update', args)
        print('added', add_wikipedia_urls(args), 'wikipedia urls')
Exemple #5
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    q = DBSession.query(common.Parameter).join(common.ValueSet).distinct()
    n = q.count()
    m = DBSession.query(models.Inventory).count()
    for segment in q:
        #
        # TODO: this ratio (number of inventories a segment appears in by number of
        # distinct segment total) doesn't make much sense, does it?
        #
        segment.frequency = float(len(segment.valuesets)) / float(n)
        segment.in_inventories = len(segment.valuesets)
        segment.total_inventories = m

    for inventory in DBSession.query(models.Inventory).options(
            joinedload_all(common.Contribution.valuesets,
                           common.ValueSet.parameter)):
        if '(UPSID)' not in inventory.name:
            inventory.count_tone = 0

        for vs in inventory.valuesets:
            attr = 'count_' + vs.parameter.segment_class
            if hasattr(inventory, attr):
                val = getattr(inventory, attr) or 0
                setattr(inventory, attr, val + 1)

    ficons = cycle(ORDERED_ICONS)
    gicons = cycle(ORDERED_ICONS)
    for root, genus in groupby(
            DBSession.query(models.Genus).order_by(models.Genus.description),
            lambda g: g.description):
        ficon = ficons.next().name
        for g in genus:
            g.ficon = ficon
            g.gicon = gicons.next().name

    for variety in DBSession.query(models.Variety).options(
            joinedload(models.Variety.inventories)):
        variety.count_inventories = len(variety.inventories)

    if 0:
        ia_func('update', args)
        gbs_func('update', args)
        print('added', add_wikipedia_urls(args), 'wikipedia urls')
Exemple #6
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row["language_id"], row["feature_id"])
        old_value = vs2008.get(key)
        new_value = row["value_numeric"]
        if old_value and old_value != new_value:
            valueset = (
                VersionedDBSession.query(common.ValueSet)
                .join(common.Language)
                .join(common.Parameter)
                .filter(common.Parameter.id == row["feature_id"])
                .filter(common.Language.id == row["language_id"])
                .one()
            )
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file("corrections_2013.tab"), namedtuples=True, newline="\r"):
        valueset = (
            VersionedDBSession.query(common.ValueSet)
            .join(common.Language)
            .join(common.Parameter)
            .filter(common.Parameter.id == row.feature)
            .filter(common.Language.id == row.wals_code)
            .one()
        )
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print("**** old news", valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print("--->", valueset.language.id, valueset.parameter.id, value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print("corrections 2013 done")

    for issue in ["0", "9", "10", "11", "13", "14", "15", "16", "17", "19", "20", "24", "26", "27", "28"]:
        issue = getattr(issues, "issue" + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
        DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter
    ):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print("recomputation of representation done")
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
        joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier)
    ):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ", ".join(sorted(set(iso_codes)))
    print("ecomputation of iso codes done")
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func("update", args)
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
        joinedload(common.Parameter.valuesets)
    ):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            with open(path(apics.__file__).dirname().joinpath(
                'static', 'wals', '%sA.json' % feature.wals_id
            ), 'r') as fp:
                data = json.load(fp)
            feature.wals_representation = sum([len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
        joinedload(common.ValueSet.parameter),
        joinedload_all(common.ValueSet.values, common.Value.domainelement)
    ):
        values = sorted(list(valueset.values), key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency), colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                save('freq-%s' % frequency)
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color, 'icon': 'pie-100-%s.png' % color}

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(
                tuple(reversed(fracs)),
                colors=['#' + color for color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            save(basename)
            icons[(fracs, colors)] = True

            with open(str(icons_dir.joinpath(basename + '.svg')), 'w') as fp:
                fp.write(svg.pie(fracs, ['#' + color for color in colors], width=40))

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
Exemple #8
0
def prime_cache(args):
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print '**** old news', valueset.language.id, valueset.parameter.id
            continue

        if value.domainelement.number != int(row.old):
            print '--->', valueset.language.id, valueset.parameter.id, value.domainelement.number
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print 'corrections 2013 done'

    for issue in ['0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28']:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk),
            lambda vs: vs.parameter):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print 'recomputation of representation done'
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(joinedload_all(
        common.Language.languageidentifier, common.LanguageIdentifier.identifier
    )):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print 'recomputation of iso codes done'
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Exemple #9
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'),
                      namedtuples=True,
                      newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print('**** old news', valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print('--->', valueset.language.id, valueset.parameter.id,
                  value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print('corrections 2013 done')

    for issue in [
            '0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20',
            '24', '26', '27', '28'
    ]:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(
                common.ValueSet.parameter_pk), lambda vs: vs.parameter):
        parameter.representation = str(
            len(set(v.language_pk for v in valuesets)))
    print('recomputation of representation done')
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
            joinedload_all(common.Language.languageidentifier,
                           common.LanguageIdentifier.identifier)):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print('ecomputation of iso codes done')
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Exemple #10
0
from clld.scripts.util import gbs_func, parsed_args


if __name__ == '__main__':
    gbs_func('update', parsed_args(bootstrap=True))
Exemple #11
0
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
            joinedload(common.Parameter.valuesets)):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            data = jsonload(
                path(apics.__file__).dirname().joinpath(
                    'static', 'wals', '%sA.json' % feature.wals_id))
            feature.wals_representation = sum(
                [len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier \
                    = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.parameter),
            joinedload_all(common.ValueSet.values,
                           common.Value.domainelement)):
        values = sorted(list(valueset.values),
                        key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency),
                           colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                assert icons_dir.joinpath('freq-%s.png' % frequency).exists()
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color,
                'icon': 'pie-100-%s.png' % color
            }

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(tuple(reversed(fracs)),
                       colors=['#' + _color for _color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            assert icons_dir.joinpath('%s.png' % basename).exists()
            icons[(fracs, colors)] = True
            assert icons_dir.joinpath(basename + '.svg').exists()

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
Exemple #12
0
from clld.scripts.util import gbs_func, parsed_args

if __name__ == '__main__':
    gbs_func('update', parsed_args(bootstrap=True))