Example #1
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodiucally whenever data has been updated.
    """
    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Example #2
0
    def test_compute_language_sources(self):
        from clld.db.models.common import Source, Sentence, Language, SentenceReference
        from clld.db.meta import DBSession
        from clld.db.util import compute_language_sources

        s = Sentence(id='sentenced', language=Language(id='newlang'))
        sr = SentenceReference(sentence=s, source=Source.first())
        DBSession.add(sr)
        DBSession.flush()
        compute_language_sources()
Example #3
0
def prime_cache(args):  # pragma: no cover
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    if 1:
        langs = {l.pk: l for l in DBSession.query(models.GrambankLanguage)}
        features = {f.pk: f for f in DBSession.query(models.Feature)}

        for lpk, nf in DBSession.query(common.ValueSet.language_pk, func.count(common.ValueSet.pk)) \
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk) \
                .group_by(common.ValueSet.language_pk):
            langs[lpk].representation = nf

        for fpk, nl in DBSession.query(common.ValueSet.parameter_pk, func.count(common.ValueSet.pk))\
                .join(common.Value, common.Value.valueset_pk == common.ValueSet.pk)\
                .group_by(common.ValueSet.parameter_pk):
            features[fpk].representation = nl

        compute_language_sources()

    get_repos()

    for obj in DBSession.query(LanguageTreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(TreeLabel).all():
        DBSession.delete(obj)
    for obj in DBSession.query(Phylogeny).all():
        DBSession.delete(obj)
    DBSession.flush()

    for tree in tqdm(
            iter_trees([l.id for l in DBSession.query(common.Language)],
                       Glottolog(REPOS['glottolog']))):
        nodes = set(n.name for n in tree.traverse())
        phylo = Phylogeny(id=tree.name.split('_')[1],
                          name=tree.name,
                          newick=tree.write(format=9))
        for l in DBSession.query(common.Language).filter(
                common.Language.id.in_(nodes)):
            LanguageTreeLabel(language=l,
                              treelabel=TreeLabel(id=l.id,
                                                  name=l.id,
                                                  phylogeny=phylo))
        DBSession.add(phylo)
Example #4
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row["language_id"], row["feature_id"])
        old_value = vs2008.get(key)
        new_value = row["value_numeric"]
        if old_value and old_value != new_value:
            valueset = (
                VersionedDBSession.query(common.ValueSet)
                .join(common.Language)
                .join(common.Parameter)
                .filter(common.Parameter.id == row["feature_id"])
                .filter(common.Language.id == row["language_id"])
                .one()
            )
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file("corrections_2013.tab"), namedtuples=True, newline="\r"):
        valueset = (
            VersionedDBSession.query(common.ValueSet)
            .join(common.Language)
            .join(common.Parameter)
            .filter(common.Parameter.id == row.feature)
            .filter(common.Language.id == row.wals_code)
            .one()
        )
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print("**** old news", valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print("--->", valueset.language.id, valueset.parameter.id, value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print("corrections 2013 done")

    for issue in ["0", "9", "10", "11", "13", "14", "15", "16", "17", "19", "20", "24", "26", "27", "28"]:
        issue = getattr(issues, "issue" + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
        DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter
    ):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print("recomputation of representation done")
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
        joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier)
    ):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ", ".join(sorted(set(iso_codes)))
    print("ecomputation of iso codes done")
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func("update", args)
Example #5
0
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
        joinedload(common.Parameter.valuesets)
    ):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            with open(path(apics.__file__).dirname().joinpath(
                'static', 'wals', '%sA.json' % feature.wals_id
            ), 'r') as fp:
                data = json.load(fp)
            feature.wals_representation = sum([len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
        joinedload(common.ValueSet.parameter),
        joinedload_all(common.ValueSet.values, common.Value.domainelement)
    ):
        values = sorted(list(valueset.values), key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency), colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                save('freq-%s' % frequency)
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color, 'icon': 'pie-100-%s.png' % color}

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(
                tuple(reversed(fracs)),
                colors=['#' + color for color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            save(basename)
            icons[(fracs, colors)] = True

            with open(str(icons_dir.joinpath(basename + '.svg')), 'w') as fp:
                fp.write(svg.pie(fracs, ['#' + color for color in colors], width=40))

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
Example #6
0
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    compute_language_sources()
    return 
    from time import time
    _s = time()

    def checkpoint(s, msg=None):
        n = time()
        print(n - s, msg or '')
        return n

    sql = """
select p.id, l.id, v.name from value as v, valueset as vs, language as l, parameter as p
where v.valueset_pk = vs.pk and vs.language_pk = l.pk and vs.parameter_pk = p.pk
    """
    datatriples = [(v[0], v[1], v[2]) for v in DBSession.execute(sql)]
    _s = checkpoint(_s, '%s values loaded' % len(datatriples))

    flv = dict([(feature, dict(lvs)) for (feature, lvs) in grp(datatriples).items()])
    _s = checkpoint(_s, 'triples grouped')

    clfps = list(get_clf_paths([row[0] for row in DBSession.execute("select id from language")]))
    _s = checkpoint(_s, '%s clfps loaded' % len(clfps))

    features = {f.id: f for f in DBSession.query(Feature)}
    for (f, lv) in flv.items():
        features[f].representation = len(lv)
    DBSession.flush()
    _s = checkpoint(_s, 'representation assigned')

    families = {f.id: f for f in DBSession.query(Family)}
    if False:
        fs = feature_stability(datatriples, clfps)
        _s = checkpoint(_s, 'feature_stability computed')

        for (f, (s, transitions, stationarity_p, synchronic_p)) in fs:
            print(f)
            stability = Stability(
                id=f.replace("GB", "S"),
                feature=features[f],
                parsimony_stability_value=s["stability"],
                parsimony_retentions=s["retentions"],
                parsimony_transitions=s["transitions"],
                jsondata={'diachronic_p': stationarity_p, "synchronic_p": synchronic_p})
            DBSession.add(stability)
            for (i, (fam, (fromnode, tonode), (ft, tt))) in enumerate(transitions):
                DBSession.add(Transition(
                    id="%s: %s->%s" % (f, fromnode, tonode),
                    stability=stability,
                    fromnode=get_name(fromnode),
                    tonode=get_name(tonode),
                    fromvalue=ft,
                    tovalue=tt,
                    family=families[fam],
                    retention_innovation="Retention" if ft == tt else "Innovation"))
        DBSession.flush()
        _s = checkpoint(_s, 'stability and transitions loaded')

    imps = feature_dependencies(datatriples)
    _s = checkpoint(_s, 'feature_dependencies computed')
    if True:
        (H, V) = dependencies_graph([(v, f1, f2) for ((v, dstats), f1, f2) in imps])
        _s = checkpoint(_s, 'dependencies_graph written')

        for (i, ((v, dstats), f1, f2)) in enumerate(imps):
            combinatory_status = ("primary" if (f1, f2) in H else ("epiphenomenal" if v > 0.0 else None)) if H else "N/A"
            DBSession.add(Dependency(
                id="%s->%s" % (f1, f2),
                strength=v,
                feature1=features[f1],
                feature2=features[f2],
                representation=dstats["representation"],
                combinatory_status=combinatory_status,
                jsondata=dstats))
        DBSession.flush()
        _s = checkpoint(_s, 'dependencies loaded')

    coordinates = {
        lg.id: (lg.longitude, lg.latitude)
        for lg in DBSession.query(common.Language)
        .filter(common.Language.longitude != None)
        .filter(common.Language.latitude != None)}
    deepfams = deep_families(datatriples, clfps, coordinates=coordinates)
    _s = checkpoint(_s, '%s deep_families computed' % len(deepfams))

    missing_families = set()
    data = Data()
    for ((l1, l2), support_value, significance, supports, f1c, f2c) in deepfams:
        dname = "proto-%s x proto-%s" % (glottolog_names[l1], glottolog_names[l2])
        kmdistance = havdist(f1c, f2c)
        (f1lon, f1lat) = f1c if f1c else (None, None)
        (f2lon, f2lat) = f2c if f2c else (None, None)

        for li in [l1, l2]:
            if li not in families:
                missing_families.add(li)

        deepfam = DeepFamily(
            id=dname,
            support_value=support_value,
            significance=significance,
            family1=families.get(l1),
            family2=families.get(l2),
            family1_latitude = f1lat,
            family1_longitude = f1lon,
            family2_latitude = f2lat,
            family2_longitude = f2lon,
            geographic_plausibility = kmdistance)
        DBSession.add(deepfam)
        for (f, v1, v2, historical_score, independent_score, support_score) in supports:
            vid = ("%s: %s %s %s" % (f, v1, "==" if v1 == v2 else "!=", v2)).replace(".", "")
            #vname = ("%s|%s" % (v1, v2)).replace(".", "")
            #print vid, vname
            if vid not in data["Support"]:
                data.add(
                    Support, vid,
                    id = vid,
                    historical_score = historical_score,
                    independent_score = independent_score,
                    support_score = support_score,
                    value1= v1,
                    value2 = v2,
                    feature=features[f])
            DBSession.add(HasSupport(
                id=dname + "-" + vid,
                deepfamily = deepfam,
                support = data["Support"][vid]))
    print('missing_families:')
    print(missing_families)
    DBSession.flush()
    _s = checkpoint(_s, 'deep_families loaded')

    compute_language_sources()
Example #7
0
def prime_cache(args):
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print '**** old news', valueset.language.id, valueset.parameter.id
            continue

        if value.domainelement.number != int(row.old):
            print '--->', valueset.language.id, valueset.parameter.id, value.domainelement.number
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print 'corrections 2013 done'

    for issue in ['0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28']:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk),
            lambda vs: vs.parameter):
        parameter.representation = str(len(set(v.language_pk for v in valuesets)))
    print 'recomputation of representation done'
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(joinedload_all(
        common.Language.languageidentifier, common.LanguageIdentifier.identifier
    )):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print 'recomputation of iso codes done'
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Example #8
0
def prime_cache(args):  # pragma: no cover
    """
    we use a versioned session to insert the changes in value assignment
    """
    #
    # compute the changes from 2008 to 2011:
    #
    vs2008 = get_vs2008(args)
    for row in DB.execute("select * from datapoint"):
        key = (row['language_id'], row['feature_id'])
        old_value = vs2008.get(key)
        new_value = row['value_numeric']
        if old_value and old_value != new_value:
            valueset = VersionedDBSession.query(common.ValueSet)\
                .join(common.Language)\
                .join(common.Parameter)\
                .filter(common.Parameter.id == row['feature_id'])\
                .filter(common.Language.id == row['language_id'])\
                .one()
            value = valueset.values[0]
            assert value.domainelement.number == old_value
            for de in valueset.parameter.domain:
                if de.number == new_value:
                    value.domainelement = de
                    break
            assert value.domainelement.number == new_value
            valueset.updated = E2011
            value.updated = E2011
            VersionedDBSession.flush()

    for row in reader(args.data_file('corrections_2013.tab'),
                      namedtuples=True,
                      newline='\r'):
        valueset = VersionedDBSession.query(common.ValueSet)\
            .join(common.Language)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == row.feature)\
            .filter(common.Language.id == row.wals_code)\
            .one()
        value = valueset.values[0]

        if value.domainelement.number == int(row.new):
            print('**** old news', valueset.language.id, valueset.parameter.id)
            continue

        if value.domainelement.number != int(row.old):
            print('--->', valueset.language.id, valueset.parameter.id,
                  value.domainelement.number)
        for de in valueset.parameter.domain:
            if de.number == int(row.new):
                value.domainelement = de
                break
        assert value.domainelement.number == int(row.new)
        valueset.updated = E2013
        value.updated = E2013
        VersionedDBSession.flush()
    print('corrections 2013 done')

    for issue in [
            '0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20',
            '24', '26', '27', '28'
    ]:
        issue = getattr(issues, 'issue' + issue)
        issue(VersionedDBSession, E2013)
        VersionedDBSession.flush()
        transaction.commit()
        transaction.begin()

    #
    # TODO: these must be recomputed as well, after migrations!
    #
    # cache number of languages for a parameter:
    for parameter, valuesets in groupby(
            DBSession.query(common.ValueSet).order_by(
                common.ValueSet.parameter_pk), lambda vs: vs.parameter):
        parameter.representation = str(
            len(set(v.language_pk for v in valuesets)))
    print('recomputation of representation done')
    transaction.commit()
    transaction.begin()

    # cache iso codes for languages:
    for language in DBSession.query(common.Language).options(
            joinedload_all(common.Language.languageidentifier,
                           common.LanguageIdentifier.identifier)):
        iso_codes = []
        for identifier in language.identifiers:
            if identifier.type == common.IdentifierType.iso.value:
                iso_codes.append(identifier.name)
        language.iso_codes = ', '.join(sorted(set(iso_codes)))
    print('ecomputation of iso codes done')
    transaction.commit()
    transaction.begin()

    compute_language_sources()
    transaction.commit()
    transaction.begin()

    gbs_func('update', args)
Example #9
0
def prime_cache(args):
    #
    # TODO: relate survey chapter reference with language!
    #
    icons = {}
    frequencies = {}

    args.log.info('computing wals representation')
    for feature in DBSession.query(common.Parameter).options(
            joinedload(common.Parameter.valuesets)):
        feature.representation = len(feature.valuesets)
        if feature.wals_id:
            data = jsonload(
                path(apics.__file__).dirname().joinpath(
                    'static', 'wals', '%sA.json' % feature.wals_id))
            feature.wals_representation = sum(
                [len(l['features']) for l in data['layers']])

    args.log.info('computing language sources')
    compute_language_sources((common.ContributionReference, 'contribution'))
    compute_number_of_values()

    for valueset in DBSession.query(common.ValueSet)\
            .join(common.Parameter)\
            .filter(common.Parameter.id == '0')\
            .options(joinedload(common.ValueSet.language)):
        if valueset.language.language_pk:
            continue
        if len(valueset.values) > 1:
            valueset.language.lexifier = 'Other'
        else:
            if valueset.values[0].domainelement.name == 'Other':
                valueset.language.lexifier = 'Other'
            else:
                valueset.language.lexifier \
                    = valueset.values[0].domainelement.name.replace('-based', '')
        for lect in valueset.language.lects:
            lect.lexifier = valueset.language.lexifier

    args.log.info('creating icons')
    for valueset in DBSession.query(common.ValueSet).options(
            joinedload(common.ValueSet.parameter),
            joinedload_all(common.ValueSet.values,
                           common.Value.domainelement)):
        values = sorted(list(valueset.values),
                        key=lambda v: v.domainelement.number)
        assert abs(sum(v.frequency for v in values) - 100) < 1
        fracs = []
        colors = []

        for v in values:
            color = v.domainelement.jsondata['color']
            frequency = round(v.frequency)
            assert frequency

            if frequency not in frequencies:
                figure(figsize=(0.4, 0.4))
                axes([0.1, 0.1, 0.8, 0.8])
                coll = pie((int(100 - frequency), frequency),
                           colors=('w', 'k'))
                coll[0][0].set_linewidth(0.5)
                assert icons_dir.joinpath('freq-%s.png' % frequency).exists()
                frequencies[frequency] = True

            v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency}
            fracs.append(frequency)
            colors.append(color)
            v.domainelement.jsondata = {
                'color': color,
                'icon': 'pie-100-%s.png' % color
            }

        assert len(colors) == len(set(colors))
        fracs, colors = tuple(fracs), tuple(colors)

        basename = 'pie-'
        basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors))
        valueset.update_jsondata(icon=basename + '.png')
        if (fracs, colors) not in icons:
            figure(figsize=(0.4, 0.4))
            axes([0.1, 0.1, 0.8, 0.8])
            coll = pie(tuple(reversed(fracs)),
                       colors=['#' + _color for _color in reversed(colors)])
            for wedge in coll[0]:
                wedge.set_linewidth(0.5)
            assert icons_dir.joinpath('%s.png' % basename).exists()
            icons[(fracs, colors)] = True
            assert icons_dir.joinpath(basename + '.svg').exists()

    for de in DBSession.query(common.DomainElement):
        if not de.jsondata.get('icon'):
            de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color'])

    gbs_func('update', args)
def prime_cache(args):
    """If data needs to be denormalized for lookup, do that here.
    This procedure should be separate from the db initialization, because
    it will have to be run periodically whenever data has been updated.
    """
    compute_language_sources()