def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodiucally whenever data has been updated. """ compute_language_sources() transaction.commit() transaction.begin() gbs_func('update', args)
def google_books(**kw): # pragma: no cover add_args = [ (("command",), dict(help="download|verify|update|cleanup")), (("--api-key",), dict(default=kw.get("key", os.environ.get("GBS_API_KEY")))), ] args = parsed_args(*add_args, **kw) if args.command == "download" and not args.api_key: raise argparse.ArgumentError(None, "no API key found for download") with transaction.manager: gbs_func(args.command, args, kw.get("sources"))
def google_books(**kw): # pragma: no cover add_args = [ (("command",), dict(help="download|verify|update|cleanup")), (("--api-key",), dict(default=kw.get('key', os.environ.get('GBS_API_KEY')))), ] args = parsed_args(*add_args, **kw) if args.command == 'download' and not args.api_key: raise argparse.ArgumentError(None, 'no API key found for download') with transaction.manager: gbs_func(args.command, args, kw.get('sources'))
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ q = DBSession.query(common.Parameter).join(common.ValueSet).distinct() n = q.count() m = DBSession.query(models.Inventory).count() for segment in q: # # TODO: this ratio (number of inventories a segment appears in by number of # distinct segment total) doesn't make much sense, does it? # segment.frequency = float(len(segment.valuesets)) / float(n) segment.in_inventories = len(segment.valuesets) segment.total_inventories = m for inventory in DBSession.query(models.Inventory).options( joinedload_all(common.Contribution.valuesets, common.ValueSet.parameter) ): if '(UPSID)' not in inventory.name: inventory.count_tone = 0 for vs in inventory.valuesets: attr = 'count_' + vs.parameter.segment_class if hasattr(inventory, attr): val = getattr(inventory, attr) or 0 setattr(inventory, attr, val + 1) ficons = cycle(ORDERED_ICONS) gicons = cycle(ORDERED_ICONS) for root, genus in groupby( DBSession.query(models.Genus).order_by(models.Genus.description), lambda g: g.description): ficon = ficons.next().name for g in genus: g.ficon = ficon g.gicon = gicons.next().name for variety in DBSession.query(models.Variety).options( joinedload(models.Variety.inventories)): variety.count_inventories = len(variety.inventories) if astroman: ia_func('update', args) gbs_func('update', args) print('added', add_wikipedia_urls(args), 'wikipedia urls')
def prime_cache(args): """If data needs to be denormalized for lookup, do that here. This procedure should be separate from the db initialization, because it will have to be run periodically whenever data has been updated. """ q = DBSession.query(common.Parameter).join(common.ValueSet).distinct() n = q.count() m = DBSession.query(models.Inventory).count() for segment in q: # # TODO: this ratio (number of inventories a segment appears in by number of # distinct segment total) doesn't make much sense, does it? # segment.frequency = float(len(segment.valuesets)) / float(n) segment.in_inventories = len(segment.valuesets) segment.total_inventories = m for inventory in DBSession.query(models.Inventory).options( joinedload_all(common.Contribution.valuesets, common.ValueSet.parameter)): if '(UPSID)' not in inventory.name: inventory.count_tone = 0 for vs in inventory.valuesets: attr = 'count_' + vs.parameter.segment_class if hasattr(inventory, attr): val = getattr(inventory, attr) or 0 setattr(inventory, attr, val + 1) ficons = cycle(ORDERED_ICONS) gicons = cycle(ORDERED_ICONS) for root, genus in groupby( DBSession.query(models.Genus).order_by(models.Genus.description), lambda g: g.description): ficon = ficons.next().name for g in genus: g.ficon = ficon g.gicon = gicons.next().name for variety in DBSession.query(models.Variety).options( joinedload(models.Variety.inventories)): variety.count_inventories = len(variety.inventories) if 0: ia_func('update', args) gbs_func('update', args) print('added', add_wikipedia_urls(args), 'wikipedia urls')
def prime_cache(args): # pragma: no cover """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row["language_id"], row["feature_id"]) old_value = vs2008.get(key) new_value = row["value_numeric"] if old_value and old_value != new_value: valueset = ( VersionedDBSession.query(common.ValueSet) .join(common.Language) .join(common.Parameter) .filter(common.Parameter.id == row["feature_id"]) .filter(common.Language.id == row["language_id"]) .one() ) value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file("corrections_2013.tab"), namedtuples=True, newline="\r"): valueset = ( VersionedDBSession.query(common.ValueSet) .join(common.Language) .join(common.Parameter) .filter(common.Parameter.id == row.feature) .filter(common.Language.id == row.wals_code) .one() ) value = valueset.values[0] if value.domainelement.number == int(row.new): print("**** old news", valueset.language.id, valueset.parameter.id) continue if value.domainelement.number != int(row.old): print("--->", valueset.language.id, valueset.parameter.id, value.domainelement.number) for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print("corrections 2013 done") for issue in ["0", "9", "10", "11", "13", "14", "15", "16", "17", "19", "20", "24", "26", "27", "28"]: issue = getattr(issues, "issue" + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter ): parameter.representation = str(len(set(v.language_pk for v in valuesets))) print("recomputation of representation done") transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options( joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier) ): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ", ".join(sorted(set(iso_codes))) print("ecomputation of iso codes done") transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func("update", args)
def prime_cache(args): # # TODO: relate survey chapter reference with language! # icons = {} frequencies = {} args.log.info('computing wals representation') for feature in DBSession.query(common.Parameter).options( joinedload(common.Parameter.valuesets) ): feature.representation = len(feature.valuesets) if feature.wals_id: with open(path(apics.__file__).dirname().joinpath( 'static', 'wals', '%sA.json' % feature.wals_id ), 'r') as fp: data = json.load(fp) feature.wals_representation = sum([len(l['features']) for l in data['layers']]) args.log.info('computing language sources') compute_language_sources((common.ContributionReference, 'contribution')) compute_number_of_values() for valueset in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '0')\ .options(joinedload(common.ValueSet.language)): if valueset.language.language_pk: continue if len(valueset.values) > 1: valueset.language.lexifier = 'Other' else: if valueset.values[0].domainelement.name == 'Other': valueset.language.lexifier = 'Other' else: valueset.language.lexifier = valueset.values[0].domainelement.name.replace('-based', '') for lect in valueset.language.lects: lect.lexifier = valueset.language.lexifier args.log.info('creating icons') for valueset in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.parameter), joinedload_all(common.ValueSet.values, common.Value.domainelement) ): values = sorted(list(valueset.values), key=lambda v: v.domainelement.number) assert abs(sum(v.frequency for v in values) - 100) < 1 fracs = [] colors = [] for v in values: color = v.domainelement.jsondata['color'] frequency = round(v.frequency) assert frequency if frequency not in frequencies: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((int(100 - frequency), frequency), colors=('w', 'k')) coll[0][0].set_linewidth(0.5) save('freq-%s' % frequency) frequencies[frequency] = True v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency} fracs.append(frequency) colors.append(color) v.domainelement.jsondata = { 'color': color, 'icon': 'pie-100-%s.png' % color} assert len(colors) == len(set(colors)) fracs, colors = tuple(fracs), tuple(colors) basename = 'pie-' basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors)) valueset.update_jsondata(icon=basename + '.png') if (fracs, colors) not in icons: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie( tuple(reversed(fracs)), colors=['#' + color for color in reversed(colors)]) for wedge in coll[0]: wedge.set_linewidth(0.5) save(basename) icons[(fracs, colors)] = True with open(str(icons_dir.joinpath(basename + '.svg')), 'w') as fp: fp.write(svg.pie(fracs, ['#' + color for color in colors], width=40)) for de in DBSession.query(common.DomainElement): if not de.jsondata.get('icon'): de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color']) gbs_func('update', args)
def prime_cache(args): """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row['language_id'], row['feature_id']) old_value = vs2008.get(key) new_value = row['value_numeric'] if old_value and old_value != new_value: valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row['feature_id'])\ .filter(common.Language.id == row['language_id'])\ .one() value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'): valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row.feature)\ .filter(common.Language.id == row.wals_code)\ .one() value = valueset.values[0] if value.domainelement.number == int(row.new): print '**** old news', valueset.language.id, valueset.parameter.id continue if value.domainelement.number != int(row.old): print '--->', valueset.language.id, valueset.parameter.id, value.domainelement.number for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print 'corrections 2013 done' for issue in ['0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28']: issue = getattr(issues, 'issue' + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by(common.ValueSet.parameter_pk), lambda vs: vs.parameter): parameter.representation = str(len(set(v.language_pk for v in valuesets))) print 'recomputation of representation done' transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options(joinedload_all( common.Language.languageidentifier, common.LanguageIdentifier.identifier )): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ', '.join(sorted(set(iso_codes))) print 'recomputation of iso codes done' transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func('update', args)
def prime_cache(args): # pragma: no cover """ we use a versioned session to insert the changes in value assignment """ # # compute the changes from 2008 to 2011: # vs2008 = get_vs2008(args) for row in DB.execute("select * from datapoint"): key = (row['language_id'], row['feature_id']) old_value = vs2008.get(key) new_value = row['value_numeric'] if old_value and old_value != new_value: valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row['feature_id'])\ .filter(common.Language.id == row['language_id'])\ .one() value = valueset.values[0] assert value.domainelement.number == old_value for de in valueset.parameter.domain: if de.number == new_value: value.domainelement = de break assert value.domainelement.number == new_value valueset.updated = E2011 value.updated = E2011 VersionedDBSession.flush() for row in reader(args.data_file('corrections_2013.tab'), namedtuples=True, newline='\r'): valueset = VersionedDBSession.query(common.ValueSet)\ .join(common.Language)\ .join(common.Parameter)\ .filter(common.Parameter.id == row.feature)\ .filter(common.Language.id == row.wals_code)\ .one() value = valueset.values[0] if value.domainelement.number == int(row.new): print('**** old news', valueset.language.id, valueset.parameter.id) continue if value.domainelement.number != int(row.old): print('--->', valueset.language.id, valueset.parameter.id, value.domainelement.number) for de in valueset.parameter.domain: if de.number == int(row.new): value.domainelement = de break assert value.domainelement.number == int(row.new) valueset.updated = E2013 value.updated = E2013 VersionedDBSession.flush() print('corrections 2013 done') for issue in [ '0', '9', '10', '11', '13', '14', '15', '16', '17', '19', '20', '24', '26', '27', '28' ]: issue = getattr(issues, 'issue' + issue) issue(VersionedDBSession, E2013) VersionedDBSession.flush() transaction.commit() transaction.begin() # # TODO: these must be recomputed as well, after migrations! # # cache number of languages for a parameter: for parameter, valuesets in groupby( DBSession.query(common.ValueSet).order_by( common.ValueSet.parameter_pk), lambda vs: vs.parameter): parameter.representation = str( len(set(v.language_pk for v in valuesets))) print('recomputation of representation done') transaction.commit() transaction.begin() # cache iso codes for languages: for language in DBSession.query(common.Language).options( joinedload_all(common.Language.languageidentifier, common.LanguageIdentifier.identifier)): iso_codes = [] for identifier in language.identifiers: if identifier.type == common.IdentifierType.iso.value: iso_codes.append(identifier.name) language.iso_codes = ', '.join(sorted(set(iso_codes))) print('ecomputation of iso codes done') transaction.commit() transaction.begin() compute_language_sources() transaction.commit() transaction.begin() gbs_func('update', args)
from clld.scripts.util import gbs_func, parsed_args if __name__ == '__main__': gbs_func('update', parsed_args(bootstrap=True))
def prime_cache(args): # # TODO: relate survey chapter reference with language! # icons = {} frequencies = {} args.log.info('computing wals representation') for feature in DBSession.query(common.Parameter).options( joinedload(common.Parameter.valuesets)): feature.representation = len(feature.valuesets) if feature.wals_id: data = jsonload( path(apics.__file__).dirname().joinpath( 'static', 'wals', '%sA.json' % feature.wals_id)) feature.wals_representation = sum( [len(l['features']) for l in data['layers']]) args.log.info('computing language sources') compute_language_sources((common.ContributionReference, 'contribution')) compute_number_of_values() for valueset in DBSession.query(common.ValueSet)\ .join(common.Parameter)\ .filter(common.Parameter.id == '0')\ .options(joinedload(common.ValueSet.language)): if valueset.language.language_pk: continue if len(valueset.values) > 1: valueset.language.lexifier = 'Other' else: if valueset.values[0].domainelement.name == 'Other': valueset.language.lexifier = 'Other' else: valueset.language.lexifier \ = valueset.values[0].domainelement.name.replace('-based', '') for lect in valueset.language.lects: lect.lexifier = valueset.language.lexifier args.log.info('creating icons') for valueset in DBSession.query(common.ValueSet).options( joinedload(common.ValueSet.parameter), joinedload_all(common.ValueSet.values, common.Value.domainelement)): values = sorted(list(valueset.values), key=lambda v: v.domainelement.number) assert abs(sum(v.frequency for v in values) - 100) < 1 fracs = [] colors = [] for v in values: color = v.domainelement.jsondata['color'] frequency = round(v.frequency) assert frequency if frequency not in frequencies: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie((int(100 - frequency), frequency), colors=('w', 'k')) coll[0][0].set_linewidth(0.5) assert icons_dir.joinpath('freq-%s.png' % frequency).exists() frequencies[frequency] = True v.jsondata = {'frequency_icon': 'freq-%s.png' % frequency} fracs.append(frequency) colors.append(color) v.domainelement.jsondata = { 'color': color, 'icon': 'pie-100-%s.png' % color } assert len(colors) == len(set(colors)) fracs, colors = tuple(fracs), tuple(colors) basename = 'pie-' basename += '-'.join('%s-%s' % (f, c) for f, c in zip(fracs, colors)) valueset.update_jsondata(icon=basename + '.png') if (fracs, colors) not in icons: figure(figsize=(0.4, 0.4)) axes([0.1, 0.1, 0.8, 0.8]) coll = pie(tuple(reversed(fracs)), colors=['#' + _color for _color in reversed(colors)]) for wedge in coll[0]: wedge.set_linewidth(0.5) assert icons_dir.joinpath('%s.png' % basename).exists() icons[(fracs, colors)] = True assert icons_dir.joinpath(basename + '.svg').exists() for de in DBSession.query(common.DomainElement): if not de.jsondata.get('icon'): de.update_jsondata(icon='pie-100-%s.png' % de.jsondata['color']) gbs_func('update', args)