def run(args): ds = Dataset() vsrefs = collections.defaultdict(set) for row in ds.iter_rows('valuesetreference.csv', lambda r: True): vsrefs[row['valueset_pk']].add(row['source_pk']) lrefs = collections.defaultdict(set) for row in ds.iter_rows('valueset.csv', lambda r: r['pk'] in vsrefs): lrefs[row['language_pk']] = lrefs[row['language_pk']].union( vsrefs[row['pk']]) def repl(r): if r['language_pk'] in lrefs: if r['source_pk'] in lrefs[r['language_pk']]: lrefs[r['language_pk']].remove(r['source_pk']) return r ds.rewrite('languagesource.csv', repl) rows = [] pk = ds.maxpk('languagesource.csv') for lpk in sorted(lrefs): for spk in sorted(lrefs[lpk]): pk += 1 rows.append([pk, '', lpk, spk, '1']) ds.add_rows('languagesource.csv', *rows)
def run(args): ds = Dataset() lpk2id = {} for row in ds.iter_rows('language.csv', lambda r: True): lpk2id[row['pk']] = row['id'] vspk2lid = {} vschange = set() for row in ds.iter_rows('valueset.csv', lambda r: True): vspk2lid[row['pk']] = lpk2id[row['language_pk']] if not row['id'].endswith('-' + lpk2id[row['language_pk']]): vschange.add(row['pk']) vchange = set() for row in ds.iter_rows('value.csv', lambda r: True): if not row['id'].endswith('-' + vspk2lid[row['valueset_pk']]): vchange.add(row['pk']) def rvs(row): if row['pk'] in vschange: row['id'] = row['id'].split('-')[0] + '-' + lpk2id[ row['language_pk']] return row ds.rewrite('valueset.csv', rvs) def rv(row): if row['pk'] in vchange: row['id'] = row['id'].split('-')[0] + '-' + vspk2lid[ row['valueset_pk']] return row ds.rewrite('value.csv', rv)
def run(args): ds = Dataset() comments = {} for p in ds.raw_dir.glob('blog_comments/comments*.html'): for c in iter_comments(p): comments[c['id']] = c comments = sorted(comments.values(), key=lambda c: int(c['id'].split('comment-')[-1])) dump(comments, ds.etc_dir / 'comments.json', indent=4) args.log.info('{} comments'.format(len(comments)))
def run(args): ds = Dataset() assert not list( ds.iter_rows('language.csv', lambda r: r['id'] == args.language_id)) lpk = ds.maxpk('language.csv') + 1 gpk = ds.get_row('genus.csv', lambda r: r['name'] == args.genus)['pk'] #pk, jsondata, id, name, description, markup_description, latitude, longitude, version #353,, ktz, Kati( in Afghanistan), , , 35.5, 70, 1 ds.add_rows('language.csv', [ lpk, '', args.language_id, args.name, '', '', args.latitude or '', args.longitude or '', '1', ]) #pk,ascii_name,genus_pk,samples_100,samples_200,iso_codes,macroarea #910,amis,595,f,f,ami,Papunesia ds.add_rows( 'walslanguage.csv', [ lpk, slug(args.name, remove_whitespace=False), gpk, 'f', 'f', '', args.macroarea, ], )
def run(args): ds = Dataset() row = ds.get_row('language.csv', lambda r: r['id'] == args.language_id) lpk = row['pk'] old_name = row['name'] def _rename(r): if r['pk'] == lpk: if 'name' in r: r['name'] = args.name if 'ascii_name' in r: r['ascii_name'] = slug(args.name, remove_whitespace=False) if args.latitude and 'latitude' in r: r['latitude'] = args.latitude if args.longitude and 'longitude' in r: r['longitude'] = args.longitude return r ds.rewrite('language.csv', _rename) ds.rewrite('walslanguage.csv', _rename) if args.countries: cpks = set() for row in ds.iter_rows( 'country.csv', lambda r: r['id'] in args.countries.split(',') or r['name'] in args.countries.split(',')): cpks.add(row['pk']) #pk, jsondata, country_pk, language_pk ds.rewrite('countrylanguage.csv', lambda r: r if r['language_pk'] != lpk else None) clpk = ds.maxpk('countrylanguage.csv') + 1 ds.add_rows( 'countrylanguage.csv', *[[clpk + i, '', cpk, lpk] for i, cpk in enumerate(sorted(cpks))]) if args.keep_old_name: # Check whether the code exists: ipk = None for row in ds.iter_rows( 'identifier.csv', lambda r: r['type'] == 'name' and r[ 'description'] == 'other' and r['name'] == old_name): ipk = row['pk'] break if not ipk: # create an identifier: ipk = ds.maxpk('identifier.csv') + 1 ds.add_rows( 'identifier.csv', [ipk, '', old_name, 'other', '', '', 'name', 'en', '1']) lipk = ds.maxpk('languageidentifier.csv') + 1 ds.add_rows('languageidentifier.csv', [lipk, '', lpk, ipk, '', '1'])
def run(args): ds = Dataset()
def run(args): ds = Dataset() fpk = ds.pk_from_id('language.csv', args.from_language_id) tpk = ds.pk_from_id('language.csv', args.to_language_id) spk = ds.get_row('source.csv', cond=lambda r: r['name'] == args.ref)['pk'] vspk_map = {} # Collect PKs for all valuesets that are referenced by the specified source and possibly # need to be copied and vsr pks for later updating! vspk2vsrekpk = collections.defaultdict(list) for row in ds.iter_rows('valuesetreference.csv', lambda r: r['source_pk'] == spk): vspk2vsrekpk[row['valueset_pk']].append(row['pk']) # Now read valueset.csv and value.csv for these: valueset, value = {}, {} for row in ds.iter_rows('valueset.csv', lambda r: r['pk'] in vspk2vsrekpk): # must limit vspks to the correct language, too! if row['language_pk'] == fpk: valueset[row['pk']] = row elif row['pk'] in vspk2vsrekpk: del vspk2vsrekpk[row['pk']] print(len(vspk2vsrekpk)) for row in ds.iter_rows('value.csv', lambda r: r['valueset_pk'] in vspk2vsrekpk): value[row['valueset_pk']] = row # Now write the copied data: vspk = ds.maxpk('valueset.csv') rows = [] for oldpk, row in valueset.items(): # first check, whether a valueset for this datapoint already exist for the target # language! expk = ds.pk_from_id( 'valueset.csv', row['id'].replace(args.from_language_id, args.to_language_id)) if expk: print('exists') vspk_map[oldpk] = expk else: vspk += 1 vspk_map[oldpk] = str(vspk) # Update pk and language_pk row['pk'] = vspk row['id'] = row['id'].replace(args.from_language_id, args.to_language_id) row['language_pk'] = tpk rows.append(row.values()) ds.add_rows('valueset.csv', *rows) vpk = ds.maxpk('value.csv') rows = [] for oldpk, row in value.items(): # first check, whether a valueset for this datapoint already exist for the target # language! expk = ds.pk_from_id( 'value.csv', row['id'].replace(args.from_language_id, args.to_language_id)) if not expk: vpk += 1 row['pk'] = vpk row['id'] = row['id'].replace(args.from_language_id, args.to_language_id) row['valueset_pk'] = vspk_map[oldpk] rows.append(row.values()) ds.add_rows('value.csv', *rows) # # FIXME: now update valuesetreference.csv # all_vsrefpks = set() for s in vspk2vsrekpk.values(): all_vsrefpks = all_vsrefpks.union(s) def upd(r): if r['pk'] in all_vsrefpks: r['valueset_pk'] = vspk_map[r['valueset_pk']] return r ds.rewrite('valuesetreference.csv', upd) fixvids_run(args) languagesources_run(args)
def run(args): ds = Dataset() lpk = ds.pk_from_id('language', args.language_id) ipks = set() for code in args.codes: t, code = code.split('=', maxsplit=1) assert t in ['other', 'ethnologue'] # Check whether the code exists: ipk = None for row in ds.iter_rows( 'identifier.csv', lambda r: r['type'] == 'name' and r[ 'description'] == t and r['name'] == code): ipk = row['pk'] break if not ipk: # create an identifier: ipk = ds.maxpk('identifier.csv') + 1 ds.add_rows('identifier.csv', [ipk, '', code, t, '', code, 'name', 'en', '1']) ipks.add(ipk) # Now rewrite languageidentifier.csv: keep = set() for row in ds.iter_rows('languageidentifier.csv', lambda r: r['language_pk'] == lpk): # get the identifier data: if row['identifier_pk'] in ipks: keep.add(row['identifier_pk']) ipks.remove(row['identifier_pk']) continue i = ds.get_row('identifier.csv', lambda r: r['pk'] == row['identifier_pk']) if (i['type'] != 'name') or (i['description'] not in ('other', 'ethnologue')): keep.add(row['identifier_pk']) def repl(r): if r['language_pk'] != lpk or r['identifier_pk'] in keep: return r ds.rewrite('languageidentifier.csv', repl) if ipks: # We have to create additional languageidentifier! lipk = ds.maxpk('languageidentifier.csv') + 1 ds.add_rows( 'languageidentifier.csv', *[[lipk + j, '', lpk, ipk, '', '1'] for j, ipk in enumerate(ipks)])
def run(args): ds = Dataset() lpk = ds.pk_from_id('language.csv', args.language_id) for t in ['languagesource', 'languageidentifier', 'countrylanguage']: ds.rewrite(t + '.csv', lambda r: r if r['language_pk'] != lpk else None) for t in ['language', 'walslanguage']: ds.rewrite(t + '.csv', lambda r: r if r['pk'] != lpk else None) if args.full: # remove valuesets as well vspks = set(row['pk'] for row in ds.iter_rows( 'valueset.csv', lambda r: r['language_pk'] == lpk)) ds.rewrite('valueset.csv', lambda r: r if r['pk'] not in vspks else None) ds.rewrite('valuesetreference.csv', lambda r: r if r['valueset_pk'] not in vspks else None) vpks = set(row['pk'] for row in ds.iter_rows( 'value.csv', lambda r: r['valueset_pk'] in vspks)) ds.rewrite('value.csv', lambda r: r if r['valueset_pk'] not in vspks else None) ds.rewrite('valuesentence.csv', lambda r: r if r['value_pk'] not in vpks else None) if args.replacement_id: rpk = ds.pk_from_id('language.csv', args.replacement_id) else: rpk = None def repl(r): if r['language_pk'] == lpk: if rpk: r['language_pk'] = rpk else: args.log.warning('removing sentence {0}'.format(r)) return return r ds.rewrite('sentence.csv', repl)
def run(args): ds = Dataset() lpk = ds.pk_from_id('language', args.language_id) assert lpk iso_codes = set() ipks = set() for code in args.codes: t, code = code.split('=', maxsplit=1) assert t in ['iso', 'glottolog'] if t == 'iso': iso_codes.add(code) t = 'iso639-3' # Check whether the code exists: ipk = None for row in ds.iter_rows( 'identifier.csv', lambda r: r['type'] == t and r['name'] == code): ipk = row['pk'] break if not ipk: # create an identifier: ipk = ds.maxpk('identifier.csv') + 1 ds.add_rows('identifier.csv', [ipk, '', code, '', '', code, t, 'en', '1']) ipks.add(ipk) # Now rewrite languageidentifier.csv: keep = set() for row in ds.iter_rows('languageidentifier.csv', lambda r: r['language_pk'] == lpk): # get the identifier data: if row['identifier_pk'] in ipks: keep.add(row['identifier_pk']) ipks.remove(row['identifier_pk']) continue i = ds.get_row('identifier.csv', lambda r: r['pk'] == row['identifier_pk']) if i['type'] not in ('iso639-3', 'glottolog'): keep.add(row['identifier_pk']) def repl(r): if r['language_pk'] != lpk or r['identifier_pk'] in keep: return r ds.rewrite('languageidentifier.csv', repl) if ipks: # We have to create additional languageidentifier! lipk = ds.maxpk('languageidentifier.csv') + 1 ds.add_rows( 'languageidentifier.csv', *[[lipk + j, '', lpk, ipk, '', '1'] for j, ipk in enumerate(ipks)]) # rewrite iso_codes col in walslanguage.csv: ', '.join(sorted(args.isocodes)) def adj(r): if r['pk'] == lpk: r['iso_codes'] = ', '.join(sorted(iso_codes)) return r ds.rewrite('walslanguage.csv', adj)
def run(args): ds = Dataset() fpk = ds.pk_from_id('language.csv', args.from_language_id) tpk = ds.pk_from_id('language.csv', args.to_language_id) assert fpk and tpk spk = ds.get_row( 'source.csv', cond=lambda r: r['name'] == args.ref)['pk'] if args.ref else None vspks = set() vsrpks = set() if args.ref: for row in ds.iter_rows('valuesetreference.csv', lambda r: r['source_pk'] == spk): vsrpks.add(row['valueset_pk']) for row in ds.iter_rows( 'valueset.csv', lambda r: r['language_pk'] == fpk and (r['pk'] in vsrpks or (not args.ref))): vspks.add(row['pk']) print(len(vspks)) # # FIXME: Determine whether there are any sentences related to values of the valuesets. If so, # change the language_pk of the sentence! # vpks = set() for row in ds.iter_rows('value.csv', lambda r: r['valueset_pk'] in vspks): vpks.add(row['pk']) spks = set() for row in ds.iter_rows('valuesentence.csv', lambda r: r['value_pk'] in vpks): spks.add(row['sentence_pk']) def repl(r): if r['language_pk'] == fpk and r['pk'] in spks: r['language_pk'] = tpk return r ds.rewrite('sentence.csv', repl) def repl(r): if r['language_pk'] == fpk and (r['pk'] in vspks or (args.ref is None)): r['language_pk'] = tpk return r ds.rewrite('valueset.csv', repl) fixvids_run(args) languagesources_run(args)
def run(args): ds = Dataset() for lid in args.language_id.split(','): recl(ds, lid.strip(), args)