def command(cls, options): from ckan import model if options.write: rev = model.repo.new_revision() rev.author = 'script_fix_secondary_themes_3.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset) for package in datasets: if not 'theme-secondary' in package.extras: stats_outcome.add('Ignore - no secondary theme', package.name) continue secondary_theme = package.extras.get('theme-secondary') if secondary_theme.startswith('["["'): secondary_theme = LOOKUP[secondary_theme] secondary_theme = json.loads(secondary_theme) if isinstance(secondary_theme, list) and secondary_theme and len(secondary_theme[0]) == 1: secondary_theme = "".join(secondary_theme).replace('&', ' & ') if secondary_theme == 'GovernmentBusiness & Economy': secondary_theme = ['Government', 'Business & Economy'] elif secondary_theme == 'GovernmentSpending': secondary_theme = ['Government Spending'] elif secondary_theme == 'EnvironmentEducationGovernmentSpending': secondary_theme = ['Environment', 'Education', 'Government Spending'] elif secondary_theme == 'EnvironmentGovernment': secondary_theme = ['Environment', 'Government'] else: secondary_theme = [secondary_theme] if json.dumps(secondary_theme) != package.extras.get('theme-secondary'): stats_outcome.add('Fixing', package.name) package.extras['theme-secondary'] = json.dumps(secondary_theme) else: stats_outcome.add('Unchanged', package.name) print 'Formats:\n', stats_format.report() print 'Outcomes:\n', stats_outcome.report() if options.write: print 'Writing...' model.Session.commit() print '...done' stats_format.show_time_taken()
use_flickr=False, use_set5=False, use_urban100=False, patch_size=144, use_noise=False, valid_rate=0.1, inter='nearest', augment="default", kernel_dim=10, )) PinkBlack.io.set_seeds(args.seed) # --------------------------------------------------------- # Prepare training/validation/test data, and its dataloaders datasets = get_datasets(args) print(f"datasets are prepared.") train_dl = DataLoader(datasets['train_dataset'], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) valid_dl = DataLoader(datasets['valid_dataset'], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) test_dl = DataLoader(datasets['test_dataset'], batch_size=1, shuffle=False,
def command(cls, config_ini, options): common.load_config(config_ini) common.register_translator() from ckan import model from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES) rev = model.repo.new_revision() rev.author = 'script-fix_themes.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset, organization_ref=options.organization) def fix_theme(theme_str): '''Returns (fixed_theme_str, outcome)''' if not theme_str: return '', 'Blank' elif theme_str == 'null': return '', '"null"->""' elif theme_str in THEMES: return theme_str, 'Ok' else: fixed_theme = THEME_MAP.get(theme_str) if fixed_theme is None: return theme_str, 'Unknown theme %s - recategorizing' % theme_str else: assert (fixed_theme != theme_str) return fixed_theme, 'Changed to long form' package.extras[PRIMARY_THEME] = new_primary def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]' for package in datasets: if PRIMARY_THEME in package.extras: primary = package.extras.get(PRIMARY_THEME) new_primary, outcome = fix_theme(primary) if new_primary != primary: package.extras[PRIMARY_THEME] = new_primary output = stats_primary.add(outcome, package.name) if outcome != 'Ok': print output if outcome.startswith('Unknown theme'): recategorize(package) continue else: stats_primary.add('No theme', package.name) if SECONDARY_THEMES in package.extras: secondary = package.extras.get(SECONDARY_THEMES) try: secondary = json.loads(secondary) except ValueError: if secondary.startswith('{') and secondary.endswith('}'): # '{Crime}' -> 'Crime' secondary = secondary[1:-1].strip('\"') print stats_secondary.add('Tidied {}', package.name) else: print stats_secondary.add('Error decoding JSON', package.name) if secondary == {}: secondary = [] new_secondary = [] do_recategorize = False if not isinstance(secondary, list): secondary = [secondary] for theme_str in secondary: if not isinstance(theme_str, basestring): print stats_secondary.add( 'Not a list of strings %s' % type(theme_str), package.name) continue new_theme, outcome = fix_theme(theme_str) if new_theme: new_secondary.append(new_theme) if outcome != 'Ok': print stats_secondary.add(outcome, package.name) if outcome.startswith('Unknown theme'): do_recategorize = True if do_recategorize: recategorize(package) continue if json.dumps(new_secondary) != package.extras.get( SECONDARY_THEMES): stats_secondary.add('Fixed', package.name) package.extras[SECONDARY_THEMES] = json.dumps( new_secondary) else: stats_secondary.add('Ok', package.name) else: stats_secondary.add('No theme', package.name) if 'themes-secondary' in package.extras: print stats_secondary.add( 'Old key removed: themes-secondary', '%s %s' % (package.name, package.extras['themes-secondary'])) del package.extras['themes-secondary'] print "\nPrimary theme:" print stats_primary.report() print "\nSecondary theme:" print stats_secondary.report() print "\nRecategorizations:" print stats_recategorize.report() if options.write: print 'Writing' model.Session.commit()
def command(cls, config_ini, options): common.load_config(config_ini) common.register_translator() from ckan import model from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES) rev = model.repo.new_revision() rev.author = 'script-fix_themes.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset, organization_ref=options.organization) def fix_theme(theme_str): '''Returns (fixed_theme_str, outcome)''' if not theme_str: return '', 'Blank' elif theme_str == 'null': return '', '"null"->""' elif theme_str in THEMES: return theme_str, 'Ok' else: fixed_theme = THEME_MAP.get(theme_str) if fixed_theme is None: return theme_str, 'Unknown theme %s - recategorizing' % theme_str else: assert(fixed_theme != theme_str) return fixed_theme, 'Changed to long form' package.extras[PRIMARY_THEME] = new_primary def recategorize(pkg): themes = categorize_package(pkg, stats_recategorize) print 'Recategorize: %s' % themes if themes: pkg.extras[PRIMARY_THEME] = themes[0] elif PRIMARY_THEME in pkg.extras: pkg.extras[PRIMARY_THEME] = '' if len(themes) > 1: pkg.extras[SECONDARY_THEMES] = '["%s"]' % themes[1] elif SECONDARY_THEMES in pkg.extras: pkg.extras[SECONDARY_THEMES] = '[]' for package in datasets: if PRIMARY_THEME in package.extras: primary = package.extras.get(PRIMARY_THEME) new_primary, outcome = fix_theme(primary) if new_primary != primary: package.extras[PRIMARY_THEME] = new_primary output = stats_primary.add(outcome, package.name) if outcome != 'Ok': print output if outcome.startswith('Unknown theme'): recategorize(package) continue else: stats_primary.add('No theme', package.name) if SECONDARY_THEMES in package.extras: secondary = package.extras.get(SECONDARY_THEMES) try: secondary = json.loads(secondary) except ValueError: if secondary.startswith('{') and secondary.endswith('}'): # '{Crime}' -> 'Crime' secondary = secondary[1:-1].strip('\"') print stats_secondary.add('Tidied {}', package.name) else: print stats_secondary.add('Error decoding JSON', package.name) if secondary == {}: secondary = [] new_secondary = [] do_recategorize = False if not isinstance(secondary, list): secondary = [secondary] for theme_str in secondary: if not isinstance(theme_str, basestring): print stats_secondary.add('Not a list of strings %s' % type(theme_str), package.name) continue new_theme, outcome = fix_theme(theme_str) if new_theme: new_secondary.append(new_theme) if outcome != 'Ok': print stats_secondary.add(outcome, package.name) if outcome.startswith('Unknown theme'): do_recategorize = True if do_recategorize: recategorize(package) continue if json.dumps(new_secondary) != package.extras.get(SECONDARY_THEMES): stats_secondary.add('Fixed', package.name) package.extras[SECONDARY_THEMES] = json.dumps(new_secondary) else: stats_secondary.add('Ok', package.name) else: stats_secondary.add('No theme', package.name) if 'themes-secondary' in package.extras: print stats_secondary.add('Old key removed: themes-secondary', '%s %s' % (package.name, package.extras['themes-secondary'])) del package.extras['themes-secondary'] print "\nPrimary theme:" print stats_primary.report() print "\nSecondary theme:" print stats_secondary.report() print "\nRecategorizations:" print stats_recategorize.report() if options.write: print 'Writing' model.Session.commit()
def command(cls, options): from ckan import model if options.write: rev = model.repo.new_revision() rev.author = 'script_fix_secondary_themes_2.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset) for package in datasets: if not 'theme-secondary' in package.extras: stats_outcome.add('Ignore - no secondary theme', package.name) continue secondary_theme = package.extras.get('theme-secondary') # Convert from JSON to a list loop = 1 while isinstance(secondary_theme, basestring): try: secondary_theme = json.loads(secondary_theme) except ValueError: if secondary_theme == 'None': stats_format.add('"None" string', package.name) secondary_theme = [] elif ',' in secondary_theme: # e.g. '"Government, Society"' print stats_format.add('Non-JSON string, comma separated', package.name) secondary_theme = [t.strip() for t in secondary_theme.split(',')] else: # e.g. 'Towns & Cities' print stats_format.add('Non-JSON string', '%s %r' % (package.name, secondary_theme.strip())) secondary_theme = [secondary_theme.strip()] break loop = 1 if loop == 2: stats_format.add('JSON', package.name) elif loop == 3: # e.g. '"\\"Health\\""' print stats_format.add('Multiple JSON encoded', package.name) if secondary_theme in ('None', '', {}): print stats_format.add('Empty list', package.name) secondary_theme = [] assert isinstance(secondary_theme, list) # Filter out nulls in the list for filter_string in (None, 'None', ''): if filter_string in secondary_theme: print stats_format.add('%r in the list' % filter_string, package.name) secondary_theme = [theme for theme in secondary_theme if theme != filter_string] # Remove {} from strings e.g. ["{Government}"] if '{' in str(secondary_theme): print stats_format.add('{theme}', package.name) secondary_theme = [theme.strip('{}') for theme in secondary_theme] if json.dumps(secondary_theme) != package.extras.get('theme-secondary'): stats_outcome.add('Fixing', package.name) package.extras['theme-secondary'] = json.dumps(secondary_theme) else: stats_outcome.add('Unchanged', package.name) print 'Formats:\n', stats_format.report() print 'Outcomes:\n', stats_outcome.report() if options.write: print 'Writing...' model.Session.commit() print '...done' stats_format.show_time_taken()
def command(cls, options): from ckan import model if options.write: rev = model.repo.new_revision() rev.author = 'script_fix_secondary_themes_2.py' datasets = common.get_datasets(state='active', dataset_name=options.dataset) for package in datasets: if not 'theme-secondary' in package.extras: stats_outcome.add('Ignore - no secondary theme', package.name) continue secondary_theme = package.extras.get('theme-secondary') # Convert from JSON to a list loop = 1 while isinstance(secondary_theme, basestring): try: secondary_theme = json.loads(secondary_theme) except ValueError: if secondary_theme == 'None': stats_format.add('"None" string', package.name) secondary_theme = [] elif ',' in secondary_theme: # e.g. '"Government, Society"' print stats_format.add( 'Non-JSON string, comma separated', package.name) secondary_theme = [ t.strip() for t in secondary_theme.split(',') ] else: # e.g. 'Towns & Cities' print stats_format.add( 'Non-JSON string', '%s %r' % (package.name, secondary_theme.strip())) secondary_theme = [secondary_theme.strip()] break loop = 1 if loop == 2: stats_format.add('JSON', package.name) elif loop == 3: # e.g. '"\\"Health\\""' print stats_format.add('Multiple JSON encoded', package.name) if secondary_theme in ('None', '', {}): print stats_format.add('Empty list', package.name) secondary_theme = [] assert isinstance(secondary_theme, list) # Filter out nulls in the list for filter_string in (None, 'None', ''): if filter_string in secondary_theme: print stats_format.add('%r in the list' % filter_string, package.name) secondary_theme = [ theme for theme in secondary_theme if theme != filter_string ] # Remove {} from strings e.g. ["{Government}"] if '{' in str(secondary_theme): print stats_format.add('{theme}', package.name) secondary_theme = [ theme.strip('{}') for theme in secondary_theme ] if json.dumps(secondary_theme) != package.extras.get( 'theme-secondary'): stats_outcome.add('Fixing', package.name) package.extras['theme-secondary'] = json.dumps(secondary_theme) else: stats_outcome.add('Unchanged', package.name) print 'Formats:\n', stats_format.report() print 'Outcomes:\n', stats_outcome.report() if options.write: print 'Writing...' model.Session.commit() print '...done' stats_format.show_time_taken()