def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package2, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % ( theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package2(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0]['name'] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]['name']: print stats.add('Theme unchanged %s' % themes[0]['name'], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0]['name'], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def get_packages(publisher=None, theme=None, uncategorized=False, limit=None): from ckan import model from ckanext.dgu.lib.theme import PRIMARY_THEME, Themes packages = model.Session.query(model.Package) \ .filter_by(state='active') if options.publisher: publisher_ = model.Group.get(publisher) packages = packages.filter_by(owner_org=publisher_.id) if uncategorized: theme = 'uncategorized' if theme is True: # only packages with a theme packages = packages.join(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter(model.PackageExtra.value != None) \ .filter(model.PackageExtra.value != '') \ .filter_by(state='active') elif theme == 'uncategorized': # only packages not of a known theme themes = model.Session.query(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter_by(state='active') \ .subquery() valid_themes = Themes.instance().data.keys() packages = packages.outerjoin(themes, themes.c.package_id==model.Package.id) \ .filter(not_(themes.c.value.in_(valid_themes))) import pdb pdb.set_trace() elif theme: # only packages of a particular theme packages = packages.join(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter(model.PackageExtra.value == theme) \ .filter_by(state='active') elif theme == False: # only packages without a theme themes = model.Session.query(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter_by(state='active') \ .subquery() packages = packages.outerjoin(themes, themes.c.package_id==model.Package.id) \ .filter(or_(themes.c.value == None, themes.c.value == '')) elif theme is None: # all packages pass total_count = packages.count() if limit is not None: packages = packages.limit(int(limit)) packages = packages.all() print 'Datasets: %s/%s' % (len(packages), total_count) return packages
def get_packages(publisher=None, theme=None, uncategorized=False, limit=None): from ckan import model from ckanext.dgu.lib.theme import PRIMARY_THEME, Themes packages = model.Session.query(model.Package) \ .filter_by(state='active') if options.publisher: publisher_ = model.Group.get(publisher) packages = packages.filter_by(owner_org=publisher_.id) if uncategorized: theme = 'uncategorized' if theme is True: # only packages with a theme packages = packages.join(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter(model.PackageExtra.value != None) \ .filter(model.PackageExtra.value != '') \ .filter_by(state='active') elif theme == 'uncategorized': # only packages not of a known theme themes = model.Session.query(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter_by(state='active') \ .subquery() valid_themes = Themes.instance().data.keys() packages = packages.outerjoin(themes, themes.c.package_id==model.Package.id) \ .filter(not_(themes.c.value.in_(valid_themes))) import pdb; pdb.set_trace() elif theme: # only packages of a particular theme packages = packages.join(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter(model.PackageExtra.value == theme) \ .filter_by(state='active') elif theme == False: # only packages without a theme themes = model.Session.query(model.PackageExtra) \ .filter_by(key=PRIMARY_THEME) \ .filter_by(state='active') \ .subquery() packages = packages.outerjoin(themes, themes.c.package_id==model.Package.id) \ .filter(or_(themes.c.value == None, themes.c.value == '')) elif theme is None: # all packages pass total_count = packages.count() if limit is not None: packages = packages.limit(int(limit)) packages = packages.all() print 'Datasets: %s/%s' % (len(packages), total_count) return packages
def recategorize(options): from ckanext.dgu.lib.theme import (categorize_package, PRIMARY_THEME, SECONDARY_THEMES, Themes) stats = StatsList() stats.report_value_limit = 1000 if options.dataset: pkg = model.Package.get(options.dataset) assert pkg packages = [pkg] else: packages = get_packages(publisher=options.publisher, theme=None, uncategorized=options.uncategorized, limit=options.limit) # process the list of themes we are interested in setting on packages themes = Themes.instance() if options.theme: theme_filter = set(options.theme.split(',')) for theme in theme_filter: assert theme in themes.data, '"%s" not in %r' % (theme, themes.data.keys()) else: theme_filter = themes.data themes_to_write = {} # pkg_name:themes for pkg in packages: print 'Dataset: %s' % pkg.name themes = categorize_package(pkg) existing_theme = pkg.extras.get(PRIMARY_THEME) pkg_identity = '%s (%s)' % (pkg.name, existing_theme) if not themes: print stats.add('Cannot decide theme', pkg_identity) continue if themes[0] not in theme_filter: print stats.add('Not interested in theme', pkg_identity) continue if existing_theme == themes[0]: print stats.add('Theme unchanged %s' % themes[0], pkg_identity) continue print stats.add('Recategorized to %s' % themes[0], pkg_identity) if options.write: themes_to_write[pkg.name] = themes print 'Recategorize summary:' print stats.report() if options.write: write_themes(themes_to_write)
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq) / max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add( 'margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()
def learn(options): '''Analyse datasets that are already categorise to find out which words associate with which theme. ''' from ckanext.dgu.lib.theme import Themes level = 1 freq_dists = {} fd_by_fraction = defaultdict(list) count = 0 for theme in Themes.instance().data: count += 1 if count == 30: break options.theme = theme freq_dist = get_freq_dist(options, level) print '%s: %r' % (theme, freq_dist) freq_dists[theme] = freq_dist if not len(freq_dist): continue max_freq = freq_dist[freq_dist.max()] freq_fraction_threshold = 0.0 for word, freq in freq_dist.items(): freq_fraction = float(freq)/max_freq if freq_fraction < freq_fraction_threshold: break fd_by_fraction[word].append((freq_fraction, theme, freq)) stats = StatsList() stats.report_value_limit = 1000 unique_words = defaultdict(list) # theme: [word, ...] for word, counts in fd_by_fraction.items(): if len(counts) == 1: print stats.add('unique', '%s %s' % (word, counts[0][1])) unique_words[counts[0][1]].append('%s (%s)' % (word, counts[0][2])) continue sorted_counts = sorted(counts, key=lambda tup: -tup[0]) winning_margin = sorted_counts[0][0] - sorted_counts[1][0] print stats.add('margin %.1f' % winning_margin, '%s %s-%s' % (word, sorted_counts[0][1], sorted_counts[1][1])) print 'Unique words:' for theme, words in unique_words.items(): print '%s: %s' % (theme, ' '.join(words)) print 'Summary:' print stats.report()