def populate_taxonomy(**kwargs): out_dir = kwargs.get('out_dir') outfile = os.path.join(out_dir, 'taxonomy.json') with open(outfile, 'w') as filehandle: for c in tdb.taxonomy(): row = (c.id, c.label, c.wordclass, c.level, c.branch_size, c.sortcode, c.parent_id) data = {fieldname: value for fieldname, value in zip(FIELDS['thesaurusclass'], row)} filehandle.write(json.dumps(data)) filehandle.write('\n')
def options_list(**kwargs): """ Return the list of thesaurus class IDs that will be used as the set of options for the classifier to pick. """ # Classifiers will only be built for thesaurus branches between these sizes branch_size_min = kwargs.get('min_size', 2500) branch_size_max = kwargs.get('max_size', 50000) return [t.id for t in tdb.taxonomy(level=5) if t.level in (2, 3, 4, 5) and t.wordclass is None and t.branch_size <= branch_size_max and (t.level == 2 or t.branch_size >= branch_size_min)]