def write(fondz_dir): index_file = join(fondz_dir, "index.html") topics = read_json(join(fondz_dir, "js", "topics.json")) formats = read_json(join(fondz_dir, "js", "formats.json")) format_summary = summarize(formats) bag_summary = read_json(join(fondz_dir, "js", "bags.json")) render_to('index.html', index_file, topics=topics, format_summary=format_summary, bag_summary=bag_summary)
def _add_topics(fondz_dir): logger = logging.getLogger("fondz") logger.info("doing topic modeling for %s", fondz_dir) fondz_file = _fondz_file(fondz_dir) fondz = read_json(fondz_file) fondz['topic_model'] = topics(fondz_dir) write_json(fondz, fondz_file)
def write(fondz_dir): _setup_logging(fondz_dir) logger = logging.getLogger("fondz") logger.info("writing fondz description for %s", fondz_dir) fondz_file = _fondz_file(fondz_dir) fondz = read_json(fondz_file) _write_static(fondz_dir) _write_index_html(fondz_dir, fondz) _write_topics_html(fondz_dir, fondz) _write_bags_html(fondz_dir, fondz) _write_formats_html(fondz_dir, fondz)
def test_topics(self): fondz_dir = tempfile.mkdtemp() create("test", fondz_dir, bag, overwrite=True) fondz_file = join(fondz_dir, 'fondz.json') fondz = read_json(fondz_file) results = fondz['topic_model'] # make sure mallet details are present self.assertTrue('mallet' in results) self.assertEqual(len(results['mallet']), 2) # the actual topics topics = results['topics'] self.assertTrue(len(topics) > 0) self.assertEqual(len(topics[0]['words']), 15) for topic in topics: self.assertTrue(len(topic['files']) > 0) self.assertTrue(len(topic['words']) > 0) self.assertTrue(topic['score']) shutil.rmtree(fondz_dir)
def test_add_bags(self): # create a fondz directory, and add 2 test bags to it fondz_dir = tempfile.mkdtemp() init("test", fondz_dir) add_bag(fondz_dir, bag1) add_bag(fondz_dir, bag2) # generate and load fondz.js fondz_json = join(fondz_dir, "fondz.json") self.assertTrue(isfile(fondz_json)) result = read_json(fondz_json) # check that bags are there self.assertEqual(len(result['bags']), 2) self.assertEqual(result['num_files'], 12) self.assertEqual(result['bytes'], 7772330) self.assertEqual(result['bags'][0]['path'], bag1) self.assertEqual(len(result['bags'][0]['manifest']), 4) self.assertEqual(result['bags'][1]['path'], bag2) self.assertEqual(len(result['bags'][1]['manifest']), 8) # look closer at bag1 self.assertTrue(uuid.UUID(result['bags'][0]['id'])) f = result['bags'][0]['manifest'][0] self.assertEqual(f['path'], 'data/newspaper.jpg') self.assertEqual(f['md5'], 'a0471d984e6e82f15da686cebdb38a36') self.assertEqual(f['bytes'], 7004510) self.assertTrue(f['modified']) self.assertTrue(f['created']) self.assertEqual(f['format'], 'fmt/43') # make sure formats are populated self.assertEqual(len(result['formats'].keys()), 5) # make at least one derivative is there bag_id = result['bags'][0]['id'] f = result['bags'][0]['manifest'][1] deriv = join(fondz_dir, 'derivatives', bag_id, 'word.doc.html') self.assertTrue(isfile(deriv))
def test_create(self): d = tempfile.mkdtemp() create("test", d, bag1, overwrite=True) self.assertTrue(isdir(d)) # fondz.json there? fondz_file = join(d, "fondz.json") self.assertTrue(isfile(fondz_file)) fondz = read_json(fondz_file) bag_id = fondz["bags"][0]["id"] # topics there self.assertTrue(len(fondz["topic_model"]["topics"]) > 0) # bag is there self.assertEqual(len(fondz['bags']), 1) self.assertEqual(fondz['bags'][0]['path'], bag1) self.assertEqual(len(fondz['bags'][0]['manifest']), 4) f = fondz['bags'][0]['manifest'][0] self.assertEqual(f['path'], 'data/newspaper.jpg') self.assertEqual(f['bytes'], 7004510) self.assertEqual(f['format'], 'fmt/43') f = fondz['bags'][0]['manifest'][1] self.assertEqual(f['path'], 'data/wordperfect.wp') self.assertEqual(f['bytes'], 672936) self.assertEqual(f['format'], 'x-fmt/394') self.assertTrue(isfile(join(d, "derivatives", bag_id, "word.doc.html"))) self.assertTrue(isfile(join(d, "derivatives", bag_id, "wordperfect.wp.html"))) self.assertTrue(isfile(join(d, "derivatives", bag_id, "subdir", "word.docx.html"))) # html files there? for f in ["index.html", "topics.html", "formats.html", "bags.html"]: html_file = join(d, f) self.assertTrue(isfile(html_file)) html = open(html_file).read() self.assertTrue('<!doctype html>' in html)
def add_bag(fondz_dir, bag_dir): """ add a particular bagit directory to the fondz description """ # TODO: validate bag first? logger = logging.getLogger("fondz") logger.info("adding bag %s to %s", bag_dir, fondz_dir) bag_dir = abspath(bag_dir) fondz_file = _fondz_file(fondz_dir) fondz = read_json(fondz_file) for bag in fondz["bags"]: if bag["path"] == bag_dir: raise Exception("bag was already added: %s" % bag_dir) bag = _get_bag(bag_dir) _add_formats(bag, fondz, fondz_dir) _convert(bag, fondz_dir) fondz["bags"].append(bag) fondz["bytes"] += bag["bytes"] fondz["num_files"] += len(bag["manifest"]) write_json(fondz, fondz_file)