def __init__(self, base, api_key, secret, token=None, interact=False, cache=0, pretty=False): self.base = base self.ff = SafeFlickrAPI(api_key, secret, token) self.interact = bool(interact) # TODO NORM use a decorator to do this, somehow... # needs to be able to access previous stack frame's locals! self.cache = int(cache) self.pretty = bool(pretty) if not os.path.isdir(str(base)): raise ValueError("not a directory: %s" % base) self.dir_idx = os.path.join(base, "idx") self.dir_tgr = os.path.join(base, "tgr") self.dir_res = os.path.join(base, "res") for path in [self.base, self.dir_idx, self.dir_tgr, self.dir_res]: if not os.path.isdir(path): os.mkdir(path) self.res = {}
class Evaluation(object): _rounds = [ ("social", Round("Scraping social network", [], ["soc.graphml"])), ("group", Round("Scraping groups", ["social"], ["group-user.map"])), ("photo", Round("Scraping photos", ["group"], ["prod-doc.db"]+["group-user.map"]+["soc.graphml"])), ("inv_pd", Round("Inverting producer-document mapping", ["photo"], ["doc-prod.db"])), ("tag", Round("Scraping tags", ["photo"], ["doc-tag.db", "doc-tag.len"])), ("inv_dt", Round("Inverting document-tag mapping", ["tag"], ["tag-doc.db"])), ("cluster", Round("Scraping clusters", ["tag"], ["tag-cluster.db"])), ("generate", Round("Generating data", ["inv_pd", "cluster"], ["p_idx.db", "idx.graphml", "communities.map", "p_tgr.db", "tgr.graphml"])), ("writeall", Round("Writing objects", ["generate"], [])), ("examine", Round("Examine data", [], [])), ] rounds = dict(_rounds) roundlist = [k for k, r in _rounds] def __init__(self, base, api_key, secret, token=None, interact=False, cache=0, pretty=False): self.base = base self.ff = SafeFlickrAPI(api_key, secret, token) self.interact = bool(interact) # TODO NORM use a decorator to do this, somehow... # needs to be able to access previous stack frame's locals! self.cache = int(cache) self.pretty = bool(pretty) if not os.path.isdir(str(base)): raise ValueError("not a directory: %s" % base) self.dir_idx = os.path.join(base, "idx") self.dir_tgr = os.path.join(base, "tgr") self.dir_res = os.path.join(base, "res") for path in [self.base, self.dir_idx, self.dir_tgr, self.dir_res]: if not os.path.isdir(path): os.mkdir(path) self.res = {} def __enter__(self): return self def __exit__(self, type, value, traceback): for path, res in self.res.iteritems(): res.close() LOG.info("%s closed" % (path)) def banner(self, local): return "[Evaluation console]\n>>> locals().keys()\n%r\n>>> self.ff\n%r" % (sorted(local.keys()), self.ff) def fp_o(self, name): fn = os.path.join(self.base, name) fp = open(fn, 'w') self.respush(fn, fp, 'w') return fp def fp_i(self, name): fn = os.path.join(self.base, name) fp = open(fn) self.respush(fn, fp, 'r') return fp def fp_exists(self, name): fn = os.path.join(self.base, name) return os.path.exists(fn) def df(self, name): return os.path.join(self.base, "%s.db" % name) def db(self, name, writeback=False, lrusize=0): dbf = os.path.join(self.base, "%s.db" % name) db = db_open(dbf, writeback) lrusize = int(lrusize) if lrusize: shelve_attach_cache(db, lrusize) self.respush(dbf, db, 'rw') return db def respush(self, path, res, mode): if path in self.res: self.res.pop(path).close() LOG.info("%s closed" % (path)) self.res[path] = res LOG.info("%s opened (%s)" % (path, mode)) def round_social(self, seed, size): """ Scrape the social network using breadth-search. @param seed: Seed identity @param size: Number of identities to scrape """ size = int(size) socgr = self.ff.scrapeIDs(seed, size).graph socgr.write_graphml(self.fp_o("soc.graphml")) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_group(self): """ Scrape the group network from the social network. """ users = Graph.Read(self.fp_i("soc.graphml")).vs["id"] gumap = self.ff.scrapeGroups(users) dict_save(gumap, self.fp_o("group-user.map")) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_photo(self): """ Scrape photos of the collected producers. """ socgr = Graph.Read(self.fp_i("soc.graphml")) gumap = dict_load(self.fp_i("group-user.map")) pddb = self.db("prod-doc") self.ff.commitUserPhotos(socgr.vs["id"], pddb) self.ff.commitGroupPhotos(gumap, pddb) self.ff.pruneProducers(socgr, gumap, pddb) socgr.write_graphml(self.fp_o("soc.graphml")) dict_save(gumap, self.fp_o("group-user.map")) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_inv_pd(self): """ Invert the producer-photo mapping. """ pddb = self.db("prod-doc") dppb = self.db("doc-prod", writeback=True) self.ff.invertMap(pddb, dppb, "context") if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_tag(self): """ Scrape tags of the collected photos. """ pddb = self.db("prod-doc") dtdb = self.db("doc-tag") photos = chain(*pddb.itervalues()) self.ff.commitPhotoTags(photos, dtdb) print >>self.fp_o("doc-tag.len"), len(dtdb) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_inv_dt(self): """ Invert the photo-tag mapping. """ dtdb = self.db("doc-tag") tddb = self.db("tag-doc", writeback=True) self.ff.invertMap(dtdb, tddb, "tag-photo") if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_cluster(self): """ Scrape clusters of the collected tags. """ dtdb = self.db("doc-tag") tcdb = self.db("tag-cluster") tags = chain(*dtdb.itervalues()) self.ff.commitTagClusters(tags, tcdb) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_generate(self): """ Generate objects from the scraped data. """ socgr = Graph.Read(self.fp_i("soc.graphml")) gumap = dict_load(self.fp_i("group-user.map")) pddb = self.db("prod-doc") dppb = self.db("doc-prod") dtdb = self.db("doc-tag") tcdb = self.db("tag-cluster") phdb = self.db("p_idx", lrusize=self.cache) phsb = self.db("p_idx_s") pgdb = self.db("p_tgr", lrusize=self.cache) pgsb = self.db("p_tgr_s") FILE_IDX = "idx.graphml" FILE_CMM = "communities.map" FILE_TGR = "tgr.graphml" FILE_PTB = "ptb.graphml" FILE_PTB_U = "ptables.map" sg = SampleGenerator(socgr, gumap, pddb, dppb, dtdb, tcdb, phdb, phsb, pgdb, pgsb) # indexes if not self.fp_exists(FILE_IDX): sg.generateIndexes() sg.prodgr.write(self.fp_o(FILE_IDX)) else: sg.prodgr = Graph.Read(self.fp_i(FILE_IDX)) # communities if not self.fp_exists(FILE_CMM): sg.generateCommunities() dict_save(dict(enumerate(sg.comm)), self.fp_o(FILE_CMM)) else: sg.comm = [v for k, v in sorted(dict_load(self.fp_i(FILE_CMM)).iteritems())] # tgraphs if not self.fp_exists(FILE_TGR): sg.generateTGraphs() sg.sprdgr.write(self.fp_o(FILE_TGR)) else: sg.sprdgr = Graph.Read(self.fp_i(FILE_TGR)) # ptables if not self.fp_exists(FILE_PTB): sg.generatePTables() sg.ptabgr.write(self.fp_o(FILE_PTB)) dict_save(sg.ptbmap, self.fp_o(FILE_PTB_U)) else: sg.ptabgr = Graph.Read(self.fp_i(FILE_PTB)) sg.ptbmap = dict_load(self.fp_i(FILE_PTB_U)) LOG.info("generation complete; don't forget to run `postgen -d %s`" % self.base) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_writeall(self): """ Write objects from the generated data. """ socgr = Graph.Read(self.fp_i("soc.graphml")) gumap = dict_load(self.fp_i("group-user.map")) totalsize = int(self.fp_i("doc-tag.len").read()) phdb = self.db("p_idx") pgdb = self.db("p_tgr") ss = SampleWriter(phdb, pgdb, totalsize) ss.writeIndexes(self.dir_idx) ss.writeTGraphs(self.dir_tgr) if self.interact: code.interact(banner=self.banner(locals()), local=locals()) def round_examine(self, *args): """ Examine objects through the python interactive interpreter. """ socgr = Graph.Read(self.fp_i("soc.graphml")) gumap = dict_load(self.fp_i("group-user.map")) pddb = self.db("prod-doc") dppb = self.db("doc-prod") dtdb = self.db("doc-tag") tddb = self.db("tag-doc") tcdb = self.db("tag-cluster") totalsize = int(self.fp_i("doc-tag.len").read()) phdb = self.db("p_idx") phsb = self.db("p_idx_s") pgdb = self.db("p_tgr") pgsb = self.db("p_tgr_s") ptabgr = Graph.Read(self.fp_i("ptb.graphml")) prodgr = Graph.Read(self.fp_i("idx.graphml")) sprdgr = Graph.Read(self.fp_i("tgr.graphml")) stats = SampleStats(pddb, dppb, dtdb, tddb, totalsize, ptabgr, prodgr, sprdgr) reports = [] for arg in args: with open(os.path.join(self.dir_res, arg)) as fp: chaps = read_chapters(fp) reports.append(QueryReport.from_chapters(chaps)) stats.printReports(reports, pretty=self.pretty) if self.interact: code.interact(banner=self.banner(locals()), local=locals())