def testFileList(self): """Optionally check all the filetype => category mappings for a list of paths provided in a file. """ if filelist_file: logging.info("Running filelist test using file %s" % filelist_file) with open(filelist_file, "rb") as f: for line in f: path = line.decode("utf-8").rstrip() (t, c) = filetype_utils.get_file_description_and_category(path) self._check_category(path, t, c) else: logging.info("Skipping filelist test. To run, specify --filelist=path, where path is a path to a list of files")
def get_categories(self, log): paths = log["path"] filetypes = [] categories = [] tag_lists = [] for p in paths: filepath = p2f(p) (filetype,category) = \ filetype_utils.get_file_description_and_category(filepath) filetypes.append(filetype) categories.append(category) tags = filetype_utils.get_tags(filepath) tag_lists.append(tags) return (filetypes, categories, tag_lists)
def _tc(self, path, filetype, category, indexable): """The work for running a single testcase""" (t, c) = filetype_utils.get_file_description_and_category(path) self.assertEqual(t, filetype, "Expecting filetype %s for %s, got %s" % (filetype, path, t)) self.assertEqual(c, category, "Expecting category %s for %s, got %s" % (category, path, c)) self._check_category(path, t, c) i = filetype_utils.is_indexable_file(path) self.assertEqual(i, indexable, "Path %s is %sindexable, expecting %sindexable" % (path, "" if i else "not ", "" if indexable else "not "))
def send_file(self, host, volume, path, stat): listing = {} listing["path"] = [host + ":" + path] listing["size"] = [stat.st_size] listing["perm"] = [stat.st_mode] listing["owner"] = [stat.st_uid] (filetype,category) = filetype_utils.get_file_description_and_category(path) listing["volume"] = [volume,] listing["filetype"] = [filetype,] listing["category"] = [category,] if (not self.config.has_key('only_metadata') or self.config['only_metadata'] == False) and self.indexable_file(path): with open(path) as f: listing["data"] = [base64.b64encode(f.read())] log = Log() log.set_log(listing) self.buffered_push("output", log)
def crawl_files(directory_to_crawl): path = os.path.abspath(os.path.expanduser(directory_to_crawl)) assert os.path.isdir(path) print "Crawling directory %s" % path total_size = 0 total_cnt = 0 indexed_size = 0 indexed_cnt = 0 size_by_category = {} cnt_by_category = {} size_by_type = {} cnt_by_type = {} size_by_tag = {} cnt_by_tag = {} entries = [["Path", "Size", "Category", "Type", "Indexable?", "Tags"],] for root, dirnames, filenames in os.walk(path): for filename in filenames: entry = [] fpath = os.path.join(root, filename) entry.append(fpath) stat = os.stat(fpath) filesize = stat.st_size total_size += filesize total_cnt += 1 entry.append(str(filesize)) (filetype, category) = filetype_utils.get_file_description_and_category(fpath) entry.append(category) add_to_dict(size_by_category, category, filesize) add_to_dict(cnt_by_category, category, 1) add_to_dict(size_by_type, filetype, filesize) add_to_dict(cnt_by_type, filetype, 1) entry.append(filetype) if filetype_utils.is_indexable_file(fpath): entry.append("Yes") else: entry.append("No") tags = filetype_utils.get_tags(fpath) if len(tags)>0: entry.append(tags[0]) add_to_dict(size_by_tag, tags[0], filesize) add_to_dict(cnt_by_tag, tags[0], 1) else: entry.append("None") add_to_dict(size_by_tag, "untagged", filesize) add_to_dict(cnt_by_tag, "untagged", 1) entries.append(entry) print "Crawled %d files, for %3.2f MB total" % (total_cnt, float(total_size)/ float(total_cnt)/1000000.0) datafile = os.path.abspath("./file_data.csv") with open(datafile, "w") as fd: for entry in entries: fd.write(", ".join(entry) + "\n") print "Wrote data to file %s" % datafile aggfile = os.path.abspath("./aggregate_data.csv") with open(aggfile, "w") as fa: fa.write("Group, Subgroup, Value\n") fa.write("Total, Count, %d\n" % total_cnt) fa.write("Total, Size, %d\n" % total_size) dict_to_csv(fa, "Cnt by Category", cnt_by_category) dict_to_csv(fa, "Size by Category", size_by_category) dict_to_csv(fa, "Cnt by Type", cnt_by_type) dict_to_csv(fa, "Size by Type", size_by_type) dict_to_csv(fa, "Cnt by Tag", cnt_by_tag) dict_to_csv(fa, "Size by Tag", size_by_tag) print "Wrote aggregates to file %s" % aggfile