Esempio n. 1
0
 def indexable_file(self, path):
   """If true, the file can be indexed by the indexing engine (e.g. contains
   text content). We will only send the data if this returns True.
   """
   if self.only_index:
     for e in self.only_index:
       if path.endswith(e):
         return True
     return False
   else:
     return filetype_utils.is_indexable_file(path)
Esempio n. 2
0
 def _tc(self, path, filetype, category, indexable):
     """The work for running a single testcase"""
     (t, c) = filetype_utils.get_file_description_and_category(path)
     self.assertEqual(t, filetype,
                      "Expecting filetype %s for %s, got %s" %
                      (filetype, path, t))
     self.assertEqual(c, category,
                      "Expecting category %s for %s, got %s" %
                      (category, path, c))
     self._check_category(path, t, c)
     i = filetype_utils.is_indexable_file(path)
     self.assertEqual(i, indexable,
                      "Path %s is %sindexable, expecting %sindexable" %
                      (path, "" if i else "not ", "" if indexable else "not "))
 def include(record):
   if record.has_key("token"):
     return True
   else:
     return filetype_utils.is_indexable_file(p2f(record["path"]))
 def indexable_file(self, path):
   """If true, the file can be indexed by the indexing engine (e.g. contains
   text content). We will only send the data if this returns True.
   """
   return filetype_utils.is_indexable_file(path)
Esempio n. 5
0
def crawl_files(directory_to_crawl):
    path = os.path.abspath(os.path.expanduser(directory_to_crawl))
    assert os.path.isdir(path)
    print "Crawling directory %s" % path
    total_size = 0
    total_cnt = 0
    indexed_size = 0
    indexed_cnt = 0
    size_by_category = {}
    cnt_by_category = {}
    size_by_type = {}
    cnt_by_type = {}
    size_by_tag = {}
    cnt_by_tag = {}
    
    entries = [["Path", "Size", "Category", "Type", "Indexable?", "Tags"],]
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            entry = []
            fpath = os.path.join(root, filename)
            entry.append(fpath)
            stat = os.stat(fpath)
            filesize = stat.st_size
            total_size += filesize
            total_cnt += 1
            entry.append(str(filesize))
            (filetype, category) = filetype_utils.get_file_description_and_category(fpath)
            entry.append(category)
            add_to_dict(size_by_category, category, filesize)
            add_to_dict(cnt_by_category, category, 1)
            add_to_dict(size_by_type, filetype, filesize)
            add_to_dict(cnt_by_type, filetype, 1)
            entry.append(filetype)
            if filetype_utils.is_indexable_file(fpath):
                entry.append("Yes")
            else:
                entry.append("No")
            tags = filetype_utils.get_tags(fpath)
            if len(tags)>0:
                entry.append(tags[0])
                add_to_dict(size_by_tag, tags[0], filesize)
                add_to_dict(cnt_by_tag, tags[0], 1)
            else:
                entry.append("None")
                add_to_dict(size_by_tag, "untagged", filesize)
                add_to_dict(cnt_by_tag, "untagged", 1)
            entries.append(entry)
        print "Crawled %d files, for %3.2f MB total" % (total_cnt,
                                                        float(total_size)/
                                                        float(total_cnt)/1000000.0)
        datafile = os.path.abspath("./file_data.csv")
        with open(datafile, "w") as fd:
            for entry in entries:
                fd.write(", ".join(entry) + "\n")
        print "Wrote data to file %s" % datafile

        aggfile = os.path.abspath("./aggregate_data.csv")
        with open(aggfile, "w") as fa:
            fa.write("Group, Subgroup, Value\n")
            fa.write("Total, Count, %d\n" % total_cnt)
            fa.write("Total, Size, %d\n" % total_size)
            dict_to_csv(fa, "Cnt by Category", cnt_by_category)
            dict_to_csv(fa, "Size by Category", size_by_category)
            dict_to_csv(fa, "Cnt by Type", cnt_by_type)
            dict_to_csv(fa, "Size by Type", size_by_type)
            dict_to_csv(fa, "Cnt by Tag", cnt_by_tag)
            dict_to_csv(fa, "Size by Tag", size_by_tag)
        print "Wrote aggregates to file %s" % aggfile