def test_more_host_zero_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=5, max_path_segments=0) == sample[0]["suburis"][0:4] assert generate_suburis(sample[0]["surt"], max_host_segments=7, max_path_segments=0) == sample[0]["suburis"][0:4]
def test_more_host_equal_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=5, max_path_segments=3) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_host_segments=7, max_path_segments=3) == sample[0]["suburis"]
def test_more_host_middle_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=5, max_path_segments=2) == sample[0]["suburis"][0:6] assert generate_suburis(sample[0]["surt"], max_host_segments=7, max_path_segments=2) == sample[0]["suburis"][0:6]
def test_zero_host_more_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=0, max_path_segments=4) == [] assert generate_suburis(sample[0]["surt"], max_host_segments=0, max_path_segments=6) == []
def test_one_host_more_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=1, max_path_segments=4) == sample[0]["suburis"][0:1] assert generate_suburis(sample[0]["surt"], max_host_segments=1, max_path_segments=6) == sample[0]["suburis"][0:1]
def test_explicit_none_param(): assert generate_suburis(sample[0]["surt"], max_host_segments=None) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_path_segments=None) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_host_segments=None, max_path_segments=None) == sample[0]["suburis"]
def test_non_int_param(): assert generate_suburis(sample[0]["surt"], max_host_segments="all") == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_path_segments="all") == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_host_segments="all", max_path_segments="all") == sample[0]["suburis"]
def _update_ds(self, entry): """Update data structure after processing a line from the CDX""" suburis = generate_suburis(entry.surt, max_host_segments=self.max_host_segments, max_path_segments=self.max_path_segments) for s in suburis: self._update_record("suburi", s, entry.surt) self._update_record("time", entry.time[0:6], entry.surt) self._update_record("mediatype", entry.mime, entry.surt)
def _update_ds(self, count, entry): """Update data structure after processing a line from the CDX""" try: suburis = generate_suburis(surt(entry), max_host_segments=self.max_host_segments, max_path_segments=self.max_path_segments) for s in suburis: self._update_record("suburi", s, count) #self._update_record("time", entry.time[0:6], entry.surt) #self._update_record("mediatype", entry.mime, entry.surt) except: print("Something went wrong while processing " + entry)
def _update_ds(self, count, entry): """Update data structure after processing a line from the CDX""" try: suburis = generate_suburis( surt(entry), max_host_segments=self.max_host_segments, max_path_segments=self.max_path_segments) for s in suburis: self._update_record("suburi", s, count) #self._update_record("time", entry.time[0:6], entry.surt) #self._update_record("mediatype", entry.mime, entry.surt) except: print("Something went wrong while processing " + entry)
def generate_all_suburis(host, path): print("Generating Sub-URIs of {0} with Host: {1}, Path: {2}".format(collection, host, path)) filename = "{0}-H{1}P{2}.suburi".format(collection, host, path) opf = open(os.path.join(opdir, filename), "w") for extr in sys.argv[1:]: with open(extr) as f: for line in f: count, entry = line.split() try: opf.write("\n".join(generate_suburis(surt(entry), max_host_segments=host, max_path_segments=path)) + "\n") except: print("Something went wrong while processing " + line) opf.close()
def test_all_host_all_path(): assert generate_suburis(sample[0]["surt"]) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"][:-7]) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"][:-8]) == sample[0]["suburis"] assert generate_suburis( sample[0]["surt"][:-16]) == sample[0]["suburis"][:-1] assert generate_suburis( sample[0]["surt"][:-17]) == sample[0]["suburis"][:-1] assert generate_suburis( sample[0]["surt"][:-19]) == sample[0]["suburis"][:-2] assert generate_suburis( sample[0]["surt"][:-20]) == sample[0]["suburis"][:-2] assert generate_suburis( sample[0]["surt"][:-22]) == sample[0]["suburis"][:-3] assert generate_suburis(sample[0]["surt"][:-28] + ")/") == sample[0]["suburis"][:-4] assert generate_suburis(sample[0]["surt"][:-35] + ")/") == sample[0]["suburis"][:-5] assert generate_suburis(sample[0]["surt"][:-43] + ")/") == sample[0]["suburis"][:-6]
def test_middle_host_all_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=2) == sample[0]["suburis"][0:2]
def test_all_host_one_path(): assert generate_suburis(sample[0]["surt"], max_path_segments=1) == sample[0]["suburis"][0:5]
def test_one_host_all_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=1) == sample[0]["suburis"][0:1]
def test_zero_host_all_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=0) == []
def test_one_host_equal_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=1, max_path_segments=3) == sample[0]["suburis"][0:1]
def test_equal_host_all_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=4) == sample[0]["suburis"]
def test_equal_host_middle_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=4, max_path_segments=2) == sample[0]["suburis"][0:6]
def test_equal_host_zero_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=4, max_path_segments=0) == sample[0]["suburis"][0:4]
def test_paren_in_path(): assert generate_suburis(sample[1]["surt"]) == sample[1]["suburis"]
def test_all_host_all_path(): assert generate_suburis(sample[0]["surt"]) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"][:-7]) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"][:-8]) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"][:-16]) == sample[0]["suburis"][:-1] assert generate_suburis(sample[0]["surt"][:-17]) == sample[0]["suburis"][:-1] assert generate_suburis(sample[0]["surt"][:-19]) == sample[0]["suburis"][:-2] assert generate_suburis(sample[0]["surt"][:-20]) == sample[0]["suburis"][:-2] assert generate_suburis(sample[0]["surt"][:-22]) == sample[0]["suburis"][:-3] assert generate_suburis(sample[0]["surt"][:-28]+")/") == sample[0]["suburis"][:-4] assert generate_suburis(sample[0]["surt"][:-35]+")/") == sample[0]["suburis"][:-5] assert generate_suburis(sample[0]["surt"][:-43]+")/") == sample[0]["suburis"][:-6]
def test_all_host_more_path(): assert generate_suburis(sample[0]["surt"], max_path_segments=4) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_path_segments=6) == sample[0]["suburis"]
def test_middle_host_zero_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=2, max_path_segments=0) == sample[0]["suburis"][0:2]
def test_more_host_all_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=5) == sample[0]["suburis"] assert generate_suburis(sample[0]["surt"], max_host_segments=7) == sample[0]["suburis"]
def test_equal_host_one_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=4, max_path_segments=1) == sample[0]["suburis"][0:5]
def test_middle_host_more_path(): assert generate_suburis(sample[0]["surt"], max_host_segments=2, max_path_segments=4) == sample[0]["suburis"][0:2] assert generate_suburis(sample[0]["surt"], max_host_segments=2, max_path_segments=6) == sample[0]["suburis"][0:2]