def test_successful_login_reset(self): """Test that FindBlockedIPs resets its counter if there is a successful login request after a failed login request.""" f_output_reset = self.tmpdir.join("reset_output.txt") input_reset = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature4_reset.txt')) f4_reset = FindBlockedIPs(input_reset, str(f_output_reset)) f4_reset.parse() assert len(f4_reset.blocked_logs) == 0
def test_additional_unsuccessful_logins_are_blocked(self): """Test that FindBlockedIPs begins blocking incoming requests as soon as the threshhold for failed requests is met, and that additional incoming failed requests are blocked.""" f_output_immediate = self.tmpdir.join("immediate_output.txt") input_immediate = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature4_immediate.txt')) f4_immediate = FindBlockedIPs(input_immediate, str(f_output_immediate)) f4_immediate.parse() assert len(f4_immediate.blocked_logs) == 2
def test_k_larger_than_data(self): """Tests that FindMostActive does not throw errors when the k most active hosts requested is larger than the total number of unique hosts.""" f_output_large_k = self.tmpdir.join("large_k_output.txt") input_large_k = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature1_grouping.txt')) f1_large_k = FindMostActive(input_large_k, str(f_output_large_k), 10) f1_large_k.parse() assert f_output_large_k.read( ) == 'google.com,3\nbing.com,2\naskjeeves.com,1\n'
def test_k_larger_than_data(self): """Tests that FindMostIntensiveResources does not throw errors when the k most active resources requested is larger than the total number of unique resources.""" f_output_large_k = self.tmpdir.join("large_k_output.txt") input_large_k = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature2_grouping.txt')) f2_large_k = FindMostIntensiveResources(input_large_k, str(f_output_large_k), 10) f2_large_k.parse() assert f_output_large_k.read() == '/\n/coolstuff.gif\n/lamestuff.gif\n'
def test_single_entry(self): """Tests that FindHighestTrafficWindows correctly identifies that the highest traffic window for a single entry begins with the entry.""" f_output_single_entry = self.tmpdir.join("single_entry_output.txt") input_single_entry = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature3_single_entry.txt')) f3_single_entry = FindHighestTrafficWindows(input_single_entry, str(f_output_single_entry)) f3_single_entry.parse() assert f_output_single_entry.read() == "01/Jul/1995:00:00:01 -0400,1\n"
def test_timezone(self): """Tests that FindHighestTrafficWindows reads timezone from the input.""" f_output_single_entry = self.tmpdir.join("single_entry_output.txt") input_single_entry = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature3_single_entry.txt')) f3_single_entry = FindHighestTrafficWindows(input_single_entry, str(f_output_single_entry)) f3_single_entry.parse() assert f3_single_entry.timezone == "-0400"
def test_failed_logins_depend_on_host(self): """Test that failed logins from multiple hosts do not lead to a block.""" f_output_multiple_hosts = self.tmpdir.join("multiple_hosts_output.txt") input_multiple_hosts = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_multiple_hosts.txt')) f4_multiple_hosts = FindBlockedIPs(input_multiple_hosts, str(f_output_multiple_hosts)) f4_multiple_hosts.parse() assert len(f4_multiple_hosts.blocked_logs) == 0
def test_dynamic_window(self): """Tests that we can configure the size of the sliding window.""" f_output_window = self.tmpdir.join("boundary_output.txt") input_window = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature3_boundary.txt')) f3_window = FindHighestTrafficWindows(input_window, str(f_output_window), k=2, minutes_per_bucket=120) f3_window.parse() assert f_output_window.read( ) == "01/Jul/1995:00:00:03 -0400,3\n01/Jul/1995:00:00:04 -0400,2\n"
def test_breaking_ties(self): """Tests that FindMostActive orders its output lexicographically when two hosts have equal traffic. In this case bing.com and google.com both have three hits, so bing.com should come first in the output file as b < g.""" f_output_ties = self.tmpdir.join("ties_output.txt") input_ties = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature1_tie.txt')) f1_ties = FindMostActive(input_ties, str(f_output_ties), 3) f1_ties.parse() assert f1_ties.hosts_to_hits["google.com"] == 3 assert f1_ties.hosts_to_hits["bing.com"] == 3 assert f_output_ties.read() == 'bing.com,3\ngoogle.com,3\n'
def test_breaking_ties(self): """Tests that FindMostIntensiveResources orders its output lexicographically when two resources have equal volume. In this case /coolstuff.gif and /lamestuff.gif both have total 2048, so /coolstuff.gif should come first as c < l.""" f_output_ties = self.tmpdir.join("ties_output.txt") input_ties = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature2_tie.txt')) f2_ties = FindMostIntensiveResources(input_ties, str(f_output_ties), 3) f2_ties.parse() assert f2_ties.resources_to_bandwidth["/coolstuff.gif"] == 2048 assert f2_ties.resources_to_bandwidth["/lamestuff.gif"] == 2048 assert f_output_ties.read() == '/coolstuff.gif\n/lamestuff.gif\n'
def test_in_between_values(self): """Tests that FindHighestTrafficWindows recognizes that highest traffic windows may begin between logged events.""" f_output_multi_entry = self.tmpdir.join("multi_entry_output.txt") input_multi_entry = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature3_multi_entry.txt')) f3_multi_entry = FindHighestTrafficWindows(input_multi_entry, str(f_output_multi_entry)) f3_multi_entry.parse() assert f_output_multi_entry.read( ) == "01/Jul/1995:00:00:01 -0400,2\n01/Jul/1995:00:00:02 -0400,1\n01/Jul/1995:00:00:03 -0400,1\n01/Jul/1995:00:00:04 -0400,1\n"
def test_handles_gaps(self): """Test that GetHostActivityLog outputs a 0 next to any gaps, rather than simply skipping them.""" f_output_gaps = self.tmpdir.join("gaps_output.txt") input_gaps = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature5_gaps.txt')) f5_gaps = GetHostActivityLog(input_gaps, str(f_output_gaps), host_to_search="199.72.81.55", minutes_per_bin=5) f5_gaps.parse() assert f_output_gaps.read( ) == "01/Jul/1995:00:00:01,6\n01/Jul/1995:00:05:01,1\n01/Jul/1995:00:10:01,5\n01/Jul/1995:00:15:01,0\n01/Jul/1995:00:20:01,1\n"
def test_grouping(self): """Tests that an input file with 6 server logs from 3 unique hosts is correctly grouped by FindMostActive.""" f_output_grouping = self.tmpdir.join("grouping_output.txt") input_grouping = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature1_grouping.txt')) f1_grouping = FindMostActive(input_grouping, str(f_output_grouping), 3) f1_grouping.parse() assert f1_grouping.hosts_to_hits["google.com"] == 3 assert f1_grouping.hosts_to_hits["bing.com"] == 2 assert f1_grouping.hosts_to_hits["askjeeves.com"] == 1 assert f_output_grouping.read( ) == 'google.com,3\nbing.com,2\naskjeeves.com,1\n'
def test_block_window_minutes(self): """Test that the size of the block period can be configured.""" f_output_configure_block = self.tmpdir.join( "configure_block_output.txt") input_configure_block = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_configure_block.txt')) f4_configure_block = FindBlockedIPs(input_configure_block, str(f_output_configure_block), block_minutes=30) f4_configure_block.parse() assert len(f4_configure_block.blocked_logs) == 2
def test_failed_attempts_counter(self): """Test that the counter of failed attempts that lead to a block can be configured.""" f_output_configure_fail = self.tmpdir.join("configure_fail_output.txt") input_configure_fail = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_configure_fail.txt')) f4_configure_fail = FindBlockedIPs(input_configure_fail, str(f_output_configure_fail), failed_attempts=2) f4_configure_fail.parse() assert len(f4_configure_fail.blocked_logs) == 4
def test_window_boundary(self): """Tests for off-by-one errors at the boundary of the sliding window: ensure that two events exactly 60 minutes apart are both included in the same window, but two events 60 minutes and 1 second apart are not included in the same window, and two events at the same clock time but separated by one day are also not included in the same time window.""" f_output_boundary = self.tmpdir.join("boundary_output.txt") input_boundary = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature3_boundary.txt')) f3_boundary = FindHighestTrafficWindows(input_boundary, str(f_output_boundary), k=2) f3_boundary.parse() assert f_output_boundary.read( ) == "01/Jul/1995:00:00:03 -0400,2\n01/Jul/1995:00:00:04 -0400,2\n"
def test_window_seconds(self): """Test that the size of the window to look for failed login attempts can be configured.""" f_output_configure_window = self.tmpdir.join( "configure_window_output.txt") input_configure_window = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_configure_window.txt')) f4_configure_window = FindBlockedIPs(input_configure_window, str(f_output_configure_window), window_seconds=30) f4_configure_window.parse() assert len(f4_configure_window.blocked_logs) == 1
def test_limited_to_k(self): """Tests that if there are more than k traffic windows, we only return the top k.""" f_output_multi_entry = self.tmpdir.join("multi_entry_output.txt") input_multi_entry = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature3_multi_entry.txt')) f3_multi_entry = FindHighestTrafficWindows(input_multi_entry, str(f_output_multi_entry), k=2) f3_multi_entry.parse() assert f_output_multi_entry.read( ) == "01/Jul/1995:00:00:01 -0400,2\n01/Jul/1995:00:00:04 -0400,1\n"
def test_grouping(self): """Tests that an input file with 6 server logs from 3 unique resources is correctly grouped by FindMostIntensiveResources.""" f_output_grouping = self.tmpdir.join("grouping_output.txt") input_grouping = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature2_grouping.txt')) f2_grouping = FindMostIntensiveResources(input_grouping, str(f_output_grouping), 3) f2_grouping.parse() assert f2_grouping.resources_to_bandwidth["/"] == 2048 assert f2_grouping.resources_to_bandwidth["/coolstuff.gif"] == 256 assert f2_grouping.resources_to_bandwidth["/lamestuff.gif"] == 112 assert f_output_grouping.read( ) == '/\n/coolstuff.gif\n/lamestuff.gif\n'
def test_correct_assignment(self): """Test that GetHostActivityLog correctly bins incoming traffic and ignores other hosts.""" f_output_assignment = self.tmpdir.join("assignment_output.txt") input_assignment = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature5_assignment.txt')) f5_assignment = GetHostActivityLog(input_assignment, str(f_output_assignment), host_to_search="199.72.81.55", minutes_per_bin=5) f5_assignment.parse() assert f_output_assignment.read( ) == "01/Jul/1995:00:00:01,6\n01/Jul/1995:00:05:01,1\n01/Jul/1995:00:10:01,5\n"
def test_failed_requests_are_reconsidered(self): """Test that FindBlockedIPs reconsiders a failed login attempt that does *not* lead to a block if it might be the beginning of a later window of failed login attempts.""" f_output_reconsider = self.tmpdir.join("reconsider_output.txt") input_reconsider = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_fail_reconsider.txt')) f4_reconsider = FindBlockedIPs(input_reconsider, str(f_output_reconsider)) f4_reconsider.parse() assert len(f4_reconsider.blocked_logs) == 1 assert f4_reconsider.blocked_logs[0].timestamp == datetime.strptime( "01/Jul/1995:00:04:05", '%d/%b/%Y:%H:%M:%S')
def build_and_train(): model = build_model() # generate training and test dataset training_set = gen_training_data.gen_training_data() test_set = gen_test_data.gen_test_data() # to view the training class indices # training_set.class_indices # TRAINING THE CLASSIFIER model.fit_generator(training_set, steps_per_epoch=4000, epochs=15, validation_data=test_set, validation_steps=1000) model.save('classifier.hd5')
def test_input_data_sort(self): """Test that FindBlockedIPs sorts the input server logs by host and time.""" f_output_sort = self.tmpdir.join("sort_output.txt") input_sort = gen_test_data( str(os.path.dirname(__file__) + '/../test/feature4_sort.txt')) f4_sort = FindBlockedIPs(input_sort, str(f_output_sort)) f4_sort.parse() # first confirm that hosts are sorted in lexicographic order assert f4_sort.server_log[0].host == "bing.com" assert f4_sort.server_log[1].host == "bing.com" assert f4_sort.server_log[2].host == "google.com" assert f4_sort.server_log[3].host == "google.com" # then confirm that events are ascending in time within a host assert f4_sort.server_log[0].bytes == 56 assert f4_sort.server_log[1].bytes == 1024 assert f4_sort.server_log[2].bytes == 256 assert f4_sort.server_log[3].bytes == 1024
def test_blocked_requests_not_reconsidered(self): """Test that FindBlockedIPs does not start a new block window if there are three failed login attempts within a time period that is already blocked.""" f_output_not_reconsider = self.tmpdir.join("not_reconsider_output.txt") input_not_reconsider = gen_test_data( str( os.path.dirname(__file__) + '/../test/feature4_block_not_reconsider.txt')) f4_not_reconsider = FindBlockedIPs(input_not_reconsider, str(f_output_not_reconsider)) f4_not_reconsider.parse() assert len(f4_not_reconsider.blocked_logs) == 3 # test that last request is not blocked # final request falls outside the block window established by the first three unsuccessful requests # but would fall within the block window of the unsuccessful requests that were blocked assert f4_not_reconsider.blocked_logs[ 2].timestamp == datetime.strptime("01/Jul/1995:00:04:05", '%d/%b/%Y:%H:%M:%S')