def force_learn(self, text): ## some checks assert (self.click_matrix.shape[0] == self.click_matrix.shape[1]), \ "Something wrong with the dimentions of the click matrix!" assert (self.click_matrix.shape[0] == len(self.known_urls)), \ "Something wrong with the number of known urls!" assert (len(self.spend_time) == len(self.known_urls)), \ "Time/url mismatch: {}-{}".format(len(self.spend_time), len(self.known_urls)) info = Util.parse_log_line(text) if info != None: if Guesser.use_derived_urls: all_urls = [info.url] all_urls.extend(Util.get_derived_urls(info.url)) all_urls2 = [info.url2] all_urls2.extend(Util.get_derived_urls(info.url2)) for idx, url in enumerate(reversed(all_urls)): for idx2, url2 in enumerate(reversed(all_urls2)): info.url = url info.url2 = url2 self.force_learn_from_info(info, idx + idx2) else: self.force_learn_from_info(info)
def do_per_user_time_tests(): filepaths = find_all_csv_names() # groups files with their user filepaths_per_user = {} for filepath in filepaths: file_id = filepath.rsplit("u",1)[-1].split(".",1)[0] [user_number, file_number] = file_id.split("_") if user_number in filepaths_per_user: filepaths_per_user[user_number].append(filepath) else: filepaths_per_user[user_number] = [filepath] test_sets = [] for user, files in filepaths_per_user.items(): # sort by first log file_times = [] proper_file_names = [] removed_file_names = [] for filename in files: with open(filename, 'r') as csv_file: info = None for line in csv_file: info = Util.parse_log_line(line) if info is not None: break if info is not None: file_times.append(info.time) proper_file_names.append(filename) else: removed_file_names.append(filename) if (len(proper_file_names) < 3): logging.info("Ignored user {} because " "he has too little files".format(user)) else: file_times, sorted_file_paths = zip(*sorted(zip(file_times, proper_file_names), key=lambda x: x[0])) number_of_files = len(sorted_file_paths) limiter = int(number_of_files / 3) last_part = sorted_file_paths[:limiter] first_part = sorted_file_paths[limiter:] #logging.warning("Last: {}".format(last_part)) #logging.warning("First: {}".format(first_part)) test_set = {} test_set['test'] = last_part test_set['learn'] = first_part test_set['id'] = "time-test-for-user-{}".format(user) test_sets.append(test_set) total_correct_guesses, total_missed_guesses, \ total_correct_count, total_missed_count = run_test_sets(test_sets) logging.info("-> Per-user Time tests: {} total hits, {} total misses, " "{} total hit count, {} total miss count" .format(total_correct_guesses, total_missed_guesses, total_correct_count, total_missed_count))
def __init__(self, filepath): parsed_lines = [] with open(filepath, 'r') as lines: parsed_lines = [Util.parse_log_line(line) for line in lines] parsed_lines = [info for info in parsed_lines if info is not None] # get load urls as these are the ones we'll be testing on self.load_urls = [info.url for info in parsed_lines if info.type == "load"]
def do_time_test(): file_paths = find_all_csv_names() # sort by first log file_times = [] proper_file_names = [] removed_file_names = [] for filename in file_paths: with open(filename, 'r') as csv_file: info = None for line in csv_file: info = Util.parse_log_line(line) if info is not None: break if info is not None: file_times.append(info.time) proper_file_names.append(filename) else: removed_file_names.append(filename) file_times, sorted_file_paths = zip(*sorted(zip(file_times, proper_file_names), key=lambda x: x[0])) number_of_files = len(sorted_file_paths) limiter = int(number_of_files / 5 * 4) last_part = sorted_file_paths[limiter:] first_part = sorted_file_paths[:limiter] test_set = {} test_set['test'] = last_part test_set['learn'] = first_part test_set['id'] = "time-test" total_correct_guesses, total_missed_guesses, \ total_correct_count, total_missed_count = run_test_set(test_set) logging.info("-> Time tests: {} total hits, {} total misses, " "{} total hit count, {} total miss count" .format(total_correct_guesses, total_missed_guesses, total_correct_count, total_missed_count))
def learn_from_files(self, filenames): file_times = [] proper_file_names = [] removed_file_names = [] for filename in filenames: with open(filename, 'r') as csv_file: info = None for line in csv_file: info = Util.parse_log_line(line) if info is not None: break if info is not None: file_times.append(info.time) proper_file_names.append(filename) else: removed_file_names.append(filename) file_times, proper_file_names = zip(*sorted(zip(file_times, proper_file_names), key=lambda x: x[0])) logging.debug( "Removed files (empty or crap): {}".format(removed_file_names)) for i in range(len(proper_file_names)): filename = proper_file_names[i] filetime = file_times[i] with open(filename, 'r') as csv_file: # Incrementally train your model based on these files logging.debug( 'Processing ({}) -> {}'.format(filetime, filename)) for line in csv_file: self.force_learn(line) logging.debug('Learned info:') #logging.debug('urls (first 100): {}..'.format(self.known_urls[0:100])) #logging.debug('matrix:\n{}'.format(self.click_matrix)) #logging.debug('times (first 100): {}'.format(self.spend_time[0:100])) logging.debug('size: {}'.format( sum(x is not None for x in self.known_urls))) self.calculate_guesses_click_matrix()