Ejemplo n.º 1
0
 def force_learn(self, text):
     ## some checks
     assert (self.click_matrix.shape[0] == self.click_matrix.shape[1]), \
             "Something wrong with the dimentions of the click matrix!"
     assert (self.click_matrix.shape[0] == len(self.known_urls)), \
             "Something wrong with the number of known urls!"
     assert (len(self.spend_time) == len(self.known_urls)), \
             "Time/url mismatch: {}-{}".format(len(self.spend_time), 
                                               len(self.known_urls))
     
     info = Util.parse_log_line(text)
     if info != None:
         if Guesser.use_derived_urls:
             all_urls = [info.url]
             all_urls.extend(Util.get_derived_urls(info.url))
             all_urls2 = [info.url2]
             all_urls2.extend(Util.get_derived_urls(info.url2))
             
             for idx, url in enumerate(reversed(all_urls)):
                 for idx2, url2 in enumerate(reversed(all_urls2)):
                     info.url = url
                     info.url2 = url2
                     self.force_learn_from_info(info, idx + idx2)
         else:
             self.force_learn_from_info(info)
Ejemplo n.º 2
0
def do_per_user_time_tests():
    filepaths = find_all_csv_names()
    
    # groups files with their user
    filepaths_per_user = {}
    for filepath in filepaths:
        file_id = filepath.rsplit("u",1)[-1].split(".",1)[0]
        [user_number, file_number] = file_id.split("_")
        
        if user_number in filepaths_per_user:
            filepaths_per_user[user_number].append(filepath)
        else:
            filepaths_per_user[user_number] = [filepath]
    
    test_sets = []
    for user, files in filepaths_per_user.items():
        # sort by first log
        file_times = []
        proper_file_names = []
        removed_file_names = []
        for filename in files:
            with open(filename, 'r') as csv_file:
                info = None
                for line in csv_file:
                    info = Util.parse_log_line(line)
                    if info is not None:
                        break
                if info is not None:
                    file_times.append(info.time)
                    proper_file_names.append(filename)
                else:
                    removed_file_names.append(filename)
        if (len(proper_file_names) < 3):
            logging.info("Ignored user {} because "
                        "he has too little files".format(user))
        else:
            file_times, sorted_file_paths = zip(*sorted(zip(file_times, 
                        proper_file_names), key=lambda x: x[0]))
            
            number_of_files = len(sorted_file_paths)
            limiter = int(number_of_files / 3)
            last_part = sorted_file_paths[:limiter]
            first_part = sorted_file_paths[limiter:]
            
            #logging.warning("Last: {}".format(last_part))
            #logging.warning("First: {}".format(first_part))
            
            test_set = {}
            test_set['test'] = last_part
            test_set['learn'] = first_part
            test_set['id'] = "time-test-for-user-{}".format(user)
            test_sets.append(test_set)
    
    total_correct_guesses, total_missed_guesses, \
    total_correct_count, total_missed_count = run_test_sets(test_sets)
    
    logging.info("-> Per-user Time tests: {} total hits, {} total misses, "
                "{} total hit count, {} total miss count"
                .format(total_correct_guesses, total_missed_guesses, 
                        total_correct_count, total_missed_count))
Ejemplo n.º 3
0
 def __init__(self, filepath):
     parsed_lines = []
     with open(filepath, 'r') as lines:
         parsed_lines = [Util.parse_log_line(line) for line in lines]
     parsed_lines = [info for info in parsed_lines if info is not None]
     # get load urls as these are the ones we'll be testing on
     self.load_urls = [info.url for info in parsed_lines if 
                                         info.type == "load"]
Ejemplo n.º 4
0
def do_time_test():
    file_paths = find_all_csv_names()
    
    # sort by first log
    file_times = []
    proper_file_names = []
    removed_file_names = []
    for filename in file_paths:
        with open(filename, 'r') as csv_file:
            info = None
            for line in csv_file:
                info = Util.parse_log_line(line)
                if info is not None:
                    break
            if info is not None:
                file_times.append(info.time)
                proper_file_names.append(filename)
            else:
                removed_file_names.append(filename)
    file_times, sorted_file_paths = zip(*sorted(zip(file_times,
                                                    proper_file_names),
                                                    key=lambda x: x[0]))
    
    number_of_files = len(sorted_file_paths)
    limiter = int(number_of_files / 5 * 4)
    last_part = sorted_file_paths[limiter:]
    first_part = sorted_file_paths[:limiter]
    
    test_set = {}
    test_set['test'] = last_part
    test_set['learn'] = first_part
    test_set['id'] = "time-test"
    
    total_correct_guesses, total_missed_guesses, \
    total_correct_count, total_missed_count = run_test_set(test_set)
    
    logging.info("-> Time tests: {} total hits, {} total misses, "
                "{} total hit count, {} total miss count"
                .format(total_correct_guesses, total_missed_guesses, 
                        total_correct_count, total_missed_count))
Ejemplo n.º 5
0
 def learn_from_files(self, filenames):
     file_times = []
     proper_file_names = []
     removed_file_names = []
     for filename in filenames:
         with open(filename, 'r') as csv_file:
             info = None
             for line in csv_file:
                 info = Util.parse_log_line(line)
                 if info is not None:
                     break
             if info is not None:
                 file_times.append(info.time)
                 proper_file_names.append(filename)
             else:
                 removed_file_names.append(filename)
     
     file_times, proper_file_names = zip(*sorted(zip(file_times, 
                                                     proper_file_names), 
                                                     key=lambda x: x[0]))
     
     logging.debug(
         "Removed files (empty or crap): {}".format(removed_file_names))
     for i in range(len(proper_file_names)):
         filename = proper_file_names[i]
         filetime = file_times[i]
         with open(filename, 'r') as csv_file:
             # Incrementally train your model based on these files
             logging.debug(
                 'Processing ({}) -> {}'.format(filetime, filename))
             for line in csv_file:
                 self.force_learn(line)
     logging.debug('Learned info:')
     #logging.debug('urls (first 100): {}..'.format(self.known_urls[0:100]))
     #logging.debug('matrix:\n{}'.format(self.click_matrix))
     #logging.debug('times (first 100): {}'.format(self.spend_time[0:100]))
     logging.debug('size: {}'.format(
         sum(x is not None for x in self.known_urls)))
     self.calculate_guesses_click_matrix()