Ejemplo n.º 1
0
 def force_learn(self, text):
     ## some checks
     assert (self.click_matrix.shape[0] == self.click_matrix.shape[1]), \
             "Something wrong with the dimentions of the click matrix!"
     assert (self.click_matrix.shape[0] == len(self.known_urls)), \
             "Something wrong with the number of known urls!"
     assert (len(self.spend_time) == len(self.known_urls)), \
             "Time/url mismatch: {}-{}".format(len(self.spend_time), 
                                               len(self.known_urls))
     
     info = Util.parse_log_line(text)
     if info != None:
         if Guesser.use_derived_urls:
             all_urls = [info.url]
             all_urls.extend(Util.get_derived_urls(info.url))
             all_urls2 = [info.url2]
             all_urls2.extend(Util.get_derived_urls(info.url2))
             
             for idx, url in enumerate(reversed(all_urls)):
                 for idx2, url2 in enumerate(reversed(all_urls2)):
                     info.url = url
                     info.url2 = url2
                     self.force_learn_from_info(info, idx + idx2)
         else:
             self.force_learn_from_info(info)
Ejemplo n.º 2
0
 def number_of_urls_for_guesses(self, guesses, guessing_for_url, 
                                guessing_index = -1):
     '''
     guesses: a list of guesses
     guessing_for_url: the url we're guessing for
     guessing_index: the index of the url, 
     if it's one from this log file, otherwise -1
     '''
     other_urls = self.load_urls.copy()
     if guessing_index > 0:
         other_urls = other_urls[guessing_index+1:]
     
     if TesterLogFile.use_derivatives:
         # add derivatives
        other_urls = [[url]+Util.get_derived_urls(url) for url in 
                                                               other_urls]
        other_urls = [x for y in other_urls for x in y] #flatten
     # find intersection
     intersection = [i for i in guesses if i in other_urls]
     return len(intersection)
Ejemplo n.º 3
0
    def get_guesses(self, url):
        url = Util.clean_url(url)
        
        # this fills self.guesses_matrix
        if self.guesses_click_matrix is None:
            self.calculate_guesses_click_matrix()
        
        # neem de huidige url
        index = self.get_index(url)
        unordered_weights = self.guesses_click_matrix[index,:].getA1()
        if Guesser.use_derived_urls:
            for idx, derived_url in \
                    enumerate(Util.get_derived_urls(url), start=1):
                der_index = self.get_index(derived_url)
                der_weights = self.guesses_click_matrix[der_index,:].getA1()
                unordered_weights = [w + dw *
                    (Guesser.devied_guess_falloff ** idx) for w,dw in 
                    zip(unordered_weights, der_weights)]
        
        # add time knowledge
        unordered_weights = [w * self.make_time_robust(t) for w,t in 
                            zip(unordered_weights, self.spend_time)]
        weights, urls = zip(*sorted(zip(unordered_weights, self.known_urls), 
                                    reverse=True, key=lambda x: x[0]))
        
        #debug info
        logging.debug("Guessed for ({}) {}".format(index, url))
        

        url_limit = min(Guesser.max_number_of_guesses, len(urls))
        result = []
        for i in range(url_limit):
            if weights[i] > 0:
                result.append([urls[i], weights[i]])
        
        if len(result) is 0:
            result = [["Can't guess :(", 0]]
        
        return result