def force_learn(self, text): ## some checks assert (self.click_matrix.shape[0] == self.click_matrix.shape[1]), \ "Something wrong with the dimentions of the click matrix!" assert (self.click_matrix.shape[0] == len(self.known_urls)), \ "Something wrong with the number of known urls!" assert (len(self.spend_time) == len(self.known_urls)), \ "Time/url mismatch: {}-{}".format(len(self.spend_time), len(self.known_urls)) info = Util.parse_log_line(text) if info != None: if Guesser.use_derived_urls: all_urls = [info.url] all_urls.extend(Util.get_derived_urls(info.url)) all_urls2 = [info.url2] all_urls2.extend(Util.get_derived_urls(info.url2)) for idx, url in enumerate(reversed(all_urls)): for idx2, url2 in enumerate(reversed(all_urls2)): info.url = url info.url2 = url2 self.force_learn_from_info(info, idx + idx2) else: self.force_learn_from_info(info)
def number_of_urls_for_guesses(self, guesses, guessing_for_url, guessing_index = -1): ''' guesses: a list of guesses guessing_for_url: the url we're guessing for guessing_index: the index of the url, if it's one from this log file, otherwise -1 ''' other_urls = self.load_urls.copy() if guessing_index > 0: other_urls = other_urls[guessing_index+1:] if TesterLogFile.use_derivatives: # add derivatives other_urls = [[url]+Util.get_derived_urls(url) for url in other_urls] other_urls = [x for y in other_urls for x in y] #flatten # find intersection intersection = [i for i in guesses if i in other_urls] return len(intersection)
def get_guesses(self, url): url = Util.clean_url(url) # this fills self.guesses_matrix if self.guesses_click_matrix is None: self.calculate_guesses_click_matrix() # neem de huidige url index = self.get_index(url) unordered_weights = self.guesses_click_matrix[index,:].getA1() if Guesser.use_derived_urls: for idx, derived_url in \ enumerate(Util.get_derived_urls(url), start=1): der_index = self.get_index(derived_url) der_weights = self.guesses_click_matrix[der_index,:].getA1() unordered_weights = [w + dw * (Guesser.devied_guess_falloff ** idx) for w,dw in zip(unordered_weights, der_weights)] # add time knowledge unordered_weights = [w * self.make_time_robust(t) for w,t in zip(unordered_weights, self.spend_time)] weights, urls = zip(*sorted(zip(unordered_weights, self.known_urls), reverse=True, key=lambda x: x[0])) #debug info logging.debug("Guessed for ({}) {}".format(index, url)) url_limit = min(Guesser.max_number_of_guesses, len(urls)) result = [] for i in range(url_limit): if weights[i] > 0: result.append([urls[i], weights[i]]) if len(result) is 0: result = [["Can't guess :(", 0]] return result