def test(): fetcher = Fetcher("test/fetcher_test_data") urls = URLUtility.load_urls("test/data/urls.txt") sites = fetcher.fetch(urls) for site in sites: for page in site: print page.get_text('body')[:100].replace("\n", "")
def __init__(self, seed_file, result_file, data_dir): """ Args: seed_file: contains list of seed urls data_dir: stores crawled data result_file: stores urls and their scores """ self.train_urls = URLUtility.load_urls(seed_file) self.fetcher = Fetcher( data_dir, None, False ) # Note: Fetcher contains Bing Search but does not use it (just for website ranking evaluation) self.result_file = result_file self.ranked_result_file = result_file + ".rank" self.searcher = Search_APIs(data_dir, self.fetcher)
def search_site(url_file, out_file, keyword): """ Write results as json line objects into out_file Format of each json object: list<str>: list of urls. First url is the main site """ urls = URLUtility.load_urls(url_file) site2urls = read_json(out_file) k = 10 out = open(out_file, "a+") for i, url in enumerate(urls): site = URLUtility.get_host(url) if site not in site2urls: results = bing_search.search_site(keyword, url, 10) results = [site, url] + results json.dump(results, out) out.write("\n") out.close()
page = Page(url) if len(res.text) < self.max_html_size: page.add_html(res.text) if extraction: jspage = page.get_json_obj() else: jspage = {'url': url, 'html': res.text} out.write(json.dumps(jspage) + '\n') else: print res.status_code, url except: print "Failed to fetch ", url traceback.print_exc() out.close() def test(): fetcher = Fetcher() urls = ['http://nyu.edu', 'http://mit.edu'] out_file = 'test_fetcher.json' fetcher.fetch(urls, out_file) if __name__ == "__main__": url_file = sys.argv[1] out_file = sys.argv[2] urls = URLUtility.load_urls(url_file) fetcher = Fetcher() fetcher.fetch(urls, out_file, True)
def evaluate_ranking(seed_file, candidate_file, negative_file, data_dir, rankings, max_cand, representation, test_ratio, online, selection=None, max_pages=1, prf=False, seednumbs=None): """ test_ratio: percentage of test urls splitted from seed urls """ t = time.time() seed_urls = URLUtility.load_urls(seed_file) cand_urls = URLUtility.load_urls(candidate_file) neg_urls = URLUtility.load_urls(negative_file) # Split train and test urls split = int((1 - test_ratio) * len(seed_urls)) test_urls = seed_urls[split:] train_urls = seed_urls[:split] # Fetch the train, test and candidate sites print "Loading the cache" fetcher = Fetcher(data_dir) if selection == "mix": # This is to prove the yet ineffectiveness of multipages representation train_selection = test_selection = "search" cand_selection = "random" else: train_selection = test_selection = cand_selection = selection print "\nFetching train sites" train_sites = fetcher.fetch_sites(train_urls, max_pages, train_selection, online) print "Time to fetch train sites: ", time.time() - t t = time.time() if seednumbs: seednumbs = get_seednumbs(seednumbs[0], len(train_sites), seednumbs[1]) else: seednumbs = [len(train_sites)] print "seednumbs", seednumbs for seednumb in seednumbs: train_sites = train_sites[:seednumb + 1] #for s in train_sites: # for p in s: # print p.get_url() print "\nFetching cand sites" cand_sites = fetcher.fetch_sites(cand_urls, max_pages, cand_selection, online) print "\nFetching test sites" test_sites = fetcher.fetch_sites(test_urls, max_pages, test_selection, online) print "\nFetching negative sites" neg_sites = fetcher.fetch_sites(neg_urls, 1, None, online) print "Time to fetch cand, test, neg sites: ", time.time() - t cand_sites = cand_sites[:max_cand] max_cand -= len(test_sites) cand_sites.extend(test_sites) print "Number of seed sites: ", len(train_sites) print "Number of test sites: ", len(test_sites) print "Number of candidate sites: ", len(cand_sites) print "Ranking methods: ", rankings if online: print "Running online mode" else: print "Running offline mode" # Initialize the ranking models for ranking in rankings: # Train print "Ranking..." t = time.time() ranker = Ranker( copy.deepcopy(train_sites), representation, ranking, neg_sites ) # train_sites might be changed in the object initialization print "Time to initialize ranker: ", time.time() - t t = time.time() top_sites = ranker.rank(cand_sites, prf) print "Time to rank: ", time.time() - t # Evaluate print "Evaluating ranking results" site2rank = {} site2website = {} for i, site_score in enumerate(top_sites): site = site_score[0].get_host() if site not in site2rank: site2rank[site] = i site2website[site] = site_score[0] test_scores = [] #test_count = 0 for url in test_urls: site = URLUtility.get_host(url) if site in site2rank: #test_count += 1 print site, site2rank[site] print[p.get_url() for p in site2website[site]] test_scores.append(site2rank[site]) test_scores = sorted(test_scores) mean = sum(test_scores) / float(len(test_scores)) mean = round(mean, 2) median = test_scores[(len(test_scores) - 1) / 2] #prec_at_k = round(len([s for s in test_scores if s<=len(test_urls)])/float(test_count), 4)*100 prec_at_k = round( len([s for s in test_scores if s < len(test_scores)]) / float(len(test_scores)), 4) * 100 precs = compute_prec(test_scores) print "RESULTS_SEEDNUMB", len(train_sites) print "RESULTS_RAW," + ranking + ',' + ','.join( [str(s) for s in test_scores]) print "RESULTS_AGGREGATION," + ranking + ',' + str( mean) + ',' + str(median) + ',' + str(prec_at_k) print "RESULTS_PRECS", ranking + ',' + ','.join( [str(p) for p in precs]) # Debug: print top 10 urls print "Top 10 urls: " for item in top_sites[:20]: print item[0].get_host(), item[1] print[p.get_url() for p in item[0]] # Clear the pre-computed vectorization from previous runs clear(train_sites) clear(cand_sites) clear(test_sites) clear(neg_sites)
counter = Counter() for site in sites: for p in site: text = p.get_text('meta') text = URLUtility.clean_text(text) words = word_tokenize(text) words = [ word for word in words if word not in stop and len(word) > 2 ] counter += Counter(words) # Get the topk words counter = [(counter[w], w) for w in counter if counter[w] > 1] # convert to array heapq.heapify(counter) topk = heapq.nlargest(k, counter) print "Top extracted keywords: ", topk return [w[1] for w in topk] def make_output_filename(data_dir, seed_file): seed_filename = seed_file.split("/")[-1].split(".")[0] return data_dir + "/" + seed_filename + "_candidates.txt" if __name__ == "__main__": seed_file = sys.argv[1] data_dir = sys.argv[2] seed_urls = URLUtility.load_urls(seed_file) collect_candidates(seed_urls, data_dir)
def run_mix_search(self, ranking, selection=None, online=True, max_results=50, seed_keyword="gun", search="kw", iters=5, representation='body', negative_file=None): """ seed_sites: urls that are used for search selected_urls: urls that were used for search Only top-ranked urls will become seed urls Important Args: ranking: a ranking method max_results: Maximum number of results to return in related and keyword search """ max_pages = 1 # Always use single page to represent a website train_sites = self.fetcher.fetch_sites(self.train_urls, max_pages, selection, online) if negative_file: # (random) reliably negative examples neg_urls = URLUtility.load_urls(negative_file) neg_urls = neg_urls[:200] else: neg_urls = [] print "neg_urls: ", len(neg_urls) neg_sites = self.fetcher.fetch_sites(neg_urls, 1, None, online) ranker = Ranker(train_sites, representation, ranking, neg_sites) # Data scores = [] # Avoid exception when iters=0 #seed_sites = self.train_urls # topk urls from each search batch seed_sites = train_sites # topk urls from each search batch selected_urls = {} # avoid searching with these urls again selected_urls['kw'] = set() selected_urls['bl'] = set() selected_urls['rl'] = set() selected_urls['fw'] = set() results = [] # Search results for ranking urls = set() # Avoid fetch and rank these urls again sites = set() # used to compute reward # Hyperparameters #max_numb_pages = 12000 # stop condition max_numb_pages = 51000 # stop condition #iters = 500 iters = 2000 k = 20 # number of pages from the newly discovered pages to be added to the seed list max_kw = 20 # maximum number of keywords to select from the seed pages self.searcher.set_max_keywords(max_kw) # Initialize Search Operator Selection Strategy count = {} # Count number of results yeilded by each search operator count['bl'] = count['kw'] = count['rl'] = count['fw'] = 0 count['bl'] = 20000 # never choose this #ucb = UCB1(['rl', 'bl', 'kw']) ucb = UCB1(['rl', 'bl', 'kw', 'fw']) site_mode = False # used in get_top_ranked_urls function for i in xrange(iters): t = time.time() print "Searching... ", len(seed_sites), " seed urls" searchop = self.select_searchop(count, search, ucb) if searchop == 'rl' or searchop == 'bl': site_mode = True else: site_mode = False print "\n Iteration ", i, searchop new_urls = self.searcher.search(seed_sites, \ searchop, seed_keyword=seed_keyword, \ max_results=max_results) new_urls = [url for url in new_urls if url not in urls] if len(new_urls) == 0: print "Searcher found 0 url" seed_sites = self.get_top_ranked_urls( scores, k, selected_urls[searchop], site_mode ) # Backlink search and related search only use host name to form the query. searchop!='kw' <-> searchop=='bl' or searchop=='rl' if len(seed_sites) == 0: print "Stop. Running out of seeds" break else: continue urls.update(new_urls) print "Time to search ", i, ": ", time.time() - t t = time.time() new_sites = self.fetcher.fetch_sites(new_urls, max_pages, selection, online) print "Time to fetch ", i, ": ", time.time() - t t = time.time() temp = len(results) results.extend(new_sites) print "Size of candidates (after): ", len(results) print "Number of new candidates (after): ", len(results) - temp scores = ranker.rank(results) if len(scores) >= max_numb_pages: print "Stop. Retrieved ", max_numb_pages, " pages" break #seed_sites = self.get_top_ranked_urls(scores, k, selected_urls[searchop]) seed_sites = self.get_top_ranked_urls( scores, k, selected_urls[searchop], site_mode ) # Backlink search and related search only use host name to form the query. searchop!='kw' <-> searchop=='bl' or searchop=='rl' if len(seed_sites) == 0: print "Stop. Running out of seeds" break self.save_urls(new_sites, i) # Update information from the search results to the operation selector count[searchop] += len(new_urls) if (search == 'bandit') and new_sites: reward = self.get_reward(scores, new_sites, sites) print "UCB Rewards", searchop, reward ucb.update(searchop, reward, len(new_sites)) sites.update([s.get_host() for s in new_sites]) print "Time to rank ", i, ": ", time.time() - t self.save_scores(scores)
def run(self, ranking, selection=None, online=True, max_results=50, seed_keyword="gun", searchop="kw", iters=5, representation='body', negative_file=None): """ seed_sites: urls that are used for search selected_urls: urls that were used for search Only top-ranked urls will become seed urls Important Args: ranking: a ranking method max_results: Maximum number of results to return in related and keyword search """ max_pages = 1 # Always use single page to represent a website train_sites = self.fetcher.fetch_sites(self.train_urls, max_pages, selection, online) if negative_file: # (random) reliably negative examples neg_urls = URLUtility.load_urls(negative_file) neg_urls = neg_urls[:200] else: neg_urls = [] print "neg_urls: ", len(neg_urls) neg_sites = self.fetcher.fetch_sites(neg_urls, 1, None, online) ranker = Ranker(train_sites, representation, ranking, neg_sites) # Data scores = [] # Avoid exception when iters=0 #seed_sites = self.train_urls # topk urls from each search batch seed_sites = train_sites # topk urls from each search batch selected_urls = set() # avoid searching with these urls again results = [] # Search results for ranking urls = set() # Avoid fetch and rank these urls again # Hyperparameters #max_numb_pages = 12000 # stop condition max_numb_pages = 51000 # stop condition #iters = 500 iters = 2000 k = 20 # number of pages from the newly discovered pages to be added to the seed list max_kw = 20 # maximum number of keywords to select from the seed pages self.searcher.set_max_keywords(max_kw) """ # Search Strategy blsearch = kwsearch = rlsearch = fwsearch = False if search == 'bl': blsearch = True print "Backlink search enable" elif search == 'rl': rlsearch = True print "Related search enable" elif search == 'kw': kwsearch = True print "Keyword search enable" """ site_mode = False # used in get_top_ranked_urls function if searchop == 'rl' or searchop == 'bl': site_mode = True for i in xrange(iters): t = time.time() print "Searching... ", len(seed_sites), " seed urls" print "\n Iteration ", i, searchop new_urls = self.searcher.search(seed_sites, searchop, \ seed_keyword=seed_keyword, \ max_results=max_results) new_urls = [url for url in new_urls if url not in urls] if len(new_urls) == 0: print "Searcher found 0 url" seed_sites = self.get_top_ranked_urls(scores, k, selected_urls, site_mode) if len(seed_sites) == 0: print "Stop. Running out of seeds" break else: continue urls.update(new_urls) print "Time to search ", i, ": ", time.time() - t t = time.time() new_sites = self.fetcher.fetch_sites(new_urls, max_pages, selection, online) print "Time to fetch ", i, ": ", time.time() - t t = time.time() print "Size of candidates (before): ", len(results) results.extend(new_sites) print "Size of candidates (after): ", len(results) scores = ranker.rank(results) if len(scores) >= max_numb_pages: print "Stop. Retrieved ", max_numb_pages, " pages" break seed_sites = self.get_top_ranked_urls(scores, k, selected_urls, site_mode) if len(seed_sites) == 0: print "Stop. Running out of seeds" break self.save_urls(new_sites, i) print "Time to rank ", i, ": ", time.time() - t self.save_scores(scores)