def run(self): # this is where we are gonna call the function to get the robot.txt record = Recorder() while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break if not self.can_fetch(tbd_url): continue resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) # adding data to recorder record.add_url(tbd_url) if not (resp.raw_response is None and is_valid(tbd_url)): record.add_words(resp.raw_response.content, tbd_url) record.save() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) record.finish_crawl_report()
def __init__(self, config, restart): self.logger = get_logger("Frontier") self.config = config self.to_be_downloaded = list() if not os.path.exists(self.config.save_file) and not restart: # Save file does not exist, but request to load save. self.logger.info( f"Did not find save file {self.config.save_file}, " f"starting from seed.") elif os.path.exists(self.config.save_file) and restart: # Save file does exists, but request to start from seed. self.logger.info( f"Found save file {self.config.save_file}, deleting it.") os.remove(self.config.save_file) # Load existing save file, or create one if it does not exist. self.save = shelve.open(self.config.save_file) if restart: for url in self.config.seed_urls: if is_valid(url, self.config): self.add_url(url) else: # Set the frontier state with contents of save file. self._parse_save_file() if not self.save: for url in self.config.seed_urls: self.add_url(url)
def _parse_save_file(self): ''' This function can be overridden for alternate saving techniques. ''' total_count = len(self.save) tbd_count = 0 for url, completed in self.save.values(): if not completed and is_valid(url): self.to_be_downloaded.append(url) tbd_count += 1 self.logger.info( f"Found {tbd_count} urls to be downloaded from {total_count} " f"total urls discovered.")
def get_link(document, base): all_links = [] soup = BeautifulSoup(document, 'html.parser') for link in soup.find_all('a'): curr_link = link.get('href') curr_link = urljoin(base, curr_link) if scraper.is_valid(curr_link) and curr_link is not None: curr_link = normalize_url(curr_link) all_links.append(curr_link) return all_links
def fix_token_hash(self, tokens_file, force_delete): print("fix_token_hash") removed_count = 0 with open("./removed_urls_tokenhash.txt", "a") as removed_url_file: removed_url_file.write("===" + tokens_file + "===\n") with (shelve.open(tokens_file)) as db: # { hash : (url, complete)} for key, value in db.items(): # parsed = urlparse(db[key][0]) # filtered_url from extract_next_links() in scraper.py filtered_url = value if not is_valid(filtered_url, self.config): print("Found match to delete: " + filtered_url) if force_delete: del db[key] removed_count += 1 removed_url_file.write(filtered_url + "\n") # input("(enter to continue)") print( "" + ("" if force_delete else "(Preview) ") + "Removed " + str(removed_count) + " links from the tokenhash (saved in ./removed_urls_tokenhash.txt)" )
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break # If URL that was added into frontier was invalid, do not download it if not is_valid(tbd_url): self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) continue resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") # Check for any erroes from teh download if str(resp.status).startswith('4') or resp.error: self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) continue # Text Extraction untokenized_text = self.extract_text(resp) tokenized_text = self.token.Tokenize(untokenized_text) # Determine if indexing is worthwhile low_value_page = False if sum(tokenized_text.values()) < self.low_value_threshold: low_value_page = True self.logger.info( '[SKIPPING] URL found to be of low value: {0} tokens'. format(sum(tokenized_text.values()))) # Compare similarity to last 5 pages we crawled in similar = False for url_token_pair in self.cache: if self.token.Similarity(tokenized_text, url_token_pair[1]): similar = True self.logger.info( '[SKIPPING] Similarity found between these two urls. Skipping the second url...\n{0}\n{1}' .format(url_token_pair[0], tbd_url)) break # Add page into self.cache self.cache.append((tbd_url, tokenized_text)) if not similar and not low_value_page: # Insert into DB self.tiny.insert({tbd_url: tokenized_text }) # inserting the text into the tinydb # Link Extraction scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) # Mark as complete and sleep to be patient self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) # <-- politeness
if foundLink not in temp: temp.append(foundLink) # for link in soup.find_all('a'): # foundLink = str(urldefrag(link.get('href'))[0]) # if "http" not in foundLink: # newLink = urljoin(url, foundLink) # if newLink not in temp: # temp.append(newLink) print(temp) print("\n\n") valids = [] for link in temp: if is_valid(link): valids.append(link) print(valids) # parsed = urlparse("//www.ics.uci.edu/community/news/view_news?id=1906") # print(parsed) # result = urljoin("https://www.ics.uci.edu/community", "//stats/news/view_news?id=1906") # print(result) # page = urlopen('https://www.stat.uci.edu/minor-in-statistics') # soup = BeautifulSoup(page,'html.parser') # l = len(soup.getText(strip = True).split())