Example #1
0
    def run(self):

        # this is where we are gonna call the function to get the robot.txt
        record = Recorder()
        while True:
            tbd_url = self.frontier.get_tbd_url()

            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            if not self.can_fetch(tbd_url):
                continue

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")

            scraped_urls = scraper(tbd_url, resp)

            # adding data to recorder
            record.add_url(tbd_url)

            if not (resp.raw_response is None and is_valid(tbd_url)):
                record.add_words(resp.raw_response.content, tbd_url)

            record.save()

            for scraped_url in scraped_urls:
                self.frontier.add_url(scraped_url)
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)

        record.finish_crawl_report()
Example #2
0
    def __init__(self, config, restart):
        self.logger = get_logger("Frontier")
        self.config = config
        self.to_be_downloaded = list()

        if not os.path.exists(self.config.save_file) and not restart:
            # Save file does not exist, but request to load save.
            self.logger.info(
                f"Did not find save file {self.config.save_file}, "
                f"starting from seed.")
        elif os.path.exists(self.config.save_file) and restart:
            # Save file does exists, but request to start from seed.
            self.logger.info(
                f"Found save file {self.config.save_file}, deleting it.")
            os.remove(self.config.save_file)
        # Load existing save file, or create one if it does not exist.
        self.save = shelve.open(self.config.save_file)
        if restart:
            for url in self.config.seed_urls:
                if is_valid(url, self.config):
                    self.add_url(url)
        else:
            # Set the frontier state with contents of save file.
            self._parse_save_file()
            if not self.save:
                for url in self.config.seed_urls:
                    self.add_url(url)
Example #3
0
 def _parse_save_file(self):
     ''' This function can be overridden for alternate saving techniques. '''
     total_count = len(self.save)
     tbd_count = 0
     for url, completed in self.save.values():
         if not completed and is_valid(url):
             self.to_be_downloaded.append(url)
             tbd_count += 1
     self.logger.info(
         f"Found {tbd_count} urls to be downloaded from {total_count} "
         f"total urls discovered.")
def get_link(document, base):
    all_links = []
    soup = BeautifulSoup(document, 'html.parser')
    for link in soup.find_all('a'):
        curr_link = link.get('href')
        curr_link = urljoin(base, curr_link)
        if scraper.is_valid(curr_link) and curr_link is not None:
            curr_link = normalize_url(curr_link)
            all_links.append(curr_link)

    return all_links
Example #5
0
 def fix_token_hash(self, tokens_file, force_delete):
     print("fix_token_hash")
     removed_count = 0
     with open("./removed_urls_tokenhash.txt", "a") as removed_url_file:
         removed_url_file.write("===" + tokens_file + "===\n")
         with (shelve.open(tokens_file)) as db:
             # { hash : (url, complete)}
             for key, value in db.items():
                 # parsed = urlparse(db[key][0])
                 # filtered_url from extract_next_links() in scraper.py
                 filtered_url = value
                 if not is_valid(filtered_url, self.config):
                     print("Found match to delete: " + filtered_url)
                     if force_delete:
                         del db[key]
                     removed_count += 1
                     removed_url_file.write(filtered_url + "\n")
                     # input("(enter to continue)")
         print(
             "" + ("" if force_delete else "(Preview) ") + "Removed " +
             str(removed_count) +
             " links from the tokenhash (saved in ./removed_urls_tokenhash.txt)"
         )
Example #6
0
    def run(self):
        while True:
            tbd_url = self.frontier.get_tbd_url()
            if not tbd_url:
                self.logger.info("Frontier is empty. Stopping Crawler.")
                break

            # If URL that was added into frontier was invalid, do not download it
            if not is_valid(tbd_url):
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
                continue

            resp = download(tbd_url, self.config, self.logger)
            self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, "
                             f"using cache {self.config.cache_server}.")

            # Check for any erroes from teh download
            if str(resp.status).startswith('4') or resp.error:
                self.frontier.mark_url_complete(tbd_url)
                time.sleep(self.config.time_delay)
                continue

            # Text Extraction
            untokenized_text = self.extract_text(resp)
            tokenized_text = self.token.Tokenize(untokenized_text)

            # Determine if indexing is worthwhile
            low_value_page = False
            if sum(tokenized_text.values()) < self.low_value_threshold:
                low_value_page = True
                self.logger.info(
                    '[SKIPPING] URL found to be of low value: {0} tokens'.
                    format(sum(tokenized_text.values())))

            # Compare similarity to last 5 pages we crawled in
            similar = False
            for url_token_pair in self.cache:
                if self.token.Similarity(tokenized_text, url_token_pair[1]):
                    similar = True
                    self.logger.info(
                        '[SKIPPING] Similarity found between these two urls. Skipping the second url...\n{0}\n{1}'
                        .format(url_token_pair[0], tbd_url))
                    break

            # Add page into self.cache
            self.cache.append((tbd_url, tokenized_text))

            if not similar and not low_value_page:
                # Insert into DB
                self.tiny.insert({tbd_url: tokenized_text
                                  })  # inserting the text into the tinydb

                # Link Extraction
                scraped_urls = scraper(tbd_url, resp)
                for scraped_url in scraped_urls:
                    self.frontier.add_url(scraped_url)

            # Mark as complete and sleep to be patient
            self.frontier.mark_url_complete(tbd_url)
            time.sleep(self.config.time_delay)  # <-- politeness
Example #7
0
        if foundLink not in temp:
            temp.append(foundLink)

    # for link in soup.find_all('a'):
    # 	foundLink = str(urldefrag(link.get('href'))[0])
    # 	if "http" not in foundLink:
    # 		newLink = urljoin(url, foundLink)
    # 		if newLink not in temp:
    # 			temp.append(newLink)

    print(temp)
    print("\n\n")

    valids = []
    for link in temp:
        if is_valid(link):
            valids.append(link)

    print(valids)

    # parsed = urlparse("//www.ics.uci.edu/community/news/view_news?id=1906")

    # print(parsed)

    # result = urljoin("https://www.ics.uci.edu/community", "//stats/news/view_news?id=1906")
    # print(result)

    # page = urlopen('https://www.stat.uci.edu/minor-in-statistics')

    # soup = BeautifulSoup(page,'html.parser')
    # l = len(soup.getText(strip = True).split())