Example #1
0
def is_valid(url, saved):
    try:
        url = defrag(url)
        parsed = urlparse(url)
        if parsed.scheme not in {"http", "https"}:
            return False
        elif saved.get(get_urlhash(url)):
            return False
        elif not re.match(
                r".*(\.ics"
                + r"|\.cs"
                + r"|\.informatics"
                + r"|\.stat"
                + r"|today)"
                  r"\.uci\.edu", parsed.netloc):
            return False
        elif re.match(
                r".*\.(css|js|bmp|gif|jpe?g|ico"
                + r"|png|tiff?|mid|mp2|mp3|mp4"
                + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf"
                + r"|ps|eps|tex|ppt|pptx|ppsx|diff|doc|docx|xls|xlsx|names"
                + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso"
                + r"|epub|dll|cnf|tgz|sha1"
                + r"|thmx|mso|arff|rtf|jar|csv"
                + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()):
            return False
        else:
            if re.match(r"today\.uci\.edu", parsed.netloc) and not re.match(r"//department"
                                                                            r"/information_computer_sciences/*",
                                                                            parsed.path):
                return False
            return True
    except TypeError:
        print("TypeError for ", url)
        raise
Example #2
0
 def add_words(self, url, words):
     if words is None:
         return
     urlhash = get_urlhash(url)
     with self.save_lock:
         self.save[urlhash] = (url, tuple(words))
         self.save.sync()
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         self.add_to_tbd(url)
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         self.to_be_downloaded.append(url)
Example #5
0
 def mark_url_complete(self, url):
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         # This should not happen.
         self.logger.error(
             f"Completed url {url}, but have not seen it before.")
     self.save[urlhash] = (url, True)
     self.save.sync()
Example #6
0
 def add_url(self, parent_url, scraped_url):
     url = normalize(scraped_url)
     urlhash = get_urlhash(url)
     # print('      add url:          ', url, '- urlhash:', urlhash)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         self.to_be_downloaded.append((parent_url, scraped_url))
Example #7
0
    def add_url(self, url):
        with self.lock: 
            url = normalize(url)
            urlhash = get_urlhash(url)
            if urlhash not in self.save:
                self.save[urlhash] = (url, False)
                self.save.sync()
#                 self.to_be_downloaded.append(url)
                self.add_to_backQueue(url)
Example #8
0
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         domain = Frontier.place_url_in_dom(url)
         self.delay_tracker
         self.to_be_downloaded[domain].put(url)
Example #9
0
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         self._get_tbd_bucket(url).append(url)
         return True
     return False
Example #10
0
 def check_url_completed(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         return False
     else:
         if not self.save[urlhash][1]:
             return False
     return True
Example #11
0
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.lock.acquire()
         self.save[urlhash] = (url, False)
         self.save.sync()
         self.lock.release()
         self.to_be_downloaded.put(url)
 def add_url(self, url):
     url = normalize(url)
     # Doug - add url defragging
     url = defragURL(url)
     # Doug - end
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         self.to_be_downloaded.append(url)
Example #13
0
    def mark_url_complete(self, url, urlInfo):
        urlhash = get_urlhash(url)
        with self.save_lock:
            if urlhash not in self.save:
                # This should not happen.
                self.logger.error(
                    f"Completed url {url}, but have not seen it before.")

            self.save[urlhash] = urlInfo
            self.save.sync()
Example #14
0
 def add_url(self, url):
     url = normalize(url)
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         self.save[urlhash] = (url, False)
         self.save.sync()
         #f = open("links1.txt", "a+")
         #f.write(str(url) + "\n")
         #f.close()
         self.to_be_downloaded.append(url)
Example #15
0
    def add_url(self, url):
        url = normalize(url)
        urlhash = get_urlhash(url)
        if urlhash not in self.save:
            self.save[urlhash] = (url, False)
            self.save.sync()
            self.discovered_urls[url] += 1

            if self.discovered_urls[url] == 1:
                self.discovered_urls_text_file.write(url + '\n')
                self.to_be_downloaded.append(url)
Example #16
0
 def mark_url_complete(self, url):
     self.to_be_downloaded.task_done()
     urlhash = get_urlhash(url)
     if urlhash not in self.save:
         # This should not happen.
         self.logger.error(
             f"Completed url {url}, but have not seen it before.")
     self.lock.acquire()
     self.save[urlhash] = (url, True)
     self.save.sync()
     self.lock.release()
Example #17
0
    def mark_url_complete(self, url):
        url = normalize(url)
        urlhash = get_urlhash(url)
        # print('      mark complete url:', url, '- urlhash:', urlhash)
        if urlhash not in self.save:
            # This should not happen.
            self.logger.error(
                f"Completed url {url}, but have not seen it before.")

        self.save[urlhash] = (url, True)
        self.save.sync()
    def add_url(self, url):
        url = normalize(url)
        urlhash = get_urlhash(url)
        if urlhash not in self.save:
            self.save[urlhash] = (url, False)
            self.save.sync()
            self.to_be_downloaded.append(url)

            # If the url hasn't been appended before, return true
            # Used in worker.py to count the number of unique pages
            return True
        return False
Example #19
0
    def add_url(self, url):
        url = normalize(url)
        urlhash = get_urlhash(url)

        should_put = False
        with self.save_lock:
            if urlhash not in self.save:
                self.save[urlhash] = UrlInfo(url)
                self.save.sync()
                should_put = True

        if should_put:
            self.to_be_downloaded.put(url)