def is_valid(url, saved): try: url = defrag(url) parsed = urlparse(url) if parsed.scheme not in {"http", "https"}: return False elif saved.get(get_urlhash(url)): return False elif not re.match( r".*(\.ics" + r"|\.cs" + r"|\.informatics" + r"|\.stat" + r"|today)" r"\.uci\.edu", parsed.netloc): return False elif re.match( r".*\.(css|js|bmp|gif|jpe?g|ico" + r"|png|tiff?|mid|mp2|mp3|mp4" + r"|wav|avi|mov|mpeg|ram|m4v|mkv|ogg|ogv|pdf" + r"|ps|eps|tex|ppt|pptx|ppsx|diff|doc|docx|xls|xlsx|names" + r"|data|dat|exe|bz2|tar|msi|bin|7z|psd|dmg|iso" + r"|epub|dll|cnf|tgz|sha1" + r"|thmx|mso|arff|rtf|jar|csv" + r"|rm|smil|wmv|swf|wma|zip|rar|gz)$", parsed.path.lower()): return False else: if re.match(r"today\.uci\.edu", parsed.netloc) and not re.match(r"//department" r"/information_computer_sciences/*", parsed.path): return False return True except TypeError: print("TypeError for ", url) raise
def add_words(self, url, words): if words is None: return urlhash = get_urlhash(url) with self.save_lock: self.save[urlhash] = (url, tuple(words)) self.save.sync()
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.add_to_tbd(url)
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append(url)
def mark_url_complete(self, url): urlhash = get_urlhash(url) if urlhash not in self.save: # This should not happen. self.logger.error( f"Completed url {url}, but have not seen it before.") self.save[urlhash] = (url, True) self.save.sync()
def add_url(self, parent_url, scraped_url): url = normalize(scraped_url) urlhash = get_urlhash(url) # print(' add url: ', url, '- urlhash:', urlhash) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append((parent_url, scraped_url))
def add_url(self, url): with self.lock: url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() # self.to_be_downloaded.append(url) self.add_to_backQueue(url)
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() domain = Frontier.place_url_in_dom(url) self.delay_tracker self.to_be_downloaded[domain].put(url)
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self._get_tbd_bucket(url).append(url) return True return False
def check_url_completed(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: return False else: if not self.save[urlhash][1]: return False return True
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.lock.acquire() self.save[urlhash] = (url, False) self.save.sync() self.lock.release() self.to_be_downloaded.put(url)
def add_url(self, url): url = normalize(url) # Doug - add url defragging url = defragURL(url) # Doug - end urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append(url)
def mark_url_complete(self, url, urlInfo): urlhash = get_urlhash(url) with self.save_lock: if urlhash not in self.save: # This should not happen. self.logger.error( f"Completed url {url}, but have not seen it before.") self.save[urlhash] = urlInfo self.save.sync()
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() #f = open("links1.txt", "a+") #f.write(str(url) + "\n") #f.close() self.to_be_downloaded.append(url)
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.discovered_urls[url] += 1 if self.discovered_urls[url] == 1: self.discovered_urls_text_file.write(url + '\n') self.to_be_downloaded.append(url)
def mark_url_complete(self, url): self.to_be_downloaded.task_done() urlhash = get_urlhash(url) if urlhash not in self.save: # This should not happen. self.logger.error( f"Completed url {url}, but have not seen it before.") self.lock.acquire() self.save[urlhash] = (url, True) self.save.sync() self.lock.release()
def mark_url_complete(self, url): url = normalize(url) urlhash = get_urlhash(url) # print(' mark complete url:', url, '- urlhash:', urlhash) if urlhash not in self.save: # This should not happen. self.logger.error( f"Completed url {url}, but have not seen it before.") self.save[urlhash] = (url, True) self.save.sync()
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) if urlhash not in self.save: self.save[urlhash] = (url, False) self.save.sync() self.to_be_downloaded.append(url) # If the url hasn't been appended before, return true # Used in worker.py to count the number of unique pages return True return False
def add_url(self, url): url = normalize(url) urlhash = get_urlhash(url) should_put = False with self.save_lock: if urlhash not in self.save: self.save[urlhash] = UrlInfo(url) self.save.sync() should_put = True if should_put: self.to_be_downloaded.put(url)