def __init__(self, file, encoding=settings.default_encoding, mode='wb', unique=False, unique_by=None, quoting=csv.QUOTE_ALL, utf8_bom=False, auto_repair=False, **argv): self.encoding = encoding self.unique = unique self.unique_by = unique_by if hasattr(file, 'write'): self.fp = file else: if auto_repair: self._remove_invalid_rows(file=file, quoting=quoting, **argv) if utf8_bom: self.fp = open(file, 'wb') self.fp.write('\xef\xbb\xbf') self.fp.close() self.fp = open(file, mode=mode.replace('w', 'a')) else: self.fp = open(file, mode) if self.unique: self.rows = adt.HashDict( ) # cache the rows that have already been written for row in csv.reader(open(self.fp.name)): self.rows[self._unique_key(row)] = True self.writer = csv.writer(self.fp, quoting=quoting, **argv)
def __init__(self, output_file=None, max_links=100, max_depth=1, allowed_urls='', banned_urls='^$', robots=None, crawl_existing=True): """ output_file: where to save scraped data max_links: the maximum number of links to follow per page max_depth: the maximum depth to follow links into website (use None for no limit) allowed_urls: a regex for allowed urls, defaults to all urls banned_urls: a regex for banned urls, defaults to no urls robots: RobotFileParser object to determine which urls allowed to crawl crawl_existing: sets whether to crawl content already downloaded previously in the cache """ self.found = adt.HashDict(int) # track depth of found URLs if output_file: self.writer = common.UnicodeWriter(output_file) else: self.writer = None self.max_links = max_links self.max_depth = max_depth self.allowed_urls = re.compile(allowed_urls) self.banned_urls = re.compile(banned_urls) self.robots = robots self.crawl_existing = crawl_existing
def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1): """Crawl this website and return all emails found website: the URL of website to crawl max_depth: how many links deep to follow before stop crawl max_urls: how many URL's to download before stop crawl max_emails: The maximum number of emails to extract before stop crawl. If None then extract all emails found in crawl. """ def score(link): """Return how valuable this link is for ordering crawling The lower the better""" link = link.lower() total = 0 if 'contact' in link: pass # this page is top priority elif 'about' in link: total += 10 elif 'help' in link: total += 20 else: # generic page total += 100 # bias towards shorter links total += len(link) return total domain = urlparse.urlparse(website).netloc scraped = adt.HashDict() c = CrawlerCallback(max_depth=max_depth) outstanding = [(0, website)] # list of URLs and their score emails = [] while outstanding and (max_urls is None or len(scraped) < max_urls) \ and (max_emails is None or len(emails) < max_emails): _, url = outstanding.pop(0) scraped[url] = True html = self.get(url) if html: for email in alg.extract_emails(html): if email not in emails: emails.append(email) if len(emails) == max_emails: break # crawl the linked URLs for link in c.crawl(self, url, html): if urlparse.urlparse(link).netloc == domain: if link not in scraped: outstanding.append((score(link), link)) # sort based on score to crawl most promising first outstanding.sort() return list(emails)
def find(self, website, max_depth, max_urls, max_results): """ website: the URL of website to crawl max_depth: how many links deep to follow before stop crawl max_urls: how many URL's to download before stop crawl max_results: The maximum number of results to extract before stop crawl. If None then extract all results found in crawl. """ # check for redirect URL self.D.get(website) redirect_url = self.D.cache.meta(website).get( 'url') if self.D.cache else self.final_url website = redirect_url or website domain = urlparse.urlparse(website).netloc scraped = adt.HashDict() c = CrawlerCallback(max_depth=max_depth) outstanding = [(0, website)] # list of URLs and their score results = [] while outstanding and (max_urls is None or len(scraped) < max_urls) \ and (max_results is None or len(results) < max_results): _, url = outstanding.pop(0) scraped[url] = True html = self.D.get(url, num_retries=0) if html: for result in self.extract_fn(html): if result not in results: results.append(result) if len(results) == max_results: break # crawl the linked URLs for link in c.crawl(self, url, html): if urlparse.urlparse(link).netloc == domain: if link not in scraped: # insert sort this new record so crawl most promising first score = self.link_score(link) for i, (other_score, other_link) in enumerate(outstanding): if score < other_score: outstanding.insert(i, ((score, link))) break else: outstanding.append((score, link)) return results
def __init__(self, url=None, urls=None, url_iter=None, num_threads=20, cb=None, depth=True, max_errors=None, pattern=None, **kwargs): self.settings = adt.Bag(read_cache=True, write_cache=True, num_redirects=5, num_retries=2, timeout=20, headers={}, num_threads=num_threads, cb=cb, url_iter=url_iter, depth=depth, pattern=pattern) self.settings.update(**kwargs) self.D = download.Download(**kwargs) self.kwargs = kwargs # queue of html to be written to cache self.cache_queue = [] # URL's that are waiting to download self.download_queue = collections.deque() if urls: self.download_queue.extend(urls) if url: self.download_queue.append( url ) # XXX create compressed dict data type for large in memory? # URL's currently downloading self.processing = {} # defereds that are downloading self.downloading = [] # URL's that have been found before self.found = adt.HashDict() for url in self.download_queue: self.found[url] = True self.state = download.State() self.max_errors = max_errors self.num_errors = 0 # counter for the number of subsequent errors
def get_emails(self, website, max_depth=1, max_urls=None, max_emails=None): """Crawl this website and return all emails found """ scraped = adt.HashDict() c = CrawlerCallback(max_depth=max_depth) outstanding = collections.deque([website]) emails = [] while outstanding and (max_urls is None or len(scraped) < max_urls) \ and (max_emails is None or len(emails) < max_emails): url = outstanding.popleft() scraped[url] = True html = self.get(url, delay=1) if html: for email in alg.extract_emails(html): if email not in emails: emails.append(email) if len(emails) == max_emails: break outstanding.extend(c.crawl(self, url, html)) return list(emails)
def get_tag(html): """Find tag type at this location >>> get_tag('<div>abc</div>') 'div' >>> get_tag(' <div>') >>> get_tag('div') """ match = tag_regex.match(html) if match: return match.groups()[0] else: return None splits = adt.HashDict() def split_tag(html): """Extract starting tag and contents from HTML >>> [str(s) for s in split_tag('<div>abc<div>def</div>abc</div>ghi<div>jkl</div>')] ['<div>abc<div>def</div>abc</div>', 'ghi<div>jkl</div>'] >>> [str(s) for s in split_tag('<br /><div>abc</div>')] ['<br />', '<div>abc</div>'] >>> [str(s) for s in split_tag('<div>abc<div>def</div>abc</span>')] ['<div>abc<div>def</div>abc</span></div>', ''] """ if html in splits: i, tag = splits[html] else:
if hasattr(e, 'code'): self.response_code = str(e.code) if hasattr(e, 'read'): try: self.error_content = e.read() except Exception, e: self.error_content = '' # so many kinds of errors are possible here so just catch them all common.logger.warning('Download error: %s %s' % (url, e)) if self.settings.acceptable_errors and self.response_code in self.settings.acceptable_errors: content, self.final_url = self.settings.default, url else: content, self.final_url = None, url return content _domains = adt.HashDict() def throttle(self, url, delay, proxy=None, variance=0.5): """Delay a minimum time for each domain per proxy by storing last access time url what intend to download delay the minimum amount of time (in seconds) to wait after downloading content from this domain proxy the proxy to download through variance the amount of randomness in delay, 0-1 """ if delay > 0: key = ':'.join([
class CrawlerCallback(ThreadedCallback): """Example callback to crawl the website """ found = adt.HashDict(int) # track depth of found URLs def __init__(self, output_file=None, max_links=100, max_depth=1, allowed_urls='', banned_urls='^$', robots=None, crawl_existing=True): """ `output_file' is where to save scraped data `max_links' is the maximum number of links to follow per page `max_depth' is the maximum depth to follow links into website (use None for no limit) `allowed_urls' is a regex for allowed urls, defaults to all urls `banned_urls' is a regex for banned urls, defaults to no urls `robots': RobotFileParser object to determine which urls allowed to crawl `crawl_existing' sets whether to crawl content already downloaded previously in the cache """ if output_file: self.writer = common.UnicodeWriter(output_file) else: self.writer = None self.max_links = max_links self.max_depth = max_depth self.allowed_urls = re.compile(allowed_urls) self.banned_urls = re.compile(banned_urls) self.robots = robots self.crawl_existing = crawl_existing def crawl(self, D, url, html): """Crawl website html and return list of URLs crawled """ def normalize(link): """Normalize the link to avoid duplicates """ if '#' in link: # remove internal links to avoid duplicates link = link[:link.index('#')] link = common.unescape(link) # remove & from link return urlparse.urljoin(url, link) # support relative links def valid(link): """Check if should crawl this link """ # check if a media file if common.get_extension(link) not in common.MEDIA_EXTENSIONS: # check if a proper HTTP link if link.lower().startswith('http'): # only crawl within website if common.same_domain(domain, link): # passes regex if self.allowed_urls.match( link) and not self.banned_urls.match(link): # not blocked by robots.txt if not self.robots or self.robots.can_fetch( settings.user_agent, link): # allowed to recrawl if self.crawl_existing or (D.cache and link not in D.cache): return True return False domain = common.get_domain(url) depth = CrawlerCallback.found[url] outstanding = [] if depth != self.max_depth: # extract links to continue crawling links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) for link in links_re.findall(html): link = normalize(link) if link not in CrawlerCallback.found: CrawlerCallback.found[link] = depth + 1 if valid(link): # is a new link outstanding.append(link) if len(outstanding) == self.max_links: break return outstanding
class StateCallback(ThreadedCallback): """Example callback that saves state """ active_urls = set() found = adt.HashDict() # track found URLs def __init__(self, output_file, header): # load state from previous run, if exists state = self.load_state() # settings to start crawl from beginning self.new_urls = False write_header = True mode = 'wb' if StateCallback.active_urls: # incomplete crawl common.logger.info('Loading previous crawl state') self.new_urls = True if os.path.exists(output_file): mode = 'ab' write_header = False self.writer = common.UnicodeWriter(output_file, mode=mode) if write_header: self.writer.writerow(header) def __call__(self, D, url, html): if self.new_urls: # restoring state so can ignore the starting url # instead return urls previously in queue self.new_urls = False new_urls = StateCallback.active_urls else: self.scrape(D, url, html) new_urls = self.crawl(D, url, html) # add newly scraped urls StateCallback.active_urls.update(new_urls) # this url has already been processed StateCallback.active_urls.discard(url) # save state in thread thread.start_new_thread(self.save_state, tuple()) return new_urls def save_state(self, output_file='.state.pickle'): """Save state of current crawl to pickle file """ # to ensure atomic write save state to temporary file first and then rename pickled_data = pickle.dumps( dict(urls=StateCallback.active_urls, found=StateCallback.found)) tmp_file = tempfile.NamedTemporaryFile(prefix=output_file + '.').name fp = open(tmp_file, 'wb') fp.write(pickled_data) # ensure all content is written to disk fp.flush() os.fsync(fp.fileno()) fp.close() # XXX error on Windows if dest exists os.rename(tmp_file, output_file) def load_state(self, input_file='.state.pickle'): """Load previous state from pickle file """ if os.path.exists(input_file): data = pickle.load(open(input_file)) StateCallback.active_urls.update(data.get('urls', [])) StateCallback.found = data.get('found', StateCallback.found) else: data = {} return data