def crawl(self): if self.sitemap: page = http.get(self.sitemap) xmldoc = minidom.parseString(page.data.decode('utf-8')) sitemap_urls = xmldoc.getElementsByTagName('loc') for url in sitemap_urls: self.page_queue.append(self.get_text_from_xml(url.childNodes)) self.page_queue.append(self.base_url) for url in self.page_queue: if url in self.crawled_urls: continue page = Page(url=url, base_domain=self.base_url) if page.parsed_url.netloc != page.base_domain.netloc: continue page.analyze() for w in page.wordcount: self.wordcount[w] += page.wordcount[w] for b in page.bigrams: self.bigrams[b] += page.bigrams[b] for t in page.trigrams: self.trigrams[t] += page.trigrams[t] self.page_queue.extend(page.links) self.crawled_pages.append(page) self.crawled_urls.add(page.url)
def crawl(self): if self.sitemap: page = http.get(self.sitemap) if self.sitemap.endswith('xml'): xmldoc = minidom.parseString(page.data.decode('utf-8')) sitemap_urls = xmldoc.getElementsByTagName('loc') for url in sitemap_urls: self.page_queue.append( self.get_text_from_xml(url.childNodes)) elif self.sitemap.endswith('txt'): sitemap_urls = page.data.decode('utf-8').split('\n') for url in sitemap_urls: self.page_queue.append(url) self.page_queue.append(self.base_url) for url in self.page_queue: if url in self.crawled_urls: continue page = Page(url=url, base_domain=self.base_url, analyze_headings=self.analyze_headings, analyze_extra_tags=self.analyze_extra_tags) if page.parsed_url.netloc != page.base_domain.netloc: continue page.analyze() self.content_hashes[page.content_hash].add(page.url) for w in page.wordcount: self.wordcount[w] += page.wordcount[w] for b in page.bigrams: self.bigrams[b] += page.bigrams[b] for t in page.trigrams: self.trigrams[t] += page.trigrams[t] self.page_queue.extend(page.links) self.crawled_pages.append(page) self.crawled_urls.add(page.url) if not self.follow_links: break
def analyze(self, raw_html=None): """ Analyze the page and populate the warnings list """ if not raw_html: valid_prefixes = [] # only allow http:// https:// and // for s in [ 'http://', 'https://', '//', ]: valid_prefixes.append(self.url.startswith(s)) if True not in valid_prefixes: self.warn( f'{self.url} does not appear to have a valid protocol.') return if self.url.startswith('//'): self.url = f'{self.base_domain.scheme}:{self.url}' if self.parsed_url.netloc != self.base_domain.netloc: self.warn( f'{self.url} is not part of {self.base_domain.netloc}.') return try: page = http.get(self.url) except HTTPError as e: self.warn(f'Returned {e}') return encoding = 'ascii' if 'content-type' in page.headers: encoding = page.headers['content-type'].split('charset=')[-1] if encoding.lower() not in ('text/html', 'text/plain', 'utf-8'): # there is no unicode function in Python3 # try: # raw_html = unicode(page.read(), encoding) # except: self.warn(f'Can not read {encoding}') return else: raw_html = page.data.decode('utf-8') self.content_hash = hashlib.sha1(raw_html.encode('utf-8')).hexdigest() # remove comments, they screw with BeautifulSoup clean_html = re.sub(r'<!--.*?-->', r'', raw_html, flags=re.DOTALL) soup_lower = BeautifulSoup(clean_html.lower(), 'html.parser') #.encode('utf-8') soup_unmodified = BeautifulSoup(clean_html, 'html.parser') #.encode('utf-8') texts = soup_lower.findAll(text=True) visible_text = [w for w in filter(self.visible_tags, texts)] self.process_text(visible_text) self.populate(soup_lower) self.analyze_title() self.analyze_description() self.analyze_og(soup_lower) self.analyze_a_tags(soup_unmodified) self.analyze_img_tags(soup_lower) self.analyze_h1_tags(soup_lower) return True