def test_safe_url_domain(self): """Tests the methods to return request safe URLs and domains.""" http_string = "http://mydomain.com" https_string = "https://mydomain.com" no_protocol_string = "mydomain.com" self.assertEqual(h.safe_url(http_string), http_string) self.assertEqual(h.safe_url(https_string), https_string) self.assertEqual(h.safe_url(no_protocol_string), https_string) self.assertEqual(h.get_domain(http_string), no_protocol_string) self.assertEqual(h.get_domain(https_string), no_protocol_string) self.assertEqual(h.get_domain(no_protocol_string), no_protocol_string)
def insert_thumbs(self, rssid): self.db[self.COLLECTION].remove({'rssid': rssid}, safe=True) Site.objects(rssid=rssid).delete() domain = get_domain(rssid, connection=self.connection) site = Site(rssid=rssid, domain=domain) thumbs = self.get_thumbs(rssid, domain) try: colorific.color_mt(thumbs.items(), rssid, n=8) except Exception, e: raise e
def child_urls(self): """list: return a list with all the child pages originated from the same domain without duplicates. Excludes the parent page. """ urls = [url["href"] for url in self.soup.find_all("a", href=True)] for url in urls: domain = helpers.get_domain(url) if domain == self.domain: self._child_urls.add(helpers.safe_url(url)) elif domain == "": self._child_urls.add(helpers.safe_url(self.domain + url)) return self._child_urls
def update_thumbs(self, rssid): domain = get_domain(rssid, connection=self.connection) site, created = Site.objects.get_or_create(rssid=rssid, defaults={'domain': domain}) if created: last_updated = None else: last_updated = site.last_updated thumbs = self.get_thumbs(rssid, domain, last_updated=last_updated) try: colorific.color_mt(thumbs.items(), rssid, n=8) except Exception, e: raise e
def __init__(self, url, parents_list, node_depth, id=None, parent_node=None, keyword=None): self.url = url self.parents_list = parents_list self.domain = get_domain(self.url) self.node_depth = int(node_depth) self.id = id self.parent_node = parent_node self.keyword = keyword self.found = bool()
def test_valid_domain_page(self): """Tests if the domain page follows the expected standards""" page = self.domain_page self.assertTrue(page.child_urls) self.assertEqual(page.domain, self.domain) # Verifies that there are no duplicate child pages self.assertEqual(len(page.child_urls), len(set(page.child_urls))) # Verifies that there are no child pages from different domains diff_domain_children = [ child for child in page.child_urls if helpers.get_domain(child) != page.domain ] self.assertFalse(diff_domain_children)
def __init__(self, url, target_tag="div", target_class="productName", timeout=2): self.url = helpers.safe_url(url) self.target_tag = target_tag self.target_class = target_class self.timeout = timeout self._soup = "" self._child_urls = set() div = self.soup.find(self.target_tag, {"class": self.target_class}) self.title = self.soup.find("title").text self.domain = helpers.get_domain(self.url) self.target_name = self._INVALID_TARGET if div is None else div.text # dump del div
def main(): link = session.query(Link)\ .filter(Link.visited_at == None)\ .order_by(asc(Link.id))\ .first() if link is None: print('Nothing to visit right now') try: print('Trying to visit: {}'.format(link)) r = requests.get(link, timeout=5) soup = BeautifulSoup(r.text, 'html.parser') domain_on_redirect = get_domain(r.url) if not domain_exists(session, domain_on_redirect): print('Found new domain: {}'.format(domain_on_redirect)) save(session, Domain(url=domain_on_redirect)) print('Saved that new domain.') for site_url in set([o.get('href') for o in soup.find_all('a')]): if site_url is None: continue url = site_url if not is_url(site_url): url = urljoin(get_domain(link.url), site_url) print('Found: {}'.format(url)) l = session.query(Link)\ .filter(Link.url == url).first() if l is not None: continue l = Link(url=url) domain = get_domain(l.url) domain_in_db = session.query(Domain)\ .filter(Domain.url == domain)\ .first() if domain_in_db is None: print('Found new domain: {}'.format(domain)) domain_in_db = Domain(url=domain) save(session, domain_in_db) l.domain = domain_in_db save(session, l) except Exception as e: print('Something went wrong') print(e) finally: link.visited_at = datetime.now() save(session, link)