コード例 #1
0
    def test_safe_url_domain(self):
        """Tests the methods to return request safe URLs and domains."""
        http_string = "http://mydomain.com"
        https_string = "https://mydomain.com"
        no_protocol_string = "mydomain.com"

        self.assertEqual(h.safe_url(http_string), http_string)
        self.assertEqual(h.safe_url(https_string), https_string)
        self.assertEqual(h.safe_url(no_protocol_string), https_string)

        self.assertEqual(h.get_domain(http_string), no_protocol_string)
        self.assertEqual(h.get_domain(https_string), no_protocol_string)
        self.assertEqual(h.get_domain(no_protocol_string), no_protocol_string)
コード例 #2
0
 def insert_thumbs(self, rssid):
     self.db[self.COLLECTION].remove({'rssid': rssid}, safe=True)
     Site.objects(rssid=rssid).delete()
     domain = get_domain(rssid, connection=self.connection)
     site = Site(rssid=rssid, domain=domain)
     thumbs = self.get_thumbs(rssid, domain)
     try:
         colorific.color_mt(thumbs.items(), rssid, n=8)
     except Exception, e:
         raise e
コード例 #3
0
ファイル: fetcher.py プロジェクト: brishin/nSquared
 def insert_thumbs(self, rssid):
   self.db[self.COLLECTION].remove({'rssid': rssid}, safe=True)
   Site.objects(rssid=rssid).delete()
   domain = get_domain(rssid, connection=self.connection)
   site = Site(rssid=rssid, domain=domain)
   thumbs = self.get_thumbs(rssid, domain)
   try:
     colorific.color_mt(thumbs.items(), rssid, n=8)
   except Exception, e:
     raise e
コード例 #4
0
ファイル: webpage.py プロジェクト: carlosfem/crawler
 def child_urls(self):
     """list: return a list with all the child pages originated from the
              same domain without duplicates. Excludes the parent page.
     """
     urls = [url["href"] for url in self.soup.find_all("a", href=True)]
     for url in urls:
         domain = helpers.get_domain(url)
         if domain == self.domain:
             self._child_urls.add(helpers.safe_url(url))
         elif domain == "":
             self._child_urls.add(helpers.safe_url(self.domain + url))
     return self._child_urls
コード例 #5
0
 def update_thumbs(self, rssid):
     domain = get_domain(rssid, connection=self.connection)
     site, created = Site.objects.get_or_create(rssid=rssid,
                                                defaults={'domain': domain})
     if created:
         last_updated = None
     else:
         last_updated = site.last_updated
     thumbs = self.get_thumbs(rssid, domain, last_updated=last_updated)
     try:
         colorific.color_mt(thumbs.items(), rssid, n=8)
     except Exception, e:
         raise e
コード例 #6
0
ファイル: fetcher.py プロジェクト: brishin/nSquared
 def update_thumbs(self, rssid):
   domain = get_domain(rssid, connection=self.connection)
   site, created = Site.objects.get_or_create(rssid=rssid,
       defaults={'domain': domain})
   if created:
     last_updated = None
   else:
     last_updated = site.last_updated
   thumbs = self.get_thumbs(rssid, domain, last_updated=last_updated)
   try:
     colorific.color_mt(thumbs.items(), rssid, n=8)
   except Exception,  e:
     raise e
コード例 #7
0
ファイル: Graph.py プロジェクト: ngntrn/visual_web_crawler
 def __init__(self,
              url,
              parents_list,
              node_depth,
              id=None,
              parent_node=None,
              keyword=None):
     self.url = url
     self.parents_list = parents_list
     self.domain = get_domain(self.url)
     self.node_depth = int(node_depth)
     self.id = id
     self.parent_node = parent_node
     self.keyword = keyword
     self.found = bool()
コード例 #8
0
ファイル: core_unit_tests.py プロジェクト: carlosfem/crawler
    def test_valid_domain_page(self):
        """Tests if the domain page follows the expected standards"""
        page = self.domain_page

        self.assertTrue(page.child_urls)
        self.assertEqual(page.domain, self.domain)

        # Verifies that there are no duplicate child pages
        self.assertEqual(len(page.child_urls), len(set(page.child_urls)))

        # Verifies that there are no child pages from different domains
        diff_domain_children = [
            child for child in page.child_urls
            if helpers.get_domain(child) != page.domain
        ]
        self.assertFalse(diff_domain_children)
コード例 #9
0
ファイル: webpage.py プロジェクト: carlosfem/crawler
    def __init__(self,
                 url,
                 target_tag="div",
                 target_class="productName",
                 timeout=2):

        self.url = helpers.safe_url(url)
        self.target_tag = target_tag
        self.target_class = target_class
        self.timeout = timeout

        self._soup = ""
        self._child_urls = set()

        div = self.soup.find(self.target_tag, {"class": self.target_class})

        self.title = self.soup.find("title").text
        self.domain = helpers.get_domain(self.url)
        self.target_name = self._INVALID_TARGET if div is None else div.text

        # dump
        del div
コード例 #10
0
def main():
    link = session.query(Link)\
            .filter(Link.visited_at == None)\
            .order_by(asc(Link.id))\
            .first()

    if link is None:
        print('Nothing to visit right now')

    try:
        print('Trying to visit: {}'.format(link))

        r = requests.get(link, timeout=5)
        soup = BeautifulSoup(r.text, 'html.parser')

        domain_on_redirect = get_domain(r.url)

        if not domain_exists(session, domain_on_redirect):
            print('Found new domain: {}'.format(domain_on_redirect))
            save(session, Domain(url=domain_on_redirect))
            print('Saved that new domain.')

        for site_url in set([o.get('href') for o in soup.find_all('a')]):

            if site_url is None:
                continue

            url = site_url

            if not is_url(site_url):
                url = urljoin(get_domain(link.url),
                              site_url)

            print('Found: {}'.format(url))

            l = session.query(Link)\
                       .filter(Link.url == url).first()

            if l is not None:
                continue

            l = Link(url=url)
            domain = get_domain(l.url)

            domain_in_db = session.query(Domain)\
                                  .filter(Domain.url == domain)\
                                  .first()

            if domain_in_db is None:
                print('Found new domain: {}'.format(domain))
                domain_in_db = Domain(url=domain)
                save(session, domain_in_db)

            l.domain = domain_in_db
            save(session, l)
    except Exception as e:
        print('Something went wrong')
        print(e)
    finally:
        link.visited_at = datetime.now()
        save(session, link)