def whois(self, url, timeout=10):
     """Return text of this whois query
     """
     domain = common.get_domain(url)
     if domain:
         text = ''
         key = 'whois_%s' % domain
         try:
             if self.cache:
                 text = self.cache[key]
             else:
                 raise KeyError()
         except KeyError:
             # try local whois command
             r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
             start = time.time()
             while r.poll() is None:
                 time.sleep(0.5)
                 if time.time() - start > timeout:
                     try:
                         r.kill()
                     except Exception, e:
                         pass
                     break
             if r.poll() != 1:
                 text = r.communicate()[0]
         
             if '@' in text:
                 if self.cache:
                     self.cache[key] = text
         return text
Example #2
0
    def whois(self, url, timeout=10):
        """Return text of this whois query
        """
        domain = common.get_domain(url)
        if domain:
            text = ''
            key = 'whois_%s' % domain
            try:
                if self.cache:
                    text = self.cache[key]
                else:
                    raise KeyError()
            except KeyError:
                # try local whois command
                r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
                start = time.time()
                while r.poll() is None:
                    time.sleep(0.5)
                    if time.time() - start > timeout:
                        try:
                            r.kill()
                        except Exception, e:
                            pass
                        break
                if r.poll() != 1:
                    text = r.communicate()[0]

                if '@' in text:
                    if self.cache:
                        self.cache[key] = text
            return text
    def throttle(self, url, delay, proxy=None, variance=0.5):
        """Delay a minimum time for each domain per proxy by storing last access time

        `url' is what intend to download
        `delay' is the minimum amount of time (in seconds) to wait after downloading content from this domain
        `proxy' is the proxy to download through
        `variance' is the amount of randomness in delay, 0-1
        """
        key = str(proxy) + ":" + common.get_domain(url)
        start = datetime.now()
        while datetime.now() < Download.domains.get(key, start):
            time.sleep(SLEEP_TIME)
        # update domain timestamp to when can query next
        Download.domains[key] = datetime.now() + timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
Example #4
0
    def crawl(self, D, url, html):
        """Crawl website html and return list of URLs crawled
        """
        def normalize(link):
            """Normalize the link to avoid duplicates
            """
            if '#' in link:
                # remove internal links to avoid duplicates
                link = link[:link.index('#')]
            link = common.unescape(link)  # remove &amp; from link
            return urlparse.urljoin(url, link)  # support relative links

        def valid(link):
            """Check if should crawl this link
            """
            # check if a media file
            if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
                # check if a proper HTTP link
                if link.lower().startswith('http'):
                    # only crawl within website
                    if common.same_domain(domain, link):
                        # passes regex
                        if self.allowed_urls.match(
                                link) and not self.banned_urls.match(link):
                            # not blocked by robots.txt
                            if not self.robots or self.robots.can_fetch(
                                    settings.user_agent, link):
                                # allowed to recrawl
                                if self.crawl_existing or (D.cache and link
                                                           not in D.cache):
                                    return True
            return False

        domain = common.get_domain(url)
        depth = CrawlerCallback.found[url]
        outstanding = []
        if depth != self.max_depth:
            # extract links to continue crawling
            links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
            for link in links_re.findall(html):
                link = normalize(link)
                if link not in CrawlerCallback.found:
                    CrawlerCallback.found[link] = depth + 1
                    if valid(link):
                        # is a new link
                        outstanding.append(link)
                        if len(outstanding) == self.max_links:
                            break
        return outstanding
Example #5
0
    def throttle(self, url, delay, proxy=None, variance=0.5):
        """Delay a minimum time for each domain per proxy by storing last access time

        `url' is what intend to download
        `delay' is the minimum amount of time (in seconds) to wait after downloading content from this domain
        `proxy' is the proxy to download through
        `variance' is the amount of randomness in delay, 0-1
        """
        if delay > 0:
            key = str(proxy) + ':' + common.get_domain(url)
            start = datetime.datetime.now()
            while datetime.datetime.now() < Download.domains.get(key, start):
                time.sleep(SLEEP_TIME)
            # update domain timestamp to when can query next
            Download.domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
Example #6
0
    def crawl(self, D, url, html): 
        """Crawl website html and return list of URLs crawled
        """
        def normalize(link):
            """Normalize the link to avoid duplicates
            """
            if '#' in link:
                # remove internal links to avoid duplicates
                link = link[:link.index('#')] 
            link = common.unescape(link) # remove &amp; from link
            return urlparse.urljoin(url, link) # support relative links

        def valid(link):
            """Check if should crawl this link
            """
            # check if a media file
            if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
                # check if a proper HTTP link
                if link.lower().startswith('http'):
                    # only crawl within website
                    if common.same_domain(domain, link):
                        # passes regex
                        if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                            # not blocked by robots.txt
                            if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                                # allowed to recrawl
                                if self.crawl_existing or (D.cache and link not in D.cache):
                                    return True
            return False


        domain = common.get_domain(url)
        depth = CrawlerCallback.found[url]
        outstanding = []
        if depth != self.max_depth: 
            # extract links to continue crawling
            links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
            for link in links_re.findall(html):
                link = normalize(link)
                if link not in CrawlerCallback.found:
                    CrawlerCallback.found[link] = depth + 1
                    if valid(link):
                        # is a new link
                        outstanding.append(link)
                        if len(outstanding) == self.max_links:
                            break
        return outstanding
Example #7
0
    def crawl(self, D, url, html):
        """Crawl website html and return list of URLs crawled
        """
        def valid(link):
            """Check if should crawl this link
            """
            # check if a media file
            if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
                # check if a proper HTTP link
                if link.lower().startswith('http'):
                    # only crawl within website
                    if common.same_domain(domain, link):
                        # passes regex
                        if self.allowed_urls.match(
                                link) and not self.banned_urls.match(link):
                            # not blocked by robots.txt
                            if not self.robots or self.robots.can_fetch(
                                    settings.user_agent, link):
                                # allowed to recrawl
                                if self.crawl_existing or (D.cache and link
                                                           not in D.cache):
                                    return True
            return False

        domain = common.get_domain(url)
        depth = self.found[url]
        outstanding = []
        if depth != self.max_depth:
            # extract links to continue crawling
            links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
            for link in links_re.findall(html):
                try:
                    link = self.normalize(url, link)
                except UnicodeDecodeError as e:
                    # unicode error when joining url
                    common.logger.info(e)
                else:
                    if link not in self.found:
                        self.found[link] = depth + 1
                        if valid(link):
                            # is a new link
                            outstanding.append(link)
                            if len(outstanding) == self.max_links:
                                break
        return outstanding
Example #8
0
    def whois(self, url, timeout=10):
        """Query whois info
        """
        domain = common.get_domain(url)
        if domain:
            text = ''
            key = 'whois_%s' % domain
            try:
                if self.cache:
                    text = self.cache[key]
                else:
                    raise KeyError()
            except KeyError:
                # try online whois app
                query_url = 'http://whois.chinaz.com/%s' % domain
                html = self.get(query_url)
                match = re.compile(
                    "<script src='(request.aspx\?domain=.*?)'></script>"
                ).search(html)
                if match:
                    script_url = urlparse.urljoin(query_url, match.groups()[0])
                    text = self.get(script_url, read_cache=False)

                if '@' not in text:
                    if self.cache:
                        del self.cache[query_url]
                    # failed, so try local whois command
                    r = subprocess.Popen(['whois', domain],
                                         stdout=subprocess.PIPE)
                    start = time.time()
                    while r.poll() is None:
                        time.sleep(0.5)
                        if time.time() - start > timeout:
                            try:
                                r.kill()
                            except Exception, e:
                                pass
                            break
                    if r.poll() != 1:
                        text = r.communicate()[0]

                if '@' in text:
                    if self.cache:
                        self.cache[key] = text
            return text
Example #9
0
    def throttle(self, url, delay, proxy=None, variance=0.5):
        """Delay a minimum time for each domain per proxy by storing last access time

        url
            what intend to download
        delay
            the minimum amount of time (in seconds) to wait after downloading content from this domain
        proxy
            the proxy to download through
        variance
            the amount of randomness in delay, 0-1
        """
        if delay > 0:
            key = str(proxy) + ':' + common.get_domain(url)
            if key in Download._domains:
                while datetime.datetime.now() < Download._domains.get(key):
                    time.sleep(SLEEP_TIME)
            # update domain timestamp to when can query next
            Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
Example #10
0
    def throttle(self, url, delay, proxy=None, variance=0.5):
        """Delay a minimum time for each domain per proxy by storing last access time

        url
            what intend to download
        delay
            the minimum amount of time (in seconds) to wait after downloading content from this domain
        proxy
            the proxy to download through
        variance
            the amount of randomness in delay, 0-1
        """
        if delay > 0:
            key = ':'.join([str(proxy), self.throttle_additional_key or '', common.get_domain(url)])
            if key in Download._domains:
                while datetime.datetime.now() < Download._domains.get(key):
                    time.sleep(SLEEP_TIME)
            # update domain timestamp to when can query next
            Download._domains[key] = datetime.datetime.now() + datetime.timedelta(seconds=delay * (1 + variance * (random.random() - 0.5)))
Example #11
0
    def whois(self, url, timeout=10):
        """Return text of this whois query
        """
        domain = common.get_domain(url)
        if domain:
            text = ''
            key = 'whois_%s' % domain
            try:
                if self.cache:
                    text = self.cache[key]
                else:
                    raise KeyError()
            except KeyError:
                # try online whois app
                query_url = 'http://whois.chinaz.com/%s' % domain
                html = self.get(query_url)
                match = re.compile("<script src='(request.aspx\?domain=.*?)'></script>").search(html)
                if match:
                    script_url = urlparse.urljoin(query_url, match.groups()[0])
                    text = self.get(script_url, read_cache=False)

                if '@' not in text:
                    if self.cache:
                        del self.cache[query_url]
                    # failed, so try local whois command
                    r = subprocess.Popen(['whois', domain], stdout=subprocess.PIPE)
                    start = time.time()
                    while r.poll() is None:
                        time.sleep(0.5)
                        if time.time() - start > timeout:
                            try:
                                r.kill()
                            except Exception, e:
                                pass
                            break
                    if r.poll() != 1:
                        text = r.communicate()[0]
                
                if '@' in text:
                    if self.cache:
                        self.cache[key] = text
            return text
Example #12
0
    def crawl(self, D, url, html): 
        """Crawl website html and return list of URLs crawled
        """
        def valid(link):
            """Check if should crawl this link
            """
            # check if a media file
            if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
                # check if a proper HTTP link
                if link.lower().startswith('http'):
                    # only crawl within website
                    if common.same_domain(domain, link):
                        # passes regex
                        if self.allowed_urls.match(link) and not self.banned_urls.match(link):
                            # not blocked by robots.txt
                            if not self.robots or self.robots.can_fetch(settings.user_agent, link):
                                # allowed to recrawl
                                if self.crawl_existing or (D.cache and link not in D.cache):
                                    return True
            return False

        domain = common.get_domain(url)
        depth = self.found[url]
        outstanding = []
        if depth != self.max_depth: 
            # extract links to continue crawling
            links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
            for link in links_re.findall(html):
                try:
                    link = self.normalize(url, link)
                except UnicodeDecodeError as e:
                    # unicode error when joining url
                    common.logger.info(e)
                else:
                    if link not in self.found:
                        self.found[link] = depth + 1
                        if valid(link):
                            # is a new link
                            outstanding.append(link)
                            if len(outstanding) == self.max_links:
                                break
        return outstanding