コード例 #1
0
ファイル: common.py プロジェクト: akhdir/price-extraction
 def __init__(self,
              file,
              encoding=settings.default_encoding,
              mode='wb',
              unique=False,
              unique_by=None,
              quoting=csv.QUOTE_ALL,
              utf8_bom=False,
              auto_repair=False,
              **argv):
     self.encoding = encoding
     self.unique = unique
     self.unique_by = unique_by
     if hasattr(file, 'write'):
         self.fp = file
     else:
         if auto_repair:
             self._remove_invalid_rows(file=file, quoting=quoting, **argv)
         if utf8_bom:
             self.fp = open(file, 'wb')
             self.fp.write('\xef\xbb\xbf')
             self.fp.close()
             self.fp = open(file, mode=mode.replace('w', 'a'))
         else:
             self.fp = open(file, mode)
     if self.unique:
         self.rows = adt.HashDict(
         )  # cache the rows that have already been written
         for row in csv.reader(open(self.fp.name)):
             self.rows[self._unique_key(row)] = True
     self.writer = csv.writer(self.fp, quoting=quoting, **argv)
コード例 #2
0
 def __init__(self,
              output_file=None,
              max_links=100,
              max_depth=1,
              allowed_urls='',
              banned_urls='^$',
              robots=None,
              crawl_existing=True):
     """
     output_file:
         where to save scraped data
     max_links:
         the maximum number of links to follow per page
     max_depth:
         the maximum depth to follow links into website (use None for no limit)
     allowed_urls:
         a regex for allowed urls, defaults to all urls
     banned_urls:
         a regex for banned urls, defaults to no urls
     robots:
         RobotFileParser object to determine which urls allowed to crawl
     crawl_existing:
         sets whether to crawl content already downloaded previously in the cache
     """
     self.found = adt.HashDict(int)  # track depth of found URLs
     if output_file:
         self.writer = common.UnicodeWriter(output_file)
     else:
         self.writer = None
     self.max_links = max_links
     self.max_depth = max_depth
     self.allowed_urls = re.compile(allowed_urls)
     self.banned_urls = re.compile(banned_urls)
     self.robots = robots
     self.crawl_existing = crawl_existing
コード例 #3
0
ファイル: download.py プロジェクト: Hack42/BusNotifier
    def get_emails(self, website, max_depth=1, max_urls=10, max_emails=1):
        """Crawl this website and return all emails found

        website:
            the URL of website to crawl
        max_depth:
            how many links deep to follow before stop crawl
        max_urls:
            how many URL's to download before stop crawl
        max_emails:
            The maximum number of emails to extract before stop crawl.
            If None then extract all emails found in crawl.
        """
        def score(link):
            """Return how valuable this link is for ordering crawling
            The lower the better"""
            link = link.lower()
            total = 0
            if 'contact' in link:
                pass  # this page is top priority
            elif 'about' in link:
                total += 10
            elif 'help' in link:
                total += 20
            else:
                # generic page
                total += 100
            # bias towards shorter links
            total += len(link)
            return total

        domain = urlparse.urlparse(website).netloc
        scraped = adt.HashDict()
        c = CrawlerCallback(max_depth=max_depth)
        outstanding = [(0, website)]  # list of URLs and their score
        emails = []
        while outstanding and (max_urls is None or len(scraped) < max_urls) \
                          and (max_emails is None or len(emails) < max_emails):
            _, url = outstanding.pop(0)
            scraped[url] = True
            html = self.get(url)
            if html:
                for email in alg.extract_emails(html):
                    if email not in emails:
                        emails.append(email)
                        if len(emails) == max_emails:
                            break
                # crawl the linked URLs
                for link in c.crawl(self, url, html):
                    if urlparse.urlparse(link).netloc == domain:
                        if link not in scraped:
                            outstanding.append((score(link), link))
                # sort based on score to crawl most promising first
                outstanding.sort()
        return list(emails)
コード例 #4
0
    def find(self, website, max_depth, max_urls, max_results):
        """
        website:
            the URL of website to crawl
        max_depth:
            how many links deep to follow before stop crawl
        max_urls:
            how many URL's to download before stop crawl
        max_results:
            The maximum number of results to extract before stop crawl.
            If None then extract all results found in crawl.
        """
        # check for redirect URL
        self.D.get(website)
        redirect_url = self.D.cache.meta(website).get(
            'url') if self.D.cache else self.final_url
        website = redirect_url or website

        domain = urlparse.urlparse(website).netloc
        scraped = adt.HashDict()
        c = CrawlerCallback(max_depth=max_depth)
        outstanding = [(0, website)]  # list of URLs and their score
        results = []
        while outstanding and (max_urls is None or len(scraped) < max_urls) \
                          and (max_results is None or len(results) < max_results):
            _, url = outstanding.pop(0)
            scraped[url] = True
            html = self.D.get(url, num_retries=0)

            if html:
                for result in self.extract_fn(html):
                    if result not in results:
                        results.append(result)
                        if len(results) == max_results:
                            break
                # crawl the linked URLs
                for link in c.crawl(self, url, html):
                    if urlparse.urlparse(link).netloc == domain:
                        if link not in scraped:
                            # insert sort this new record so crawl most promising first
                            score = self.link_score(link)
                            for i, (other_score,
                                    other_link) in enumerate(outstanding):
                                if score < other_score:
                                    outstanding.insert(i, ((score, link)))
                                    break
                            else:
                                outstanding.append((score, link))
        return results
コード例 #5
0
ファイル: async.py プロジェクト: yuzi3150/SeatPJ2
 def __init__(self,
              url=None,
              urls=None,
              url_iter=None,
              num_threads=20,
              cb=None,
              depth=True,
              max_errors=None,
              pattern=None,
              **kwargs):
     self.settings = adt.Bag(read_cache=True,
                             write_cache=True,
                             num_redirects=5,
                             num_retries=2,
                             timeout=20,
                             headers={},
                             num_threads=num_threads,
                             cb=cb,
                             url_iter=url_iter,
                             depth=depth,
                             pattern=pattern)
     self.settings.update(**kwargs)
     self.D = download.Download(**kwargs)
     self.kwargs = kwargs
     # queue of html to be written to cache
     self.cache_queue = []
     # URL's that are waiting to download
     self.download_queue = collections.deque()
     if urls:
         self.download_queue.extend(urls)
     if url:
         self.download_queue.append(
             url
         )  # XXX create compressed dict data type for large in memory?
     # URL's currently downloading
     self.processing = {}
     # defereds that are downloading
     self.downloading = []
     # URL's that have been found before
     self.found = adt.HashDict()
     for url in self.download_queue:
         self.found[url] = True
     self.state = download.State()
     self.max_errors = max_errors
     self.num_errors = 0  # counter for the number of subsequent errors
コード例 #6
0
ファイル: download.py プロジェクト: w4lker/Antix
 def get_emails(self, website, max_depth=1, max_urls=None, max_emails=None):
     """Crawl this website and return all emails found
     """
     scraped = adt.HashDict()
     c = CrawlerCallback(max_depth=max_depth)
     outstanding = collections.deque([website])
     emails = []
     while outstanding and (max_urls is None or len(scraped) < max_urls) \
                       and (max_emails is None or len(emails) < max_emails):
         url = outstanding.popleft()
         scraped[url] = True
         html = self.get(url, delay=1)
         if html:
             for email in alg.extract_emails(html):
                 if email not in emails:
                     emails.append(email)
                     if len(emails) == max_emails:
                         break
             outstanding.extend(c.crawl(self, url, html))
     return list(emails)
コード例 #7
0
ファイル: xpath.py プロジェクト: huligong1234/python-study
def get_tag(html):
    """Find tag type at this location

    >>> get_tag('<div>abc</div>')
    'div'
    >>> get_tag(' <div>')
    >>> get_tag('div')
    """
    match = tag_regex.match(html)
    if match:
        return match.groups()[0]
    else:
        return None


splits = adt.HashDict()


def split_tag(html):
    """Extract starting tag and contents from HTML

    >>> [str(s) for s in split_tag('<div>abc<div>def</div>abc</div>ghi<div>jkl</div>')]
    ['<div>abc<div>def</div>abc</div>', 'ghi<div>jkl</div>']
    >>> [str(s) for s in split_tag('<br /><div>abc</div>')]
    ['<br />', '<div>abc</div>']
    >>> [str(s) for s in split_tag('<div>abc<div>def</div>abc</span>')]
    ['<div>abc<div>def</div>abc</span></div>', '']
    """
    if html in splits:
        i, tag = splits[html]
    else:
コード例 #8
0
            if hasattr(e, 'code'):
                self.response_code = str(e.code)
            if hasattr(e, 'read'):
                try:
                    self.error_content = e.read()
                except Exception, e:
                    self.error_content = ''
            # so many kinds of errors are possible here so just catch them all
            common.logger.warning('Download error: %s %s' % (url, e))
            if self.settings.acceptable_errors and self.response_code in self.settings.acceptable_errors:
                content, self.final_url = self.settings.default, url
            else:
                content, self.final_url = None, url
        return content

    _domains = adt.HashDict()

    def throttle(self, url, delay, proxy=None, variance=0.5):
        """Delay a minimum time for each domain per proxy by storing last access time

        url
            what intend to download
        delay
            the minimum amount of time (in seconds) to wait after downloading content from this domain
        proxy
            the proxy to download through
        variance
            the amount of randomness in delay, 0-1
        """
        if delay > 0:
            key = ':'.join([
コード例 #9
0
ファイル: download.py プロジェクト: w4lker/Antix
class CrawlerCallback(ThreadedCallback):
    """Example callback to crawl the website
    """
    found = adt.HashDict(int)  # track depth of found URLs

    def __init__(self,
                 output_file=None,
                 max_links=100,
                 max_depth=1,
                 allowed_urls='',
                 banned_urls='^$',
                 robots=None,
                 crawl_existing=True):
        """
        `output_file' is where to save scraped data
        `max_links' is the maximum number of links to follow per page
        `max_depth' is the maximum depth to follow links into website (use None for no limit)
        `allowed_urls' is a regex for allowed urls, defaults to all urls
        `banned_urls' is a regex for banned urls, defaults to no urls
        `robots': RobotFileParser object to determine which urls allowed to crawl
        `crawl_existing' sets whether to crawl content already downloaded previously in the cache
        """
        if output_file:
            self.writer = common.UnicodeWriter(output_file)
        else:
            self.writer = None
        self.max_links = max_links
        self.max_depth = max_depth
        self.allowed_urls = re.compile(allowed_urls)
        self.banned_urls = re.compile(banned_urls)
        self.robots = robots
        self.crawl_existing = crawl_existing

    def crawl(self, D, url, html):
        """Crawl website html and return list of URLs crawled
        """
        def normalize(link):
            """Normalize the link to avoid duplicates
            """
            if '#' in link:
                # remove internal links to avoid duplicates
                link = link[:link.index('#')]
            link = common.unescape(link)  # remove &amp; from link
            return urlparse.urljoin(url, link)  # support relative links

        def valid(link):
            """Check if should crawl this link
            """
            # check if a media file
            if common.get_extension(link) not in common.MEDIA_EXTENSIONS:
                # check if a proper HTTP link
                if link.lower().startswith('http'):
                    # only crawl within website
                    if common.same_domain(domain, link):
                        # passes regex
                        if self.allowed_urls.match(
                                link) and not self.banned_urls.match(link):
                            # not blocked by robots.txt
                            if not self.robots or self.robots.can_fetch(
                                    settings.user_agent, link):
                                # allowed to recrawl
                                if self.crawl_existing or (D.cache and link
                                                           not in D.cache):
                                    return True
            return False

        domain = common.get_domain(url)
        depth = CrawlerCallback.found[url]
        outstanding = []
        if depth != self.max_depth:
            # extract links to continue crawling
            links_re = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
            for link in links_re.findall(html):
                link = normalize(link)
                if link not in CrawlerCallback.found:
                    CrawlerCallback.found[link] = depth + 1
                    if valid(link):
                        # is a new link
                        outstanding.append(link)
                        if len(outstanding) == self.max_links:
                            break
        return outstanding
コード例 #10
0
ファイル: download.py プロジェクト: w4lker/Antix
class StateCallback(ThreadedCallback):
    """Example callback that saves state
    """
    active_urls = set()
    found = adt.HashDict()  # track found URLs

    def __init__(self, output_file, header):
        # load state from previous run, if exists
        state = self.load_state()
        # settings to start crawl from beginning
        self.new_urls = False
        write_header = True
        mode = 'wb'
        if StateCallback.active_urls:
            # incomplete crawl
            common.logger.info('Loading previous crawl state')
            self.new_urls = True
            if os.path.exists(output_file):
                mode = 'ab'
                write_header = False

        self.writer = common.UnicodeWriter(output_file, mode=mode)
        if write_header:
            self.writer.writerow(header)

    def __call__(self, D, url, html):
        if self.new_urls:
            # restoring state so can ignore the starting url
            # instead return urls previously in queue
            self.new_urls = False
            new_urls = StateCallback.active_urls
        else:
            self.scrape(D, url, html)
            new_urls = self.crawl(D, url, html)
            # add newly scraped urls
            StateCallback.active_urls.update(new_urls)
            # this url has already been processed
            StateCallback.active_urls.discard(url)
            # save state in thread
            thread.start_new_thread(self.save_state, tuple())
        return new_urls

    def save_state(self, output_file='.state.pickle'):
        """Save state of current crawl to pickle file
        """
        # to ensure atomic write save state to temporary file first and then rename
        pickled_data = pickle.dumps(
            dict(urls=StateCallback.active_urls, found=StateCallback.found))
        tmp_file = tempfile.NamedTemporaryFile(prefix=output_file + '.').name
        fp = open(tmp_file, 'wb')
        fp.write(pickled_data)
        # ensure all content is written to disk
        fp.flush()
        os.fsync(fp.fileno())
        fp.close()
        # XXX error on Windows if dest exists
        os.rename(tmp_file, output_file)

    def load_state(self, input_file='.state.pickle'):
        """Load previous state from pickle file
        """
        if os.path.exists(input_file):
            data = pickle.load(open(input_file))
            StateCallback.active_urls.update(data.get('urls', []))
            StateCallback.found = data.get('found', StateCallback.found)
        else:
            data = {}
        return data