def crawl_sitemap(url):
    # download the sitemap file as sitemap.xml
    sitemap = Download(url)
    links = re.findall('<loc>(.*?)</loc>', sitemap.decode("utf-8"))
    # download each link
    for link in links:
        html = Download(link)
Beispiel #2
0
class SpiderMain(object):
    
    def __init__(self):
        self.manager = URLManager()
        self.down = Download()
        self.parser = HtmlParser()
        self.output = DataOutput()
    
    def crawl(self, root_url):
        content = self.down.download(root_url)
        movie_ids = self.parser.parse_urls(content)
        count = 0
        
        for mid in movie_ids:
            if count > 10:
                break
            movie_link = '''http://service.library.mtime.com/Movie.api?\
            Ajax_CallBack=true\
            &Ajax_CallBackType=Mtime.Library.Services\
            &Ajax_CallBackMethod=GetMovieOverviewRating\
            &Ajax_CrossDomain=1\
            &Ajax_RequestUrl=http%3A%2F%2Fmovie.mtime.com%2F{0}%2F\
            &t={1}\
            &Ajax_CallBackArgument0={2}\
            '''.format(mid, datetime.datetime.now().strftime("%Y%m%d%H%M%S%f"), mid)
            
            res = self.down.download(movie_link.replace(' ', ''))
            self.parser.parser_json(res)
            count += 1
        
        self.output.store_data(self.parser.items)
        self.output.close_connect()
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = Download(url)
    #>Downloading: http://example.webscraping.com/sitemap.xml
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = Download(link)
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]  # the queue of URL's to download
    while crawl_queue:
        url = crawl_queue.pop()
        html = Download(url)
        # filter for links matching our regular expression
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to the crawl queue
                crawl_queue.append(link)
def iteration():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        #url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = Download(url)
        if html is None:
            # received an error trying to download this webpage
            # so assume have reached the last country ID and can stop downloading
            break
        else:
            # success - can scrape the result
            # ...
            pass
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed URL following links matched by link_regex
    """
    crawl_queue = [seed_url]
    seen = set(crawl_queue) # keep track which URL's have seen before
    while crawl_queue:
        url = crawl_queue.pop()
        html = Download(url)
        for link in get_links(html):
            # check if link matches expected regex
            if re.match(link_regex, link):
                # form absolute link
                link = urlparse.urljoin(seed_url, link)
                crawl_queue.append(link)
Beispiel #7
0
def iteration():
    """连续N次下载出错后推出程序"""
    max_errors = 5  # maximum number of consecutive download errors allowed
    num_errors = 0  # current number of consecutive download errors
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/places/default/view/-{}'.format(
            page)
        html = Download(url)
        if html is None:
            # received an error trying to download this webpage
            num_errors += 1
            if num_errors == max_errors:
                # reached maximum amount of errors in a row so exit
                break
            # so assume have reached the last country ID and can stop downloading
        else:
            # success - can scrape the result
            # ...
            num_errors = 0
Beispiel #8
0
from common import Download
from parsers import HtmlParser

dl = Download()
parse = HtmlParser()

content = dl.download('http://theater.mtime.com/China_Beijing/')
res = parse._parse_movies(content)
print(res)
Beispiel #9
0
 def __init__(self):
     self.manager = URLManager()
     self.down = Download()
     self.parser = HtmlParser()
     self.output = DataOutput()