Example #1
0
def crawpages(urlp, selector, start=1, limit=-1):
    if urlp.find('%d') == -1: return
    soup = ''
    links = []
    index = start

    while True:
        page = urlp % index
        print '-->', page
        new_soup = select(BeautifulSoup(urlopen(urlp % index).read()),
                          selector)
        if new_soup is None or len(new_soup) == 0 or new_soup == soup:
            print 'repeat or none'
            print 'End at %s' % index
            break
        if limit > 0 and limit + 1 <= index:
            print 'limit ', limit
            print 'End at %s' % index
            break
        print getlisttext(new_soup)
        if (new_soup[0].name == 'a'):
            for link in new_soup:
                if ('href' in dict(link.attrs)):
                    url = urljoin(page, link['href'])
                    #if url.find("'")!=-1: continue why?
                    url = url.split('#')[0]  # remove location portion
                    if url[0:4] == 'http':
                        links.append(url)
        else:
            links.append(page)

        soup = new_soup
        index += 1

    return links
Example #2
0
def get_page_links(url):
    soup = Soup(load_page(url))
    print 'open %s %s' % (soup.title.string.encode('utf-8'), url)
    links = select(soup, 'a.post-article')
    #links = soup.findAll('a', href=re.compile(r'^http://duoduovision.diandian.com/post/\d{4}-\d{2}-\d{2}/'))
    #return list(set([nomalize_url(a['href']) for a in links]))
    return [nomalize_url(a['href']) for a in links]
Example #3
0
def get_page_links(url):
    soup = Soup(load_page(url))
    print 'open %s %s' % (soup.title.string.encode('utf-8'), url)
    links = select(soup, 'a.post-article')
    #links = soup.findAll('a', href=re.compile(r'^http://duoduovision.diandian.com/post/\d{4}-\d{2}-\d{2}/'))
    #return list(set([nomalize_url(a['href']) for a in links]))
    return [nomalize_url(a['href']) for a in links]
Example #4
0
def crawpages(urlp, selector, start=1, limit=-1):
	if urlp.find('%d') == -1: return
	soup = ''
	links = []
	index = start
	
	while True:
		page = urlp % index
		print '-->', page
		new_soup = select(BeautifulSoup(urlopen(urlp % index).read()), selector)
		if new_soup is None or len(new_soup)== 0 or new_soup == soup: 
			print 'repeat or none'
			print 'End at %s' % index
			break
		if limit > 0 and limit + 1 <= index:
			print 'limit ', limit
			print 'End at %s' % index
			break
		print getlisttext(new_soup)
		if(new_soup[0].name == 'a'):
			for link in new_soup:
				if ('href' in dict(link.attrs)):
					url=urljoin(page, link['href'])
					#if url.find("'")!=-1: continue why?
					url=url.split('#')[0] # remove location portion
					if url[0:4]=='http':
						links.append(url)
		else: links.append(page)
		
		soup = new_soup
		index += 1
	
	return links
Example #5
0
def save_pagenates(leaf, limit= -1):
	"""抓取有分页的url,他们的URL大多类似"""
	url_param = leaf.url
	index = 1
	url = url_param % (index)
	filename = os.path.join(leaf.dir, "%s.html" % (index))
	while file_exist("%s\t%s" % (filename, url)):
		print '%s is already downloaded.' % (url)
		index += 1
		url = url_param % (index)
		filename = os.path.join(leaf.dir, "%s.html" % (index))
	page = urlopen(url).read().decode(encode, 'ignore')
	soup = select(BeautifulSoup(page), config[leaf.level]['selector'])
	if soup == None or len(soup) == 0: print 'break 1'; return
	else: print 'page: ', index, getlisttext(soup)
	try:
		file = open(filename, 'w').write(page.encode(encode, 'ignore'))
	except IOError as err:
		print 'error: ', str(err), filename
	open(logfile, 'a').write(filename + '\t' + url + '\r\n')
	index += 1
	url = url_param % (index)
	filename = os.path.join(leaf.dir, "%s.html" % (index))
	while True:
		while file_exist("%s\t%s" % (filename, url)):
			print '%s is already downloaded.' % (url)
			index += 1
			url = url_param % (index)
			filename = os.path.join(leaf.dir, "%s.html" % (index))
		tmp_page = urlopen(url).read().decode(encode, 'ignore')
		tmp_soup = select(BeautifulSoup(tmp_page), config[leaf.level]['selector'])
		print tmp_soup
		print tmp_page == page, tmp_soup == soup
		if tmp_page == page or tmp_soup == soup or tmp_soup == None or len(tmp_soup) == 0: print 'break 1'; break
		else: print 'page: ', index, getlisttext(tmp_soup[0])
		try:
			open(filename, 'w').write(tmp_page.encode(encode, 'ignore'))
		except:
			print filename
		open(logfile, 'a').write(filename + '\t' + url + '\r\n')
		page = tmp_page
		soup = tmp_soup
		index += 1
		if limit > 0 and limit < index: break
		url = url_param % (index)
		filename = os.path.join(leaf.dir, "%s.html" % (index))
Example #6
0
def create_children(map):
	if map.level >= len(config) - 1:
		return
	current_config = config[map.level]

	# here we create soup by url or soup from the parent 'map'
	if map.url != None:
		soup = BeautifulSoup(urlopen(map.url).read().decode(encode, 'ignore'))
	elif map.soup != None:
		soup = map.soup
	# we use the 'selector' to create html fragments we called soups
	soups = select(soup, current_config['selector'])
	# use keywords as a filter
	if 'keywords' in current_config:
		soups = keyword_filter(soups, current_config['keywords'])
		
	children = []
	for i in range(len(soups)):
		# if the element is 'a' then we record url and make soup None,
		# if the element is sth else, we make url None and soup the
		# elements.
		if soups[i].name == 'a':
#		   print soups[i]['href'], soups[i].string
			url = soups[i]['href']
			# use the content as the title
			title = soups[i].string
			# if process is in the config, we will use it for url
			# transformation.
			if 'process' in current_config: 
				url = current_config['process'](url)
			soup = None
		else:
			# if the name is not 'a', we may use the 'title' in
			# config, if 'title' startswith 'css:', we will use the
			# element's content as the title, or we use the 'title'
			# config directly.
			if current_config['title'].startswith('css:'):
				title_css = current_config['title'].split(':', 1)[1]
				title = select(soups[i], title_css)[0].string
			else: title = current_config['title']
			url = None
			soup = soups[i]

		# if 'collapse' is True then, it will not create a new dir
		# with title as its name
		if 'collapse' not in current_config or current_config['collapse'] == False:
			if map.dir is None or map.dir == '': dir = title
			else: dir = os.path.join(map.dir, title)
		else:
			dir = map.dir

		children.append(LinkMap(
								parent=map, 
								url=url,
								soup=soup, 
								seletor=config[map.level + 1]['selector'],
								title=title, 
								level=map.level + 1, 
								dir=utils.formatname(dir)))
		
	# when there is no sub links in the title, we should give another
	# link, use the 'default_selector'
	if len(children) == 0 and 'default_selector' in current_config.keys():
		soups = select(soup, current_config['default_selector'])		
		url = soups[0]['href']
		if 'process' in current_config: url = current_config['process'](url)
		if 'collapse' not in current_config or current_config['collapse'] == False:
			dir = os.path.join(map.dir, title)
		else:
			dir = map.dir
		children.append(LinkMap(
								parent=map, 
								url=url, 
								seletor=config[map.level + 1]['selector'],
								title=soups[0].string, 
								level=map.level + 1, 
								dir=utils.formatname(dir)))
	map.children = children
	
	for child in map.children:
		create_children(child)
Example #7
0
def get_img_links(url):
    soup = Soup(load_page(url))
    imgs = select(soup, 'a.post-meidaurl img')
    return [img['src'] for img in imgs]
Example #8
0
def get_img_links(url):
    soup = Soup(load_page(url))
    imgs = select(soup, 'a.post-meidaurl img')
    return [img['src'] for img in imgs]