Ejemplo n.º 1
0
def crawl(request):
	if request.method == 'POST':
		form = UrlForm(request.POST)
		if form.is_valid():
			post_url = request.POST.get('url', '')
			domain = urlparse.urlsplit(post_url)
			domain_url = domain.scheme + '://' + domain.netloc +  '/'
			
			try:
				source = urllib2.urlopen(post_url)
			except:
				logger.exception('URLopen can`t open the url')
				return HttpResponseRedirect('/my_account')

			logger.debug('Fetching images from url: %s', post_url)
			
			soup = BeautifulSoup(source)
			page_title_fb = soup.find("meta",
				{"property":"og:title"}
			)
			page_title_tw = soup.find("meta",
				{"name":"twitter:title"}
			)

			if page_title_fb and not page_title_tw:
				page_title = page_title_fb['content']
			elif page_title_tw and not page_title_fb:
				page_title = page_title_tw['content']
			else:
				page_title = soup.title.string
			
			page_title = page_title.encode('utf-8')

			try:
				site = site_url.objects.get(
					user=request.user,
					url=post_url
				)

				if site:
					delete_site_by_id(site.id)
					site.user = request.user
					site.title = page_title
					site.url = post_url
					try:
						site.save()
					except Exception, e:
						logger.exception('Site url problem')
						return HttpResponseRedirect('/my_account')
			except Exception, e:
				site = site_url(
					user=request.user, 
					title=page_title, 
					url=post_url
				)
				try:
					site.save()
				except Exception, e:
					logger.exception('Site url problem')
					return HttpResponseRedirect('/my_account')

			allowed_exts = ('png', 'jpg', 'gif', 'bmp')
			images_fb = soup.findAll("meta",
				{"property":"og:image"})
			meta_images_urls = []
			if images_fb:
				for img in images_fb:
					if img.get('content') is not None:
						if img.get('content').rsplit('.')[-1] in allowed_exts:
							db_img = site_image(
								url=site
							)

							try:
								img_filename = img.get('content').split('/')[-1]

								img_temp = NamedTemporaryFile(
									dir='/media/D/virtual_env/bin/melon/melon/static/',
									delete=True
								)

								img_temp.write(urllib2.urlopen(urlparse.urljoin(
									domain_url, 
									img.get('content')
								)).read())

								img_temp.flush()

								db_img.image_url.save(img_filename, File(img_temp))

								db_img.save()
							except Exception, e:
								logger.exception('Saving images from facebook')
								continue

							meta_images_urls.append(
								img.get('content')
							)
Ejemplo n.º 2
0
								img.get('content')
							)

			images_tw = soup.find("meta",
				{"name":"twitter:image"}
			)
			if images_tw:
				for img in images_tw:
					if meta_images_urls and img.get('content'):
						if any(img.get('content') in s for s in meta_images_urls):
							continue

					if img.get('content') is not None:
						if img.get('content').rsplit('.')[-1] in allowed_exts:
							db_img = site_image(
								url=site
							)

							try:
								img_filename = img.get('content').split('/')[-1]

								img_temp = NamedTemporaryFile(delete=True)

								img_temp.write(urllib2.urlopen(urlparse.urljoin(
									domain_url, 
									img.get('content')
								)).read())

								img_temp.flush()

								db_img.image_url.save(img_filename, File(img_temp))