def crawl_webpage(self,url): try: maxlength = 5242880 #5 * 1024 * 1024bytes = 5MB url = url_normalize(url, charset='utf-8') redirect_return_value = process_redirect(url, maxlength) if 'url' in redirect_return_value: url = redirect_return_value['url'] else: return 'error' #There is no existing url in database, we should retrieve the page #headers is for simulating browser to get connection to certain websites (like Wikipedia) headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #Accept-Language: en-us,en;q=0.5 #Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7 'Keep-Alive': '115' , 'Connection': 'keep-alive' , 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.797.0 Safari/535.1' #'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' , } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req, timeout=30) #timeout in seconds;if not set, system global is used #response buffer will be cleared after first read; returns an placeholder html even if response is not valid; webpage = response.read(maxlength + 1) if len(webpage) == maxlength + 1: return "error" #unzip webpage if it's compressed; like sohu.com if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(webpage) f = gzip.GzipFile(fileobj=buf) webpage = f.read() return webpage except: return 'error'
def daemon_add_item(self, url, title, snippet, published_time, channel_id): """ return 0(int) if succeed; otherwise return error message Add channel_id if existing item doesn't have one; or append a channel_id if existing item has some Add channel_id for new item """ try: url = url.strip() title = title.strip() snippet = snippet.strip() if not url or not title: return 'Error: url or title is null' maxlength = 5242880 #5 * 1024 * 1024bytes = 5MB # redirect_return_value = process_redirect(url, maxlength) # if 'url' in redirect_return_value: # url = redirect_return_value['url'] # else: # return redirect_return_value['error'] url = url_normalize(url, charset='utf-8') parsed = urlparse.urlparse(url) #can be used to judge the source of item if url.find('www.') == -1: url_alias = 'http://' + 'www.' + url.replace('http://', '') else: url_alias = 'http://' + url.replace('http://www.', '') url_set = Item.objects.filter(url=url) url_alias_set = Item.objects.filter(url=url_alias) if len(url_set) > 0 or len(url_alias_set) > 0: if channel_id: #we need to update the channels for the existing news if len(url_set) > 0: item = url_set[0] else: item = url_alias_set[0] if item.channels: id_list = item.channels.split(',') if not str(channel_id) in id_list: id_list.append(str(channel_id)) item.channels = ','.join(id_list) item.save() else: item.channels = str(channel_id) item.save() return 'Error: The item already exists.' headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #Accept-Language: en-us,en;q=0.5 #Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7 'Keep-Alive': '115' , 'Connection': 'keep-alive' , 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.797.0 Safari/535.1' } req = urllib2.Request(url, headers=headers) response = urllib2.urlopen(req, timeout=20) url = response.url url = url_normalize(url, charset='utf-8') #ensure url is normalized after redirect webpage = response.read(maxlength + 1) if len(webpage) == maxlength + 1: return "Error: The webpage is more than max length" if response.info().get('Content-Encoding') == 'gzip': buf = StringIO(webpage) f = gzip.GzipFile(fileobj=buf) webpage = f.read() webpage_name = generate_random_file_name(32) + '.html' f = open(settings.MEDIA_ROOT + 'webpage/' + webpage_name, 'w+') f.write(webpage) #Do not use unicode(soup) here, it will produce error on Linux server f.close() item = Item(name=title[:200], url=url[:255], snippet=snippet[:600], share_count=0, file='webpage/' + webpage_name) item.creator_id = None item.create_date = published_time if channel_id: item.channels = str(channel_id) item.save() return 0 except: return 'Error in getting webpage'