Exemple #1
0
 def crawl_webpage(self,url):
     try:
         maxlength = 5242880 #5 * 1024 * 1024bytes = 5MB
         url = url_normalize(url, charset='utf-8')
         redirect_return_value = process_redirect(url, maxlength)
         if 'url' in redirect_return_value:
             url = redirect_return_value['url']
         else:
             return 'error'
     
         #There is no existing url in database, we should retrieve the page
         #headers is for simulating browser to get connection to certain websites (like Wikipedia)
         headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                 #Accept-Language: en-us,en;q=0.5
                 #Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
                 'Keep-Alive': '115' ,
                 'Connection': 'keep-alive' ,
                 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.797.0 Safari/535.1'
                 #'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11' ,
                 }
         req = urllib2.Request(url, headers=headers)
         response = urllib2.urlopen(req, timeout=30) #timeout in seconds;if not set, system global is used
         
         #response buffer will be cleared after first read; returns an placeholder html even if response is not valid;
         webpage = response.read(maxlength + 1)  
         if len(webpage) == maxlength + 1:
             return "error"
         
         #unzip webpage if it's compressed; like sohu.com
         if response.info().get('Content-Encoding') == 'gzip':
             buf = StringIO(webpage)
             f = gzip.GzipFile(fileobj=buf)
             webpage = f.read()
         
         return webpage
     except:
         return 'error'
Exemple #2
0
 def daemon_add_item(self, url, title, snippet, published_time, channel_id):
     """
     return 0(int) if succeed; otherwise return error message
     Add channel_id if existing item doesn't have one; or append a channel_id if existing item has some
     Add channel_id for new item
     """
     try:
         url = url.strip()
         title = title.strip()
         snippet = snippet.strip()
         if not url or not title:
             return 'Error: url or title is null'
         
         maxlength = 5242880 #5 * 1024 * 1024bytes = 5MB
 #        redirect_return_value = process_redirect(url, maxlength)
 #        if 'url' in redirect_return_value:
 #            url = redirect_return_value['url']
 #        else:
 #            return redirect_return_value['error']
         
         url = url_normalize(url, charset='utf-8')
         parsed = urlparse.urlparse(url) #can be used to judge the source of item
         
         if url.find('www.') == -1:
             url_alias = 'http://' + 'www.' + url.replace('http://', '')
         else:
             url_alias = 'http://' + url.replace('http://www.', '')
         
         url_set = Item.objects.filter(url=url)
         url_alias_set = Item.objects.filter(url=url_alias)
         if len(url_set) > 0 or len(url_alias_set) > 0:
             if channel_id: #we need to update the channels for the existing news
                 if len(url_set) > 0:
                     item = url_set[0]
                 else:
                     item = url_alias_set[0]
                 if item.channels:
                     id_list = item.channels.split(',')
                     if not str(channel_id) in id_list:
                         id_list.append(str(channel_id))
                         item.channels = ','.join(id_list)
                         item.save()
                 else:
                     item.channels = str(channel_id)
                     item.save() 
                 
                     
             return 'Error: The item already exists.'
         
         headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                 #Accept-Language: en-us,en;q=0.5
                 #Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7
                 'Keep-Alive': '115' ,
                 'Connection': 'keep-alive' ,
                 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.797.0 Safari/535.1'
                 }
         req = urllib2.Request(url, headers=headers)
         response = urllib2.urlopen(req, timeout=20)
         url = response.url
         url = url_normalize(url, charset='utf-8') #ensure url is normalized after redirect
         webpage = response.read(maxlength + 1)  
         if len(webpage) == maxlength + 1:
             return "Error: The webpage is more than max length"
         if response.info().get('Content-Encoding') == 'gzip':
             buf = StringIO(webpage)
             f = gzip.GzipFile(fileobj=buf)
             webpage = f.read()
         
         webpage_name = generate_random_file_name(32) + '.html'
         f = open(settings.MEDIA_ROOT + 'webpage/' + webpage_name, 'w+')
         f.write(webpage)  #Do not use unicode(soup) here, it will produce error on Linux server
         f.close()
         
         item = Item(name=title[:200], url=url[:255], snippet=snippet[:600], share_count=0, file='webpage/' + webpage_name)
         item.creator_id = None
         item.create_date = published_time
         if channel_id:
             item.channels = str(channel_id)
         item.save()
         return 0
     except:
         return 'Error in getting webpage'