def store(self,feed,entries): feed_name = feed['name'] feed_refresh_time = None if 'refresh' in feed.keys(): feed_refresh_time = feed['refresh'] if feed_refresh_time == None: feed_refresh_time = self.get_feed_refresh_time(feed_name) c = self.conn.cursor() for entry in entries: if feed['type'] == 'ato': entry_time = datetime.fromtimestamp(mktime(entry.updated_parsed)) elif feed['type'] == 'rss': entry_time = datetime.fromtimestamp(mktime(entry.published_parsed)) else: pass max_time = None if feed_refresh_time == None or entry_time > feed_refresh_time: if max_time == None or max_time < entry_time: max_time = entry_time logger.info('find new article \'%s\' from feed \'%s\'', entry.title,feed_name) if feed['type'] == 'ato': self.save_doc(c,feed_name,entry.id,entry.title,entry.content[0].value,entry.link,entry_time) elif feed['type'] == 'rss': self.save_doc(c,feed_name,entry.id,entry.title,entry.description,entry.link,entry_time) else: pass feed['refresh'] = max_time self.conn.commit()
def download(self,feed,url,filename,extname=None): if feed.startswith('csdn'): true_url = url.split('?')[0] else: true_url = url logger.info('downloading image \'%s\'',url) try: req = urllib2.Request(true_url) req.add_header('User-Agent',USER_AGENT) resp = urllib2.urlopen(req,None,DOWNLOAD_TIMEOUT) #resp = urlopen(true_url) #logger.debug(resp.getcode()) data = resp.read(-1) resp.close() if extname == None: #in python3 #content_type = resp.getheader('Content-Type') #in python2 content_type = resp.info().getheader('Content-Type').lower() extname = self.get_extname(content_type) if extname == None: logger.warning('unsupported content type \'%s\'',content_type) return (None, STATE_BADURL, 'unsupported content type \''+content_type+'\'') file_fullname = filename+extname fn = os.path.join(self.image_dir,os.path.join(file_fullname[:1],os.path.join(file_fullname[1:2],file_fullname))) f = open(fn,'wb') f.write(data) f.close() return (file_fullname, STATE_SUCCESS, 'success') except Exception as e: logger.error('an error accur while downloading %s',url); logger.exception(e) return (None, STATE_NETWORK_ERROR, 'network error')
def save_doc(self,cursor,feed_name,doc_id,doc_title,doc_content,doc_url,update_time): sql = 'select entry_time from contents where entry_url = ?' cursor.execute(sql,(doc_url,)) rows = cursor.fetchall() if len(rows) > 0 : sql = 'delete from contents where entry_url = ?' cursor.execute(sql,(doc_url,)) sql = 'insert into contents(feed_name,entry_id,entry_title,entry_desc,entry_url,entry_time,state) values(?,?,?,?,?,?,0)' params = (feed_name,doc_id,doc_title,doc_content,doc_url,update_time) cursor.execute(sql,params) logger.info('save new article \'%s\' to database', doc_title)
def check_images(self): cursor = self.conn.cursor() # new images logger.info('start list all new images for download') sql = 'select id, feed_name, url from images where state = 0' cursor.execute(sql) rows = cursor.fetchall() self.download_images(rows,cursor) # recent error images logger.info('start list recent error images for download') sql = 'select id, feed_name, url from images where state&? = ? and crtime > datetime(\'now\',\'-24 hour\')' cursor.execute(sql,(STATE_SUCCESS,0)) rows = cursor.fetchall() self.download_images(rows,cursor) cursor.close() self.conn.commit()
def fetch_feed(self,feed): feed_name = feed['name'] try: logger.info('start fetch feed \'%s\'', feed_name) fp = feedparser.parse(feed['url']) if len(fp.entries) > 0: feed['type'] = fp.version[0:3] logger.info('get %d articles from feed \'%s\'',len(fp.entries),feed_name) self.store(feed,fp.entries) else: logger.info('get nothing from feed \'%s\'',feed_name) except Exception as e: logger.error('get error while fetching feed \'%s\'',feed_name) logger.exception(e)
def download_images(self,rows,cursor): for row in rows: imgid = row[0] feed = row[1] url = row[2] logger.info('prepare download image#%d \'%s\'',imgid,url) (filename,state,message) = self.download(feed,url,uuid.uuid1().hex) if filename != None: logger.info('download image success, local filename is \'%s\'',filename) cursor.execute('update images set filename = ?, state = state|? where id = ?',(filename,state,imgid)) else: logger.info('download image fail: %s',message) cursor.execute('update images set state = state|? where id = ?',(state,imgid)) self.conn.commit()
def fetch(self): logger.info('start fetch documents') for feed in config.feeds: self.fetch_feed(feed) logger.info('documents fetch complete')
def extract_url(self): logger.info('start search images from document') cursor = self.conn.cursor() sql = 'select id, feed_name, entry_url, entry_desc from contents where state&1 = 0' cursor.execute(sql) rows = cursor.fetchall() logger.info('get %d new documents', len(rows)) for row in rows: id = row[0] feed = row[1] docurl = row[2] content = row[3] logger.info('extract image from doc \'%s\'',docurl) for img in BeautifulSoup(content).findAll('img'): if img.has_key('src'): imgurl = img['src'] logger.info('find image \'%s\'',imgurl) if not imgurl.startswith('http'): imgurl = urljoin(docurl,imgurl) logger.info('image\'s full url is \'%s\'',imgurl) cursor.execute('select count(*) from images where url = ?',(imgurl,)) img_count = cursor.fetchall()[0][0] if img_count == 0: logger.info('it is a new image') cursor.execute('insert into images (feed_name,url,state) values (?,?,0)',(feed,imgurl)) logger.info('insert this image url to database') else: logger.info('this image is already exists') cursor.execute('update contents set state = state|1 where id = ?',(id,)) self.conn.commit()
def fetch(self): logger.info('start fetch images') self.extract_url() self.check_images() logger.info('images fetch complete')