Exemple #1
0
    def store(self,feed,entries):
        feed_name = feed['name']
        feed_refresh_time = None
        if 'refresh' in feed.keys():
            feed_refresh_time = feed['refresh']
        if feed_refresh_time == None:
            feed_refresh_time = self.get_feed_refresh_time(feed_name)
        
        c = self.conn.cursor()
        for entry in entries:
            if feed['type'] == 'ato':
                entry_time = datetime.fromtimestamp(mktime(entry.updated_parsed))
            elif feed['type'] == 'rss':
                entry_time = datetime.fromtimestamp(mktime(entry.published_parsed))
            else:
                pass

            max_time = None
            if feed_refresh_time == None or entry_time > feed_refresh_time:
                if max_time == None or max_time < entry_time:
                    max_time = entry_time

                logger.info('find new article \'%s\' from feed \'%s\'', entry.title,feed_name)
                if feed['type'] == 'ato':
                    self.save_doc(c,feed_name,entry.id,entry.title,entry.content[0].value,entry.link,entry_time)
                elif feed['type'] == 'rss':
                    self.save_doc(c,feed_name,entry.id,entry.title,entry.description,entry.link,entry_time)
                else:
                    pass

        feed['refresh'] = max_time

        self.conn.commit()
Exemple #2
0
 def download(self,feed,url,filename,extname=None):
     if feed.startswith('csdn'):
         true_url = url.split('?')[0]
     else:
         true_url = url
     logger.info('downloading image \'%s\'',url)
     try:
         req = urllib2.Request(true_url)
         req.add_header('User-Agent',USER_AGENT)
         resp = urllib2.urlopen(req,None,DOWNLOAD_TIMEOUT)
         #resp = urlopen(true_url)
         #logger.debug(resp.getcode())
         data = resp.read(-1)
         resp.close()
         if extname == None:
             #in python3
             #content_type = resp.getheader('Content-Type')
             #in python2
             content_type = resp.info().getheader('Content-Type').lower()
             extname = self.get_extname(content_type)
         if extname == None:
             logger.warning('unsupported content type \'%s\'',content_type)
             return (None, STATE_BADURL, 'unsupported content type \''+content_type+'\'')
         file_fullname = filename+extname
         fn = os.path.join(self.image_dir,os.path.join(file_fullname[:1],os.path.join(file_fullname[1:2],file_fullname)))
         f = open(fn,'wb')
         f.write(data)
         f.close()
         return (file_fullname, STATE_SUCCESS, 'success')
     except Exception as e:
         logger.error('an error accur while downloading %s',url);
         logger.exception(e)
         return (None, STATE_NETWORK_ERROR, 'network error')
Exemple #3
0
    def save_doc(self,cursor,feed_name,doc_id,doc_title,doc_content,doc_url,update_time):
        sql = 'select entry_time from contents where entry_url = ?'
        cursor.execute(sql,(doc_url,))
        rows = cursor.fetchall()
        if len(rows) > 0 :
            sql = 'delete from contents where entry_url = ?'
            cursor.execute(sql,(doc_url,))

        sql = 'insert into contents(feed_name,entry_id,entry_title,entry_desc,entry_url,entry_time,state) values(?,?,?,?,?,?,0)'
        params = (feed_name,doc_id,doc_title,doc_content,doc_url,update_time)
        cursor.execute(sql,params)
        logger.info('save new article \'%s\' to database', doc_title)
Exemple #4
0
    def check_images(self):
        cursor = self.conn.cursor()

        # new images
        logger.info('start list all new images for download')
        sql = 'select id, feed_name, url from images where state = 0'
        cursor.execute(sql)
        rows = cursor.fetchall()
        self.download_images(rows,cursor)

        # recent error images
        logger.info('start list recent error images for download')
        sql = 'select id, feed_name, url from images where state&? = ? and crtime > datetime(\'now\',\'-24 hour\')'
        cursor.execute(sql,(STATE_SUCCESS,0))
        rows = cursor.fetchall()
        self.download_images(rows,cursor)

        cursor.close()
        self.conn.commit()
Exemple #5
0
 def fetch_feed(self,feed):
     feed_name = feed['name']
     try:
         logger.info('start fetch feed \'%s\'', feed_name)
         fp = feedparser.parse(feed['url'])
         if len(fp.entries) > 0:
             feed['type'] = fp.version[0:3]
             logger.info('get %d articles from feed \'%s\'',len(fp.entries),feed_name)
             self.store(feed,fp.entries)
         else:
             logger.info('get nothing from feed \'%s\'',feed_name)
     except Exception as e:
         logger.error('get error while fetching feed \'%s\'',feed_name)
         logger.exception(e)
Exemple #6
0
 def download_images(self,rows,cursor):
     for row in rows:
         imgid = row[0]
         feed = row[1]
         url = row[2]
         logger.info('prepare download image#%d \'%s\'',imgid,url)
         (filename,state,message) = self.download(feed,url,uuid.uuid1().hex)
         if filename != None:
             logger.info('download image success, local filename is \'%s\'',filename)
             cursor.execute('update images set filename = ?, state = state|? where id = ?',(filename,state,imgid))
         else:
             logger.info('download image fail: %s',message)
             cursor.execute('update images set state = state|? where id = ?',(state,imgid))
         self.conn.commit()
Exemple #7
0
 def fetch(self):
     logger.info('start fetch documents')
     for feed in config.feeds:
         self.fetch_feed(feed)
     logger.info('documents fetch complete')
Exemple #8
0
    def extract_url(self):
        logger.info('start search images from document')
        cursor = self.conn.cursor()
        sql = 'select id, feed_name, entry_url, entry_desc from contents where state&1 = 0'
        cursor.execute(sql)
        rows = cursor.fetchall()
        logger.info('get %d new documents', len(rows))
        for row in rows:
            id = row[0]
            feed = row[1]
            docurl = row[2]
            content = row[3]
            logger.info('extract image from doc \'%s\'',docurl)
            for img in BeautifulSoup(content).findAll('img'):
                if img.has_key('src'):
                    imgurl = img['src']
                    logger.info('find image \'%s\'',imgurl)
                    if not imgurl.startswith('http'):
                        imgurl = urljoin(docurl,imgurl)
                    logger.info('image\'s full url is \'%s\'',imgurl)

                    cursor.execute('select count(*) from images where url = ?',(imgurl,))
                    img_count = cursor.fetchall()[0][0]
                    if img_count == 0:
                        logger.info('it is a new image')
                        cursor.execute('insert into images (feed_name,url,state) values (?,?,0)',(feed,imgurl))
                        logger.info('insert this image url to database')
                    else:
                        logger.info('this image is already exists')
            cursor.execute('update contents set state = state|1 where id = ?',(id,))
            self.conn.commit()
Exemple #9
0
 def fetch(self):
     logger.info('start fetch images')
     self.extract_url()
     self.check_images()
     logger.info('images fetch complete')