def fetchFeed(feed): counter = 0 modified = feed.modified.timetuple() if feed.modified else None if bloghu_re.match(feed.url): f = bloghu_parse(feed) else: f = parse(feed.url, etag=feed.etag, modified=modified) if not f: print '[!] cannot parse %s - %s' % (feed.name, feed.url) return #print '[!] parsing %s - %s' % (feed.name, feed.url) try: feed.etag = f.etag except AttributeError: pass try: feed.modified = datetime(*f.modified[:6]) except (TypeError, AttributeError): pass d = feed.updated for item in reversed(f['entries']): try: tmp_date = datetime(*item['updated_parsed'][:6]) except: tmp_date = datetime.now() # title content updated try: c = cleaner.clean_html(clean(unicode(''.join([x.value for x in item.content])))) except: c = u'No content found, plz check the feed and fix me =)' for key in ['media_text', 'summary', 'description', 'media:description']: if item.has_key(key): c = try_clean(item[key]) break t = unicode(item.get('title','')) try: u = urlSanitize(item['links'][0]['href']) except: u = '' #if feed.item_set.filter(title=t).filter(content=c).all(): if feed.item_set.filter(url=u).filter(feed=feed).all(): continue # date as tmp_date?! new_item = Item(url=u, title=t, content=c, feed=feed, date=tmp_date) new_item.save() counter += 1 feed.updated = d feed.save() return counter
def fetchFeed(feed): if verbose: print u'[!] parsing %s - %s' % (feed.name, feed.url) counter = 0 headers={'ETag': feed.etag} if feed.modified: headers['Last-Modified']=feed.modified.strftime("%a, %d %b %Y %H:%M:%S") try: resp=requests.get(feed.url, headers=headers, timeout=4) except: print >>sys.stderr, u"[!] couldn't fetch feed, skip", feed.name print >>sys.stderr, traceback.format_exc() return counter if len(' '.join(resp.text.split()))<=0: return f = parse(resp.text) if not f: print >>sys.stderr, u'[!] cannot parse %s - %s' % (feed.name, feed.url) return counter try: feed.etag = resp.headers['etag'] except AttributeError: pass try: feed.modified = datetime.strptime(' '.join(resp.headers['last-modified'].split()[:-1]),"%a, %d %b %Y %H:%M:%S") except AttributeError: pass d = feed.updated for item in reversed(f['entries']): try: u = urlSanitize(item['links'][0]['href']) except: #print >>sys.stderr, u"[!] couldn't sanitize url, leaving as is", item['links'][0].get('href') #print >>sys.stderr, traceback.format_exc() u = item['links'][0].get('href') if not u or feed.item_set.filter(url=u).all(): #print 'skipping', u.encode('utf8') continue if verbose: print 'adding', u.encode('utf8') try: tmp_date = datetime(*item['updated_parsed'][:6]) except: tmp_date = datetime.now() # title content updated try: c = cleaner.clean_html(clean(unicode(''.join([x.value for x in item.content])))) except: c = u'No content found, plz check the feed and fix me =)' for key in ['media_text', 'summary', 'description', 'media:description']: if item.has_key(key): try: c = cleaner.clean_html(clean(unicode(item[key]))) except: #print u print clean(unicode(item.get(key))) break t = unicode(item.get('title','')) #if feed.item_set.filter(title=t).filter(content=c).all(): if feed.item_set.filter(title=t,content=c).count()>0: #if feed.item_set.filter(title=t,content=c).count()>0: #if feed.item_set.filter(title=t).filter(content=c).count()>0: continue # date as tmp_date?! new_item = Item(url=u, title=t, content=c, feed=feed, date=tmp_date) new_item.save() for tag in item.get('tags',[]): if tag.get('term'): tag=Tag.objects.get_or_create(tag=tag['term'], scheme=tag.get('scheme'))[0] if not new_item in tag.items.all(): tag.items.add(new_item) counter += 1 feed.updated = d feed.save() return counter
counter = 0 for feed in feeds: f = parse(feed.url) if not f: print '[!] cannot parse %s - %s' % (feed.name, feed.url) continue print '[!] parsing %s - %s' % (feed.name, feed.url) d = feed.updated for item in reversed(f['entries']): # title content updated try: c = unicode(item.content[0].value) except: if item.has_key('media_text'): c = unicode(item['media_text']) elif item.has_key('summary'): c = unicode(item['summary']) else: c = u'Not found any content, plz check the feed and fix me =)' t = unicode(item['title']) u = item['links'][0]['href'] if feed.item_set.filter(title=t).all(): continue # date as tmp_date?! new_item = Item(url=u, title=t, content=c, feed=feed) new_item.save() counter += 1 feed.updated = d feed.save() print '[!] %d item added' % counter