Beispiel #1
0
def fetchFeed(feed):
    counter = 0
    modified = feed.modified.timetuple() if feed.modified else None
    if bloghu_re.match(feed.url):
        f = bloghu_parse(feed)
    else:
        f = parse(feed.url, etag=feed.etag, modified=modified)
    if not f:
        print '[!] cannot parse %s - %s' % (feed.name, feed.url)
        return
    #print '[!] parsing %s - %s' % (feed.name, feed.url)
    try:
        feed.etag = f.etag
    except AttributeError:
        pass
    try:
        feed.modified = datetime(*f.modified[:6])
    except (TypeError, AttributeError):
        pass
    d = feed.updated
    for item in reversed(f['entries']):
        try:
            tmp_date = datetime(*item['updated_parsed'][:6])
        except:
            tmp_date = datetime.now()
        # title content updated
        try:
            c = cleaner.clean_html(clean(unicode(''.join([x.value for x in item.content]))))
        except:
            c = u'No content found, plz check the feed and fix me =)'
            for key in ['media_text', 'summary', 'description', 'media:description']:
                if item.has_key(key):
                    c = try_clean(item[key])
                    break
        t = unicode(item.get('title',''))
        try:
           u = urlSanitize(item['links'][0]['href'])
        except:
           u = ''
        #if feed.item_set.filter(title=t).filter(content=c).all():
        if feed.item_set.filter(url=u).filter(feed=feed).all():
            continue
        # date as tmp_date?!
        new_item = Item(url=u, title=t, content=c, feed=feed, date=tmp_date)
        new_item.save()
        counter += 1
    feed.updated = d
    feed.save()
    return counter
Beispiel #2
0
def fetchFeed(feed):
    if verbose: print u'[!] parsing %s - %s' % (feed.name, feed.url)
    counter = 0
    headers={'ETag': feed.etag}
    if feed.modified:
        headers['Last-Modified']=feed.modified.strftime("%a, %d %b %Y %H:%M:%S")
    try:
        resp=requests.get(feed.url, headers=headers, timeout=4)
    except:
        print >>sys.stderr, u"[!] couldn't fetch feed, skip", feed.name
        print >>sys.stderr, traceback.format_exc()
        return counter
    if len(' '.join(resp.text.split()))<=0: return
    f = parse(resp.text)
    if not f:
        print >>sys.stderr, u'[!] cannot parse %s - %s' % (feed.name, feed.url)
        return counter
    try:
        feed.etag = resp.headers['etag']
    except AttributeError:
        pass
    try:
       feed.modified = datetime.strptime(' '.join(resp.headers['last-modified'].split()[:-1]),"%a, %d %b %Y %H:%M:%S")
    except AttributeError:
        pass
    d = feed.updated
    for item in reversed(f['entries']):
        try:
           u = urlSanitize(item['links'][0]['href'])
        except:
           #print >>sys.stderr, u"[!] couldn't sanitize url, leaving as is", item['links'][0].get('href')
           #print >>sys.stderr, traceback.format_exc()
           u = item['links'][0].get('href')
        if not u or feed.item_set.filter(url=u).all():
            #print 'skipping', u.encode('utf8')
            continue

        if verbose: print 'adding', u.encode('utf8')
        try:
            tmp_date = datetime(*item['updated_parsed'][:6])
        except:
            tmp_date = datetime.now()

        # title content updated
        try:
            c = cleaner.clean_html(clean(unicode(''.join([x.value for x in item.content]))))
        except:
            c = u'No content found, plz check the feed and fix me =)'
            for key in ['media_text', 'summary', 'description', 'media:description']:
                if item.has_key(key):
                    try:
                        c = cleaner.clean_html(clean(unicode(item[key])))
                    except:
                        #print u
                        print clean(unicode(item.get(key)))
                    break
        t = unicode(item.get('title',''))
        #if feed.item_set.filter(title=t).filter(content=c).all():
        if feed.item_set.filter(title=t,content=c).count()>0:
        #if feed.item_set.filter(title=t,content=c).count()>0:
        #if feed.item_set.filter(title=t).filter(content=c).count()>0:
            continue
        # date as tmp_date?!
        new_item = Item(url=u, title=t, content=c, feed=feed, date=tmp_date)
        new_item.save()
        for tag in item.get('tags',[]):
            if tag.get('term'):
                tag=Tag.objects.get_or_create(tag=tag['term'], scheme=tag.get('scheme'))[0]
                if not new_item in tag.items.all():
                    tag.items.add(new_item)
        counter += 1
    feed.updated = d
    feed.save()
    return counter
Beispiel #3
0
    counter = 0
    for feed in feeds:
        f = parse(feed.url)
        if not f:
            print '[!] cannot parse %s - %s' % (feed.name, feed.url)
            continue
        print '[!] parsing %s - %s' % (feed.name, feed.url)
        d = feed.updated
        for item in reversed(f['entries']):
            # title content updated
            try:
                c = unicode(item.content[0].value)
            except:
                if item.has_key('media_text'):
                    c = unicode(item['media_text'])
                elif item.has_key('summary'):
                    c = unicode(item['summary'])
                else:
                    c = u'Not found any content, plz check the feed and fix me =)'
            t = unicode(item['title'])
            u = item['links'][0]['href']
            if feed.item_set.filter(title=t).all():
                continue
            # date as tmp_date?!
            new_item = Item(url=u, title=t, content=c, feed=feed)
            new_item.save()
            counter += 1
        feed.updated = d
        feed.save()
    print '[!] %d item added' % counter