Esempio n. 1
0
def feed(request):
    """
    Fetches feeds and looks for new posts;
    there is not a lot in the way of error catching, may have to be fixed.
    
    This is theoretically the page that causes the most CPU usage
    in the entire application, so optimization is needed
    
    DODO: Add more error fetching
    
    No input sanitizing is done here, this is a background task anyway.
    
    @param request: Djangos request object, GET is used unfiltered, 
    """
    from google.appengine.api import urlfetch
    from sfdr.models import Feed, Entry
    feedobj = Feed.objects.get(id=request.GET['feed'])

    result = urlfetch.fetch(feedobj.url)
    if result.status_code == 200:
        from sfdr.modules.feedparse import process, itemTester
        items = process(result.content.lstrip())
        logging.debug("loaded %s items from RSS-feed %s" % (len(items), feedobj.name))
        tester = itemTester()
        u = tester.process(feedobj, items)
        del items
        from datetime import datetime, timedelta
        from google.appengine.api.taskqueue import add, TombstonedTaskError, TaskAlreadyExistsError #@UnresolvedImport
        now = datetime.now()
        if u:
            from sfdr.modules.feedparse import cfWrapper
            logging.debug("U loaded %s" % len(u))
            logging.debug("Strange %s " % str(u))
            cf = cfWrapper()
            for i in u:
                (i, cfo) = cf.parse(i, feedobj)
                from pickle import dumps
                if not i['date']:
                    now = now + timedelta(seconds=1)
                    i['date'] = now
                    logging.debug("Added a second to : \"%s\"" % i['guid'])
                try:
                    cfd = cfo["cfd"]
                except:
                    cfd = None
                e = Entry(
                          feed=feedobj,
                          key="%s->%s" % (feedobj.id, i['guid']),
                          date=i['date'],
                          guid=i['guid'],
                          title=i['title'],
                          url=i['link'],
                          summary=i['summary'],
                          other=dumps({"cf":cfd})
                         )
                e.save()
                logging.debug("Added post \"%s\"" % i['guid'])
                try:
                    from hashlib import sha1
                    add(
                        url="/tasks/post.py",
                        name=sha1(e.key).hexdigest() + settings.TQWOKEY,
                        params={"post":str(e.key)},
                        method="GET",
                        queue_name="posts"
                       )
                    if cfo and "call" in cfo:
                        logging.debug("Adding OWL call")
                        add(
                            url=cfo["call"],
                            params={"entry":e.key},
                            method="GET",
                            queue_name="longterm",
                            eta=datetime.now() + timedelta(days=7)
                           )
                except TaskAlreadyExistsError: #@UnusedVariable
                    logging.error("Existing task encountered: " + str(i))
                except TombstonedTaskError:
                    logging.error("Error occurred, TombstonedTask: " + str(i))
        feedobj.lastCk = datetime.now()
        if feedobj.nextCk < datetime.now() + timedelta(minutes=5):
            logging.debug("Resceduling now")
            feedobj.nextCk = datetime.now() + timedelta(minutes=30)
            add(
                url="/tasks/feeds/fetch.py",
                params={"feed":str(request.GET['feed'])},
                queue_name="feed-burns",
                method="GET",
                eta=datetime.now() + timedelta(minutes=30)
               )
        else:
            logging.debug("Not Resceduling Now: %s, nextCheck: %s  %s" %
                          (
                           str(datetime.now() + timedelta(minutes=5)),
                           str(feedobj.nextCk),
                           (feedobj.nextCk < datetime.now() + timedelta(minutes=5)))
                          )
        feedobj.save()
        from django.http import HttpResponse
        return HttpResponse()
    else:
        logging.error("Unknown Error!")
        logging.error("Debug: %s" % result)
        from django.http import HttpResponseServerError
        return HttpResponseServerError()
Esempio n. 2
0
def EncR_Rs(request):
    """
    EncR rescanner,
    This page simply rescans the feed, extracts the given post,
    and then checks wether or not there are any new enclosed files.
    
    @param request: Django's requeest pobject, GT used unfiltered.
    """
    from google.appengine.api import urlfetch
    from sfdr.models import Entry
    enobj = Entry.objects.get(key=request.GET['entry'])
    feedobj = enobj.feed
    result = urlfetch.fetch(feedobj.url)
    #===========================================================================
    # RSS fetched, did it return correctly?
    # in that case, let's try to make some sense out of it.
    #===========================================================================
    if result.status_code == 200:
        from sfdr.modules.feedparse import process
        items = process(result.content.lstrip())
        o = None
        #=======================================================================
        # For some reason, this is the best way i could get to do this.
        # 
        # In theory you should have the entry caught in this next loop,
        # but if it ran out of the feed you don't.
        #
        # Another scenario is that the entry changed its GUID,
        # but that would be asking for trouble
        #=======================================================================
        for i in items:
            if enobj.key == "%s->%s" % (feedobj.id, i['guid']):
                o = i
                break
        if o:
            from pickle import loads
            c = loads(feedobj.cfConf)
            #===================================================================
            # For some reason, usually upgrades, the extra data the parser gave
            # us did not make it to this point; the fix below adds empty data,
            # clean and simple.
            #===================================================================
            try:
                ose = loads(enobj.other)
                ol = ose["cf"]["l"]
            except:
                try:
                    ose = loads(enobj.other)
                    ol = ose["l"]
                    ose = {"cf":{"l":ol}}
                except:
                    logging.info("Error in loding old shit: %s" % enobj.other)
                    ol = []
                    ose = {"cf":{}}

            #===================================================================
            # Applying the EncR parser. This needs to be separated into
            # a separate subroutine to make it easier to maintain.
            #
            # API: function returning the string list.
            # 
            # TODO: Move the EncR parser into a separate subroutine.
            #===================================================================
            logging.debug("EncR parser applied")
            if c['type'] == "EncR" and c['owl']:
                l = []
                e = []
                for i in c['Fs']:
                    for j in o['enc']:
                        if i["query"] in j:
                            l.append(i["set"])
                            e.append(j)
                #===============================================================
                # OK! so far so good, now we need to do some simple stuff.
                # 1. Did we get all of them?
                # -    Not? Tell someone
                # 2. Join the lists (for later storage)
                # 3. Diff the lists (any new ones show up?)
                # 
                # TODO: Tell someone should email the owner, not just log it.
                #===============================================================
                l = set(l)
                logging.debug("l %s" % str(list(l)))
                ol = set(ol)
                logging.debug("ol %s" % str(list(ol)))
                if len(l) != len(o['enc']):
                    logging.warning("Didn't get %s" % str(list(set(o['enc']) - set(e))))
                lt = list(l | ol)
                logging.debug("%s" % str(list(lt)))
                l = list(l - ol)
                from datetime import datetime, timedelta
                from google.appengine.api.taskqueue import add
                if len(l) > 0:
                    #===========================================================
                    # Something new was found, let us do something about it
                    # 
                    # TODO: this assembler really should be it's own function.
                    #===========================================================
                    s = c['owlPre']
                    for i in l:
                        if i == l[0]:
                            s = s + i
                        elif i == l[-1]:
                            s = s + c['Sl'] + i
                        else:
                            s = s + c['So'] + i
                    s = s + c['owlPost'] + o["title"]
                    st = s
                    logging.debug("owl: %s" % (st))
                    svs = feedobj.postTo
                    if svs:
                        from sfdr.modules.post import push
                        from sfdr.models import Service
                        for s in svs:
                            svs = Service.objects.get(key=s)
                            push(svs, st, enobj.shortURL)
                    #===========================================================
                    # Now that we found one new entry, we reschedule the task
                    # to run next week.
                    #===========================================================
                    add(
                       url="/tasks/pin/encr_rs.py",
                       params={"entry":enobj.key, "count":1},
                       method="GET",
                       queue_name="longterm",
                       eta=datetime.now() + timedelta(days=7)
                      )
                    #===========================================================
                    # Important! Store the data back or we'll get
                    # a infinite loop.
                    #===========================================================
                    from pickle import dumps
                    ose["cf"]["l"] = lt
                    enobj.other = dumps(ose)
                    enobj.save()
                else:
                    #===========================================================
                    # Reschedule, 3 times if 3 weeks goes by without any
                    # new enclosed files, stop rescheduling.
                    #===========================================================
                    if ("count" not in request.GET) or (int(request.GET["count"]) < 4):
                        if "count" not in request.GET:
                            count = 1
                        else:
                            count = int(request.GET["count"]) + 1
                        add(
                            url="/tasks/pin/encr_rs.py",
                            params={"entry":enobj.key, "count":count},
                            method="GET",
                            queue_name="longterm",
                            eta=datetime.now() + timedelta(days=7)
                        )
                        logging.debug("No new enclosures, trying again next week!")
                    else:
                        logging.debug("No new enclosures, and timer ran out, Quiting!")
            else:
                logging.warning("No config on EncR cf")
    #===========================================================================
    # In any case, send success to the task queue API.
    #===========================================================================
    from django.http import HttpResponse
    return HttpResponse()