Exemple #1
0
def loadHansard(hansard=None, url=None, session=None):
    if hansard:
        try:
            return HansardCache.objects.get(hansard=hansard)
        except HansardCache.DoesNotExist:
            if hansard.url:
                return loadHansard(url=hansard.url, session=hansard.session)
    elif url and session:
        normurl = parsetools.normalizeHansardURL(url)
        if normurl != url:
            print "WARNING: Normalized URL %s to %s" % (url, normurl)
        try:
            cached = HansardCache.objects.get(hansard__url=normurl)
            if cached.hansard.session != session:
                raise Exception(
                    "Found cached Hansard, but session doesn't match...")
            return cached
        except HansardCache.DoesNotExist:
            print "Downloading Hansard from %s" % normurl
            req = urllib2.Request(normurl)
            page = urllib2.urlopen(req).read()
            #try:
            number = _getHansardNumber(page)
            #except Exception, e:
            #    print e
            #    print "Couldn't get Hansard number for"
            #    print url
            #    print "Please enter: ",
            #    number = sys.stdin.readline().strip()
            try:
                hansard = Hansard.objects.get(session=session, number=number)
            except Hansard.DoesNotExist:
                hansard = Hansard(session=session, number=number, url=normurl)
                hansard.save()
            else:
                if hansard.url != normurl:
                    raise Exception(
                        "Hansard exists, with a different url: %s %s" %
                        (normurl, hansard.url))
            cache = HansardCache(hansard=hansard)
            cache.saveHTML(page)
            cache.save()
            return cache
    else:
        raise Exception("Either url/session or hansard are required")
def loadHansard(hansard=None, url=None, session=None):
    if hansard:
        try:
            return HansardCache.objects.get(hansard=hansard)
        except HansardCache.DoesNotExist:
            if hansard.url:
                return loadHansard(url=hansard.url, session=hansard.session)
    elif url and session:
        normurl = parsetools.normalizeHansardURL(url)
        if normurl != url:
            print "WARNING: Normalized URL %s to %s" % (url, normurl)
        try:
            cached = HansardCache.objects.get(hansard__url=normurl)
            if cached.hansard.session != session:
                raise Exception("Found cached Hansard, but session doesn't match...")
            return cached
        except HansardCache.DoesNotExist:
            print "Downloading Hansard from %s" % normurl
            req = urllib2.Request(normurl)
            page = urllib2.urlopen(req).read()
            #try:
            number = _getHansardNumber(page)
            #except Exception, e:
            #    print e
            #    print "Couldn't get Hansard number for"
            #    print url
            #    print "Please enter: ",
            #    number = sys.stdin.readline().strip()
            try:
                hansard = Hansard.objects.get(session=session, number=number)
            except Hansard.DoesNotExist:
                hansard = Hansard(session=session, number=number, url=normurl)
                hansard.save()
            else:
                if hansard.url != normurl:
                    raise Exception("Hansard exists, with a different url: %s %s" % (normurl, hansard.url))
            cache = HansardCache(hansard=hansard)
            cache.saveHTML(page)
            cache.save()
            return cache
    else:
        raise Exception("Either url/session or hansard are required")
Exemple #3
0
def normalize_hansard_urls():
    for h in Hansard.objects.all():
        normalized = parsetools.normalizeHansardURL(h.url)
        if normalized != h.url:
            h.url = normalized
            h.save()