def get_application_links(page_url):
    """ This routine hunts down all the development application links,
        within the given page.

        Each of these links should be yielded back to the caller, or
        all returned at once in an iterable result (eg. list).
    """
    response = requests.get(page_url)
    content = lxml.html.fromstring(response.text)
    pages = get_pagination_links(page_url, content)
    finished = False

    applications = OrderedDict()

    # NOTE: We walk all the listed pages for links now,
    #   since we don't get to backtrack to a specific pagenation
    #   once we visit the detail page. The lists all share the same
    #   url, with page selection controlled by javascript form submits. :P
    while len(content) and not finished:
        links = content.xpath(
            "//tr[@class!='headerRow' and @class!='pagerRow']/td[1]/a"
        )

        for link in links:
            # Screw the href we're given here. It's a javascript
            # postback. Instead, we'll use the target url that the
            # normal form submit + redirect would send us to.
            target_url = "https://noosa-eproperty.t1cloud.com/NOOEPRPROD/P1/eTrack/eTrackApplicationDetails.aspx?r=P1.WEBGUEST&f=$P1.ETR.APPDET.VIW&ApplicationId={}".format(
                link.text
            )
            applications[link.text] = target_url

        if pages:
            # Fetch the next page of application links
            next_page_link = pages.pop(0)
            response = next_page_link.click()
            content = lxml.html.fromstring(response)
        else:
            finished = True

    # Council ref is the link text. If we can skip now,
    # No need to fetch the detail page later.
    council_refs = applications.keys()

    # Fetch the already-seen list from our database...
    known_applications = DevelopmentApplication.select().where(
        DevelopmentApplication.council_reference.in_(council_refs)
    )

    for known in known_applications:
        applications[known.council_reference] = None
    return applications.values()
Exemple #2
0
def main(url):
    MorphDatabase.init()

    html = requests.get(url)
    tree = lxml.html.fromstring(html.content)
    print(tree)
    sys.exit(0)

    count_new = total = 0
    for application_url in get_application_links(url):

        if not application_url:
            # Skipped entry...
            total += 1
            continue

        # html = scraperwiki.scrape(url)

        # XPath
        #This will create a list of buyers:
        buyers = tree.xpath('//div[@title="buyer-name"]/text()')
        #This will create a list of prices
        prices = tree.xpath('//span[@class="item-price"]/text()')

        # Find something on the page using css selectors
        # root.cssselect("div[align='left']")
        #
        data = extract_application_details(application_url)

        application, created = DevelopmentApplication.get_or_create(**data)

        total += 1

        if not created:
            print("* Skipping {0.council_reference}".format(application))
        else:
            print("Saved {0.council_reference}".format(application))
            count_new += 1

    print("Added {0} records out of {1} processed.".format(count_new, total))
def main(start_url):
    MorphDatabase.init()
    count_new = total = 0

    for application_url in get_application_links(start_url):

        if not application_url:
            # Skipped entry...
            total += 1
            continue

        da_info = extract_application_details(application_url)

        application, created = DevelopmentApplication.get_or_create(**da_info)

        total += 1

        if not created:
            print("* Skipping {0.council_reference}".format(application))
        else:
            print("Saved {0.council_reference}".format(application))
            count_new += 1

    print("Added {0} records out of {1} processed.".format(count_new, total))