def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) next = self.base_url %{"day": day, "month": month, "year": year, } while next: # Now get the search page response = urllib2.urlopen(next) soup = BeautifulSoup.BeautifulSoup(response.read()) trs = soup.table.findAll("tr")[1:] # First one is just headers for tr in trs: application = PlanningApplication() application.date_received = search_day application.council_reference = tr.a.string application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) tds = tr.findAll("td") application.address = ' '.join([x.replace(" ", " ").strip() for x in tds[2].contents if type(x) == BeautifulSoup.NavigableString and x.strip()]) application.postcode = getPostcodeFromText(application.address) application.description = tds[4].string.replace(" ", " ").strip() # Get the info page in order to find the comment url # we could do this without a download if it wasn't for the # sector parameter - I wonder what that is? info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) comment_navstring = info_soup.find(text=comment_re) if comment_navstring: application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text=comment_re).parent['href']) else: application.comment_url = "No Comments" # While we're at it, let's get the OSGB application.osgb_x, application.osgb_y = [x.strip() for x in info_soup.find(text=mapref_re).findNext("a").string.strip().split(",")] self._results.addApplication(application) next_element = soup.find(text="next").parent if next_element.name == 'a': next = urlparse.urljoin(self.base_url, next_element['href']) else: next = None return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format), "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)}) soup = BeautifulSoup(response.read()) # Each app is stored in it's own table result_tables = soup.findAll("table", border="1") # For the moment, we'll have to ignore the first result (see TODO list). for table in result_tables[1:]: application = PlanningApplication() # It's not clear to me why this next one isn't the string of the next sibling. This works though! application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0] application.address = table.find(text="Location").parent.findNextSibling().string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = urlparse.urljoin(self.base_url, table.a['href']) # Let's go to the info_page and get the OSGB and the date_received info_request = urllib2.Request(application.info_url) # We need to add the language header in order to get UK style dates info_request.add_header("Accept-Language", "en-gb,en") info_response = urllib2.urlopen(info_request) info_soup = BeautifulSoup(info_response.read()) grid_reference_td = info_soup.find(text="Grid Reference").findNext("td") x_element = grid_reference_td.font application.osgb_x = x_element.string.strip() application.osgb_y = x_element.nextSibling.nextSibling.string.strip() date_string = info_soup.find(text="Date Valid").findNext("td").string.strip() application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip() # There is a link to comment from the info page, though I can't click it. application.comment_url = application.info_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url %{"day": day, "month": month, "year": year, }) soup = BeautifulSoup(response.read()) trs = soup.findAll("tr", valign="middle") count = 0 for tr in trs: # The odd trs are just spacers if count % 2 == 0: application = PlanningApplication() tds = tr.findAll("td") application.date_received = search_day application.council_reference = tds[1].a.string application.address = tds[3].a.string application.postcode = getPostcodeFromText(application.address) # All the links in this <tr> go to the same place... application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) # Still looking for description and comment url # For the description, we'll need the info page info_soup = BeautifulSoup(urllib2.urlopen(application.info_url).read()) application.description = info_soup.find(text="Description").findNext("td").findNext("td").font.string # While we're here, lets get the OSGB grid ref application.osgb_x, application.osgb_y = info_soup.find(text="Grid Reference").findNext("td").font.string.split("-") # We'll have to use an email address for comments application.comment_url = self.comments_email_address self._results.addApplication(application) count += 1 return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) monday_before = search_day - datetime.timedelta(search_day.weekday()) thursday = monday_before + datetime.timedelta(3) if search_day.weekday() > 3: # i.e. It is friday, saturday, or sunday # We need to add a week thursday = thursday + datetime.timedelta(7) this_url = self.base_url %(thursday.strftime(search_date_format)) # Now get the search page response = urllib2.urlopen(this_url) soup = BeautifulSoup(response.read()) # Each app is stored in a table of its own. The tables don't have # any useful attributes, so we'll find all the NavigableString objects # which look like " Application Number:" and then look at the #tables they are in. nav_strings = soup.findAll(text=" Application Number:") for nav_string in nav_strings: application = PlanningApplication() application.council_reference = nav_string.findNext("p").string.strip() result_table = nav_string.findPrevious("table") application.date_received = datetime.datetime.strptime(result_table.find(text=" Registration Date: ").findNext("p").contents[0].strip(), reg_date_format) application.osgb_x = result_table.find(text=" Easting:").findNext("p").string.strip() application.osgb_y = result_table.find(text=" Northing:").findNext("p").string.strip() application.description = result_table.find(text=" Proposed Development:").findNext("p").string.strip() application.address = result_table.find(text=" Location:").findNext("p").string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = this_url application.comment_url = self.comments_email_address self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) next_page_url = self.base_url %{"date": search_date.strftime(date_format)} while next_page_url: try: response = urllib2.urlopen(next_page_url) except urllib2.HTTPError: # This is what seems to happen if there are no apps break soup = BeautifulSoup(response.read()) next = soup.find(text="Next") if next: next_page_url = urlparse.urljoin(self.base_url, next.parent['href']) else: next_page_url = None # There is an <h3> for each app that we can use for h3 in soup.findAll("h3", {"class": "resultsnavbar"}): application = PlanningApplication() application.date_received = search_date application.council_reference = h3.string.split(": ")[1] application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip() application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r")) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href']) application.info_url = self.info_url %(urllib.quote(application.council_reference)) application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url % {"date": search_day.strftime(date_format)}) soup = BeautifulSoup(response.read()) # Results are shown in a table each. The tables don't have any nice # attributes, but they do all contain a NavString "Application", # and nothing else does... nav_strings = soup.findAll(text="Application") for nav_string in nav_strings: result_table = nav_string.findPrevious("table") application = PlanningApplication() application.date_received = search_day links = result_table.findAll("a") # We can get OSGB coordinates from the link to streetmap map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]["href"])[3]) application.osgb_x = map_qs_dict.get("x")[0] application.osgb_y = map_qs_dict.get("y")[0] application.council_reference = links[1].string.strip() application.info_url = urlparse.urljoin(self.base_url, links[1]["href"]) application.comment_url = urlparse.urljoin(self.base_url, links[2]["href"]) application.address = " ".join(links[0].previous.strip().split()) application.postcode = getPostcodeFromText(application.address) application.description = links[2].previous.strip() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) post_data = [ ("CNPA_ref", ""), ("application_number", ""), ("LA_id", "%"), ("applicant_type", "%"), ("applicant_name", ""), ("development_address", ""), ("agent_name", ""), ("status", "%"), ("startDay", "%02d" %day), ("startMonth", "%02d" %month), ("startYear", "%d" %year), ("endDay", "%02d" %day), ("endMonth", "%02d" %month), ("endYear", "%d" %year), ] first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year} curlobj = pycurl.Curl() curlobj.setopt(pycurl.FOLLOWLOCATION, True) curlobj.setopt(pycurl.MAXREDIRS, 10) # First we do a normal post, this would happen as an AJAX query # from the browser and just returns the number of applications found. fakefile = StringIO.StringIO() curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000))) curlobj.setopt(pycurl.POST, True) curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.setopt(pycurl.POSTFIELDS, first_post_data) curlobj.perform() app_count = int(fakefile.getvalue()) fakefile.close() if app_count: # Now we do another multipart form post # This gives us something to use as the callback fakefile = StringIO.StringIO() curlobj.setopt(pycurl.URL, self.base_url) curlobj.setopt(pycurl.HTTPPOST, post_data) curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.setopt(pycurl.REFERER, self.referer) curlobj.perform() soup = BeautifulSoup(fakefile.getvalue()) # We may as well free up the memory used by fakefile fakefile.close() for tr in soup.table.findAll("tr")[1:]: application = PlanningApplication() application.date_received = search_date application.comment_url = self.comments_email_address tds = tr.findAll("td") application.council_reference = tds[1].string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) # We're going to need to get the info page in order to get the description # We can't pass a unicode string to pycurl, so we'll have to encode it. curlobj.setopt(pycurl.URL, application.info_url.encode()) curlobj.setopt(pycurl.HTTPGET, True) # This gives us something to use as the callback fakefile = StringIO.StringIO() curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) curlobj.perform() info_soup = BeautifulSoup(fakefile.getvalue()) fakefile.close() application.description = info_soup.find(text="Development Details").findNext("td").string.strip() application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip() application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip() self._results.addApplication(application) return self._results