def getResultsByDayMonthYear(self, day, month, year): # Now get the search page response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) # Each application contains the nav string "Application: " nav_strings = soup.findAll(text="Application: ") for nav_string in nav_strings: application = PlanningApplication() application.council_reference = nav_string.findPrevious("tr").findAll("td", limit=2)[1].string.strip() application.address = nav_string.findNext(text=location_re).split(":")[1].strip() application.postcode = getPostcodeFromText(application.address) application.description = nav_string.findNext(text="Proposal: ").findNext("td").string.strip() application.comment_url = urlparse.urljoin(self.base_url, nav_string.findNext(text="Proposal: ").findNext("a")['href']) application.date_received = datetime.datetime.strptime(nav_string.findNext(text=date_received_re).split(": ")[1], date_format).date() # FIXME: There is no appropriate info_url for the Harrow apps. # I'll put the base url for the moment, but as that is # a list of apps from the last 7 days that will quickly be out of date. application.info_url = self.base_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # What we actually need is the monday before the date searched for: monday_before = search_day - datetime.timedelta(search_day.weekday()) # Now get the search page response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format))) soup = BeautifulSoup(response.read()) result_tables = soup.findAll("table", align="Center", cellpadding="3") for table in result_tables: application = PlanningApplication() # We can set the date received and the comment url straight away. application.comment_url = self.comments_email_address trs = table.findAll("tr") application.council_reference = trs[0].a.string.strip() relative_info_url = trs[0].a['href'] application.info_url = urlparse.urljoin(self.base_url, relative_info_url) application.date_received = monday_before application.address = trs[1].findAll("td")[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = trs[2].findAll("td")[1].string.strip() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) soup = BeautifulSoup(response.read()) # Each app is stored in a table on it's own. # These tables don't have any nice distinguishing features, # but they do all contain a NavigableString "Application", # and nothing else in the page does. nav_strings = soup.findAll(text="Application") for nav_string in nav_strings: results_table = nav_string.findPrevious("table") application = PlanningApplication() application.date_received = search_day application.council_reference = results_table.a.string.strip() application.info_url = urlparse.urljoin(self.base_url, results_table.a['href']) application.address = results_table.findAll("td")[7].a.string.strip() application.postcode = getPostcodeFromText(application.address) application.description = results_table.findAll("td")[-1].contents[0].strip() # A few applications have comment urls, but most don't. # When they do, they have a case officer - I don't think we can # work out the other urls - even if they exist. # Best to use the email address. application.comment_url = self.comments_email_address self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): response = urllib2.urlopen(self.base_url) soup = BeautifulSoup.BeautifulSoup(response.read()) trs = soup.find("table", {"class": "planningtable"}).tbody.findAll("tr") for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a["href"]) application.address = " ".join(tds[1].contents[1].strip().split()[1:]) application.postcode = getPostcodeFromText(application.address) # We're going to need to download the info page in order to get # the comment link, the date received, and the description. info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) application.description = info_soup.find(text="Proposal").findNext(text=True).strip() date_received_str = info_soup.find(text="Date Application Valid").findNext(text=True).split(",")[1].strip() # This is a nasty botch, but the easiest way I can see to get a date out of this is to make another string and use strptime better_date_str = "%s %s %s" % date_received_re.match(date_received_str).groups() application.date_received = datetime.datetime.strptime(better_date_str, "%d %B %Y").date() application.comment_url = info_soup.find("a", title="Comment on this planning application.")["href"] self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): # Now get the search page response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")}) for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[3].string.strip() # Fetch the info url in order to get the date received and the comment url info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date() application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action']) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) get_response = urllib2.urlopen(self.base_url) get_soup = BeautifulSoup(get_response.read()) post_data = ( ("__VIEWSTATE", get_soup.find("input", id="__VIEWSTATE")["value"]), # ("QuickSearchApplicationNumber$TextBox_ApplicationNumber", ""), # ("QuickSearchThisWeek$DropDownList_PastWeek", ""), # ("DetailedSearch$TextBox_PropertyNameNumber", ""), # ("DetailedSearch$Textbox_StreetName", ""), # ("DetailedSearch$Textbox_TownVillage", ""), # ("DetailedSearch$Textbox_Postcode", ""), # ("DetailedSearch$Textbox_Parish", ""), # ("DetailedSearch$Textbox_ApplicantSurname", ""), # ("DetailedSearch$TextBox_AgentName", ""), ("DetailedSearch$TextBox_DateRaisedFrom", search_date.strftime(date_format)), ("DetailedSearch$TextBox_DateRaisedTo", search_date.strftime(date_format)), # ("DetailedSearch$TextBox_DecisionFrom", "dd%2Fmm%2Fyyyy"), # ("DetailedSearch$TextBox_DecisionTo", "dd%2Fmm%2Fyyyy"), ("DetailedSearch$Button_DetailedSearch", "Search"), ("__EVENTVALIDATION", get_soup.find("input", id="__EVENTVALIDATION")["value"]), ) # The response to the GET is a redirect. We'll need to post to the new url. post_response = urllib2.urlopen(get_response.url, urllib.urlencode(post_data)) post_soup = BeautifulSoup(post_response.read()) if not post_soup.find(text = re.compile("No matching record")): # The first row contains headers. trs = post_soup.find("table", {"class": "searchresults"}).findAll("tr")[1:] for tr in trs: application = PlanningApplication() # We can fill the date received in straight away from the date we searched for. application.date_received = search_date tds = tr.findAll("td") application.council_reference = tds[0].font.string.strip() application.address = tds[2].font.string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[3].font.string.strip() # Set the info url and the comment url to be the same - can't get to the comment # one directly without javascript. application.info_url = self.info_url %(application.council_reference) application.comment_url = application.info_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) next = self.base_url %{"day": day, "month": month, "year": year, } while next: # Now get the search page response = urllib2.urlopen(next) soup = BeautifulSoup.BeautifulSoup(response.read()) trs = soup.table.findAll("tr")[1:] # First one is just headers for tr in trs: application = PlanningApplication() application.date_received = search_day application.council_reference = tr.a.string application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) tds = tr.findAll("td") application.address = ' '.join([x.replace(" ", " ").strip() for x in tds[2].contents if type(x) == BeautifulSoup.NavigableString and x.strip()]) application.postcode = getPostcodeFromText(application.address) application.description = tds[4].string.replace(" ", " ").strip() # Get the info page in order to find the comment url # we could do this without a download if it wasn't for the # sector parameter - I wonder what that is? info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) comment_navstring = info_soup.find(text=comment_re) if comment_navstring: application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text=comment_re).parent['href']) else: application.comment_url = "No Comments" # While we're at it, let's get the OSGB application.osgb_x, application.osgb_y = [x.strip() for x in info_soup.find(text=mapref_re).findNext("a").string.strip().split(",")] self._results.addApplication(application) next_element = soup.find(text="next").parent if next_element.name == 'a': next = urlparse.urljoin(self.base_url, next_element['href']) else: next = None return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_date_string = search_date.strftime(date_format) "appstat=&decision=&appdec=&ward=&parish=&dadfrom=&dadto=&davfrom=01%2F06%2F2008&davto=02%2F06%2F2008&searchbut=Search" search_data = urllib.urlencode( [("searchtype", "1"), ("appstat", ""), ("decision", ""), ("appdec", ""), ("ward", ""), ("parish", ""), ("dadfrom", ""), ("dadto", ""), ("davfrom", search_date_string), ("davto", search_date_string), ("searchbut", "Search"), ] ) split_search_url = self._split_base_url[:3] + (search_data, '') search_url = urlparse.urlunsplit(split_search_url) response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) results_table = soup.find(text="Application No").parent.parent.parent trs = results_table.findAll("tr")[1:] tr_counter = 0 while tr_counter < len(trs): tr = trs[tr_counter] if tr_counter % 2 == 0: application = PlanningApplication() application.date_received = search_date application.comment_url = self.comment_email_address tds = tr.findAll("td") application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) application.council_reference = tr.a.string.strip() application.address = tds[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() self._results.addApplication(application) tr_counter += 1 return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_data = urllib.urlencode( [("fuseaction", "planapps.list"), ("SUBJECTID", "104CC166-3ED1-4D22-B9F1E2FB8438478A"), ("src_fromdayRec", day), ("src_frommonthRec", month), ("src_fromyearRec", year), ("src_todayRec", day), ("src_tomonthRec", month), ("src_toyearRec", year), ("findroadworks", "GO"), ] ) search_url = self.base_url + "?" + search_data response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) results_table = soup.find("table", id="results") # Apart from the first tr, which contains headers, the trs come in pairs for each application trs = results_table.findAll("tr")[1:] tr_count = 0 while tr_count < len(trs): tr = trs[tr_count] if tr_count % 2 == 0: application = PlanningApplication() application.date_received = search_date tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.comment_url = self.comment_url %(application.council_reference) application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = ', '.join([x.strip() for x in tds[1].findAll(text=True)]) application.postcode = getPostcodeFromText(application.address) else: # Get rid of the "Details: " at the beginning. application.description = tr.td.string.strip()[9:] self._results.addApplication(application) tr_count += 1 return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) get_request = urllib2.Request(self.base_url) get_response = urllib2.urlopen(get_request) cookie_jar.extract_cookies(get_response, get_request) get_soup = BeautifulSoup(get_response.read()) post_data = ( ("__VIEWSTATE", get_soup.find("input", {"name": "__VIEWSTATE"})["value"]), ("pgid", get_soup.find("input", {"name": "pgid"})["value"]), ("action", "Search"), # ("ApplicationSearch21%3AtbDevAddress", ""), # ("ApplicationSearch21%3AtbApplicantName", ""), # ("ApplicationSearch21%3AtbAgentName", ""), ("ApplicationSearch21:tbDateSubmitted", search_date.strftime(search_date_format)), ("ApplicationSearch21:btnDateSubmitted", "Search"), # ("ApplicationSearch21%3AtbDateDetermined", ""), ) post_request = urllib2.Request(self.base_url, urllib.urlencode(post_data)) cookie_jar.add_cookie_header(post_request) post_response = cookie_handling_opener.open(post_request) post_soup = BeautifulSoup(post_response.read()) # Discard the first <tr>, which contains headers trs = post_soup.find("table", id="SearchResults1_dgSearchResults").findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].string.strip() application.address = tds[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() application.date_received = datetime.datetime(*(time.strptime(tds[3].string.strip(), info_page_date_format)[0:6])) application.info_url = self.info_url %(application.council_reference) # The comment url must be accessed by a POST, so we'll just use the info url for that as well application.comment_url = application.info_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format), "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)}) soup = BeautifulSoup(response.read()) # Each app is stored in it's own table result_tables = soup.findAll("table", border="1") # For the moment, we'll have to ignore the first result (see TODO list). for table in result_tables[1:]: application = PlanningApplication() # It's not clear to me why this next one isn't the string of the next sibling. This works though! application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0] application.address = table.find(text="Location").parent.findNextSibling().string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = urlparse.urljoin(self.base_url, table.a['href']) # Let's go to the info_page and get the OSGB and the date_received info_request = urllib2.Request(application.info_url) # We need to add the language header in order to get UK style dates info_request.add_header("Accept-Language", "en-gb,en") info_response = urllib2.urlopen(info_request) info_soup = BeautifulSoup(info_response.read()) grid_reference_td = info_soup.find(text="Grid Reference").findNext("td") x_element = grid_reference_td.font application.osgb_x = x_element.string.strip() application.osgb_y = x_element.nextSibling.nextSibling.string.strip() date_string = info_soup.find(text="Date Valid").findNext("td").string.strip() application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip() # There is a link to comment from the info page, though I can't click it. application.comment_url = application.info_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) date_string = search_date.strftime(date_format) search_data = urllib.urlencode({"reference": "", "undecided": "yes", "dateFrom": date_string, "dateTo": date_string, "Address": "", "validate": "true", }) request = urllib2.Request(self.base_url, search_data) response = urllib2.urlopen(request) html = response.read() soup = BeautifulSoup(html) tables = soup.findAll("table", {"style": "width:auto;"}) if not tables: return self._results # We don't want the first or last tr trs = tables[0].findAll("tr")[1:-1] for tr in trs: app = PlanningApplication() tds = tr.findAll("td") if len(tds) == 4: local_info_url = tds[0].a['href'] app.info_url = urlparse.urljoin(self.base_url, local_info_url) app.council_reference = tds[0].a.string app.address = tds[1].string app.postcode = getPostcodeFromText(app.address) app.description = tds[2].string app.comment_url = urlparse.urljoin(self.base_url, comment_url_end %app.council_reference) app.date_received = search_date self._results.addApplication(app) return self._results
def getResultsByDayMonthYear(self): # Note that we don't take the day, month and year parameters here. # First get the search page request = urllib2.Request(self.base_url) response = urllib2.urlopen(request) soup = BeautifulSoup(response.read()) trs = soup.findAll("tr", {"class": "dbResults"}) for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.council_reference = tds[0].a.contents[0].strip() application.address = tds[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() application.info_url= urlparse.urljoin(self.base_url, tds[0].a['href']) # These bits have been commented out for performance reasons. We can't afford to go to every application's details page ten times a day while it is open. Instead, we'll just set the date_received to be the scrape date. The comment url can be got by using the id in the info url application.date_received = datetime.datetime.today() relative_comment_url_template = "PlanAppComment.aspx?appId=%d" # Get the appId from the info_url app_id = int(cgi.parse_qs(urlparse.urlsplit(application.info_url)[3])['frmId'][0]) application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url_template %(app_id)) # # I'm afraid we're going to have to get each info url... # this_app_response = urllib2.urlopen(application.info_url) # this_app_soup = BeautifulSoup(this_app_response.read()) # # If there is no received date, for some reason. We'll use the publicicty date instead. # date_string = (this_app_soup.find("span", id="lblTrackRecievedDate") or this_app_soup.find("span", id="lblPubDate")).string # application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) # application.comment_url = urlparse.urljoin(self.base_url, this_app_soup.find("a", id="lnkMakeComment")['href']) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) pagenum = 1 while pagenum: response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format), "pagenum": pagenum} ) soup = BeautifulSoup.BeautifulSoup(response.read()) # This is not a nice way to find the results table, but I can't # see anything good to use, and it works... # There are two trs with style attributes per app. This will find all the first ones of the pairs. trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2] for tr in trs: tds = tr.findAll("td") date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date() # Stop looking through the list if we have found one which is earlier than the date searched for. if date_received < search_date: # If we break out, then we won't want the next page pagenum = None break application = PlanningApplication() application.date_received = date_received application.council_reference = tds[0].small.string.strip() # The second <td> contains the address, split up with <br/>s application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)]) application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href']) application.comment_url = self.comments_email_address self._results.addApplication(application) else: # If we got through the whole list without breaking out, # then we'll want to get the next page. pagenum += 1 return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url %{"day": day, "month": month, "year": year, }) soup = BeautifulSoup(response.read()) trs = soup.findAll("tr", valign="middle") count = 0 for tr in trs: # The odd trs are just spacers if count % 2 == 0: application = PlanningApplication() tds = tr.findAll("td") application.date_received = search_day application.council_reference = tds[1].a.string application.address = tds[3].a.string application.postcode = getPostcodeFromText(application.address) # All the links in this <tr> go to the same place... application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) # Still looking for description and comment url # For the description, we'll need the info page info_soup = BeautifulSoup(urllib2.urlopen(application.info_url).read()) application.description = info_soup.find(text="Description").findNext("td").findNext("td").font.string # While we're here, lets get the OSGB grid ref application.osgb_x, application.osgb_y = info_soup.find(text="Grid Reference").findNext("td").font.string.split("-") # We'll have to use an email address for comments application.comment_url = self.comments_email_address self._results.addApplication(application) count += 1 return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) monday_before = search_day - datetime.timedelta(search_day.weekday()) thursday = monday_before + datetime.timedelta(3) if search_day.weekday() > 3: # i.e. It is friday, saturday, or sunday # We need to add a week thursday = thursday + datetime.timedelta(7) this_url = self.base_url %(thursday.strftime(search_date_format)) # Now get the search page response = urllib2.urlopen(this_url) soup = BeautifulSoup(response.read()) # Each app is stored in a table of its own. The tables don't have # any useful attributes, so we'll find all the NavigableString objects # which look like " Application Number:" and then look at the #tables they are in. nav_strings = soup.findAll(text=" Application Number:") for nav_string in nav_strings: application = PlanningApplication() application.council_reference = nav_string.findNext("p").string.strip() result_table = nav_string.findPrevious("table") application.date_received = datetime.datetime.strptime(result_table.find(text=" Registration Date: ").findNext("p").contents[0].strip(), reg_date_format) application.osgb_x = result_table.find(text=" Easting:").findNext("p").string.strip() application.osgb_y = result_table.find(text=" Northing:").findNext("p").string.strip() application.description = result_table.find(text=" Proposed Development:").findNext("p").string.strip() application.address = result_table.find(text=" Location:").findNext("p").string.strip() application.postcode = getPostcodeFromText(application.address) application.info_url = this_url application.comment_url = self.comments_email_address self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) contents = self.get_contents(day, month, year) soup = BeautifulSoup.BeautifulSoup(contents) results_table = soup.find("table", {"class": "cResultsForm"}) # First, we work out what column each thing of interest is in from the headings headings = [x.string for x in results_table.findAll("th")] ref_col = index_or_none(headings, "Application Ref.") or \ index_or_none(headings, "Case Number") or \ index_or_none(headings, "Application Number") address_col = headings.index("Address") description_col = headings.index("Proposal") comments_url = urlparse.urljoin(self.base_url, self.comments_url_end) for tr in results_table.findAll("tr")[1:]: application = PlanningApplication() application.date_received = search_date tds = tr.findAll(re.compile("t[dh]")) application.council_reference = tds[ref_col].string.strip() application.address = tds[address_col].string.strip() application.description = tds[description_col].string.strip() application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) # We need the query string from this url to make the comments_url query_string = urlparse.urlsplit(application.info_url)[3] # This is probably slightly naughty, but I'm just going to add the querystring # on to the end manually application.comment_url = "%s?%s" %(comments_url, query_string) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) post_data = urllib.urlencode(( ("type", "app"), ("time", "0"), )) # Now get the search page response = urllib2.urlopen(self.base_url, post_data) soup = BeautifulSoup(response.read()) caseno_strings = soup.findAll(text="Case No:") for caseno_string in caseno_strings: application = PlanningApplication() application.council_reference = caseno_string.findNext("a").string.strip() info_url = urlparse.urljoin(self.base_url, caseno_string.findNext("a")['href']) # See above for why we can't use the proper info url. application.info_url = self.info_url # In order to avoid doing a download to find the comment page, we'll # get the system key from this url syskey = cgi.parse_qs(urlparse.urlsplit(info_url)[3])['id'][0] application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Registration Date:").findNext("p").string.strip(), date_format).date() application.address = caseno_string.findNext(text="Location:").findNext("p").string.strip() application.postcode = getPostcodeFromText(application.address) application.description = caseno_string.findNext(text="Proposal:").findNext("p").string.strip() #http://www.hastings.gov.uk/planning/planningapp_comments.aspx?appNumber=HS/FA/08/00631&syskey=95642 application.comment_url = self.comment_url_template %(application.council_reference, syskey) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) search_data = urllib.urlencode( [ ("parent_directory_id", "200"), ("nav", "679"), ("id", "13266"), ("RecStart", "1"), ("RecCount", "100"), ("SDate", search_date.strftime(date_format)), ("EDate", search_date.strftime(date_format)), ] ) search_url = self.base_url + "?" + search_data response = urllib2.urlopen(search_url) soup = BeautifulSoup(response.read()) results_table = soup.find("table", summary="List of planning applications that match your query") for tr in results_table.findAll("tr")[1:]: application = PlanningApplication() application.date_received = search_date tds = tr.findAll("td") application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.comment_url = application.info_url application.address = ' '.join(tds[1].string.strip().split()) application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) #- Crawley only allows searches from-to, so: next = self.base_url %{ "dayFrom": day, "monthFrom": month, "yearFrom": year, "dayTo": day, "monthTo": month, "yearTo": year, } # Now get the search page response = urllib2.urlopen(next) soup = BeautifulSoup.BeautifulSoup(response.read()) if soup.table: #- Empty result set has no table trs = soup.table.findAll("tr")[1:] # First one is just headers for tr in trs: tds = tr.findAll("td") application = PlanningApplication() application.council_reference = tds[0].a.contents[0].strip().replace("/", "/") application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) info_qs = cgi.parse_qs(urlparse.urlsplit(application.info_url)[3]) comment_qs = { "pAppNo": application.council_reference, "pAppDocName": info_qs["ssDocName"][0], } application.comment_url = self.comment_url_template %comment_qs application.address = tds[1].string.strip() if tds[2].string: #- if postcode present, append it to the address too application.postcode = tds[2].string.replace(" ", " ").strip() application.address += ", " + application.postcode application.description = tds[3].string.strip() application.date_received = datetime.datetime(*(time.strptime(tds[4].string.strip(), date_format)[0:6])) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # What we actually need is the monday before the date searched for: monday_before = search_day - datetime.timedelta(search_day.weekday()) # Now get the search page response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format))) soup = BeautifulSoup(response.read()) result_tables = soup.findAll("table", width="98%", cellpadding="2") for table in result_tables: application = PlanningApplication() trs = table.findAll("tr") application.council_reference = trs[0].strong.string.strip() relative_info_url = trs[0].a['href'] application.info_url = urlparse.urljoin(self.base_url, relative_info_url) application.address = trs[1].findAll("td")[1].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = trs[2].findAll("td")[1].string.strip() #There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel! raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/") #Check whether the application is on the target day. If not, discard it and move on. if int(raw_date_recv[0]) != day: continue application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0])) try: relative_comment_url = trs[5].findAll("td")[1].a['href'] application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url) except: application.comment_url = "No Comment URL." self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) next_page_url = self.base_url %{"date": search_date.strftime(date_format)} while next_page_url: try: response = urllib2.urlopen(next_page_url) except urllib2.HTTPError: # This is what seems to happen if there are no apps break soup = BeautifulSoup(response.read()) next = soup.find(text="Next") if next: next_page_url = urlparse.urljoin(self.base_url, next.parent['href']) else: next_page_url = None # There is an <h3> for each app that we can use for h3 in soup.findAll("h3", {"class": "resultsnavbar"}): application = PlanningApplication() application.date_received = search_date application.council_reference = h3.string.split(": ")[1] application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip() application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r")) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href']) application.info_url = self.info_url %(urllib.quote(application.council_reference)) application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page response = urllib2.urlopen(self.base_url % {"date": search_day.strftime(date_format)}) soup = BeautifulSoup(response.read()) # Results are shown in a table each. The tables don't have any nice # attributes, but they do all contain a NavString "Application", # and nothing else does... nav_strings = soup.findAll(text="Application") for nav_string in nav_strings: result_table = nav_string.findPrevious("table") application = PlanningApplication() application.date_received = search_day links = result_table.findAll("a") # We can get OSGB coordinates from the link to streetmap map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]["href"])[3]) application.osgb_x = map_qs_dict.get("x")[0] application.osgb_y = map_qs_dict.get("y")[0] application.council_reference = links[1].string.strip() application.info_url = urlparse.urljoin(self.base_url, links[1]["href"]) application.comment_url = urlparse.urljoin(self.base_url, links[2]["href"]) application.address = " ".join(links[0].previous.strip().split()) application.postcode = getPostcodeFromText(application.address) application.description = links[2].previous.strip() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)}) soup = BeautifulSoup.BeautifulSoup(response.read()) if not soup.find(text=re.compile("No Results Found")): trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:] for tr in trs: tds = tr.findAll("td") application = PlanningApplication() # We can fill in the date received without actually looking at the data application.date_received = search_date application.council_reference = tds[0].a.string.strip() application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.address = ', '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)]) application.postcode = getPostcodeFromText(application.address) application.description = tds[2].string.strip() # To get the comment link we need to fetch the info page info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) base = info_soup.base['href'] application.comment_url = urlparse.urljoin(base, info_soup.find("a", target="Planning Application Consultation Form")['href']) self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): response = urllib2.urlopen(self.base_url) soup = BeautifulSoup(response.read()) for details_input in soup.find("table", summary="Planning Applications Received in the last 7 days").findAll("input", alt="Planning Details"): application = PlanningApplication() first_tr = details_input.findPrevious("tr") other_trs = first_tr.findNextSiblings("tr", limit=8) application.council_reference = first_tr.find("input", {"name": "refval"})['value'] application.address = other_trs[0].findAll("td")[1].string.strip() application.description = other_trs[1].findAll("td")[1].string.strip() application.date_received = datetime.datetime.strptime(other_trs[3].findAll("td")[1].string.strip(), date_format).date() # Both the info page and the comment page can only be got to # by a POST. The best we can do is give the url of the search page application.info_url = application.comment_url = self.search_url self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # We want the sunday of the week being searched for. # (sunday is at the end of the week). friday = search_day - datetime.timedelta(search_day.weekday()) + datetime.timedelta(4) # Not using urllib.urlencode as it insists on turning the "+" into "%2B" post_data = "WeekEndDate=%d%%2F%d%%2F%d&order=Received+Date&submit=search" %(friday.day, friday.month, friday.year) # Now get the search page response = urllib2.urlopen(self.base_url, post_data) soup = BeautifulSoup(response.read()) trs = soup.find("table", summary="Planning Application search results table").findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") # Not sure why these are entities. We'll convert them back. application.council_reference = tds[0].a.contents[1].strip().replace("/", "/") application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) application.comment_url = application.info_url application.date_received = datetime.datetime(*(time.strptime(tds[1].string.strip(), date_format)[0:6])) application.address = tds[2].string.strip() application.postcode = getPostcodeFromText(application.address) application.description = tds[3].string.strip() self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format))) soup = BeautifulSoup(response.read()) # The first <tr> contains headers trs = soup.table.findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date() application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href']))) application.council_reference = tds[1].a.string.strip() application.address = tds[2].a.string.strip() application.postcode = getPostcodeFromText(application.address) # Now fetch the info url info_response = urllib.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() try: application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href']) except: application.comment_url = "No Comments" self._results.addApplication(application) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # post_data = [ # ("EFNO", ""), # ("STName", ""), # ("STNUMB", ""), # ("ADRSNO", ""), # ("WARD", "AllWards"), # ("AGT", ""), # ("ATCDE", "AllApps"), # ("DECDE", "AllDecs"), # ("DTErec", search_day.strftime(date_format)), # ("DTErecTo", search_day.strftime(date_format)), # ("DTEvalid", ""), # ("DTEvalidTo", ""), # ("APDECDE", "AllAppDecs"), # ("submit", "Start+Search"), # ] post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} while post_data: # Now get the search page # sys.stderr.write("Fetching: %s\n" %self.base_url) # sys.stderr.write("post data: %s\n" %post_data) response = urllib2.urlopen(self.base_url, post_data) # sys.stderr.write("Got it\n") soup = BeautifulSoup(response.read()) # sys.stderr.write("Created soup\n") results_form = soup.find("form", {"name": "currentsearchresultsNext"}) # Sort out the post_data for the next page, if there is one # If there is no next page then there will be no inputs in the form. # In this case, post_data will be '', which is false. # sys.stderr.write("Found form containing results\n") post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) # sys.stderr.write("Got post data\n") # Each result has one link, and they are the only links in the form links = results_form.findAll("a") # sys.stderr.write("Got list of links\n") for link in links: # sys.stderr.write("Working on link: %s\n" %link['href']) application = PlanningApplication() application.date_received = search_day application.info_url = urlparse.urljoin(self.base_url, link['href']) application.council_reference = link.string.strip() application.address = link.findNext("td").string.strip() application.postcode = getPostcodeFromText(application.address) application.description = link.findNext("tr").findAll("td")[-1].string.strip() # To get the comment url, we're going to have to go to each info url :-( # sys.stderr.write("Fetching: %s\n" %application.info_url) info_response = urllib2.urlopen(application.info_url) # sys.stderr.write("Got it\n") info_soup = BeautifulSoup(info_response) comment_nav_string = info_soup.find(text="Comment on this case") if comment_nav_string: application.comment_url = comment_nav_string.parent['href'] else: application.comment_url = "No Comments" #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500 self._results.addApplication(application) # sys.stderr.write("Finished that link\n") # sys.stderr.write("Finished while loop, returning stuff.\n") return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # Now get the search page get_response = urllib2.urlopen(self.base_url) get_soup = BeautifulSoup(get_response.read()) # These are the inputs with a default value inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")] # Add the submit button inputs_needed.append(('cmdWeeklyList', 'Search Database')) # We also need to add the date we want to search for. # This is the friday after the date searched for. # At weekends this will get you the friday before, but that isn't # a problem as there are no apps then. friday = search_day + datetime.timedelta(4 - search_day.weekday()) inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format))) # We'd like as many results as we can get away with on one page. # 50 is the largest option offerend inputs_needed.append(("ddlResultsPerPageWeeklyList", "50")) post_data = dict(inputs_needed) post_url = get_response.url # In case something goes wrong here, let's break out of the loop after at most 10 passes passes = 0 while True: passes += 1 post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data)) post_soup = BeautifulSoup(post_response.read()) result_tables = post_soup.table.findAll("table") for result_table in result_tables: application = PlanningApplication() application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r")) application.postcode = getPostcodeFromText(application.address) trs = result_table.findAll("tr") application.council_reference = trs[0].findAll("td")[1].string.strip() application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date() application.description = trs[3].findAll("td")[1].string.strip() application.info_url = self.info_url %(urllib.quote(application.council_reference)) # In order to avoid having to do a download for every app, # I'm setting the comment url to be the same as the info_url. # There is a comment page which can be got to by pressing the button application.comment_url = application.info_url self._results.addApplication(application) # Which page are we on? page_no = int(post_soup.find("span", id="lblPageNo").b.string) total_pages = int(post_soup.find("span", id="lblTotalPages").b.string) if passes > 10 or not page_no < total_pages: break post_data = [ ("__EVENTTARGET", "hlbNext"), ("__EVENTARGUMENT", ""), ("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']), ("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']), ] post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action']) return self._results
def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs. # First get the search page - we'll be redirected somewhere else for not having the cookie first_request = urllib2.Request(self.first_url) first_response = urllib2.urlopen(first_request) cookie_jar.extract_cookies(first_response, first_request) first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read()) first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action']) the_input = first_page_soup.form.input second_page_post_data = urllib.urlencode( ( (the_input['name'], the_input['value']), ) ) second_request = urllib2.Request(first_page_action, second_page_post_data) cookie_jar.add_cookie_header(second_request) second_response = urllib2.urlopen(second_request) cookie_jar.extract_cookies(second_response, second_request) # Now (finally) get the search page #ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search post_data = urllib.urlencode( ( ("ApplicationNumber", ""), ("AddressPrefix", ""), ("Postcode", ""), ("CaseOfficer", ""), ("WardMember", ""), ("DateReceivedStart", search_day.strftime(date_format)), ("DateReceivedEnd", search_day.strftime(date_format)), ("DateDecidedStart", ""), ("DateDecidedEnd", ""), ("Locality", ""), ("AgentName", ""), ("ApplicantName", ""), ("ShowDecided", ""), ("DecisionLevel", ""), ("Sort1", "FullAddressPrefix"), ("Sort2", "DateReceived DESC"), ("Submit", "Search"), ) ) search_request = urllib2.Request(self.base_url) cookie_jar.add_cookie_header(search_request) search_response = urllib2.urlopen(search_request, post_data) soup = BeautifulSoup.BeautifulSoup(search_response.read()) app_no_strings = soup.findAll(text="App. No.:") for app_no_string in app_no_strings: application = PlanningApplication() application.date_received = search_day application.council_reference = app_no_string.findNext("a").string.strip() application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href']) application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString]) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href']) application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip() self._results.addApplication(application) return self._results