def get_dev_ids(since_date): dev_ids = {} url = 'http://ottwatch.ca/devapps?since='+str(since_date) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') counter = 0 for row_fluid in soup.findAll('div',{'class': 'row-fluid'}): for span_5 in row_fluid.findAll('div', {'class': 'span5'}): for app_row in span_5.div.table.find_all('tr'): # Get Lat/Lon Coords if (app_row.findAll('td')): if (app_row.find_all('td')[2].a != None): latLon = app_row.find_all('td')[2].a.get('href') latLon = CU.findLatLon(latLon) dev_ids[counter] = {} dev_ids[counter]['allInfo'] = {} dev_ids[counter]['allInfo']['lat'] = latLon[0] dev_ids[counter]['allInfo']['lon'] = latLon[1] # Get DevId for each application if (app_row.find('td') != None): contents = app_row.find('td') dev_ids[counter]['devID'] = contents.b.nobr.a.string counter+=1 # print(dev_ids) return dev_ids
def get_each_dev_app_info(dev_ids): for devId in dev_ids: url = 'http://ottwatch.ca/devapps/'+str(dev_ids[devId]['devID']) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') lxml_data = lxml.html.fromstring(plain_text) # Goes through both row_fluid on the page for row_fluid in soup.findAll('div', {'class': 'row-fluid'}): # This is going through both span 6's on the page for span_6 in row_fluid.findAll('div', {'class': 'span6'}): if (span_6.p): # Development application Type dev_ids[devId]['allInfo']['appType'] = span_6.p.contents[1].string # Development application description cleaned = CU.clean_description(span_6.p.contents[2].string) dev_ids[devId]['allInfo']['description'] = cleaned # Application on City of Ottawa Website dev_ids[devId]['allInfo']['city_url'] = span_6.p.contents[3].a.get('href') # MAIN Table if (span_6.table): table = span_6.table # print(table.contents[9]) ## Docs Table # print(table.contents[1]) ## WARDS ## devIds[devId]['allInfo']['wardNum'], devIds[devId][ 'allInfo']['wardName'], devIds[devId]['allInfo'][ 'wardCouncillor'] = CU.getWard(table.contents[1].text) ## UPDATED DATE ## devIds[devId]['allInfo']['receievedDate'] = \ CU.getReceivedDate(table.contents[3].text) ## UPDATED DATE ## devIds[devId]['allInfo']['lastUpdated'] = CU.getLastUpdated( table.contents[5].text) ## ADDRESSES ## devIds[devId]['allInfo']['address'] = CU.getAddress( table.contents[7].text) ## Related Documents ## if table.contents[9].table: docCount = 0 devIds[devId]['allInfo']['relatedDocuments']= {} for doc in table.contents[9].table.findAll('td'): devIds[devId]['allInfo']['relatedDocuments'][ docCount] = {} if doc.a: docName = doc.a.text docLink = doc.a.get('href') devIds[devId]['allInfo']['relatedDocuments'][ docCount]['name'] = docName devIds[devId]['allInfo']['relatedDocuments'][ docCount]['link'] = docLink docCount += 1 elif doc.nobr: docDate = doc.nobr.text devIds[devId]['allInfo']['relatedDocuments'][ docCount-1]['date'] = docDate ## Application Lifescycle Statuses ## devIds[devId]['allInfo']['statuses'] = CU.getStatuses( devIds[devId]['devID']) print(devIds)