def parseTownAdminData(): print " Parsing House Admin Data" currTown = "" zips = "" data = [] county = "" columns = ["Town", "Zip Codes", "County", "Tax Rate"] fileName = "town_zips" ws = pd.ExcelFile(os.path.join(houseDataLocation, fileName + ext)).parse("Sheet1") ws.sort_values(by="Town", inplace=True) for row in range(len(ws)): zipCode = str(ws.iloc[row, 0]).strip() # leading zero's are stripped off for some reason. So need to re-add them if len(zipCode) == 4: zipCode = "0" + zipCode zips = zips + zipCode + ",".strip() town = ws.iloc[row, 1] if not townExists(town): print town, " is not in the list" taxRate = taxRateLookup(town) # check for multiple zips per town if town == currTown: if row < (len(ws) - 1): if town != ws.iloc[row + 1, 1]: data.append([town, zips, county, taxRate]) zips = "" # last row, append the data else: data.append([town, zips, county, taxRate]) else: currTown = town county = ws.iloc[row, 2] if row < (len(ws) - 1): if town != ws.iloc[row + 1, 1]: data.append([town, zips, county, taxRate]) zips = "" # last row, append the data else: data.append([town, zips, county, taxRate]) df = pd.DataFrame(data, columns=columns) writer = pd.ExcelWriter(os.path.join(houseDataLocation, "Town_Admin-2015" + ext), engine="openpyxl") df.to_excel(writer, "Sheet1") writer.save() return df
def parseMLSHouseData(MLS_DATA_FILE): fileName = MLS_DATA_FILE columns = ["Town", "Tax Rate", "Median Sales Price", "Tax Cost"] data = [] houseData = pd.read_excel(os.path.join(houseDataLocation, fileName + ext), header=0) houseData.sort_values(by="Town", inplace=True) for row in range(len(houseData)): town = houseData.iloc[row, 0] medSalePrice = houseData.iloc[row, 3] taxRate = taxRateLookup(town) taxCost = round(taxRate * (medSalePrice / 1000), 2) data.append([town, taxRate, medSalePrice, taxCost]) # In for debugging, can be removed if not townExists(town): print "MLS Town", town, "not in Town List" df = pd.DataFrame(data, columns=columns) return df
def getTruliaZipCodeStats(zips, startDate, endDate): print " Downloading Trulia Housing Data" fileName = 'House_Data-2015'+ext data = [] retry = [] medianListPrice = '' avgListPrice = '' columns = pd.MultiIndex.from_tuples([('Zip',''),('House',''),('County',''),('Tax Rate', ''), ('All Properties', 'Avg List Price'),('All Properties', 'Median List Price'),('All Properties', 'Median Tax Cost'), ('3 Bedroom', 'Avg List Price'),('3 Bedroom', 'Median List Price'),('3 Bedroom', 'Median Tax Cost'), ('2 Bedroom', 'Avg List Price'),('2 Bedroom', 'Median List Price'),('2 Bedroom', 'Median Tax Cost'), ('1 Bedroom', 'Avg List Price'),('1 Bedroom', 'Median List Price'),('1 Bedroom', 'Median Tax Cost'),]) url_base = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&' # Iterate through all zips passed in for row in range(len(zips)): if row % 2 == 0: time.sleep(1) zipCode = zips[row] town = zipLookup(zipCode) if not townExists(town): print town, ' is not in the list' taxRate = float(taxRateLookup(town)) county = countyLookup(zipCode) # Need to initialize data array for 15 positions because not all city entries contain data for all 15 values data.append([zipCode, town, county, taxRate, 'NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA']) url = url_base+'zipCode='+zipCode+'&startDate='+startDate+'&endDate='+endDate+'&apikey='+truliaKey try: e = BeautifulSoup(urllib2.urlopen(url).read(), 'lxml') for each in e.findAll('subcategory'): avgListPrice = float(each.averagelistingprice.text) medianListPrice = float(each.medianlistingprice.text) medianTaxCost = round(taxRate*(medianListPrice/1000), 2) if each.type.text == 'All Properties': data[row][4] = avgListPrice data[row][5] = medianListPrice data[row][6] = medianTaxCost elif each.type.text == '3 Bedroom Properties': data[row][7] = avgListPrice data[row][8] = medianListPrice data[row][9] = medianTaxCost elif each.type.text == '2 Bedroom Properties': data[row][10] = avgListPrice data[row][11] = medianListPrice data[row][12] = medianTaxCost elif each.type.text == '1 Bedroom Properties': data[row][13] = avgListPrice data[row][14] = medianListPrice data[row][15] = medianTaxCost except urllib2.HTTPError as e: print "HTTP Error, skipping zip ", zipCode retry.append(zipCode) print " Skipped ", len(retry), "zip codes" df = pd.DataFrame(data, columns=columns) # writer = pd.ExcelWriter(dataLocation+fileName, engine="openpyxl") # df.to_excel(writer,"Sheet1") # writer.save() return df