Beispiel #1
0
def parseTownAdminData():
    print "        Parsing House Admin Data"
    currTown = ""
    zips = ""
    data = []
    county = ""
    columns = ["Town", "Zip Codes", "County", "Tax Rate"]

    fileName = "town_zips"
    ws = pd.ExcelFile(os.path.join(houseDataLocation, fileName + ext)).parse("Sheet1")
    ws.sort_values(by="Town", inplace=True)

    for row in range(len(ws)):
        zipCode = str(ws.iloc[row, 0]).strip()
        # leading zero's are stripped off for some reason. So need to re-add them
        if len(zipCode) == 4:
            zipCode = "0" + zipCode
        zips = zips + zipCode + ",".strip()
        town = ws.iloc[row, 1]

        if not townExists(town):
            print town, " is not in the list"

        taxRate = taxRateLookup(town)
        # check for multiple zips per town
        if town == currTown:
            if row < (len(ws) - 1):
                if town != ws.iloc[row + 1, 1]:
                    data.append([town, zips, county, taxRate])
                    zips = ""
            # last row, append the data
            else:
                data.append([town, zips, county, taxRate])
        else:
            currTown = town
            county = ws.iloc[row, 2]
            if row < (len(ws) - 1):
                if town != ws.iloc[row + 1, 1]:
                    data.append([town, zips, county, taxRate])
                    zips = ""
            # last row, append the data
            else:
                data.append([town, zips, county, taxRate])

    df = pd.DataFrame(data, columns=columns)
    writer = pd.ExcelWriter(os.path.join(houseDataLocation, "Town_Admin-2015" + ext), engine="openpyxl")
    df.to_excel(writer, "Sheet1")
    writer.save()

    return df
Beispiel #2
0
def parseMLSHouseData(MLS_DATA_FILE):
    fileName = MLS_DATA_FILE
    columns = ["Town", "Tax Rate", "Median Sales Price", "Tax Cost"]
    data = []

    houseData = pd.read_excel(os.path.join(houseDataLocation, fileName + ext), header=0)
    houseData.sort_values(by="Town", inplace=True)

    for row in range(len(houseData)):
        town = houseData.iloc[row, 0]
        medSalePrice = houseData.iloc[row, 3]
        taxRate = taxRateLookup(town)
        taxCost = round(taxRate * (medSalePrice / 1000), 2)

        data.append([town, taxRate, medSalePrice, taxCost])

        # In for debugging, can be removed
        if not townExists(town):
            print "MLS Town", town, "not in Town List"

    df = pd.DataFrame(data, columns=columns)

    return df
Beispiel #3
0
def getTruliaZipCodeStats(zips, startDate, endDate):
    print "        Downloading Trulia Housing Data"
    fileName = 'House_Data-2015'+ext
    data = []
    retry = []
    
    medianListPrice = ''
    avgListPrice = ''
    columns = pd.MultiIndex.from_tuples([('Zip',''),('House',''),('County',''),('Tax Rate', ''),
                                         ('All Properties', 'Avg List Price'),('All Properties', 'Median List Price'),('All Properties', 'Median Tax Cost'),
                                         ('3 Bedroom', 'Avg List Price'),('3 Bedroom', 'Median List Price'),('3 Bedroom', 'Median Tax Cost'),
                                         ('2 Bedroom', 'Avg List Price'),('2 Bedroom', 'Median List Price'),('2 Bedroom', 'Median Tax Cost'),
                                         ('1 Bedroom', 'Avg List Price'),('1 Bedroom', 'Median List Price'),('1 Bedroom', 'Median Tax Cost'),])
    
    url_base = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&'
        
    # Iterate through all zips passed in
    for row in range(len(zips)):
        if row % 2 == 0:
            time.sleep(1)

        zipCode = zips[row]
        town = zipLookup(zipCode)
        
        if not townExists(town):
            print town, ' is not in the list'
            
        taxRate = float(taxRateLookup(town))
        county = countyLookup(zipCode)
        
        # Need to initialize data array for 15 positions because not all city entries contain data for all 15 values
        data.append([zipCode, town, county, taxRate, 'NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA','NA'])
        
        url = url_base+'zipCode='+zipCode+'&startDate='+startDate+'&endDate='+endDate+'&apikey='+truliaKey
        try:
            e = BeautifulSoup(urllib2.urlopen(url).read(), 'lxml')
            
            for each in e.findAll('subcategory'):
                avgListPrice = float(each.averagelistingprice.text)
                medianListPrice = float(each.medianlistingprice.text)
                medianTaxCost = round(taxRate*(medianListPrice/1000), 2)

                if each.type.text == 'All Properties':
                    data[row][4] = avgListPrice
                    data[row][5] = medianListPrice
                    data[row][6] = medianTaxCost
                elif each.type.text == '3 Bedroom Properties':
                    data[row][7] = avgListPrice
                    data[row][8] = medianListPrice
                    data[row][9] = medianTaxCost
                elif each.type.text == '2 Bedroom Properties':
                    data[row][10] = avgListPrice
                    data[row][11] = medianListPrice
                    data[row][12] = medianTaxCost
                elif each.type.text == '1 Bedroom Properties':
                    data[row][13] = avgListPrice
                    data[row][14] = medianListPrice
                    data[row][15] = medianTaxCost
            
        except urllib2.HTTPError as e:
            print "HTTP Error, skipping zip ", zipCode
            retry.append(zipCode)
            
    print "            Skipped ", len(retry), "zip codes"
    df = pd.DataFrame(data, columns=columns)
    
#     writer = pd.ExcelWriter(dataLocation+fileName, engine="openpyxl")
#     df.to_excel(writer,"Sheet1")
#     writer.save()
    
    return df