def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass
            
            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)


            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg, "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
def process_row(current, row, keys, local_cache, city, feed_source, count):
    """
    work on adding all of the details from one row
    to the matching building address
    """

    #print row
    for index, key in enumerate(keys):
        current[key] = row[index]

    #print current

    results = None

    address = current['street_address']
    #if unit is in second column, need it here...
    #otherwise everything gets over-written for address in local_cache:
    if current['unit_if_applicable']:
        address = ', '.join( [address, current['unit_if_applicable']] )
        
    if conversions.has_key(address):
        address = conversions[address]
        print "Using manually fixed address: %s" % address

    if address in local_cache.keys():
        print "local_cache matched ", address
        previous = local_cache[address]
        #print previous
        results = previous['results']

        #should have already been set when loading local_cache above:
        #now load it as an actual SearchResults object
        #results = SearchResults()
        #print "found matching results: %s" % results

    else:
        #do the search for the first time

        #get rid of any '*' characters...
        #these are not really part of the address:
        addy = address.replace('*', '')
        addy = addy.strip()

        #seeing units in street address with no '#' or other prefix
        #but it is separated by a comma...
        #extract that here and add a prefix (and leave out ',')
        parts = addy.split(',')
        unit = ''
        if len(parts) > 1:
            #treat last part as a unit
            unit = parts[-1].strip()
            addy = ",".join(parts[:-1]).strip()

            #check if we have both current['unit_if_applicable']
            #and found unit
            if unit and current['unit_if_applicable']:
                if unit != current['unit_if_applicable']:
                    raise ValueError, "Found both unit: %s and unit from spreadsheet: %s" % (unit, current['unit_if_applicable'])
                #otherwise it should be ok...
                #adding in unit_if_available earlier now
        else:
            unit = current['unit_if_applicable']

        #also need to add in city, state, here to help limit matches
        addy = ", ".join( [addy, city.name, city.state] )
        print addy

        results = address_search(addy, unit)

    assert results

    lookup_building_with_geo(results, make=True)
    #print results
    current['results'] = results

    print results

    if results.errors:
        #print results
        #raise ValueError, results.errors
        skips = codecs.open("skips.txt", 'a', encoding='utf-8')
        skips.write(address)
        skips.write('\n')
        skips.close()
    else:

        bldg = results.building
        assert bldg
        unit = results.unit
        assert unit


        #not sure that the building form is going to save very much effort
        #still need to customize validation
        #skipping for now
        ## buildingform = BuildingForm(instance=bldg)
        ## print dir(buildingform)
        ## print buildingform.fields.keys()

        #this would come at the end, if using form:
        #setattr(buildingform, model_attribute, value)
        #buildingform.fields[model_attribute].initial = value
        ## #use form validation to make sure no errors are missed
        ## if buildingform.is_valid():
        ##     updated = buildingform.save(commit=True)
        ## else:
        ##     print buildingform.errors
        ##     print buildingform._errors
        ##     for field in buildingform:
        ##         print dir(field)
        ##         print field.errors
        ##     print "ERRORS!"


        #Now update the unit and building details as necessary:
        #building
        bldg_map = { "unit_type":"type", "laundry":"laundry", "parking":"parking_options", "pets":"pets", "gym_fitness_center":"gym", "game_room_rec_center_community_center":"game_room", "pool":"pool", "other_amenities":"amenities", "bike_friendly":"bike_friendly_details", "recycling":"recycling", "composting":"composting", "gardening":"garden_details", "public_transit":"transit_friendly_details", "walk_friendly":"walk_friendly_details", "other_smartliving_features":"energy_saving_details", "air_conditioning":"air_conditioning", "energy_saving_features":"energy_saving_other" }
        #bldg_map = { "laundry":"laundry", "bike_friendly":"bike_friendly" }

        #now use the keys
        for sk, model_attribute in bldg_map.items():
            #have already converted to a dict
            #row_index = keys.index(spreadsheet)
            #value = row
            value = current[sk]
            clean = []

            #now need to do any field specific conversions...
            #this boils down to massive case statement
            #(but only for those that actually need it)
            #values set on the model will get automatically converted
            if sk == "laundry":
                (clean, rest) = check_choices(bldg.LAUNDRY_CHOICES, value)
                if re.search('W/D incl\. in unit', rest):
                    rest = rest.replace('W/D incl. in unit', '')
                    clean.append('in_unit')
                #print clean
                #print rest
                value = ','.join(clean)

            if sk == "air_conditioning":
                (clean, rest) = check_choices(bldg.AC_CHOICES, value)
                #print clean
                #print rest
                value = ','.join(clean)

            if sk == "recycling":
                value = check_boolean(value)

            if sk == "pets":
                value = check_boolean(value)

            if sk == "unit_type":
                (clean, rest) = check_choices(bldg.TYPE_CHOICES, value)
                #print clean
                #print rest
                #should only have one building type!
                value = ','.join(clean)

            if sk == "bike_friendly":
                (clean, rest) = check_choices(bldg.BIKE_CHOICES, value)
                #print clean
                #print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    bldg.bike_friendly_other = rest

            if sk == "public_transit":
                (clean, rest) = check_choices(bldg.TRANSIT_CHOICES, value)
                print clean
                print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    bldg.transit_friendly_other = rest

            if sk == "parking":
                print value
                (clean, rest) = check_choices(bldg.PARKING_CHOICES, value)
                print clean
                print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    #bldg.transit_friendly_other = rest
                    raise ValueError, "Unknown parking option: %s" % rest

            if sk == "other_smartliving_features":
                (clean, rest) = check_choices(bldg.ENERGY_SAVING_CHOICES, value)
                #print clean
                #print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    bldg.energy_saving_other = rest

            if sk == "energy_saving_features":
                #this values shows up here in the spreadsheet
                #that is incorrect... bad data...
                #this is a fix for that
                if "Near Bus Route" == value:
                    #multiselectfield returns a list automatically:
                    #cur_values = bldg.transit_friendly_details.split()
                    cur_values = bldg.transit_friendly_details
                    if not 'access' in cur_values:
                        print "Adding access to transit friendly details"
                        cur_values.append('access')
                    #total = ','.join(cur_values)
                    #bldg.transit_friendly_details = total
                    bldg.transit_friendly_details = cur_values

                (clean, rest) = check_choices(bldg.ENERGY_SAVING_CHOICES, value)
                #print clean
                #print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    #might loose some data here if both other_smartliving_features and this are set with different data
                    bldg.energy_saving_other = rest                    

            if sk == "gardening":
                (clean, rest) = check_choices(bldg.GARDEN_CHOICES, value)
                #print clean
                #print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    bldg.garden_other = rest

            if sk == "walk_friendly":
                (clean, rest) = check_choices(bldg.WALK_CHOICES, value)
                print clean
                print rest
                #value = ','.join(clean)
                value = clean
                if rest:
                    bldg.walk_friendly_other = rest

            if sk == "gym_fitness_center":
                value = check_boolean(value)
            if sk == "pool":
                value = check_boolean(value)
            if sk == "game_room_rec_center_community_center":
                value = check_boolean(value)

            print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(bldg, model_attribute), value)
            setattr(bldg, model_attribute, value)

            #update values based on anything that was added here
            bldg.set_booleans()

        who_pays = { "who_pays_for_electricity":"who_pays_electricity", "who_pays_for_natural_gas":"who_pays_gas", "who_pays_for_water":"who_pays_water", "who_pays_for_trash_recycling_pickup":"who_pays_trash", "who_pays_for_cable":"who_pays_cable", "who_pays_for_internet":"who_pays_internet", }

        for sk, model_attribute in who_pays.items():
            value = current[sk]
            (value, rest) = check_who_pays(value, bldg.WHO_PAYS_CHOICES)
            #print current[sk]
            #print rest
            if rest:
                raise ValueError, "Unknown who pays value: %s" % value

            print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(bldg, model_attribute), value)
            setattr(bldg, model_attribute, value)

        #unit:
        numbers = {"rent":"rent", "security_deposit":"deposit", "sq_feet_per_unit":"sqft", "num_bedrooms":"bedrooms", "num_bathrooms":"bathrooms", "maximum_occupancy_per_unit":"max_occupants", "electric_utility_cost_average_per_mo":"average_electricity", "electric_utility_cost_low":"electricity_min", "electric_utility_cost_high":"electricity_max", "natural_gas_utility_cost_average_per_mo":"average_gas", "natural_gas_utility_cost_low":"gas_min", "natural_gas_utility_cost_high":"gas_max", }

        for sk, model_attribute in numbers.items():
            value = current[sk]
            clean = []

            value = check_number(value)
            #print value
            #print clean
            #print rest
            #if rest:
            #    raise ValueError, "Unknown who pays value: %s" % value

            if value:
                print "Setting %s (currently: %s) to: %s" % (model_attribute, getattr(unit, model_attribute), value)
                setattr(unit, model_attribute, float(value))
            #else:
            #    print "SKIPPING: %s" % value

        #agents = { "agent_property_manager":"agent_property_manager", "property_website_url":"property_website_url", "agent_property_manager_address":"agent_property_manager_address", "agent_property_manager_phone":"agent_property_manager_phone", "owner":"owner", }

        agent_name = current["agent_property_manager"].strip()
        agent_site = current["property_website_url"].strip()
        #special case:
        if agent_site == "http://parkermgt.com/":
            pass
        elif re.search('parkermgt', agent_site):
            bldg.website = agent_site
            agent_site = "http://parkermgt.com/"

        agent_address = current["agent_property_manager_address"].strip()
        agent_phone = current["agent_property_manager_phone"].strip()

        owner = current["owner"].strip()

        if agent_name or agent_site or agent_address:
            (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, website=agent_site, phone=agent_phone)
            print "created/matched agent: %s" % person.name
            print person

        if owner:
            (owner_person, obldg_person) = make_person(owner, bldg, "Owner")

        #missing:
        #heat_source, renewable_energy

        #other (skip)
        #for listing:
        #"lease_period":"lease_period", "availability":"availability",
        #for utility
        #"electricity_provider":"electricity_provider", "natural_gas_provider":"natural_gas_provider", "utility_info_source":"utility_info_source", "who_pays_for_telephone_land_line":"who_pays_for_telephone_land_line",
        #"comments"
        #energy_saving_features not used consistently, used very similarly to "other_smartliving_features"

        #not sure if this is the right conversion:
        #elif low == 'some exceptions':
        #    clean = True

        bldg.source = feed_source
        bldg.geocoder = "google"

        bldg.save()
        unit.save()
        bldg.update_utility_averages()
        bldg.update_rent_details()

    return address
Ejemplo n.º 3
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-07-31"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 1000:
                #exit()
                pass

            address = row[0]

            #need to fix the number being at the end of the address
            parts = address.split(',')
            anumber = parts[-1]
            parts = parts[:-1]
            street = ",".join(parts)
            address = "%s %s" % (anumber, street)

            invoice_number = row[1]
            bldg_id = row[1]
            print bldg_id

            #this is where owner is stored
            invoice_note = row[6]
            print invoice_note
            if re.match('Sent to:', invoice_note):
                print "changing invoice note from: %s" % invoice_note
                invoice_note = invoice_note[8:]
                print "to: %s" % invoice_note
            else:
                #raise ValueError, "invoice note does not start with Sent to"
                print "!!!!!invoice note does not start with Sent to!!!!!"
                print ""
                print ""

            no_units = row[12]

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## if not ( (ss_city.lower() == city_name.lower()) or (ss_city == '') ):
            ##     raise ValueError, "Unexpected city: %s" % (ss_city)

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            location.sources = ["google", "bing"]

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units)

            if invoice_note:
                (person, bldg_person) = make_person(invoice_note, bldg,
                                                    "Permit Applicant")

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Ejemplo n.º 4
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name


    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}
    
    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count
            
            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]


            ## no_units = row[12]


            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            
            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")
            
            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])
            
            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)
            
            ## bldg_type = row[2]
            
            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]
            
            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []
            
            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = ['csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location, "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location, bldg_id, city, feed_source, no_units=no_units, bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)


            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()
            
    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Ejemplo n.º 5
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city,
                                              feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    #keep a local copy of data we've processed...
    #this should help with subsequent calls
    #to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key('buildings'):
        local_cache['buildings'] = {}
    if not local_cache.has_key('parcels'):
        local_cache['parcels'] = {}

    locations = {}
    for key, value in local_cache['buildings'].items():
        locations[key] = Location(value)

    #geocoder helper:
    geo = Geo()

    skips = 0
    #with codecs.open(source_csv, 'rb', encoding='utf-8') as csvfile:
    with open(source_csv) as csvfile:
        #reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        #reader = csv.reader(csvfile)
        #reader = unicodecsv.UnicodeReader(csvfile, encoding='utf-8')

        reader = unicode_csv_reader(csvfile)

        #just print the first row:
        print '>, <'.join(reader.next())

        count = 0

        #want to randomize the order... distribute options more evenly
        #print len(reader)
        #exit()
        #in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            #could exit out early here, if needed
            if count > 10:
                #exit()
                pass

            print row
            address = row[0]

            ## no_units = row[12]

            #can pass this in as bldg_id to make_building
            #that gets used for parcel too
            parcel_id = row[1]
            bldg_id = parcel_id

            street_num = row[2]
            street_dir = row[3]
            street_name = row[4]
            street_sfx = row[5]
            #eg building number
            qualifier_pre = row[6]
            #eg "UNIT" or "APT"
            qualifier_post = row[7]
            apt_num = row[8]
            #skip row9 (in/out... whatever that means)
            zip_code = row[10]
            #skip row11, assessor id
            #skip row12, address num
            #skip row13, x
            #skip row14, y
            #xcoord == lng
            lng = row[15]
            lat = row[16]

            #entry floor number: (named 'z' in sheet)
            floor = row[17]

            #skip row18, strcid... not sure
            #skip row19, parent
            #skip row20, app_
            #skip row21, hteloc
            zone = row[22]
            bldg_type = row[23]
            #number of buildings
            bldg_num = row[24]
            no_units = row[25]

            #skip row[26], inspection type
            #skip row27, app number
            #skip row28, date received
            #skip row29, application type
            #skip row30, ownerid
            #skip row31, operator id
            #skip row32, agent_id
            #skip row33, mail to
            central_heat = row[34]
            if central_heat == 'Y':
                central_heat = True
            else:
                central_heat = False

            #heat mechanism? heat mechanic??? not sure
            heat_mech = row[35]
            #skip row36, agent id (2)
            #skip row37, agent last name
            #skip row38 agent first name
            #skip row39 agent middle initial
            #skip row40, agent title
            #skip row41, business name

            #could be owner, could be agent
            owner_name = row[42]
            owner_address1 = row[43]
            owner_address2 = row[44]
            owner_city = row[45]
            owner_state = row[46]
            owner_zip = row[47]

            #address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

            address_main = " ".join([
                street_num, street_dir, street_name, street_sfx, qualifier_pre
            ])
            address_main = address_main.strip()
            #get rid of any double spaces
            address_main = address_main.replace("  ", " ")

            apt_main = " ".join([qualifier_post, apt_num])
            apt_main = apt_main.strip()

            address = address_main
            print address

            owner_address = ", ".join([
                owner_address1, owner_address2, owner_city, owner_state,
                owner_zip
            ])

            ## #should always be "RENTAL" (don't need to track this one)
            ## permit_type = row[1]
            ## if not permit_type == "RENTAL" and not permit_type == "MECHANICAL":
            ##     raise ValueError, "Unexpected permit type: %s in row: %s" % (
            ##         permit_type, row)

            ## bldg_type = row[2]

            ## #can use this to filter out non-rental or obsolete entries
            ## #don't need to track otherwise:
            ## status = row[3]
            ## parcel_id = row[4]

            ## #should be fixed per source:
            ## ss_city = row[6]

            ## bldg_sf = row[7]
            ## no_bldgs = row[8]
            ## applicant_name = row[9]
            ## no_stories = row[10]
            ## no_units = row[11]

            ## sqft = row[7]
            ## number_of_buildings = row[8]
            ## applicant_name = row[9]
            ## number_of_stories = row[10]
            ## number_of_units = row[11]

            #check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

            #make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                #check if we've started processing any results for this row
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

            #temporarily just want to look at google again
            #location.sources = ["google"]
            #location.sources = ["google", "bing"]
            #location.sources = ["google", "bing", "usgeo", "geonames", "openmq"]
            #skip geocoding for columbia
            location.sources = []

            #do some geocoding, as needed:
            search = "%s, %s, %s" % (address.upper(), city_name, city.state)

            any_updated = False
            for geo_source in location.sources:
                update = geo.lookup(search, geo_source, location, force=True)
                #update = geo.lookup(search, geo_source, location, force=False)
                if update:
                    any_updated = True

            location.sources = [
                'csv', "google", "bing", "usgeo", "geonames", "openmq", "mq"
            ]

            #manually add data from csv here:
            result = []
            result.append({'place': address, 'lat': lat, 'lng': lng})
            setattr(location, 'csv', result)

            #this is the case for brand new searches
            #(which are updated in a different sense)
            if not hasattr(location,
                           "address_alt") or not location.address_alt:
                any_updated = True

            location.address_alt = search
            #location.bldg_units = bldg_units
            #location.units_bdrms = units_bdrms
            locations[address.upper()] = location

            #handle the database storage
            bldg = make_building(location,
                                 bldg_id,
                                 city,
                                 feed_source,
                                 no_units=no_units,
                                 bldg_type=bldg_type)

            if apt_main:
                unit = make_unit(apt_main, bldg)

            (person, bldg_person) = make_person(owner_name,
                                                bldg,
                                                "Agent",
                                                address=owner_address)

            if any_updated:
                #back it up for later
                #enable this when downloading GPS coordinates...
                #the rest of the time it slows things down
                local_cache['buildings'] = {}
                for key, value in locations.items():
                    local_cache['buildings'][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print

            #exit()

    destination = '%s.tsv' % city_tag
    save_results(locations, destination)
Ejemplo n.º 6
0
def read_csv(source_csv):
    city_options = City.objects.filter(tag="bloomington_in")
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = "Bloomington"
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-08-29"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city

    cache_file = "%s.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}
    if not local_cache.has_key("parcels"):
        local_cache["parcels"] = {}

    locations = {}
    for key, value in local_cache["buildings"].items():
        locations[key] = Location(value)

    # geocoder helper:
    geo = Geo()

    skips = 0
    with codecs.open(source_csv, "rb", encoding="utf-8") as csvfile:
        # reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        reader = csv.reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            # could exit out early here, if needed
            if count > 1000:
                # exit()
                pass

            bldg_id = row[0]
            print bldg_id

            address = row[1]
            print address

            owner = row[2]

            # skip this:
            ownder_contact = row[3]

            agent = row[4]

            bldg_units = row[9]
            print bldg_units

            units_bdrms = row[10]
            print units_bdrms

            # check if this is one we want to skip
            if conversions.has_key(address.upper()):
                address = conversions[address.upper()]

            # make sure it's not one we're skipping:
            if not address:
                print "SKIPPING ITEM: %s" % row[1]
                skips += 1
            else:
                if locations.has_key(address.upper()):
                    location = locations[address.upper()]
                else:
                    location = Location()

                # temporarily just want to look at google again
                location.sources = ["google"]

                # do some geocoding, as needed:
                search = "%s, Bloomington IN" % address.upper()

                any_updated = False
                for geo_source in location.sources:
                    update = geo.lookup(search, geo_source, location, force=True)
                    if update:
                        any_updated = True

                location.sources = ["google", "bing", "usgeo", "geonames", "openmq", "mq"]

                if not hasattr(location, "address_alt") or not location.address_alt:
                    any_updated = True

                location.address_alt = search
                location.bldg_units = bldg_units
                location.units_bdrms = units_bdrms
                locations[address.upper()] = location

                # handle the database storage
                bldg = make_building(location, bldg_id, city, feed_source)

                # owner_details = parse_person(owner)
                if owner:
                    result = special_cases(owner)
                    if result:
                        (owner_name, owner_address) = result
                    else:
                        (owner_name, owner_address, owner_phone, remainder) = parse_person(owner)
                        ## print "owner name: %s" % owner_name
                        ## print "owner address: %s" % owner_address
                        ## print ""

                        if owner_name:
                            (person, bldg_person) = make_person(owner_name, bldg, "Owner", address=owner_address)

                if agent and agent != "No Agent":
                    # agent_details = parse_person(agent)
                    (agent_name, agent_address, agent_phone, remainder) = parse_person(agent)
                    ## print "agent name: %s" % agent_name
                    ## print "agent address: %s" % agent_address
                    ## print ""

                    if agent_name:
                        (person, bldg_person) = make_person(agent_name, bldg, "Agent", address=agent_address, city=city)

                if any_updated:
                    # back it up for later
                    # enable this when downloading GPS coordinates...
                    # the rest of the time it slows things down
                    local_cache["buildings"] = {}
                    for key, value in locations.items():
                        local_cache["buildings"][key] = value.to_dict()
                    save_json(cache_destination, local_cache)

                print

    save_results(locations, "bloomington-filtered.tsv")
Ejemplo n.º 7
0
def read_csv(source_csv, city_name, city_tag):
    city_options = City.objects.filter(tag=city_tag)
    print "Number of cities available: %s" % len(city_options)
    if not len(city_options):
        raise ValueError, "CITY NOT FOUND! run make_cities.py first"
        ## city = City()
        ## city.name = city_name
        ## city.tag = to_tag(city.name)
        ## city.save()
    else:
        city = city_options[0]

    print city

    feed_date = "2013-10-16"

    feeds = FeedInfo.objects.filter(city=city).filter(added=feed_date)
    if feeds.exists():
        feed = feeds[0]
        print "Already had feed: %s, %s" % (feed.city, feed.added)
    else:
        feed = FeedInfo()
        feed.city = city
        feed.added = feed_date
        feed.version = "0.1"
        feed.save()
        print "Created new feed: %s" % feed.city.name

    people = Person.objects.filter(name="Blank")
    if people.exists():
        person = people[0]
        print "Already had person: %s" % (person.name)
    else:
        person = Person()
        person.name = "Blank"
        person.save()
        print "Created new person: %s" % person.name

    sources = Source.objects.filter(feed=feed)
    if sources.exists():
        feed_source = sources[0]
        print "Already had source: %s, %s" % (feed_source.feed.city, feed_source.feed.added)
    else:
        feed_source = Source()
        feed_source.feed = feed
        feed_source.person = person
        feed_source.save()
        print "Created new source: %s" % feed_source.feed.city.name

    cache_file = "%s-20150525.json" % city.tag
    cache_destination = os.path.join(os.path.dirname(source_csv), cache_file)
    # keep a local copy of data we've processed...
    # this should help with subsequent calls
    # to make sure we don't need to duplicate calls to remote geolocation APIs:
    local_cache = load_json(cache_destination, create=True)
    if not local_cache.has_key("buildings"):
        local_cache["buildings"] = {}

    search_results = {}
    for key, value in local_cache["buildings"].items():
        # search_results[key] = Location(value)
        sr = SearchResults()
        sr.from_dict(value)
        # print
        # print sr
        # print
        search_results[key] = sr

    # geocoder helper:
    # geo = Geo()

    skips = 0
    with open(source_csv) as csvfile:

        reader = unicode_csv_reader(csvfile)

        # just print the first row:
        print ">, <".join(reader.next())

        count = 0

        # want to randomize the order... distribute options more evenly
        # print len(reader)
        # exit()
        # in order to randomize, should randomize the order in the csv
        for row in reader:
            count += 1
            print "Looking at row: %s" % count

            any_updated = False

            # could exit out early here, if needed
            if count > 10:
                # exit()
                pass

            # if you want to skip ahead more quickly:
            if count < 27187:
                pass
            else:

                # print row
                objectid = row[0]

                ## no_units = row[12]

                # can pass this in as bldg_id to make_building
                # that gets used for parcel too
                parcel_id = row[1]
                bldg_id = parcel_id

                street_num = row[2]
                street_dir = row[3]
                street_name = row[4]
                street_sfx = row[5]
                # eg building number
                qualifier_pre = row[6]
                # eg "UNIT" or "APT"
                qualifier_post = row[7]
                apt_num = row[8]
                # skip row9 (in/out... whatever that means)
                zip_code = row[10]
                # skip row11, assessor id
                # skip row12, address num
                # skip row13, x
                # skip row14, y
                # xcoord == lng
                lng = row[15]
                lat = row[16]

                # entry floor number: (named 'z' in sheet)
                floor = row[17]

                # skip row18, strcid... not sure
                # skip row19, parent
                # skip row20, app_
                # skip row21, hteloc
                zone = row[22]
                bldg_type = row[23]
                # number of buildings
                bldg_num = row[24]
                no_units = row[25]

                # skip row[26], inspection type
                # skip row27, app number
                # skip row28, date received
                # skip row29, application type
                # skip row30, ownerid
                # skip row31, operator id
                # skip row32, agent_id
                # skip row33, mail to
                central_heat = row[34]
                if central_heat == "Y":
                    central_heat = True
                else:
                    central_heat = False

                # heat mechanism? heat mechanic??? not sure
                heat_mech = row[35]
                # skip row36, agent id (2)
                # skip row37, agent last name
                # skip row38 agent first name
                # skip row39 agent middle initial
                # skip row40, agent title
                # skip row41, business name

                # could be owner, could be agent
                owner_name = row[42]
                owner_address1 = row[43]
                owner_address2 = row[44]
                owner_city = row[45]
                owner_state = row[46]
                owner_zip = row[47]

                # address = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre, qualifier_post, apt_num])

                # this is causing problems with lookups in google
                if (
                    qualifier_pre == "DUP"
                    or qualifier_pre == "DUPE"
                    or qualifier_pre == "2-Jan"
                    or qualifier_pre == "HM"
                    or qualifier_pre == "DWN"
                ):
                    qualifier_pre = ""

                address_main = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                address_main = address_main.strip()
                # get rid of any double spaces
                address_main = address_main.replace("  ", " ")

                # similar to conversions,
                # but there are too many of these to list there
                if re.search("HOLLY RIDGE LN", address_main):
                    address_main = address_main.replace("HOLLY RIDGE LN", "HOLLYRIDGE LN")
                if re.search("BERKSHIRE CT", address_main):
                    address_main = address_main.replace("BERKSHIRE CT", "BERKSHIRE")
                    # address_main = ''
                if re.search("CAMERON CT", address_main):
                    address_main = address_main.replace("CAMERON CT", "CAMERON")
                    # address_main = ''
                if re.search("ATHENS CT", address_main):
                    address_main = address_main.replace("ATHENS CT", "ATHENS")
                    # address_main = ''
                if re.search("LAMAR CT", address_main):
                    address_main = address_main.replace("LAMAR CT", "LAMAR")
                    # address_main = ''
                if re.search("MONITEAU CT", address_main):
                    address_main = address_main.replace("MONITEAU CT", "MONITEAU")
                    # address_main = ''
                if re.search("IMPERIAL CT", address_main):
                    address_main = ""
                if re.search("PERKINS DR", address_main):
                    address_main = ""
                if re.search("GRANITE OAKS CT", address_main):
                    address_main = ""

                # sometimes the 'BLDG' data is added in the wrong place
                # then it gets treated as a unit item
                # (but it's not *always* a unit item, so can't generalize it that way)
                if qualifier_post == "BLDG" or qualifier_post == "LOT":
                    address_main = " ".join([address_main, qualifier_post, apt_main])
                    address_main = address_main.strip()
                    apt_main = ""
                else:
                    apt_main = " ".join([qualifier_post, apt_num])
                    apt_main = apt_main.strip()

                # check if this is one we want to skip
                if conversions.has_key(address_main.upper()):
                    address_main = conversions[address_main.upper()]

                if address_main:
                    print "APT_MAIN: ", apt_main
                    address = ", ".join([address_main, apt_main])
                else:
                    address = ""

                owner_address = ", ".join([owner_address1, owner_address2, owner_city, owner_state, owner_zip])

                ## if (not status in ['EXPIRED', 'CLOSED']) and (permit_type in ['RENTAL']):

                print "Parcel ID:", parcel_id
                print address

                results = None

                # make sure it's not one we're skipping:
                if not address:
                    print "SKIPPING ITEM: %s" % row[1]
                    skips += 1

                    skipf = codecs.open("skips.txt", "a", encoding="utf-8")
                    original = " ".join([street_num, street_dir, street_name, street_sfx, qualifier_pre])
                    skipf.write(original)
                    skipf.write("\n")
                    skipf.close()

                else:
                    # check if we've started processing any results for this row
                    if search_results.has_key(address.upper()):
                        print "Already had building: %s" % address
                        results = search_results[address.upper()]
                        # print results
                    else:

                        addy = ", ".join([address_main, city.name, city.state])
                        addy += " " + zip_code
                        # addy += ", USA"
                        print addy

                        # toggle betweeen an actual google query
                        results = address_search(addy, apt_main)

                        # print dir(results)

                        if len(results.matches) > 1:
                            print results
                            for option in results.matches:
                                print "%s: %s, %s" % (option["place"], option["lat"], option["lng"])
                            print
                            print "Source Lat: %s, Lng: %s" % (lat, lng)
                            src_lat = int(float(lat) * 100)
                            src_lng = int(float(lng) * 100)

                            matched = False
                            for current in results.matches:
                                # current = results.matches[0]
                                print current["lat"]
                                print current["lng"]
                                # only want to look at the first 2 decimal places:
                                comp_lat = int(float(current["lat"]) * 100)
                                comp_lng = int(float(current["lng"]) * 100)
                                print comp_lat
                                print comp_lng

                                if (src_lat == comp_lat) and (src_lng == comp_lng):
                                    # results.matches = results.matches[:1]
                                    results.matches = [current]
                                    matched = True

                            if not matched:
                                print "DIDN'T MATCH!"
                                exit()

                        any_updated = True

                        # or just using results as specified in csv
                        # (THIS DOES NOT NORMALIZE THE ADDRESS VIA GOOGLE)
                        # results = SearchResults()
                        # results.unit_text = apt_main
                        # handle_place(results, addy, lat, lng, apt_main)

                    assert results
                    # print results

                    lookup_building_with_geo(results, make=True, parcel_id=parcel_id)
                    # print results
                    # current['results'] = results

                    # print results

                    if results.errors:
                        print results
                        raise ValueError, results.errors
                    else:

                        search_results[address.upper()] = results

                        bldg = results.building
                        assert bldg
                        unit = results.unit

                        # may be a case where the unit is blank
                        # and another unit with an number/letter was created earlier
                        # in that case, we won't be creating one here
                        # and the building will already exist...
                        # not necessarily an error though
                        # just redundant data
                        # assert unit

                        (person, bldg_person) = make_person(owner_name, bldg, "Agent", address=owner_address)

                    # time.sleep(1)

            if any_updated:
                # back it up for later
                # enable this when downloading GPS coordinates...
                # the rest of the time it slows things down
                local_cache["buildings"] = {}
                for key, value in search_results.items():
                    # search_results[key] = SearchResults().from_dict(value)
                    local_cache["buildings"][key] = value.to_dict()
                save_json(cache_destination, local_cache)

            print