def test_normalize_state_simple(): state = 'CALIFORNIA' res = address.normalize_state(state) eq(res, 'ca')
def test_normalize_state_invalid(): state = 'FOO BAR' res = address.normalize_state(state) eq(res, 'foo bar')
def test_normalize_state_empty(): state = '.,' res = address.normalize_state(state) eq(res, '')
def test_normalize_state_other(): state = 'Quebec' res = address.normalize_state(state) eq(res, 'quebec')
def test_normalize_state_multiple(): state = 'CA California' res = address.normalize_state(state) eq(res, 'ca california')
def _match_with_venue(event, _log=None): if _log is None: _log = log facebook = event["facebook"] name = facebook.get("location") if name is None: name = facebook["owner"]["name"] venue = facebook["venue"] address = normalize_street(venue["street"]) locality = normalize_city(venue["city"]) region = normalize_state(venue.get("state", "")) country = normalize_country(venue.get("country", "")) if not address or not locality: _log.debug("Event {event_id} has invalid address or locality. " "Skipping.".format(event_id=event["_id"])) return None address = address.title() locality = locality.title() latitude = venue["latitude"] longitude = venue["longitude"] # coordinates of type int are too ambigious to be considered # good if type(latitude) is not float or type(longitude) is not float: _log.debug("Event {event_id} has invalid latitude or longitude. " "Skipping.".format(event_id=event["_id"])) return None # coordinates with little precision are too ambigious to be # considered good lat_precision = Decimal(repr(latitude)) lng_precision = Decimal(repr(longitude)) lat_precision = lat_precision.as_tuple().exponent lng_precision = lng_precision.as_tuple().exponent if lat_precision > -5 or lng_precision > -5: _log.debug( "Event {event_id} has latitude or longitude with " "little precision. Skipping.".format(event_id=event["_id"]) ) return None match = OrderedDict( [ ( "ubernear", OrderedDict([("place_id", event["_id"]), ("source", "facebook"), ("location", [longitude, latitude])]), ), ( "place", OrderedDict( [ ("address", address), ("locality", locality), ("name", name), ("latitude", latitude), ("longitude", longitude), ] ), ), ] ) if region: region = region.upper() match["place"]["region"] = region if country: country = country.upper() match["place"]["country"] = country return match
def update_venue( events_coll, usps_id, process_all, ): now = datetime.utcnow() if process_all: events = events_coll.find() else: completed_query = OrderedDict([ ('ubernear.normalization_completed', OrderedDict([ ('$exists', False), ]), ), ]) failed_query = OrderedDict([ ('ubernear.normalization_failed', OrderedDict([ ('$exists', False), ]), ), ]) lookup_query = OrderedDict([ ('ubernear.lookup_completed', OrderedDict([ ('$exists', True), ]), ), ]) events = events_coll.find( OrderedDict([ ('$and', [completed_query, failed_query, lookup_query, ] ), ]), sort=[('ubernear.fetched', pymongo.ASCENDING)], ) count = events.count() if count != 0: log.info( 'Normalizing {count} event{s}'.format( count=count, s='' if count == 1 else 's', ), ) event_batch = [] found_work = False # TODO This cursor may timeout if there are too many results for event in events: found_work = True # Don't send venues in the batch that can't be used # Check for missing values here instead of in the query # so it is explicitly known which events are not # eligible for normalization if not 'venue' in event['facebook']: _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='normalization_failed', reason='No venue', ) continue venue = event['facebook']['venue'] # The minimal requirements for the USPS API if ( not 'street' in venue or not 'city' in venue or not 'state' in venue ): _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='normalization_failed', reason='No street, city or state', ) continue # USPS doesn't take long names for states venue['state'] = addr_util.normalize_state( venue['state'] ) # Make sure it's a valid state abbreviation if venue['state'] not in addr_util.state_abbrev.keys(): _mark_as_failed( events_coll=events_coll, event_id=event['_id'], now=now, field='normalization_failed', reason='Invalid state', ) continue event_batch.append(event) if len(event_batch) == usps_batch_size: _save_venues( events=event_batch, events_coll=events_coll, usps_id=usps_id, now=now, ) event_batch = [] _save_venues( events=event_batch, events_coll=events_coll, usps_id=usps_id, now=now, ) return found_work