def get_match_quality(import_config, location, values, values_location_group_id): values_longitude = float( get_location_field(import_config, 'longitude', values).strip()) values_latitude = float( get_location_field(import_config, 'latitude', values).strip()) location['longitude'] = float(location['longitude']) location['latitude'] = float(location['latitude']) values_name = get_location_field(import_config, 'name', values) # if not close enough, skip. distance = get_direct_distance(location['latitude'], location['longitude'], values_latitude, values_longitude) result = 0 if (distance < distance_threshold_km and is_name_very_similar(values_name, location['name'])): result += 0.7 if (values_location_group_id is not None and values_location_group_id == location['location_group_id'] and distance < distance_threshold_km): result += 0.2 if (distance < distance_threshold_for_very_similar_information and is_very_similar_information(import_config, values, location)): result += 0.1 return result
def is_very_similar_information(import_config, values, location): if not is_name_at_least_vaguely_similar(location['name'], get_location_field(import_config, 'name', values)): return False phone_number1 = strip_to_digits(get_location_field(import_config, 'phone_number', values))[-7:] phone_number2 = strip_to_digits(location['phone_number'])[-7:] if len(phone_number1) >= 7 and phone_number1 == phone_number2: return True return False
def get_id_of_matching_location(import_config, locations, values, location_duplicates): """ Tries to find a location matching the latitude and longitude closely and matching names. """ values_longitude = float( get_location_field(import_config, 'longitude', values).strip()) values_latitude = float( get_location_field(import_config, 'latitude', values).strip()) values_name = get_location_field(import_config, 'name', values).strip().lower() location_duplicates_with_same_name = location_duplicates.get_location_duplicates_by_name( values_name) if len(location_duplicates_with_same_name) != 0: for location_duplicate in location_duplicates_with_same_name: locations_with_same_name = [ loc for loc in locations.locations_near( values_longitude, values_latitude, distance_threshold_km_for_recorded_duplicate) if loc['id'] == location_duplicate['location_id'] ] if len(locations_with_same_name) != 0: location = locations_with_same_name[0] location['longitude'] = float(location['longitude']) location['latitude'] = float(location['latitude']) # if not close enough, skip. distance = get_direct_distance(location['latitude'], location['longitude'], values_latitude, values_longitude) if distance < distance_threshold_km_for_recorded_duplicate: return location_duplicate['location_id'] # return the id of the location that this is a duplicate of values_location_group_id = get_location_group_id(import_config, values) if not values_location_group_id: values_location_group_id = location_groups.get_location_group_for( values_name) likely_duplicates = [] for location in locations.locations_near( values_longitude, values_latitude, distance_threshold_for_very_similar_information): match_quality = get_match_quality(import_config, location, values, values_location_group_id) if match_quality > 0.01: likely_duplicates.append((location, match_quality)) if len(likely_duplicates) == 0: return None else: # Sort by match quality so the best match goes to index 0. likely_duplicates.sort(key=lambda tup: tup[1], reverse=True) return likely_duplicates[0][0]['id']
def is_very_similar_information(import_config, values, location): if not is_name_at_least_vaguely_similar( location['name'], get_location_field(import_config, 'name', values)): return False phone_number1 = strip_to_digits( get_location_field(import_config, 'phone_number', values))[-7:] phone_number2 = strip_to_digits(location['phone_number'])[-7:] if len(phone_number1) >= 7 and phone_number1 == phone_number2: return True return False
def merge_location_information(import_config, location, user_answers, values, location_groups): fields_to_merge = [ 'location_group_id', 'address', 'phone_number', 'external_web_url' ] for field_name in fields_to_merge: val = get_location_field(import_config, field_name, values) other_value = location[field_name] if field_name == 'external_web_url': other_value = get_sanitized_external_web_url( location, location_groups) if val and not other_value: location[field_name] = val # Look into merging answers into the location. if 'import_user_id' in import_config: matched_user_answers = [ a for a in user_answers if a['answered_by_user_id'] == import_config['import_user_id'] and a['location_id'] == location['id'] ] if len(matched_user_answers) == 0: new_answers = get_user_answers_from(import_config, location['id'], values) for new_answer in new_answers: user_answers.append(new_answer)
def get_id_of_matching_location(import_config, locations, values, location_duplicates): """ Tries to find a location matching the latitude and longitude closely and matching names. """ values_longitude = float(get_location_field(import_config, 'longitude', values).strip()) values_latitude = float(get_location_field(import_config, 'latitude', values).strip()) values_name = get_location_field(import_config, 'name', values).strip().lower() location_duplicates_with_same_name = location_duplicates.get_location_duplicates_by_name(values_name) if len(location_duplicates_with_same_name) != 0: for location_duplicate in location_duplicates_with_same_name: locations_with_same_name = [loc for loc in locations.locations_near(values_longitude, values_latitude, distance_threshold_km_for_recorded_duplicate) if loc['id'] == location_duplicate['location_id']] if len(locations_with_same_name) != 0: location = locations_with_same_name[0] location['longitude'] = float(location['longitude']) location['latitude'] = float(location['latitude']) # if not close enough, skip. distance = get_direct_distance(location['latitude'], location['longitude'], values_latitude, values_longitude) if distance < distance_threshold_km_for_recorded_duplicate: return location_duplicate['location_id'] # return the id of the location that this is a duplicate of values_location_group_id = get_location_group_id(import_config, values) if not values_location_group_id: values_location_group_id = location_groups.get_location_group_for(values_name) likely_duplicates = [] for location in locations.locations_near( values_longitude, values_latitude, distance_threshold_for_very_similar_information): match_quality = get_match_quality(import_config, location, values, values_location_group_id) if match_quality > 0.01: likely_duplicates.append((location, match_quality)) if len(likely_duplicates) == 0: return None else: # Sort by match quality so the best match goes to index 0. likely_duplicates.sort(key=lambda tup: tup[1], reverse=True) return likely_duplicates[0][0]['id']
def get_match_quality(import_config, location, values, values_location_group_id): values_longitude = float(get_location_field(import_config, 'longitude', values).strip()) values_latitude = float(get_location_field(import_config, 'latitude', values).strip()) location['longitude'] = float(location['longitude']) location['latitude'] = float(location['latitude']) values_name = get_location_field(import_config, 'name', values) # if not close enough, skip. distance = get_direct_distance(location['latitude'], location['longitude'], values_latitude, values_longitude) result = 0 if ( distance < distance_threshold_km and is_name_very_similar(values_name, location['name']) ): result += 0.7 if ( values_location_group_id is not None and values_location_group_id == location['location_group_id'] and distance < distance_threshold_km ): result += 0.2 if ( distance < distance_threshold_for_very_similar_information and is_very_similar_information(import_config, values, location) ): result += 0.1 return result
def get_location_group_id(import_config, values): if 'location_group_id' in import_config: return import_config['location_group_id'] else: result = get_location_field(import_config, 'location_group_id', values) if isinstance(result, basestring): result = result.strip() if not result: result = None else: result = int(result) return result
def merge_location(import_config, locations, location_tags, location_location_tags, user_answers, values, location_duplicates): location_name = get_location_field(import_config, 'name', values) if not is_location_of_interest(location_name): print('location is not of interest: ' + location_name) return matching_location_id = duplicate_detection.get_id_of_matching_location( import_config, locations, values, location_duplicates) if matching_location_id is not None: print('matching location found for ' + location_name + ' id ' + str(matching_location_id)) merge_location_information(import_config, find_by_id(locations, matching_location_id), user_answers, values) return new_location = { 'id': guid_generator.get_guid(), 'data_source_id': import_config['data_source_id'] } for field_name in ['latitude', 'longitude']: new_location[field_name] = get_location_field(import_config, field_name, values) if 'location_group_id' in import_config: new_location['location_group_id'] = import_config['location_group_id'] new_location = set_every_key(locations, new_location) # include any user answers that might be extractable from values. new_user_answers = get_user_answers_from(import_config, new_location['id'], values) for user_answer in new_user_answers: user_answers.append(user_answer) tag_ids = [] if 'location_tag_names' in import_config: for location_tag_name in import_config['location_tag_names']: location_tag_id = get_id_for_location_tag(location_tags, location_tag_name) tag_ids.append(location_tag_id) i = 0 for column in import_config['columns']: if 'location_field' in column: new_location[column['location_field']] = sanitize( column['location_field'], values[i]) elif 'location_tag_name' in column: if matches_true(values[i]): location_tag_id = get_id_for_location_tag( location_tags, column['location_tag_name']) tag_ids.append(location_tag_id) i += 1 if 'location_group_id' not in new_location or not new_location[ 'location_group_id']: new_location[ 'location_group_id'] = location_groups.get_location_group_for( new_location['name']) locations.append(new_location) for tag_id in tag_ids: location_location_tags.append({ 'id': guid_generator.get_guid(), 'location_tag_id': tag_id, 'location_id': new_location['id'] })