Ejemplo n.º 1
0
def get_match_quality(import_config, location, values,
                      values_location_group_id):
    values_longitude = float(
        get_location_field(import_config, 'longitude', values).strip())
    values_latitude = float(
        get_location_field(import_config, 'latitude', values).strip())
    location['longitude'] = float(location['longitude'])
    location['latitude'] = float(location['latitude'])
    values_name = get_location_field(import_config, 'name', values)
    # if not close enough, skip.
    distance = get_direct_distance(location['latitude'], location['longitude'],
                                   values_latitude, values_longitude)

    result = 0
    if (distance < distance_threshold_km
            and is_name_very_similar(values_name, location['name'])):
        result += 0.7
    if (values_location_group_id is not None
            and values_location_group_id == location['location_group_id']
            and distance < distance_threshold_km):
        result += 0.2
    if (distance < distance_threshold_for_very_similar_information
            and is_very_similar_information(import_config, values, location)):
        result += 0.1

    return result
def is_very_similar_information(import_config, values, location):
	if not is_name_at_least_vaguely_similar(location['name'], get_location_field(import_config, 'name', values)):
		return False

	phone_number1 = strip_to_digits(get_location_field(import_config, 'phone_number', values))[-7:]
	phone_number2 = strip_to_digits(location['phone_number'])[-7:]
	if len(phone_number1) >= 7 and phone_number1 == phone_number2:
		return True

	return False
Ejemplo n.º 3
0
def get_id_of_matching_location(import_config, locations, values,
                                location_duplicates):
    """
	Tries to find a location matching the latitude and longitude closely and matching names.
	"""
    values_longitude = float(
        get_location_field(import_config, 'longitude', values).strip())
    values_latitude = float(
        get_location_field(import_config, 'latitude', values).strip())
    values_name = get_location_field(import_config, 'name',
                                     values).strip().lower()

    location_duplicates_with_same_name = location_duplicates.get_location_duplicates_by_name(
        values_name)
    if len(location_duplicates_with_same_name) != 0:
        for location_duplicate in location_duplicates_with_same_name:
            locations_with_same_name = [
                loc for loc in locations.locations_near(
                    values_longitude, values_latitude,
                    distance_threshold_km_for_recorded_duplicate)
                if loc['id'] == location_duplicate['location_id']
            ]
            if len(locations_with_same_name) != 0:
                location = locations_with_same_name[0]
                location['longitude'] = float(location['longitude'])
                location['latitude'] = float(location['latitude'])

                # if not close enough, skip.
                distance = get_direct_distance(location['latitude'],
                                               location['longitude'],
                                               values_latitude,
                                               values_longitude)
                if distance < distance_threshold_km_for_recorded_duplicate:
                    return location_duplicate['location_id']
                    # return the id of the location that this is a duplicate of

    values_location_group_id = get_location_group_id(import_config, values)
    if not values_location_group_id:
        values_location_group_id = location_groups.get_location_group_for(
            values_name)
    likely_duplicates = []
    for location in locations.locations_near(
            values_longitude, values_latitude,
            distance_threshold_for_very_similar_information):
        match_quality = get_match_quality(import_config, location, values,
                                          values_location_group_id)
        if match_quality > 0.01:
            likely_duplicates.append((location, match_quality))

    if len(likely_duplicates) == 0:
        return None
    else:
        # Sort by match quality so the best match goes to index 0.
        likely_duplicates.sort(key=lambda tup: tup[1], reverse=True)
        return likely_duplicates[0][0]['id']
Ejemplo n.º 4
0
def is_very_similar_information(import_config, values, location):
    if not is_name_at_least_vaguely_similar(
            location['name'], get_location_field(import_config, 'name',
                                                 values)):
        return False

    phone_number1 = strip_to_digits(
        get_location_field(import_config, 'phone_number', values))[-7:]
    phone_number2 = strip_to_digits(location['phone_number'])[-7:]
    if len(phone_number1) >= 7 and phone_number1 == phone_number2:
        return True

    return False
def merge_location_information(import_config, location, user_answers, values,
                               location_groups):
    fields_to_merge = [
        'location_group_id', 'address', 'phone_number', 'external_web_url'
    ]
    for field_name in fields_to_merge:
        val = get_location_field(import_config, field_name, values)
        other_value = location[field_name]
        if field_name == 'external_web_url':
            other_value = get_sanitized_external_web_url(
                location, location_groups)
        if val and not other_value:
            location[field_name] = val

    # Look into merging answers into the location.
    if 'import_user_id' in import_config:
        matched_user_answers = [
            a for a in user_answers
            if a['answered_by_user_id'] == import_config['import_user_id']
            and a['location_id'] == location['id']
        ]
        if len(matched_user_answers) == 0:
            new_answers = get_user_answers_from(import_config, location['id'],
                                                values)
            for new_answer in new_answers:
                user_answers.append(new_answer)
def get_id_of_matching_location(import_config, locations, values, location_duplicates):
	"""
	Tries to find a location matching the latitude and longitude closely and matching names.
	"""
	values_longitude = float(get_location_field(import_config, 'longitude', values).strip())
	values_latitude = float(get_location_field(import_config, 'latitude', values).strip())
	values_name = get_location_field(import_config, 'name', values).strip().lower()

	location_duplicates_with_same_name = location_duplicates.get_location_duplicates_by_name(values_name)
	if len(location_duplicates_with_same_name) != 0:
		for location_duplicate in location_duplicates_with_same_name:
			locations_with_same_name = [loc for loc in locations.locations_near(values_longitude, values_latitude, distance_threshold_km_for_recorded_duplicate)
				if loc['id'] == location_duplicate['location_id']]
			if len(locations_with_same_name) != 0:
				location = locations_with_same_name[0]
				location['longitude'] = float(location['longitude'])
				location['latitude'] = float(location['latitude'])

				# if not close enough, skip.
				distance = get_direct_distance(location['latitude'], location['longitude'],
					values_latitude, values_longitude)
				if distance < distance_threshold_km_for_recorded_duplicate:
					return location_duplicate['location_id']
					# return the id of the location that this is a duplicate of

	values_location_group_id = get_location_group_id(import_config, values)
	if not values_location_group_id:
		values_location_group_id = location_groups.get_location_group_for(values_name)
	likely_duplicates = []
	for location in locations.locations_near(
			values_longitude, values_latitude,
			distance_threshold_for_very_similar_information):
		match_quality = get_match_quality(import_config, location, values, values_location_group_id)
		if match_quality > 0.01:
			likely_duplicates.append((location, match_quality))

	if len(likely_duplicates) == 0:
		return None
	else:
		# Sort by match quality so the best match goes to index 0.
		likely_duplicates.sort(key=lambda tup: tup[1], reverse=True)
		return likely_duplicates[0][0]['id']
def get_match_quality(import_config, location, values, values_location_group_id):
	values_longitude = float(get_location_field(import_config, 'longitude', values).strip())
	values_latitude = float(get_location_field(import_config, 'latitude', values).strip())
	location['longitude'] = float(location['longitude'])
	location['latitude'] = float(location['latitude'])
	values_name = get_location_field(import_config, 'name', values)
	# if not close enough, skip.
	distance = get_direct_distance(location['latitude'], location['longitude'],
		values_latitude, values_longitude)

	result = 0
	if ( distance < distance_threshold_km and
	is_name_very_similar(values_name, location['name']) ):
		result += 0.7
	if ( values_location_group_id is not None and values_location_group_id == location['location_group_id'] and distance < distance_threshold_km ):
		result += 0.2
	if ( distance < distance_threshold_for_very_similar_information and 
	is_very_similar_information(import_config, values, location) ):
		result += 0.1

	return result
def get_location_group_id(import_config, values):
	if 'location_group_id' in import_config:
		return import_config['location_group_id']
	else:
		result = get_location_field(import_config, 'location_group_id', values)
		if isinstance(result, basestring):
			result = result.strip()
			if not result:
				result = None
			else:
				result = int(result)
		return result
Ejemplo n.º 9
0
def get_location_group_id(import_config, values):
    if 'location_group_id' in import_config:
        return import_config['location_group_id']
    else:
        result = get_location_field(import_config, 'location_group_id', values)
        if isinstance(result, basestring):
            result = result.strip()
            if not result:
                result = None
            else:
                result = int(result)
        return result
Ejemplo n.º 10
0
def merge_location(import_config, locations, location_tags,
                   location_location_tags, user_answers, values,
                   location_duplicates):
    location_name = get_location_field(import_config, 'name', values)
    if not is_location_of_interest(location_name):
        print('location is not of interest: ' + location_name)
        return

    matching_location_id = duplicate_detection.get_id_of_matching_location(
        import_config, locations, values, location_duplicates)
    if matching_location_id is not None:
        print('matching location found for ' + location_name + ' id ' +
              str(matching_location_id))
        merge_location_information(import_config,
                                   find_by_id(locations, matching_location_id),
                                   user_answers, values)
        return

    new_location = {
        'id': guid_generator.get_guid(),
        'data_source_id': import_config['data_source_id']
    }
    for field_name in ['latitude', 'longitude']:
        new_location[field_name] = get_location_field(import_config,
                                                      field_name, values)

    if 'location_group_id' in import_config:
        new_location['location_group_id'] = import_config['location_group_id']

    new_location = set_every_key(locations, new_location)
    # include any user answers that might be extractable from values.
    new_user_answers = get_user_answers_from(import_config, new_location['id'],
                                             values)
    for user_answer in new_user_answers:
        user_answers.append(user_answer)

    tag_ids = []
    if 'location_tag_names' in import_config:
        for location_tag_name in import_config['location_tag_names']:
            location_tag_id = get_id_for_location_tag(location_tags,
                                                      location_tag_name)
            tag_ids.append(location_tag_id)

    i = 0
    for column in import_config['columns']:
        if 'location_field' in column:
            new_location[column['location_field']] = sanitize(
                column['location_field'], values[i])
        elif 'location_tag_name' in column:
            if matches_true(values[i]):
                location_tag_id = get_id_for_location_tag(
                    location_tags, column['location_tag_name'])
                tag_ids.append(location_tag_id)

        i += 1

    if 'location_group_id' not in new_location or not new_location[
            'location_group_id']:
        new_location[
            'location_group_id'] = location_groups.get_location_group_for(
                new_location['name'])
    locations.append(new_location)
    for tag_id in tag_ids:
        location_location_tags.append({
            'id': guid_generator.get_guid(),
            'location_tag_id': tag_id,
            'location_id': new_location['id']
        })