def clean(file_path=util.restaurants_file, collection_lane=util.standard_collection): import_export.init_cleaning(file_path, collection_lane) print("Average of different entries in other fields for a given entry") practicabilities = predictability_checker.check_indications( util.field_names, util.current_collection(collection_lane).find( {}, util.get_fields_projection())) pprint.pprint(practicabilities) print("Standardizing addresses") standardize_restaurant_data.standardize_addresses(collection_lane) print("Standardizing cities") standardize_restaurant_data.standardize_cities(collection_lane) print("Standardizing phone numbers") standardize_restaurant_data.standardize_phone_numbers(collection_lane) print("Standardizing restaurant types") standardize_restaurant_data.standardize_restaurant_types(collection_lane) print("Average of different entries in other fields for a given entry") practicabilities = predictability_checker.check_indications( util.field_names, util.current_collection(collection_lane).find( {}, util.get_fields_projection())) pprint.pprint(practicabilities)
def standardize_restaurant_types(collection_lane): current_collection = util.current_collection(collection_lane) data = current_collection.find({}) next_collection = util.go_to_next_stage(collection_lane) new_data = list() containing_numbers = re.compile(r" \d.*\d ") split_points = re.compile(r"(?: and |/)") replace_dict = {"bbq": "barbecue"} for entry in data: type_content = entry[util.type_field] result = containing_numbers.search(type_content) if result: type_content = type_content[containing_numbers.search(type_content ).end():] type_content = split_points.split(type_content) for i, content in enumerate(type_content): type_content[i] = replace_dict.get(content, content) entry[util.type_field] = type_content new_data.append(entry) next_collection.insert_many(new_data)
def standardize_cities(collection_lane): current_collection = util.current_collection(collection_lane) data = current_collection.find({}) next_collection = util.go_to_next_stage(collection_lane) new_data = list() replace_dict = { 'la': 'los angeles', 'west la': 'los angeles', 'w. hollywood': 'west hollywood', 'new york': 'new york city', 'st. boyle hts.': 'boyle heights', } district_dict = util.invert_dictionary_lists(util.districts) for entry in data: std_city = entry.get(util.city_field) if not std_city: print("entry with missing '{}' field".format(util.city_field)) next_collection.save(entry) continue std_city = replace_dict.get(std_city, std_city) std_city = district_dict.get(std_city, std_city) entry[util.city_field] = std_city new_data.append(entry) next_collection.insert_many(new_data)
def import_restaurants_data(file_path, collection_lane): cur_collection = util.current_collection(collection_lane) if 0 < cur_collection.estimated_document_count(): print("Data already imported. Deleting {} entries.".format(cur_collection.estimated_document_count())) cur_collection.delete_many({}) dicts = import_tsv(file_path)["data"] print("Importing data...") cur_collection.insert_many(dicts) print("done ({} entries)".format(cur_collection.estimated_document_count()))
def aggregate_street_types(): addr_col = [entry.get(util.address_field) for entry in util.current_collection().find({}, {util.id_pm: 0, util.address_field: 1})] directions = re.compile(r"( (at|near|between|off|in) )") street_type = re.compile(r'\b\S+\.?(?: [a-z]\.?)?$',re.IGNORECASE) addr_col = [row for row in map(lambda x: x[:re.search(directions, x).start() if re.search(directions, x) else len(x)].strip(), addr_col)] addr_col = [row for row in map(lambda x: x[:re.search(r"\.", x).start() + 1 if re.search(r"\.", x) else len(x)].strip(), addr_col)] addr_col = [match.group(0) for match in map(lambda x: re.search(street_type, x), addr_col)] temp_collection = util.get_temp_collection() addr_col = [{util.address_field:address} for address in addr_col] temp_collection.insert_many(addr_col) aggregates = group_and_count(temp_collection.name, util.address_field) for entry in aggregates: print(entry)
def standardize_phone_numbers(collection_lane): current_collection = util.current_collection(collection_lane) data = current_collection.find({}) next_collection = util.go_to_next_stage(collection_lane) new_data = list() non_number = re.compile(r"\D+") non_number_start_end = re.compile(r"(^\D+)|(\D+$)") for entry in data: phone: str = entry.get(util.phone_field) if not phone: print("entry with missing '{}' field".format( util.phone_field_field)) next_collection.save(entry) continue phone = re.sub(non_number_start_end, "", phone) phone = re.sub(non_number, "-", phone) entry[util.phone_field] = phone new_data.append(entry) next_collection.insert_many(new_data)
def get_similarity_values(collection_lane): tokenized_data = get_tokenized_data( list(util.current_collection(collection_lane).find({}))) num_entries = len(tokenized_data) entry_comparisons = 4 similarity_values = {} string_matchers = {} measured_fields = [util.phone_field, util.address_field, util.name_field] for field in util.field_names: string_matchers[field] = string_matching.SoftTfIdf(get_corpus_list( tokenized_data, field), threshold=0.9) for field in measured_fields: if field == util.phone_field: tokenized_data.sort(key=lambda x: "".join(x[field])) else: tokenized_data.sort(key=lambda x: "".join(sorted(x[field]))) for i in range(0, num_entries): i_id = tokenized_data[i][util.id_field][0] if i_id not in similarity_values: similarity_values[i_id] = {} for j in range(i + 1, min(i + 1 + entry_comparisons, num_entries)): j_id = tokenized_data[j][util.id_field][0] if j_id not in similarity_values[i_id]: similarity_values[i_id][j_id] = {} for field_to_check in measured_fields: if field_to_check not in similarity_values[i_id][j_id]: similarity_values[i_id][j_id][field_to_check] = \ string_matchers[field_to_check].get_raw_score(tokenized_data[i][field_to_check], tokenized_data[j][field_to_check]) return similarity_values
def standardize_addresses(collection_lane): current_collection = util.current_collection(collection_lane) data = [a for a in current_collection.find({})] next_collection = util.go_to_next_stage(collection_lane) new_data = list() street_types = defaultdict(set) directions = re.compile(r"( (at|near|between|off|in) )") written_numbers = { "first": "1st", "second": "2nd", "third": "3rd", "fourth": "4th", "fifth": "5th", "sixth": "6th", "seventh": "7th", "eighth": "8th", "ninth": "9th", "tenth": "10th", "eleventh": "11th", "twelfth": "12th" } written_numbers_re = re.compile(r"(?P<num>{})".format("|".join( written_numbers.keys()))) abbreviations = "|".join( util.invert_dictionary_lists(util.street_suffix_abbreviations)) abbr_replacement = re.compile( r" (?P<abbr>{})\.?( |$)".format(abbreviations)) abbr_lookup = util.invert_dictionary_lists( util.street_suffix_abbreviations) double_space = re.compile(" ") for entry in data: address = entry.get(util.address_field) original = address if not address: print("entry with missing '{}' field".format(util.address_field)) next_collection.save(entry) continue result = directions.search(address) if result: address = address[:result.start()] results = abbr_replacement.finditer(address) for result in results: address_1 = address[:result.start()] address_2 = abbr_lookup[result.group('abbr')] address_3 = address[result.end():] address_1 = address_1.strip() + " " address_3 = " " + address_3 if address_3 != "" else "" address = address_1 + address_2 + address_3 results = written_numbers_re.finditer(address) for result in results: address_1 = address[:result.start()] address_2 = written_numbers[result.group('num')] address_3 = address[result.end():] address = address_1 + address_2 + address_3 result = double_space.search(address) if result: address = address[:result.start()] address = address.strip() audit_street_type(street_types, address, original) entry[util.address_field] = address new_data.append(entry) next_collection.insert_many(new_data) not_expected_count = sum(map(lambda x: len(street_types[x]), street_types)) total_count = len(data) ratio_not_expected = not_expected_count / total_count * 100 ratio_expected = 100 - ratio_not_expected print("Not expected: {}/{}".format(not_expected_count, total_count)) print("Ratio not expected: {:5.1f}%".format(ratio_not_expected)) print("Ratio expected: {:5.1f}%".format(ratio_expected)) return street_types