def get_canonical_items(): """ Fetch all `Facility` items and create a dictionary suitable for use by a Dedupe model. Returns: A dictionary. The key is the `Facility` OAR ID. The value is a dictionary of clean field values keyed by field name (country, name, address). A "clean" value is one which has been passed through the `clean` function. """ facility_set = Facility.objects.all().extra(select={ 'country': 'country_code' }).values('id', 'country', 'name', 'address') items = { str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} for i in facility_set } confirmed_items = { match_to_extended_facility_id(m): { 'country': clean(m.facility_list_item.country_code), 'name': clean(m.facility_list_item.name), 'address': clean(m.facility_list_item.address), } for m in FacilityMatch.objects.filter(status=FacilityMatch.CONFIRMED) } items.update(confirmed_items) return items
def populate_cleaned_fields(apps, schema_editor): count = 0 FacilityListItem = apps.get_model('api', 'FacilityListItem') for list_item in FacilityListItem.objects.exclude(name='', address='').iterator(): list_item.clean_name = clean(list_item.name) or '' list_item.clean_address = clean(list_item.address) or '' list_item.save() count += 1 if count % 1000 == 0: print('Filled ' + str(count))
def get_messy_items_for_training(mod_factor=5): """ Fetch a subset of `FacilityListItem` objects that have been parsed and are not in an error state. Arguments: mod_factor -- Used to partition a subset of `FacilityListItem` records. The larger the value, the fewer records will be contained in the subset. Returns: A dictionary. The key is the `FacilityListItem` ID. The value is a dictionary of clean field values keyed by field name (country, name, address). A "clean" value is one which has been passed through the `clean` function. """ facility_list_item_set = FacilityListItem.objects.exclude( Q(status=FacilityListItem.UPLOADED) | Q(status=FacilityListItem.ERROR) | Q(status=FacilityListItem.ERROR_PARSING) | Q(status=FacilityListItem.ERROR_GEOCODING) | Q(status=FacilityListItem.ERROR_MATCHING)).extra( select={ 'country': 'country_code' }).values('id', 'country', 'name', 'address') records = [ record for (i, record) in enumerate(facility_list_item_set) if i % mod_factor == 0 ] return { str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} for i in records }
def get_messy_items_from_facility_list(facility_list): """ Fetch all `FacilityListItem` objects that belong to the specified `FacilityList` and create a dictionary suitable for use by a Dedupe model. Arguments: facility_list -- A `FacilityList`. Returns: A dictionary. The key is the `FacilityListItem` ID. The value is a dictionary of clean field values keyed by field name (country, name, address). A "clean" value is one which has been passed through the `clean` function. """ facility_list_item_set = facility_list.source.facilitylistitem_set.filter( Q(status=FacilityListItem.GEOCODED) | Q(status=FacilityListItem.GEOCODED_NO_RESULTS)).extra( select={ 'country': 'country_code' }).values('id', 'country', 'name', 'address') return { str(i['id']): {k: clean(i[k]) for k in i if k != 'id'} for i in facility_list_item_set }
def exact_match_items(messy, contributor): started = str(datetime.utcnow()) matched_items = FacilityListItem.objects \ .filter(status__in=[FacilityListItem.MATCHED, FacilityListItem.CONFIRMED_MATCH]) \ .exclude(facility_id=None) active_item_ids = FacilityMatch.objects \ .filter(status__in=[FacilityMatch.AUTOMATIC, FacilityMatch.CONFIRMED, FacilityMatch.MERGED], is_active=True, facility_list_item__source__is_active=True) \ .values_list('facility_list_item', flat=True) results = dict() for messy_id, item in messy.items(): clean_name = clean(item.get('name', '')) clean_address = clean(item.get('address', '')) country_code = item.get('country', '').upper() empty_text_fields = Q( Q(clean_name__isnull=True) | Q(clean_name__exact='') | Q(clean_address__isnull=True) | Q(clean_address__exact='')) exact_matches = matched_items.filter(clean_name=clean_name, clean_address=clean_address, country_code=country_code) \ .exclude(empty_text_fields) \ .values('id', 'facility_id', 'source__contributor_id', 'updated_at') if len(exact_matches) > 0: if len(exact_matches) > 1: exact_matches = sort_exact_matches(exact_matches, active_item_ids, contributor) results[messy_id] = exact_matches finished = str(datetime.utcnow()) return { 'processed_list_item_ids': list(results.keys()), 'item_matches': results, 'started': started, 'finished': finished }
def is_string_match(item, facility): """ Check if a list item is an exact string match to a facility, after processing both through the same string cleaning operations used by the matcher. Arguments: item -- A `FacilityListItem` instance being considered as a potential match to the specified facility. facility -- A `Facility` instance. Returns: True if the item is a string match to the facility """ return (item.country_code == facility.country_code and clean(item.name) == clean(facility.name) and clean(item.address) == clean(facility.address))
def facility_values_to_dedupe_record(facility_dict): """ Convert a dictionary with id, country, name, and address keys into a dictionary suitable for training and indexing a Dedupe model. Arguments: facility_dict -- A dict with id, country, name, and address key created from a `Facility` values query. Returns: A dictionary with the id as the key and a dictionary of fields as the value. """ return { str(facility_dict['id']): { "country": clean(facility_dict['country']), "name": clean(facility_dict['name']), "address": clean(facility_dict['address']), } }
def match_item(country, name, address, id='id', automatic_threshold=MatchDefaults.AUTOMATIC_THRESHOLD, gazetteer_threshold=MatchDefaults.GAZETTEER_THRESHOLD, recall_weight=MatchDefaults.RECALL_WEIGHT): """ Match the details of a single facility to the list of existing facilities. Arguments: country -- A valid country name or 2-character ISO code. name -- The name of the facility. address -- The address of the facility. id -- The key value in the returned match results. automatic_threshold -- A number from 0.0 to 1.0. A match with a confidence score greater than this value will be assigned automatically. gazetteer_threshold -- A number from 0.0 to 1.0. A match with a confidence score between this value and the `automatic_threshold` will be considers a match that requires confirmation. recall_weight -- Sets the tradeoff between precision and recall. A value of 1.0 give an equal weight to precision and recall. https://en.wikipedia.org/wiki/Precision_and_recall https://docs.dedupe.io/en/latest/Choosing-a-good-threshold.html Returns: See `match_items`. """ return match_items( { str(id): { "country": clean(country), "name": clean(name), "address": clean(address) } }, automatic_threshold=automatic_threshold, gazetteer_threshold=gazetteer_threshold, recall_weight=recall_weight)
def process_facility_and_processing_type_claim_values(claim, apps): facility_type = getattr(claim, 'facility_type') facility_value = { 'raw_values': [], 'matched_values': [], } claim.facility_type = None facility_value['raw_values'].append(facility_type) if value_is_valid(facility_type): result = get_facility_and_processing_type(facility_type) if result[0] is not None: claim.facility_type = clean(result[2]) facility_value['matched_values'].append(result) create_field('facility_type', facility_value, claim, apps) processing_types = getattr(claim, 'facility_production_types') if processing_types is not None and isinstance(processing_types, str): processing_types = (processing_types.split('|') if '|' in processing_types else [processing_types]) if processing_types is not None: processing_value = { 'raw_values': list(processing_types), 'matched_values': [], } claim_values = [] for value in list(processing_types): if value_is_valid(value): result = get_facility_and_processing_type(value) if result[0] is not None: processing_value['matched_values'].append(result) claim_values.append(clean(result[3])) if len(processing_value['matched_values']) > 0: create_field('processing_type', processing_value, claim, apps) claim.facility_production_types = claim_values claim.save()
def parse_facility_list_item(item): started = str(datetime.utcnow()) if type(item) != FacilityListItem: raise ValueError('Argument must be a FacilityListItem') if item.status != FacilityListItem.UPLOADED: raise ValueError('Items to be parsed must be in the UPLOADED status') try: is_geocoded = False fields = [ f.lower() for f in parse_csv_line(item.source.facility_list.header) ] values = parse_csv_line(item.raw_data) # facility_type_processing_type is a special "meta" field that attempts # to simplify the submission process for contributors. if 'facility_type_processing_type' in fields: if 'facility_type' not in fields: fields.append('facility_type') values.append( values[fields.index('facility_type_processing_type')]) if 'processing_type' not in fields: fields.append('processing_type') values.append( values[fields.index('facility_type_processing_type')]) if CsvHeaderField.COUNTRY in fields: item.country_code = get_country_code(values[fields.index( CsvHeaderField.COUNTRY)]) if CsvHeaderField.NAME in fields: item.name = values[fields.index(CsvHeaderField.NAME)] item.clean_name = clean(item.name) if item.clean_name is None: item.clean_name = '' if CsvHeaderField.ADDRESS in fields: item.address = values[fields.index(CsvHeaderField.ADDRESS)] item.clean_address = clean(item.address) if item.clean_address is None: item.clean_address = '' if CsvHeaderField.LAT in fields and CsvHeaderField.LNG in fields: lat = float(values[fields.index(CsvHeaderField.LAT)]) lng = float(values[fields.index(CsvHeaderField.LNG)]) item.geocoded_point = Point(lng, lat) is_geocoded = True if CsvHeaderField.PPE_PRODUCT_TYPES in fields: product_types = values[fields.index( CsvHeaderField.PPE_PRODUCT_TYPES)] # The nested list comprehension ensures that we filter out # whitespace-only values item.ppe_product_types = \ [s for s in [s.strip() for s in product_types.split('|')] if s] if CsvHeaderField.PPE_CONTACT_PHONE in fields: item.ppe_contact_phone = values[fields.index( CsvHeaderField.PPE_CONTACT_PHONE)] if CsvHeaderField.PPE_CONTACT_EMAIL in fields: item.ppe_contact_email = values[fields.index( CsvHeaderField.PPE_CONTACT_EMAIL)] if CsvHeaderField.PPE_WEBSITE in fields: item.ppe_website = values[fields.index(CsvHeaderField.PPE_WEBSITE)] create_extendedfields_for_listitem(item, fields, values) try: item.full_clean(exclude=('processing_started_at', 'processing_completed_at', 'processing_results', 'geocoded_point', 'facility')) item.status = FacilityListItem.PARSED item.processing_results.append({ 'action': ProcessingAction.PARSE, 'started_at': started, 'error': False, 'finished_at': str(datetime.utcnow()), 'is_geocoded': is_geocoded, }) except ValidationError as ve: messages = [] for name, errors in ve.error_dict.items(): # We need to clear the invalid value so we can save the row setattr(item, name, '') error_str = ''.join(''.join(e.messages) for e in errors) messages.append('There is a problem with the {0}: {1}'.format( name, error_str)) # If there is a validation error on the `ppe_product_types` array # field, `full_clean` appears to set it to an empty string which # then causes `save` to raise an exception. ppe_product_types_is_valid = (item.ppe_product_types is None or isinstance( item.ppe_product_types, list)) if not ppe_product_types_is_valid: item.ppe_product_types = [] item.status = FacilityListItem.ERROR_PARSING item.processing_results.append({ 'action': ProcessingAction.PARSE, 'started_at': started, 'error': True, 'message': '\n'.join(messages), 'trace': traceback.format_exc(), 'finished_at': str(datetime.utcnow()), }) except Exception as e: item.status = FacilityListItem.ERROR_PARSING item.processing_results.append({ 'action': ProcessingAction.PARSE, 'started_at': started, 'error': True, 'message': str(e), 'trace': traceback.format_exc(), 'finished_at': str(datetime.utcnow()), })