def _delete_organization_buildings(org_pk, chunk_size=100, *args, **kwargs): """Deletes all BuildingSnapshot instances within an organization :param org_pk: int, str, the organization pk """ qs = BuildingSnapshot.objects.filter(super_organization=org_pk) ids = qs.values_list('id', flat=True) deleting_cache_key = get_prog_key( 'delete_organization_buildings', org_pk ) if not ids: cache.set(deleting_cache_key, 100) return # delete the canonical buildings can_ids = CanonicalBuilding.objects.filter( canonical_snapshot__super_organization=org_pk ).values_list('id', flat=True) _delete_canonical_buildings.delay(can_ids) step = float(chunk_size) / len(ids) cache.set(deleting_cache_key, 0) tasks = [] for del_ids in batch(ids, chunk_size): # we could also use .s instead of .subtask and not wrap the *args tasks.append( _delete_organization_buildings_chunk.subtask( (del_ids, deleting_cache_key, step, org_pk) ) ) chord(tasks, interval=15)(finish_delete.subtask([org_pk]))
def destroy(self, request, pk=None): """ Starts a background task to delete an organization and all related data. --- parameter_strategy: replace parameters: - name: pk type: integer description: Organization ID (primary key) required: true paramType: path type: status: description: success or error type: string required: true progress_key: description: ID of background job, for retrieving job progress type: string required: true """ org_id = pk deleting_cache_key = get_prog_key( 'delete_organization_buildings', org_id ) tasks.delete_organization.delay(org_id, deleting_cache_key) return JsonResponse({ 'status': 'success', 'progress': 0, 'progress_key': deleting_cache_key })
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: return {'status': 'warning', 'message': 'raw data already saved'} if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 prog_key = get_prog_key('save_raw_data', file_pk) tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) return {'status': 'success'}
def finish_mapping(results, file_pk): import_file = ImportFile.objects.get(pk=file_pk) import_file.mapping_done = True import_file.save() finish_import_record(import_file.import_record.pk) prog_key = get_prog_key('map_data', file_pk) cache.set(prog_key, 100)
def get_progress(request): """ Return the progress of the cleansing. """ import_file_id = request.GET.get('import_file_id') return get_cache(get_prog_key('get_progress', import_file_id))['progress']
def get_progress(request): """ Return the progress of the cleansing. """ import_file_id = request.GET.get("import_file_id") return get_cache(get_prog_key("get_progress", import_file_id))["progress"]
def _map_data(file_pk, *args, **kwargs): """Get all of the raw data and process it using appropriate mapping. @lock_and_track returns a progress_key :param file_pk: int, the id of the import_file we're working with. """ import_file = ImportFile.objects.get(pk=file_pk) # Don't perform this task if it's already been completed. if import_file.mapping_done: prog_key = get_prog_key('map_data', file_pk) cache.set(prog_key, 100) return {'status': 'warning', 'message': 'mapping already complete'} # If we haven't finished saving, we shouldn't proceed with mapping # Re-queue this task. if not import_file.raw_save_done: map_data.apply_async(args=[file_pk], countdown=60, expires=120) return {'status': 'error', 'message': 'waiting for raw data save.'} source_type_dict = { 'Portfolio Raw': PORTFOLIO_RAW, 'Assessed Raw': ASSESSED_RAW, 'Green Button Raw': GREEN_BUTTON_RAW, } source_type = source_type_dict.get(import_file.source_type, ASSESSED_RAW) qs = BuildingSnapshot.objects.filter( import_file=import_file, source_type=source_type, ).iterator() prog_key = get_prog_key('map_data', file_pk) tasks = [] for chunk in batch(qs, 100): serialized_data = [obj.extra_data for obj in chunk] tasks.append(map_row_chunk.subtask( (serialized_data, file_pk, source_type, prog_key) )) tasks = add_cache_increment_parameter(tasks) if tasks: chord(tasks, interval=15)(finish_mapping.subtask([file_pk])) else: finish_mapping.task(file_pk) return {'status': 'success'}
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" result = {'status': 'success', 'progress': 100} prog_key = get_prog_key('save_raw_data', file_pk) try: import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: result['status'] = 'warning' result['message'] = 'Raw data already saved' cache.set(prog_key, result) return result if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append( _save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) except StopIteration: result['status'] = 'error' result['message'] = 'StopIteration Exception' result['stacktrace'] = traceback.format_exc() except Error as e: result['status'] = 'error' result['message'] = 'File Content Error: ' + e.message result['stacktrace'] = traceback.format_exc() except KeyError as e: result['status'] = 'error' result['message'] = 'Invalid Column Name: "' + e.message + '"' result['stacktrace'] = traceback.format_exc() except Exception as e: result['status'] = 'error' result['message'] = 'Unhandled Error: ' + e.message result['stacktrace'] = traceback.format_exc() cache.set(prog_key, result) return result
def _save_raw_data(file_pk, *args, **kwargs): """Chunk up the CSV and save data into the DB raw.""" result = {'status': 'success', 'progress': 100} prog_key = get_prog_key('save_raw_data', file_pk) try: import_file = ImportFile.objects.get(pk=file_pk) if import_file.raw_save_done: result['status'] = 'warning' result['message'] = 'Raw data already saved' cache.set(prog_key, result) return result if import_file.source_type == "Green Button Raw": return _save_raw_green_button_data(file_pk, *args, **kwargs) parser = reader.MCMParser(import_file.local_file) cache_first_rows(import_file, parser) rows = parser.next() import_file.num_rows = 0 tasks = [] for chunk in batch(rows, 100): import_file.num_rows += len(chunk) tasks.append(_save_raw_data_chunk.subtask((chunk, file_pk, prog_key))) tasks = add_cache_increment_parameter(tasks) import_file.num_columns = parser.num_columns() import_file.save() if tasks: chord(tasks, interval=15)(finish_raw_save.subtask([file_pk])) else: finish_raw_save.task(file_pk) except StopIteration: result['status'] = 'error' result['message'] = 'StopIteration Exception' result['stacktrace'] = traceback.format_exc() except Error as e: result['status'] = 'error' result['message'] = 'File Content Error: ' + e.message result['stacktrace'] = traceback.format_exc() except KeyError as e: result['status'] = 'error' result['message'] = 'Invalid Column Name: "' + e.message + '"' result['stacktrace'] = traceback.format_exc() except Exception as e: result['status'] = 'error' result['message'] = 'Unhandled Error: ' + e.message result['stacktrace'] = traceback.format_exc() cache.set(prog_key, result) return result
def finish_cleansing(results, file_pk): """ Chord that is called after the cleansing is complete :param results: :param file_pk: import file primary key :return: """ prog_key = get_prog_key("cleanse_data", file_pk) cache.set(prog_key, 100)
def __init__(self, func_name, unique_id, init_data=None): self.func_name = func_name self.unique_id = unique_id self.key = get_prog_key(func_name, unique_id) self.total = None self.increment_by = None # Load in the initialized data, some of this may be overloaded based # on the contents in the cache self.initialize(init_data) # read the data from the cache, if there is any self.load()
def test_progress(self): """When a task finishes, it increments the progress counter properly.""" increment = expected = 25.0 key = decorators.get_prog_key('fake_func', self.pk) self.assertEqual(float(get_cache(key, 0.0)['progress']), 0.0) @decorators.lock_and_track def fake_func(import_file_pk): increment_cache(key, increment) fake_func(self.pk) self.assertEqual(float(get_cache(key, 0.0)['progress']), expected)
def test_progress(self): """When a task finishes, it increments the progress counter properly.""" increment = expected = 25.0 key = decorators.get_prog_key('fake_func', self.pk) self.assertEqual(float(cache.get(key, 0.0)), 0.0) @decorators.lock_and_track def fake_func(import_file_pk): decorators.increment_cache(key, increment) fake_func(self.pk) self.assertEqual(float(cache.get(key, 0.0)), expected)
def finish_cleansing(file_pk): """ Chord that is called after the cleansing is complete :param file_pk: import file primary key :return: """ prog_key = get_prog_key('cleanse_data', file_pk) result = { 'status': 'success', 'progress': 100, 'message': 'cleansing complete' } set_cache(prog_key, result['status'], result)
def test_progress(self): """Make sure we retrieve data from cache properly.""" progress_key = decorators.get_prog_key('fun_func', 23) expected = 50.0 cache.set(progress_key, expected) resp = self.client.post( reverse_lazy("seed:progress"), data=json.dumps({ 'progress_key': progress_key, }), content_type='application/json' ) self.assertEqual(resp.status_code, 200) body = json.loads(resp.content) self.assertEqual(body.get('progress', 0), expected) self.assertEqual(body.get('progress_key', ''), progress_key)
def remap_data(import_file_pk): """"Delete mapped buildings for current import file, re-map them.""" import_file = ImportFile.objects.get(pk=import_file_pk) # Check to ensure that the building has not already been merged. mapping_cache_key = get_prog_key('map_data', import_file.pk) if import_file.matching_done or import_file.matching_completion: cache.set(mapping_cache_key, 100) return { 'status': 'warning', 'message': 'Mapped buildings already merged' } _remap_data.delay(import_file_pk) # Make sure that our mapping cache progress is reset. cache.set(mapping_cache_key, 0) # Here we also return the mapping_prog_key so that the front end can # follow the progress. return {'status': 'success', 'progress_key': mapping_cache_key}
def test_remap_buildings(self): """Test good case for resetting mapping.""" # Make raw BSes, these should stick around. for x in range(10): test_util.make_fake_snapshot(self.import_file, {}, ASSESSED_RAW) # Make "mapped" BSes, these should get removed. for x in range(10): test_util.make_fake_snapshot(self.import_file, {}, ASSESSED_BS) # Set import file like we're done mapping self.import_file.mapping_done = True self.import_file.mapping_progress = 100 self.import_file.save() # Set cache like we're done mapping. cache_key = decorators.get_prog_key('map_data', self.import_file.pk) cache.set(cache_key, 100) resp = self.client.post( reverse_lazy("seed:remap_buildings"), data=json.dumps({ 'file_id': self.import_file.pk, }), content_type='application/json' ) self.assertEqual(resp.status_code, 200) self.assertEqual( BuildingSnapshot.objects.filter( import_file=self.import_file, source_type__in=(ASSESSED_BS, PORTFOLIO_BS) ).count(), 0 ) self.assertEqual( BuildingSnapshot.objects.filter( import_file=self.import_file, ).count(), 10 ) self.assertEqual(cache.get(cache_key), 0)
def match_buildings(file_pk): """kicks off system matching, returns progress key""" import_file = ImportFile.objects.get(pk=file_pk) if import_file.matching_done: prog_key = get_prog_key('match_buildings', file_pk) cache.set(prog_key, 100) return {'status': 'warning', 'message': 'matching already complete'} if not import_file.mapping_done: # Re-add to the queue, hopefully our mapping will be done by then. match_buildings.apply_async(args=[file_pk], countdown=10, expires=20) return { 'status': 'error', 'message': 'waiting for mapping to complete' } _match_buildings.delay(file_pk) return {'status': 'success'}
def match_buildings(file_pk, user_pk): """kicks off system matching, returns progress key""" import_file = ImportFile.objects.get(pk=file_pk) if import_file.matching_done: prog_key = get_prog_key('match_buildings', file_pk) cache.set(prog_key, 100) return {'status': 'warning', 'message': 'matching already complete'} if not import_file.mapping_done: # Re-add to the queue, hopefully our mapping will be done by then. match_buildings.apply_async( args=[file_pk, user_pk], countdown=10, expires=20 ) return { 'status': 'error', 'message': 'waiting for mapping to complete' } _match_buildings.delay(file_pk, user_pk) return {'status': 'success'}
def _save_raw_green_button_data(file_pk, *args, **kwargs): """ Pulls identifying information out of the xml data, find_or_creates a building_snapshot for the data, parses and stores the timeseries meter data and associates it with the building snapshot. """ import_file = ImportFile.objects.get(pk=file_pk) import_file.raw_save_done = True import_file.save() res = xml_importer.import_xml(import_file) prog_key = get_prog_key('save_raw_data', file_pk) cache.set(prog_key, 100) if res: return {'status': 'success'} return {'status': 'error', 'message': 'data failed to import'}
def delete_organization_inventory(request): """ Starts a background task to delete all properties & taxlots in an org. :DELETE: Expects 'org_id' for the organization. Returns:: { 'status': 'success' or 'error', 'progress_key': ID of background job, for retrieving job progress } """ org_id = request.query_params.get('organization_id', None) deleting_cache_key = get_prog_key('delete_organization_inventory', org_id) tasks.delete_organization_inventory.delay(org_id, deleting_cache_key) return JsonResponse({ 'status': 'success', 'progress': 0, 'progress_key': deleting_cache_key })
def cleansing_progress(self, request, pk=None): """ Return the progress of the cleansing. --- type: status: required: true type: string description: either success or error progress: type: integer description: status of background cleansing task parameter_strategy: replace parameters: - name: pk description: Import file ID required: true paramType: path """ import_file_id = pk prog_key = get_prog_key('get_progress', import_file_id) cache = get_cache(prog_key) return HttpResponse(cache['progress'])
def _save_raw_green_button_data(file_pk, *args, **kwargs): """ Pulls identifying information out of the xml data, find_or_creates a building_snapshot for the data, parses and stores the timeseries meter data and associates it with the building snapshot. """ import_file = ImportFile.objects.get(pk=file_pk) import_file.raw_save_done = True import_file.save() res = xml_importer.import_xml(import_file) prog_key = get_prog_key('save_raw_data', file_pk) cache.set(prog_key, 100) if res: return {'status': 'success'} return { 'status': 'error', 'message': 'data failed to import' }
def test_get_prog_key(self): """We format our cache key properly.""" expected = cache.make_key('SEED:fun_func:PROG:34') self.assertEqual(decorators.get_prog_key('fun_func', 34), expected)
def _match_buildings(file_pk): """ngram search against all of the canonical_building snapshots for org.""" min_threshold = settings.MATCH_MIN_THRESHOLD import_file = ImportFile.objects.get(pk=file_pk) prog_key = get_prog_key('match_buildings', file_pk) org = Organization.objects.filter( users=import_file.import_record.owner )[0] unmatched_buildings = find_unmatched_building_values(import_file) # If we don't find any unmatched buildings, there's nothing left to do. if not unmatched_buildings: _finish_matching(import_file, prog_key) return # Here we want all the values not related to the BS id for doing comps. unmatched_ngrams = [ _stringify(list(values)[1:]) for values in unmatched_buildings ] canonical_buildings = find_canonical_building_values(org) if not canonical_buildings: # There are no canonical_buildings for this organization, all unmatched # buildings will then become canonicalized. hydrated_unmatched_buildings = BuildingSnapshot.objects.filter( pk__in=[item[0] for item in unmatched_buildings] ) num_unmatched = len(unmatched_ngrams) or 1 increment = 1.0 / num_unmatched * 100 for (i, unmatched) in enumerate(hydrated_unmatched_buildings): initialize_canonical_building(unmatched) if i % 100 == 0: increment_cache(prog_key, increment * 100) _finish_matching(import_file, prog_key) return # This allows us to retrieve the PK for a given NGram after a match. can_rev_idx = { _stringify(value[1:]): value[0] for value in canonical_buildings } n = ngram.NGram( [_stringify(values[1:]) for values in canonical_buildings] ) # For progress tracking num_unmatched = len(unmatched_ngrams) or 1 increment = 1.0 / num_unmatched * 100 # PKs when we have a match. import_file.mapping_completion = 0 import_file.save() for i, building in enumerate(unmatched_ngrams): results = n.search(building, min_threshold) if results: handle_results(results, i, can_rev_idx, unmatched_buildings) else: hydrated_building = BuildingSnapshot.objects.get( pk=unmatched_buildings[i][0] ) initialize_canonical_building(hydrated_building) if i % 100 == 0: increment_cache(prog_key, increment * 100) import_file.mapping_completion += int(increment * 100) import_file.save() _finish_matching(import_file, prog_key) return {'status': 'success'}
def test_get_prog_key(self): """We format our cache key properly.""" expected = make_key('SEED:fun_func:PROG:' + str(self.pk)) self.assertEqual(decorators.get_prog_key('fun_func', self.pk), expected)
def _match_buildings(file_pk, user_pk): """ngram search against all of the canonical_building snapshots for org.""" # assert True min_threshold = settings.MATCH_MIN_THRESHOLD import_file = ImportFile.objects.get(pk=file_pk) prog_key = get_prog_key('match_buildings', file_pk) org = Organization.objects.filter(users=import_file.import_record.owner)[0] test = '' unmatched_buildings = find_unmatched_buildings(import_file) duplicates = [] newly_matched_building_pks = [] #Filter out matches based on ID. #if the match is a duplicate of other existing data add it to a list #and indicate which existing record it is a duplicate of for unmatched in unmatched_buildings: try: match = handle_id_matches(unmatched, import_file, user_pk) except DuplicateDataError as e: duplicates.append(unmatched.pk) unmatched.duplicate_id = e.id unmatched.save() continue if match: newly_matched_building_pks.extend([match.pk, unmatched.pk]) # Remove any buildings we just did exact ID matches with. unmatched_buildings = unmatched_buildings.exclude( pk__in=newly_matched_building_pks).values_list(*BS_VALUES_LIST) # If we don't find any unmatched buildings, there's nothing left to do. if not unmatched_buildings: _finish_matching(import_file, prog_key) return #here we deal with duplicates unmatched_buildings = unmatched_buildings.exclude( pk__in=duplicates).values_list(*BS_VALUES_LIST) if not unmatched_buildings: _finish_matching(import_file, prog_key) return # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city # unmatched_normalized_addresses=[] unmatched_normalized_addresses = [ _normalize_address_str(unmatched[4]) for unmatched in unmatched_buildings ] # Here we want all the values not related to the BS id for doing comps. # dont do this now # unmatched_ngrams = [ # _stringify(list(values)[1:]) for values in unmatched_buildings # ] canonical_buildings = find_canonical_building_values(org) if not canonical_buildings: # There are no canonical_buildings for this organization, all unmatched # buildings will then become canonicalized. hydrated_unmatched_buildings = BuildingSnapshot.objects.filter( pk__in=[item[0] for item in unmatched_buildings]) num_unmatched = len(unmatched_normalized_addresses) or 1 increment = 1.0 / num_unmatched * 100 for (i, unmatched) in enumerate(hydrated_unmatched_buildings): initialize_canonical_building(unmatched, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) _finish_matching(import_file, prog_key) return # This allows us to retrieve the PK for a given NGram after a match. can_rev_idx = { _normalize_address_str(value[4]): value[0] for value in canonical_buildings } # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle # we no longer need to # n = ngram.NGram( # [_stringify(values[1:]) for values in canonical_buildings] # ) # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city canonical_buildings_addresses = [ _normalize_address_str(values[4]) for values in canonical_buildings ] # For progress tracking # sd we now use the address # num_unmatched = len(unmatched_ngrams) or 1 num_unmatched = len(unmatched_normalized_addresses) or 1 # this code below seemed to be unclear when I was debugging so I added the brackets increment = (1.0 / num_unmatched) * 100 # PKs when we have a match. import_file.mapping_completion = 0 import_file.save() # this section spencer changed to make the exact match for i, un_m_address in enumerate(unmatched_normalized_addresses): results = _findMatches(un_m_address, canonical_buildings_addresses) if results: handle_results(results, i, can_rev_idx, unmatched_buildings, user_pk) else: hydrated_building = BuildingSnapshot.objects.get( pk=unmatched_buildings[i][0]) initialize_canonical_building(hydrated_building, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) import_file.mapping_completion += int(increment * 100) import_file.save() _finish_matching(import_file, prog_key) return {'status': 'success'}
def finish_raw_save(results, file_pk): import_file = ImportFile.objects.get(pk=file_pk) import_file.raw_save_done = True import_file.save() prog_key = get_prog_key('save_raw_data', file_pk) cache.set(prog_key, 100)
def finish_delete(results, org_pk): prog_key = get_prog_key('delete_organization_buildings', org_pk) cache.set(prog_key, 100)
def finish_raw_save(results, file_pk): import_file = ImportFile.objects.get(pk=file_pk) import_file.raw_save_done = True import_file.save() prog_key = get_prog_key('save_raw_data', file_pk) cache.set(prog_key, {'status': 'success', 'progress': 100})
def _match_buildings(file_pk, user_pk): """ngram search against all of the canonical_building snapshots for org.""" # assert True min_threshold = settings.MATCH_MIN_THRESHOLD import_file = ImportFile.objects.get(pk=file_pk) prog_key = get_prog_key('match_buildings', file_pk) org = Organization.objects.filter( users=import_file.import_record.owner )[0] test='' unmatched_buildings = find_unmatched_buildings(import_file) newly_matched_building_pks = [] for unmatched in unmatched_buildings: match = handle_id_matches(unmatched, import_file, user_pk) if match: newly_matched_building_pks.extend([match.pk, unmatched.pk]) # Remove any buildings we just did exact ID matches with. unmatched_buildings = unmatched_buildings.exclude( pk__in=newly_matched_building_pks ).values_list(*BS_VALUES_LIST) # If we don't find any unmatched buildings, there's nothing left to do. if not unmatched_buildings: _finish_matching(import_file, prog_key) return #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city # unmatched_normalized_addresses=[] unmatched_normalized_addresses = [ _normalize_address_str(unmatched[4]) for unmatched in unmatched_buildings ] # Here we want all the values not related to the BS id for doing comps. # dont do this now # unmatched_ngrams = [ # _stringify(list(values)[1:]) for values in unmatched_buildings # ] canonical_buildings = find_canonical_building_values(org) if not canonical_buildings: # There are no canonical_buildings for this organization, all unmatched # buildings will then become canonicalized. hydrated_unmatched_buildings = BuildingSnapshot.objects.filter( pk__in=[item[0] for item in unmatched_buildings] ) num_unmatched = len(unmatched_normalized_addresses) or 1 increment = 1.0 / num_unmatched * 100 for (i, unmatched) in enumerate(hydrated_unmatched_buildings): initialize_canonical_building(unmatched, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) _finish_matching(import_file, prog_key) return #print value[] # This allows us to retrieve the PK for a given NGram after a match. can_rev_idx = { _normalize_address_str(value[4]): value[0] for value in canonical_buildings } # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle #we no longer need to # n = ngram.NGram( # [_stringify(values[1:]) for values in canonical_buildings] # ) #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city canonical_buildings_addresses = [ _normalize_address_str(values[4]) for values in canonical_buildings ] # For progress tracking # sd we now use the address # num_unmatched = len(unmatched_ngrams) or 1 num_unmatched = len(unmatched_normalized_addresses) or 1 #this code below seemed to be unclear when I was debugging so I added the brackets increment = (1.0 / num_unmatched) * 100 # PKs when we have a match. import_file.mapping_completion = 0 import_file.save() # this section spencer changed to make the exact match for i,un_m_address in enumerate(unmatched_normalized_addresses): results =_findMatches(un_m_address,canonical_buildings_addresses) if results: handle_results( results, i, can_rev_idx, unmatched_buildings, user_pk ) else: hydrated_building = BuildingSnapshot.objects.get( pk=unmatched_buildings[i][0] ) initialize_canonical_building(hydrated_building, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) import_file.mapping_completion += int(increment * 100) import_file.save() _finish_matching(import_file, prog_key) return {'status': 'success'}
def _match_buildings(file_pk, user_pk): """ngram search against all of the canonical_building snapshots for org.""" min_threshold = settings.MATCH_MIN_THRESHOLD import_file = ImportFile.objects.get(pk=file_pk) prog_key = get_prog_key('match_buildings', file_pk) org = Organization.objects.filter( users=import_file.import_record.owner )[0] unmatched_buildings = find_unmatched_buildings(import_file) newly_matched_building_pks = [] for unmatched in unmatched_buildings: match = handle_id_matches(unmatched, import_file, user_pk) if match: newly_matched_building_pks.extend([match.pk, unmatched.pk]) # Remove any buildings we just did exact ID matches with. unmatched_buildings = unmatched_buildings.exclude( pk__in=newly_matched_building_pks ).values_list(*BS_VALUES_LIST) # If we don't find any unmatched buildings, there's nothing left to do. if not unmatched_buildings: _finish_matching(import_file, prog_key) return # Here we want all the values not related to the BS id for doing comps. unmatched_ngrams = [ _stringify(list(values)[1:]) for values in unmatched_buildings ] canonical_buildings = find_canonical_building_values(org) if not canonical_buildings: # There are no canonical_buildings for this organization, all unmatched # buildings will then become canonicalized. hydrated_unmatched_buildings = BuildingSnapshot.objects.filter( pk__in=[item[0] for item in unmatched_buildings] ) num_unmatched = len(unmatched_ngrams) or 1 increment = 1.0 / num_unmatched * 100 for (i, unmatched) in enumerate(hydrated_unmatched_buildings): initialize_canonical_building(unmatched, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) _finish_matching(import_file, prog_key) return # This allows us to retrieve the PK for a given NGram after a match. can_rev_idx = { _stringify(value[1:]): value[0] for value in canonical_buildings } n = ngram.NGram( [_stringify(values[1:]) for values in canonical_buildings] ) # For progress tracking num_unmatched = len(unmatched_ngrams) or 1 increment = 1.0 / num_unmatched * 100 # PKs when we have a match. import_file.mapping_completion = 0 import_file.save() for i, building in enumerate(unmatched_ngrams): results = n.search(building, min_threshold) if results: handle_results( results, i, can_rev_idx, unmatched_buildings, user_pk ) else: hydrated_building = BuildingSnapshot.objects.get( pk=unmatched_buildings[i][0] ) initialize_canonical_building(hydrated_building, user_pk) if i % 100 == 0: increment_cache(prog_key, increment * 100) import_file.mapping_completion += int(increment * 100) import_file.save() _finish_matching(import_file, prog_key) return {'status': 'success'}