Example #1
0
def map_row_chunk(chunk, file_pk, source_type, prog_key, increment, *args,
                  **kwargs):
    """Does the work of matching a mapping to a source type and saving

    :param chunk: list of dict of str. One row's worth of parse data.
    :param file_pk: int, the PK for an ImportFile obj.
    :param source_type: int, represented by either ASSESSED_RAW, or
        PORTFOLIO_RAW.
    :param cleaner: (optional), the cleaner class you want to send
    to mapper.map_row. (e.g. turn numbers into floats.).
    :param raw_ids: (optional kwarg), the list of ids in chunk order.

    """
    import_file = ImportFile.objects.get(pk=file_pk)
    save_type = PORTFOLIO_BS
    if source_type == ASSESSED_RAW:
        save_type = ASSESSED_BS

    concats = []

    org = Organization.objects.get(
        pk=import_file.import_record.super_organization.pk)

    mapping, concats = get_column_mappings(org)
    map_cleaner = _build_cleaner(org)

    # For those column mapping which are not db columns, we
    # need to let MCM know that we apply our mapping function to those.
    apply_columns = []

    mappable_columns = get_mappable_columns()
    for item in mapping:
        if mapping[item] not in mappable_columns:
            apply_columns.append(item)

    apply_func = apply_data_func(mappable_columns)

    for row in chunk:
        model = mapper.map_row(row,
                               mapping,
                               BuildingSnapshot,
                               cleaner=map_cleaner,
                               concat=concats,
                               apply_columns=apply_columns,
                               apply_func=apply_func,
                               *args,
                               **kwargs)

        model.import_file = import_file
        model.source_type = save_type
        model.clean()
        model.super_organization = import_file.import_record.super_organization
        model.save()
    if model:
        # Make sure that we've saved all of the extra_data column names
        save_column_names(model, mapping=mapping)

    increment_cache(prog_key, increment)
Example #2
0
def map_row_chunk(
    chunk, file_pk, source_type, prog_key, increment, *args, **kwargs
):
    """Does the work of matching a mapping to a source type and saving

    :param chunk: list of dict of str. One row's worth of parse data.
    :param file_pk: int, the PK for an ImportFile obj.
    :param source_type: int, represented by either ASSESSED_RAW, or
        PORTFOLIO_RAW.
    :param cleaner: (optional), the cleaner class you want to send
    to mapper.map_row. (e.g. turn numbers into floats.).
    :param raw_ids: (optional kwarg), the list of ids in chunk order.

    """
    import_file = ImportFile.objects.get(pk=file_pk)
    save_type = PORTFOLIO_BS
    mapping = espm_mapping.MAP
    map_cleaner = PORTFOLIO_CLEANER

    # Default to PM so we don't unnecessarily query for mapping
    if source_type == ASSESSED_RAW:
        org = Organization.objects.filter(
            users=import_file.import_record.owner
        )[0]
        mapping = get_column_mappings(org)
        save_type = ASSESSED_BS
        map_cleaner = ASSESSED_CLEANER

    # Pull out any columns meant to be concatenated together.
    mapping, concats = filter_concat_configs(mapping)

    for row in chunk:
        model = mapper.map_row(
            row,
            mapping,
            BuildingSnapshot,
            cleaner=map_cleaner,
            concat=concats,
            *args,
            **kwargs
        )

        model.import_file = import_file
        model.source_type = save_type
        model.clean()
        model.super_organization = import_file.import_record.super_organization
        model.save()

    increment_cache(prog_key, increment)
Example #3
0
def _save_raw_data_chunk(chunk, file_pk, prog_key, increment, *args, **kwargs):
    """Save the raw data to the database."""
    import_file = ImportFile.objects.get(pk=file_pk)
    # Save our "column headers" and sample rows for F/E.
    source_type = get_source_type(import_file)
    for c in chunk:
        raw_bs = BuildingSnapshot()
        raw_bs.import_file = import_file
        raw_bs.extra_data = c
        raw_bs.source_type = source_type

        # We require a save to get our PK
        # We save here to set our initial source PKs.
        raw_bs.save()
        super_org = import_file.import_record.super_organization
        raw_bs.super_organization = super_org

        set_initial_sources(raw_bs)
        raw_bs.save()

    # Indicate progress
    increment_cache(prog_key, increment)
Example #4
0
def _save_raw_data_chunk(chunk, file_pk, prog_key, increment, *args, **kwargs):
    """Save the raw data to the database."""
    import_file = ImportFile.objects.get(pk=file_pk)
    # Save our "column headers" and sample rows for F/E.
    source_type = get_source_type(import_file)
    for c in chunk:
        raw_bs = BuildingSnapshot()
        raw_bs.import_file = import_file
        raw_bs.extra_data = c
        raw_bs.source_type = source_type

        # We require a save to get our PK
        # We save here to set our initial source PKs.
        raw_bs.save()
        super_org = import_file.import_record.super_organization
        raw_bs.super_organization = super_org

        set_initial_sources(raw_bs)
        raw_bs.save()

    # Indicate progress
    increment_cache(prog_key, increment)
Example #5
0
    def test_increment_cache(self):
        """Sum our progress by increments properly."""
        expected = 25.0
        test_key = cache.make_key('increment_test')
        increment = 25.0
        # Fresh increment, this initializes the value.
        decorators.increment_cache(test_key, increment)
        self.assertEqual(float(cache.get(test_key)), expected)

        # Increment an existing key
        decorators.increment_cache(test_key, increment)
        expected = 50.0
        self.assertEqual(float(cache.get(test_key)), expected)

        # This should put us well over 100.0 in incrementation w/o bounds check.
        for i in range(10):
            decorators.increment_cache(test_key, increment)

        expected = 100.0
        self.assertEqual(float(cache.get(test_key)), expected)
Example #6
0
 def fake_func(import_file_pk):
     decorators.increment_cache(key, increment)
Example #7
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
#     assert True
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]
    test=''
    unmatched_buildings = find_unmatched_buildings(import_file)

    newly_matched_building_pks = []
    for unmatched in unmatched_buildings:
        match = handle_id_matches(unmatched, import_file, user_pk)
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks
    ).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return
    #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
#     unmatched_normalized_addresses=[]
    
    unmatched_normalized_addresses = [
        _normalize_address_str(unmatched[4]) for unmatched in unmatched_buildings
    ]
    # Here we want all the values not related to the BS id for doing comps.
    # dont do this now
#     unmatched_ngrams = [
#         _stringify(list(values)[1:]) for values in unmatched_buildings
#     ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_normalized_addresses) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return
    #print value[]
    
    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _normalize_address_str(value[4]): value[0] for value in canonical_buildings
    }
    # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building
    # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle
    
    #we no longer need to
#     n = ngram.NGram(
#         [_stringify(values[1:]) for values in canonical_buildings]
#     )
    #here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    canonical_buildings_addresses = [
        _normalize_address_str(values[4]) for values in canonical_buildings
    ]
    # For progress tracking
# sd we now use the address
#    num_unmatched = len(unmatched_ngrams) or 1
    num_unmatched = len(unmatched_normalized_addresses) or 1
    #this code below seemed to be unclear when I was debugging so I added the brackets
    increment = (1.0 / num_unmatched) * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    # this section spencer changed to make the exact match
    for i,un_m_address in enumerate(unmatched_normalized_addresses):
        results =_findMatches(un_m_address,canonical_buildings_addresses)
        if results:
            handle_results(
                results, i, can_rev_idx, unmatched_buildings, user_pk
            )
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    
    return {'status': 'success'}
Example #8
0
def map_row_chunk(
    chunk, file_pk, source_type, prog_key, increment, *args, **kwargs
):
    """Does the work of matching a mapping to a source type and saving

    :param chunk: list of dict of str. One row's worth of parse data.
    :param file_pk: int, the PK for an ImportFile obj.
    :param source_type: int, represented by either ASSESSED_RAW, or
        PORTFOLIO_RAW.
    :param cleaner: (optional), the cleaner class you want to send
    to mapper.map_row. (e.g. turn numbers into floats.).
    :param raw_ids: (optional kwarg), the list of ids in chunk order.

    """
    import_file = ImportFile.objects.get(pk=file_pk)
    save_type = PORTFOLIO_BS
    if source_type == ASSESSED_RAW:
        save_type = ASSESSED_BS

    concats = []

    org = Organization.objects.get(
        pk=import_file.import_record.super_organization.pk
    )

    mapping, concats = get_column_mappings(org)
    map_cleaner = _build_cleaner(org)

    # For those column mapping which are not db columns, we
    # need to let MCM know that we apply our mapping function to those.
    apply_columns = []

    mappable_columns = get_mappable_columns()
    for item in mapping:
        if mapping[item] not in mappable_columns:
            apply_columns.append(item)

    apply_func = apply_data_func(mappable_columns)

    for row in chunk:
        model = mapper.map_row(
            row,
            mapping,
            BuildingSnapshot,
            cleaner=map_cleaner,
            concat=concats,
            apply_columns=apply_columns,
            apply_func=apply_func,
            *args,
            **kwargs
        )

        model.import_file = import_file
        model.source_type = save_type
        model.clean()
        model.super_organization = import_file.import_record.super_organization
        model.save()
    if model:
        # Make sure that we've saved all of the extra_data column names
        save_column_names(model, mapping=mapping)

    increment_cache(prog_key, increment)
Example #9
0
def _delete_organization_buildings_chunk(del_ids, prog_key, increment,
                                         org_pk, *args, **kwargs):
    """deletes a list of ``del_ids`` and increments the cache"""
    qs = BuildingSnapshot.objects.filter(super_organization=org_pk)
    qs.filter(pk__in=del_ids).delete()
    increment_cache(prog_key, increment * 100)
Example #10
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]

    unmatched_buildings = find_unmatched_buildings(import_file)

    newly_matched_building_pks = []
    for unmatched in unmatched_buildings:
        match = handle_id_matches(unmatched, import_file, user_pk)
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks
    ).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    # Here we want all the values not related to the BS id for doing comps.
    unmatched_ngrams = [
        _stringify(list(values)[1:]) for values in unmatched_buildings
    ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_ngrams) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _stringify(value[1:]): value[0] for value in canonical_buildings
    }
    n = ngram.NGram(
        [_stringify(values[1:]) for values in canonical_buildings]
    )

    # For progress tracking

    num_unmatched = len(unmatched_ngrams) or 1
    increment = 1.0 / num_unmatched * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    for i, building in enumerate(unmatched_ngrams):
        results = n.search(building, min_threshold)
        if results:
            handle_results(
                results, i, can_rev_idx, unmatched_buildings, user_pk
            )
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    return {'status': 'success'}
Example #11
0
def _delete_organization_buildings_chunk(del_ids, prog_key, increment,
                                         org_pk, *args, **kwargs):
    """deletes a list of ``del_ids`` and increments the cache"""
    qs = BuildingSnapshot.objects.filter(super_organization=org_pk)
    qs.filter(pk__in=del_ids).delete()
    increment_cache(prog_key, increment * 100)
Example #12
0
def _match_buildings(file_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(
        users=import_file.import_record.owner
    )[0]

    unmatched_buildings = find_unmatched_building_values(import_file)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    # Here we want all the values not related to the BS id for doing comps.
    unmatched_ngrams = [
        _stringify(list(values)[1:]) for values in unmatched_buildings
    ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings]
        )
        num_unmatched = len(unmatched_ngrams) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _stringify(value[1:]): value[0] for value in canonical_buildings
    }
    n = ngram.NGram(
        [_stringify(values[1:]) for values in canonical_buildings]
    )

    # For progress tracking

    num_unmatched = len(unmatched_ngrams) or 1
    increment = 1.0 / num_unmatched * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    for i, building in enumerate(unmatched_ngrams):
        results = n.search(building, min_threshold)
        if results:
            handle_results(results, i, can_rev_idx, unmatched_buildings)

        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0]
            )
            initialize_canonical_building(hydrated_building)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)
    return {'status': 'success'}
Example #13
0
def _match_buildings(file_pk, user_pk):
    """ngram search against all of the canonical_building snapshots for org."""
    #     assert True
    min_threshold = settings.MATCH_MIN_THRESHOLD
    import_file = ImportFile.objects.get(pk=file_pk)
    prog_key = get_prog_key('match_buildings', file_pk)
    org = Organization.objects.filter(users=import_file.import_record.owner)[0]
    test = ''
    unmatched_buildings = find_unmatched_buildings(import_file)

    duplicates = []
    newly_matched_building_pks = []

    #Filter out matches based on ID.
    #if the match is a duplicate of other existing data add it to a list
    #and indicate which existing record it is a duplicate of
    for unmatched in unmatched_buildings:
        try:
            match = handle_id_matches(unmatched, import_file, user_pk)
        except DuplicateDataError as e:
            duplicates.append(unmatched.pk)
            unmatched.duplicate_id = e.id
            unmatched.save()
            continue
        if match:
            newly_matched_building_pks.extend([match.pk, unmatched.pk])

    # Remove any buildings we just did exact ID matches with.
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=newly_matched_building_pks).values_list(*BS_VALUES_LIST)

    # If we don't find any unmatched buildings, there's nothing left to do.
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return

    #here we deal with duplicates
    unmatched_buildings = unmatched_buildings.exclude(
        pk__in=duplicates).values_list(*BS_VALUES_LIST)
    if not unmatched_buildings:
        _finish_matching(import_file, prog_key)
        return
        # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    #     unmatched_normalized_addresses=[]

    unmatched_normalized_addresses = [
        _normalize_address_str(unmatched[4])
        for unmatched in unmatched_buildings
    ]
    # Here we want all the values not related to the BS id for doing comps.
    # dont do this now
    #     unmatched_ngrams = [
    #         _stringify(list(values)[1:]) for values in unmatched_buildings
    #     ]

    canonical_buildings = find_canonical_building_values(org)
    if not canonical_buildings:
        # There are no canonical_buildings for this organization, all unmatched
        # buildings will then become canonicalized.
        hydrated_unmatched_buildings = BuildingSnapshot.objects.filter(
            pk__in=[item[0] for item in unmatched_buildings])
        num_unmatched = len(unmatched_normalized_addresses) or 1
        increment = 1.0 / num_unmatched * 100
        for (i, unmatched) in enumerate(hydrated_unmatched_buildings):
            initialize_canonical_building(unmatched, user_pk)
            if i % 100 == 0:
                increment_cache(prog_key, increment * 100)

        _finish_matching(import_file, prog_key)
        return

    # This allows us to retrieve the PK for a given NGram after a match.
    can_rev_idx = {
        _normalize_address_str(value[4]): value[0]
        for value in canonical_buildings
    }
    # (SD) This loads up an ngram object with all the canonical buildings. The values are the lists of identifying data for each building
    # (SD) the stringify is given all but the first item in the values list and it concatenates each item with a space in the middle

    # we no longer need to
    #     n = ngram.NGram(
    #         [_stringify(values[1:]) for values in canonical_buildings]
    #     )
    # here we are going to normalize the addresses to match on address_1 field, this is not ideal because you could match on two locations with same address_1 but different city
    canonical_buildings_addresses = [
        _normalize_address_str(values[4]) for values in canonical_buildings
    ]
    # For progress tracking
    # sd we now use the address
    #    num_unmatched = len(unmatched_ngrams) or 1
    num_unmatched = len(unmatched_normalized_addresses) or 1
    # this code below seemed to be unclear when I was debugging so I added the brackets
    increment = (1.0 / num_unmatched) * 100

    # PKs when we have a match.
    import_file.mapping_completion = 0
    import_file.save()
    # this section spencer changed to make the exact match
    for i, un_m_address in enumerate(unmatched_normalized_addresses):
        results = _findMatches(un_m_address, canonical_buildings_addresses)
        if results:
            handle_results(results, i, can_rev_idx, unmatched_buildings,
                           user_pk)
        else:
            hydrated_building = BuildingSnapshot.objects.get(
                pk=unmatched_buildings[i][0])
            initialize_canonical_building(hydrated_building, user_pk)

        if i % 100 == 0:
            increment_cache(prog_key, increment * 100)
            import_file.mapping_completion += int(increment * 100)
            import_file.save()

    _finish_matching(import_file, prog_key)

    return {'status': 'success'}