def load_values(return_fips=False):
    """
    Drop and reload the CountyMortgageData table, or just return a FIPS list.

    This is not used in the data pipeline and is mainly for local testing.
    Passing `return_fips=True` will return a sorted list of source FIPS values.
    The script assumes that `starting_date` and `through_date`
    have been set in constants.
    """

    counter = 0
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    starting_date = MortgageDataConstant.objects.get(
        name='starting_date').date_value
    through_date = MortgageDataConstant.objects.get(
        name='through_date').date_value
    raw_data = read_in_s3_csv(source_url)
    # raw_data is a generator delivering data dicts, each representing a row
    if return_fips is True:
        fips_list = [validate_fips(row.get('fips')) for row in raw_data]
        return sorted(set(fips_list))
    logger.info("Deleting CountyMortgageData objects.")
    CountyMortgageData.objects.all().delete()
    logger.info("CountyMorgtgageData count is now {}".format(
        CountyMortgageData.objects.count()))
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                obj = CountyMortgageData(
                    fips=valid_fips,
                    county=county,
                    date=sampling_date,
                    total=int(row.get('open')),
                    current=int(row.get('current')),
                    thirty=int(row.get('thirty')),
                    sixty=int(row.get('sixty')),
                    ninety=int(row.get('ninety')),
                    other=int(row.get('other')))
                obj.save()
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    logger.info("\nCreated {} CountyMortgageData objects".format(
        CountyMortgageData.objects.count()))
def load_values(return_fips=False):
    """
    Drop and reload the CountyMortgageData table, or just return a FIPS list.

    This is not used in the data pipeline and is mainly for local testing.
    Passing `return_fips=True` will return a sorted list of source FIPS values.
    The script assumes that `starting_date` and `through_date`
    have been set in constants.
    """

    counter = 0
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    starting_date = MortgageDataConstant.objects.get(
        name='starting_date').date_value
    through_date = MortgageDataConstant.objects.get(
        name='through_date').date_value
    raw_data = read_in_s3_csv(source_url)
    # raw_data is a generator delivering data dicts, each representing a row
    if return_fips is True:
        fips_list = [validate_fips(row.get('fips')) for row in raw_data]
        return sorted(set(fips_list))
    logger.info("Deleting CountyMortgageData objects.")
    CountyMortgageData.objects.all().delete()
    logger.info("CountyMorgtgageData count is now {}".format(
        CountyMortgageData.objects.count()))
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                obj = CountyMortgageData(
                    fips=valid_fips,
                    county=county,
                    date=sampling_date,
                    total=int(row.get('open')),
                    current=int(row.get('current')),
                    thirty=int(row.get('thirty')),
                    sixty=int(row.get('sixty')),
                    ninety=int(row.get('ninety')),
                    other=int(row.get('other')))
                obj.save()
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    logger.info("\nCreated {} CountyMortgageData objects".format(
        CountyMortgageData.objects.count()))
Ejemplo n.º 3
0
def create_dump(
        starting_date, through_date, dump_slug, sql=True):
    """
    Sample input CSV field_names and row:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    Default is to dump SQL for mysql loading. Alternative is to dump CSV.
    CSV is portable and less brittle, but our mysql setup doesn't allow it.
    If we switch to Postgres, we can make CSV the default.
    """

    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    rows_out = []
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county_pk = County.objects.get(fips=valid_fips).pk
                rows_out.append([
                    pk,
                    valid_fips,
                    "{}".format(sampling_date),
                    row.get('open'),
                    row.get('current'),
                    row.get('thirty'),
                    row.get('sixty'),
                    row.get('ninety'),
                    row.get('other'),
                    county_pk])
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    if sql is True:
        dump_as_sql(rows_out, dump_slug)
    else:
        dump_as_csv(rows_out, dump_slug)
    logger.info('\nceate_dump took {} to create a file with {} rows'.format(
        (datetime.datetime.now() - starter), len(rows_out)))
Ejemplo n.º 4
0
 def test_validate_fips_too_long(self):
     fips_input = '123456'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 5
0
 def test_validate_fips_too_short(self):
     fips_input = '12'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 6
0
 def test_validate_fips_keep_outdated(self):
     fips_input = '02201'  # a normally excluded outdated FIPS code
     self.assertEqual(validate_fips(
         fips_input, keep_outdated=True), '02201')
Ejemplo n.º 7
0
 def test_validate_fips_outdated_fips(self):
     fips_input = '02201'  # a normally excluded outdated FIPS code
     self.assertIs(validate_fips(fips_input), None)
Ejemplo n.º 8
0
 def test_validate_fips_outdated_fips(self):
     fips_input = '02201'  # a normally excluded outdated FIPS code
     self.assertIs(validate_fips(fips_input), None)
Ejemplo n.º 9
0
 def test_validate_fips_invalid_5_digit(self):
     fips_input = '02201'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 10
0
 def test_validate_fips_edge_case(self):
     fips_input = '46113'
     self.assertEqual(validate_fips(fips_input), '46102')
Ejemplo n.º 11
0
 def test_validate_fips_too_long(self):
     fips_input = '123456'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 12
0
 def test_validate_fips_too_short(self):
     fips_input = '12'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 13
0
def process_source(
        starting_date, through_date, dump_slug=None):
    """
    Re-generate aggregated data from the latest source CSV posted to S3.

    This operation has three steps
    - Wipe and regenerate the base county_mortgage_data table.
    - Regenerate aggregated data for MSAs, non-MSAs, states and national.
    - Update metadata values and files.
    - Export new downloadable public CSV files.

    If dump_slug is provided, a CSV the base county tables will be dumped.

    The input CSV has the following field_names and row form:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    """
    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    new_objects = []
    # truncate table
    CountyMortgageData.objects.all().delete()
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                new_objects.append(
                    CountyMortgageData(
                        pk=pk,
                        fips=valid_fips,
                        date=sampling_date,
                        total=row.get('open'),
                        current=row.get('current'),
                        thirty=row.get('thirty'),
                        sixty=row.get('sixty'),
                        ninety=row.get('ninety'),
                        other=row.get('other'),
                        county=county
                    ))
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    CountyMortgageData.objects.bulk_create(new_objects)
    logger.info('\n{} took {} '
                'to create {} countymortgage records'.format(
                    SCRIPT_NAME,
                    (datetime.datetime.now() - starter),
                    len(new_objects)))
    if dump_slug:
        rows = []
        for obj in new_objects:
            rows.append([
                obj.pk,
                obj.fips,
                "{}".format(obj.date),
                obj.total,
                obj.current,
                obj.thirty,
                obj.sixty,
                obj.ninety,
                obj.other,
                county.pk
            ])
        dump_as_csv(rows, dump_slug)
Ejemplo n.º 14
0
def process_source(starting_date, through_date, dump_slug=None):
    """
    Re-generate aggregated data from the latest source CSV posted to S3.

    This operation has three steps
    - Wipe and regenerate the base county_mortgage_data table.
    - Regenerate aggregated data for MSAs, non-MSAs, states and national.
    - Update metadata values and files.
    - Export new downloadable public CSV files.

    If dump_slug is provided, a CSV the base county tables will be dumped.

    The input CSV has the following field_names and row form:
    date,fips,open,current,thirty,sixty,ninety,other
    01/01/08,1001,268,260,4,1,0,3

    """
    starter = datetime.datetime.now()
    counter = 0
    pk = 1
    new_objects = []
    # truncate table
    CountyMortgageData.objects.all().delete()
    source_url = "{}/{}".format(S3_SOURCE_BUCKET, S3_SOURCE_FILE)
    raw_data = read_in_s3_csv(source_url)
    for row in raw_data:
        sampling_date = parser.parse(row.get('date')).date()
        if sampling_date >= starting_date and sampling_date <= through_date:
            valid_fips = validate_fips(row.get('fips'))
            if valid_fips:
                county = County.objects.get(fips=valid_fips)
                new_objects.append(
                    CountyMortgageData(pk=pk,
                                       fips=valid_fips,
                                       date=sampling_date,
                                       total=row.get('open'),
                                       current=row.get('current'),
                                       thirty=row.get('thirty'),
                                       sixty=row.get('sixty'),
                                       ninety=row.get('ninety'),
                                       other=row.get('other'),
                                       county=county))
                pk += 1
                counter += 1
                if counter % 10000 == 0:  # pragma: no cover
                    sys.stdout.write('.')
                    sys.stdout.flush()
                if counter % 100000 == 0:  # pragma: no cover
                    logger.info("\n{}".format(counter))
    CountyMortgageData.objects.bulk_create(new_objects)
    logger.info('\n{} took {} '
                'to create {} countymortgage records'.format(
                    SCRIPT_NAME, (datetime.datetime.now() - starter),
                    len(new_objects)))
    if dump_slug:
        dump_as_csv(((
            obj.pk,
            obj.fips,
            "{}".format(obj.date),
            obj.total,
            obj.current,
            obj.thirty,
            obj.sixty,
            obj.ninety,
            obj.other,
            obj.county.pk,
        ) for obj in new_objects), dump_slug)
Ejemplo n.º 15
0
 def test_validate_fips_edge_case(self):
     fips_input = '46113'
     self.assertEqual(validate_fips(fips_input), '46102')
Ejemplo n.º 16
0
 def test_validate_fips_4_digit(self):
     fips_input = '1015'
     self.assertEqual(validate_fips(fips_input), '01015')
Ejemplo n.º 17
0
 def test_validate_fips_4_digit(self):
     fips_input = '1015'
     self.assertEqual(validate_fips(fips_input), '01015')
Ejemplo n.º 18
0
 def test_validate_fips_valid_5_digit(self):
     fips_input = '34041'
     self.assertEqual(validate_fips(fips_input), '34041')
Ejemplo n.º 19
0
 def test_validate_fips_invalid_5_digit(self):
     fips_input = '02201'
     self.assertEqual(validate_fips(fips_input), None)
Ejemplo n.º 20
0
 def test_validate_fips_keep_outdated(self):
     fips_input = '02201'  # a normally excluded outdated FIPS code
     self.assertEqual(validate_fips(
         fips_input, keep_outdated=True), '02201')
Ejemplo n.º 21
0
 def test_validate_fips_valid_5_digit(self):
     fips_input = '34041'
     self.assertEqual(validate_fips(fips_input), '34041')