Python clean_street Examples, tx_lobbying.libs.normalizers.clean_street Python Examples

Example #1

0

Show file

File: registration.py Project: texastribune/tx_lobbying

def get_or_create_interest(row):
    """
    Update or create an `Interest`.

    Uses the name and state for uniquess. So we assume that AT&T Texas and AT&T
    DC are two separate interests, but AT&T Texas and AT & T Texas are the same.
    """
    zipcode = clean_zipcode(row['EC_ZIP4'])
    address, __ = Address.objects.get_or_create(
        address1=clean_street(row['EC_ADR1'], row['EC_ADR2'], zipcode=zipcode),
        city=row['EC_CITY'],
        state=row['EC_STCD'],
        zipcode=zipcode,
    )
    # TODO get other info from the csv
    defaults = dict(
        address=address,
        slug=slugify(unicode(row['CONCERNAME'])),
    )
    interest, created = Interest.objects.get_or_create(
        name=row['CONCERNAME'],
        defaults=defaults,
    )
    return interest, address, created

Example #2

0

Show file

def get_or_create_interest(row):
    """
    Update or create an `Interest`.

    Uses the name and state for uniquess. So we assume that AT&T Texas and AT&T
    DC are two separate interests, but AT&T Texas and AT & T Texas are the same.
    """
    zipcode = clean_zipcode(row['EC_ZIP4'])
    address, __ = Address.objects.get_or_create(
        address1=clean_street(row['EC_ADR1'], row['EC_ADR2'], zipcode=zipcode),
        city=row['EC_CITY'],
        state=row['EC_STCD'],
        zipcode=zipcode,
    )
    # TODO get other info from the csv
    defaults = dict(
        address=address,
        slug=slugify(unicode(row['CONCERNAME'])),
    )
    interest, created = Interest.objects.get_or_create(
        name=row['CONCERNAME'],
        defaults=defaults,
    )
    return interest, address, created

Example #3

0

Show file

File: registration.py Project: texastribune/tx_lobbying

def process_row(row, prev_pass=None):
    """
    Process a row of the CSV.

    If you pass in the previous output, some optimization will take place to
    process things faster. There is a lot of duplication in the raw data that
    can get skipped.

    You'd think you could just see if the report id changes between rows to see
    if the lobbyist changes, but it turns out that isn't always true. So do a
    manual check of every feature to squeeze out reusing the previous pass as
    much as possible.

    Compensation objects get created outside in a bulk_create for performance.
    """
    report_date = convert_date_format_YMD(row['RPT_DATE'])
    year = row['YEAR_APPL']

    zipcode = clean_zipcode(row['ZIPCODE'])
    data = dict(
        address1=clean_street(row['ADDRESS1'], row['ADDRESS2'], zipcode=zipcode),
        city=row['CITY'],
        state=row['STATE'],
        zipcode=zipcode,
    )
    # HAHAHAHAHAHA
    if (prev_pass and prev_pass.address.address1 == data['address1']
            and prev_pass.address.city == data['city']
            and prev_pass.address.state == data['state']
            and prev_pass.address.zipcode == data['zipcode']):
        reg_address = prev_pass.address
    else:
        reg_address, __ = Address.objects.get_or_create(**data)

    # Very basic `Lobbyist` info here, most of it actually comes from the
    # coversheets.
    if prev_pass and prev_pass.lobbyist.filer_id == int(row['FILER_ID']):
        lobbyist = prev_pass.lobbyist
    else:
        default_data = dict(
            name=row['LOBBYNAME'],
            sort_name=row['SORTNAME'],  # not LOB_SORT like in coversheets
            updated_at=report_date,
            address=reg_address,
            slug=slugify(unicode(row['LOBBYNAME'])),
        )
        lobbyist, created = Lobbyist.objects.update_or_create(
            filer_id=row['FILER_ID'],
            defaults=default_data)
        if created:
            logger.debug("LOBBYIST: %s" % lobbyist)

    if row['CONCERNAME']:
        # interest/concern/client
        interest, interest_address, __ = get_or_create_interest(row)
    else:
        interest_address = None
        interest = None

    # registration report
    if prev_pass and prev_pass.report.report_id == int(row['REPNO']):
        report = prev_pass.report
    else:
        default_data = dict(
            raw=json.dumps(row),
            report_date=report_date,
            year=year,
            address=reg_address,
        )
        report, created = RegistrationReport.objects.update_or_create(
            lobbyist=lobbyist,
            report_id=row['REPNO'],
            defaults=default_data)
        if created:
            logger.debug("REPORT: %s" % report)

    if interest:
        # lobbyist M2M to `Interest` through `Compensation`
        annum, __ = LobbyistAnnum.objects.update_or_create(
            lobbyist=lobbyist,
            year=year)
        # compensation
        data = dict(
            amount_high=int(round(float(row['NHIGH'] or "0"))),  # I hate myself
            amount_low=int(round(float(row['NLOW'] or "0"))),
            compensation_type=row['TYPECOPM'],
            address=interest_address,
            raw=json.dumps(row),
            updated_at=report_date,
            report=report,
            client_num=row['CLIENT_NUM'],
        )
        if row['STARTDT']:
            data['start_date'] = row['STARTDT']
        if row['TERMDATE']:
            data['end_date'] = row['TERMDATE']
        # WISHLIST move this amount_guess logic into the model
        data['amount_guess'] = (data['amount_high'] +
            data['amount_low']) / 2
        compensation = Compensation(
            annum=annum,
            interest=interest,
            **data)
    else:
        compensation = None
    return ProcessedRow(reg_address, lobbyist, report, compensation)

Example #4

0

Show file

def process_row(row, prev_pass=None):
    """
    Process a row of the CSV.

    If you pass in the previous output, some optimization will take place to
    process things faster. There is a lot of duplication in the raw data that
    can get skipped.

    You'd think you could just see if the report id changes between rows to see
    if the lobbyist changes, but it turns out that isn't always true. So do a
    manual check of every feature to squeeze out reusing the previous pass as
    much as possible.

    Compensation objects get created outside in a bulk_create for performance.
    """
    report_date = convert_date_format_YMD(row['RPT_DATE'])
    year = row['YEAR_APPL']

    zipcode = clean_zipcode(row['ZIPCODE'])
    data = dict(
        address1=clean_street(row['ADDRESS1'],
                              row['ADDRESS2'],
                              zipcode=zipcode),
        city=row['CITY'],
        state=row['STATE'],
        zipcode=zipcode,
    )
    # HAHAHAHAHAHA
    if (prev_pass and prev_pass.address.address1 == data['address1']
            and prev_pass.address.city == data['city']
            and prev_pass.address.state == data['state']
            and prev_pass.address.zipcode == data['zipcode']):
        reg_address = prev_pass.address
    else:
        reg_address, __ = Address.objects.get_or_create(**data)

    # Very basic `Lobbyist` info here, most of it actually comes from the
    # coversheets.
    if prev_pass and prev_pass.lobbyist.filer_id == int(row['FILER_ID']):
        lobbyist = prev_pass.lobbyist
    else:
        default_data = dict(
            name=row['LOBBYNAME'],
            sort_name=row['SORTNAME'],  # not LOB_SORT like in coversheets
            updated_at=report_date,
            address=reg_address,
            slug=slugify(unicode(row['LOBBYNAME'])),
        )
        lobbyist, created = Lobbyist.objects.update_or_create(
            filer_id=row['FILER_ID'], defaults=default_data)
        if created:
            logger.debug("LOBBYIST: %s" % lobbyist)

    if row['CONCERNAME']:
        # interest/concern/client
        interest, interest_address, __ = get_or_create_interest(row)
    else:
        interest_address = None
        interest = None

    # registration report
    if prev_pass and prev_pass.report.report_id == int(row['REPNO']):
        report = prev_pass.report
    else:
        default_data = dict(
            raw=json.dumps(row),
            report_date=report_date,
            year=year,
            address=reg_address,
        )
        report, created = RegistrationReport.objects.update_or_create(
            lobbyist=lobbyist, report_id=row['REPNO'], defaults=default_data)
        if created:
            logger.debug("REPORT: %s" % report)

    if interest:
        # lobbyist M2M to `Interest` through `Compensation`
        annum, __ = LobbyistAnnum.objects.update_or_create(lobbyist=lobbyist,
                                                           year=year)
        # compensation
        data = dict(
            amount_high=int(round(float(row['NHIGH']
                                        or "0"))),  # I hate myself
            amount_low=int(round(float(row['NLOW'] or "0"))),
            compensation_type=row['TYPECOPM'],
            address=interest_address,
            raw=json.dumps(row),
            updated_at=report_date,
            report=report,
            client_num=row['CLIENT_NUM'],
        )
        if row['STARTDT']:
            data['start_date'] = row['STARTDT']
        if row['TERMDATE']:
            data['end_date'] = row['TERMDATE']
        # WISHLIST move this amount_guess logic into the model
        data['amount_guess'] = (data['amount_high'] + data['amount_low']) / 2
        compensation = Compensation(annum=annum, interest=interest, **data)
    else:
        compensation = None
    return ProcessedRow(reg_address, lobbyist, report, compensation)