def get_or_create_interest(row): """ Update or create an `Interest`. Uses the name and state for uniquess. So we assume that AT&T Texas and AT&T DC are two separate interests, but AT&T Texas and AT & T Texas are the same. """ zipcode = clean_zipcode(row['EC_ZIP4']) address, __ = Address.objects.get_or_create( address1=clean_street(row['EC_ADR1'], row['EC_ADR2'], zipcode=zipcode), city=row['EC_CITY'], state=row['EC_STCD'], zipcode=zipcode, ) # TODO get other info from the csv defaults = dict( address=address, slug=slugify(unicode(row['CONCERNAME'])), ) interest, created = Interest.objects.get_or_create( name=row['CONCERNAME'], defaults=defaults, ) return interest, address, created
def process_row(row, prev_pass=None): """ Process a row of the CSV. If you pass in the previous output, some optimization will take place to process things faster. There is a lot of duplication in the raw data that can get skipped. You'd think you could just see if the report id changes between rows to see if the lobbyist changes, but it turns out that isn't always true. So do a manual check of every feature to squeeze out reusing the previous pass as much as possible. Compensation objects get created outside in a bulk_create for performance. """ report_date = convert_date_format_YMD(row['RPT_DATE']) year = row['YEAR_APPL'] zipcode = clean_zipcode(row['ZIPCODE']) data = dict( address1=clean_street(row['ADDRESS1'], row['ADDRESS2'], zipcode=zipcode), city=row['CITY'], state=row['STATE'], zipcode=zipcode, ) # HAHAHAHAHAHA if (prev_pass and prev_pass.address.address1 == data['address1'] and prev_pass.address.city == data['city'] and prev_pass.address.state == data['state'] and prev_pass.address.zipcode == data['zipcode']): reg_address = prev_pass.address else: reg_address, __ = Address.objects.get_or_create(**data) # Very basic `Lobbyist` info here, most of it actually comes from the # coversheets. if prev_pass and prev_pass.lobbyist.filer_id == int(row['FILER_ID']): lobbyist = prev_pass.lobbyist else: default_data = dict( name=row['LOBBYNAME'], sort_name=row['SORTNAME'], # not LOB_SORT like in coversheets updated_at=report_date, address=reg_address, slug=slugify(unicode(row['LOBBYNAME'])), ) lobbyist, created = Lobbyist.objects.update_or_create( filer_id=row['FILER_ID'], defaults=default_data) if created: logger.debug("LOBBYIST: %s" % lobbyist) if row['CONCERNAME']: # interest/concern/client interest, interest_address, __ = get_or_create_interest(row) else: interest_address = None interest = None # registration report if prev_pass and prev_pass.report.report_id == int(row['REPNO']): report = prev_pass.report else: default_data = dict( raw=json.dumps(row), report_date=report_date, year=year, address=reg_address, ) report, created = RegistrationReport.objects.update_or_create( lobbyist=lobbyist, report_id=row['REPNO'], defaults=default_data) if created: logger.debug("REPORT: %s" % report) if interest: # lobbyist M2M to `Interest` through `Compensation` annum, __ = LobbyistAnnum.objects.update_or_create( lobbyist=lobbyist, year=year) # compensation data = dict( amount_high=int(round(float(row['NHIGH'] or "0"))), # I hate myself amount_low=int(round(float(row['NLOW'] or "0"))), compensation_type=row['TYPECOPM'], address=interest_address, raw=json.dumps(row), updated_at=report_date, report=report, client_num=row['CLIENT_NUM'], ) if row['STARTDT']: data['start_date'] = row['STARTDT'] if row['TERMDATE']: data['end_date'] = row['TERMDATE'] # WISHLIST move this amount_guess logic into the model data['amount_guess'] = (data['amount_high'] + data['amount_low']) / 2 compensation = Compensation( annum=annum, interest=interest, **data) else: compensation = None return ProcessedRow(reg_address, lobbyist, report, compensation)
def process_row(row, prev_pass=None): """ Process a row of the CSV. If you pass in the previous output, some optimization will take place to process things faster. There is a lot of duplication in the raw data that can get skipped. You'd think you could just see if the report id changes between rows to see if the lobbyist changes, but it turns out that isn't always true. So do a manual check of every feature to squeeze out reusing the previous pass as much as possible. Compensation objects get created outside in a bulk_create for performance. """ report_date = convert_date_format_YMD(row['RPT_DATE']) year = row['YEAR_APPL'] zipcode = clean_zipcode(row['ZIPCODE']) data = dict( address1=clean_street(row['ADDRESS1'], row['ADDRESS2'], zipcode=zipcode), city=row['CITY'], state=row['STATE'], zipcode=zipcode, ) # HAHAHAHAHAHA if (prev_pass and prev_pass.address.address1 == data['address1'] and prev_pass.address.city == data['city'] and prev_pass.address.state == data['state'] and prev_pass.address.zipcode == data['zipcode']): reg_address = prev_pass.address else: reg_address, __ = Address.objects.get_or_create(**data) # Very basic `Lobbyist` info here, most of it actually comes from the # coversheets. if prev_pass and prev_pass.lobbyist.filer_id == int(row['FILER_ID']): lobbyist = prev_pass.lobbyist else: default_data = dict( name=row['LOBBYNAME'], sort_name=row['SORTNAME'], # not LOB_SORT like in coversheets updated_at=report_date, address=reg_address, slug=slugify(unicode(row['LOBBYNAME'])), ) lobbyist, created = Lobbyist.objects.update_or_create( filer_id=row['FILER_ID'], defaults=default_data) if created: logger.debug("LOBBYIST: %s" % lobbyist) if row['CONCERNAME']: # interest/concern/client interest, interest_address, __ = get_or_create_interest(row) else: interest_address = None interest = None # registration report if prev_pass and prev_pass.report.report_id == int(row['REPNO']): report = prev_pass.report else: default_data = dict( raw=json.dumps(row), report_date=report_date, year=year, address=reg_address, ) report, created = RegistrationReport.objects.update_or_create( lobbyist=lobbyist, report_id=row['REPNO'], defaults=default_data) if created: logger.debug("REPORT: %s" % report) if interest: # lobbyist M2M to `Interest` through `Compensation` annum, __ = LobbyistAnnum.objects.update_or_create(lobbyist=lobbyist, year=year) # compensation data = dict( amount_high=int(round(float(row['NHIGH'] or "0"))), # I hate myself amount_low=int(round(float(row['NLOW'] or "0"))), compensation_type=row['TYPECOPM'], address=interest_address, raw=json.dumps(row), updated_at=report_date, report=report, client_num=row['CLIENT_NUM'], ) if row['STARTDT']: data['start_date'] = row['STARTDT'] if row['TERMDATE']: data['end_date'] = row['TERMDATE'] # WISHLIST move this amount_guess logic into the model data['amount_guess'] = (data['amount_high'] + data['amount_low']) / 2 compensation = Compensation(annum=annum, interest=interest, **data) else: compensation = None return ProcessedRow(reg_address, lobbyist, report, compensation)