def match_exists(prefix_list, offset): """Given a district list and offset value, determines if a match exists for that ocdid Keyword Arguments: prefix_list -- list of district values for the ocdid offset -- district level to check against (ex. 1 - one additional level, 2 - two levels, etc.) Returns: ocdid -- matching full ocdid if match found, otherwise None ratio -- ratio of that exact match (1-100), returns -1 if not found """ new_prefix = is_exact(prefix_list[:offset]) if new_prefix: dist_type, dist_name = prefix_list[offset].split(':') return ocdidlib.match_name(new_prefix, dist_type, dist_name) return None, -1
def match_exists(prefix_list, offset): """Given a district list and offset value, determines if a match exists for that ocdid Keyword Arguments: prefix_list -- list of district values for the ocdid offset -- district level to check against (ex. 1 - one additional level, 2 - two levels, etc.) Returns: ocdid -- matching full ocdid if match found, otherwise None ratio -- ratio of that exact match (1-100), returns -1 if not found """ new_prefix = is_exact(prefix_list[:offset]) if new_prefix: dist_type, dist_name = prefix_list[offset].split(':') return ocdid.match_name(new_prefix, dist_type, dist_name) return None, -1
def assign_ids(f): """Function that does the bulk of the processing. Definitely too long and needs to be split out to smaller functions, oh well. Outputs the matched data to a staging folder Keyword Arguments: f -- name of the file to process """ with open(Dirs.TEST_DIR + f, 'rb') as r, open(Dirs.STAGING_DIR + f, 'w') as w: reader = DictReader(r) fields = reader.fieldnames print 'FIELDS: {}'.format(fields) # ocdid_report is not included sometimes, and additional fields are # occassionally added. if 'ocdid_report' not in fields: fields.append('ocdid_report') writer = DictWriter(w, fieldnames=fields) writer.writeheader() ocdid_vals = {} unmatched = {} matched = [] row_count = 0 for row in reader: # Clean district names for ocdid matching #Removed county for first deliverable state = row['State'].lower() #county = row['Body Represents - County'].lower().replace(' ', '_') county = None #muni = row['Body Represents - Muni'].lower().replace(' ', '_') muni = None ed = row['Electoral District'].lower() #print row_count row_count += 1 #print 'ED: {}'.format(ed) # Add to prefix_list in order: state, county, muni prefix_list = [] if state: prefix_list.append('state:{}'.format(state)) if county: if state in Assign.ALT_COUNTIES: prefix_list.append('{}:{}'.format(Assign.ALT_COUNTIES[state], county)) else: # issue with coos county in NH, damn ascii if state == 'nh' and county.startswith('co'): county = 'coos' prefix_list.append('county:{}'.format(county)) # exception for dc if muni: if muni == 'dc': prefix_list.append('district:{}'.format(muni)) else: prefix_list.append('place:{}'.format(muni)) print 'PREFIX LIST: {}'.format(prefix_list) # ocdid_key is a tuple of the prefix list, makes matching to # specific group of district values only happens once ocdid_key = tuple(prefix_list) print 'OCDID_KEY: {}'.format(ocdid_key) if ocdid_key in ocdid_vals: full_prefix = ocdid_vals[ocdid_key]['ocdid'] ratio = ocdid_vals[ocdid_key]['ratio'] else: full_prefix, ratio = get_full_prefix(prefix_list) ocdid_vals[ocdid_key] = {'ocdid': full_prefix, 'ratio': ratio} # If sub-body district (sub-county, sub-muni, etc.), add to # unmatched list to perform matching based on district name # identifiers and district count if is_sub_district(ed): d_type, d_name = get_sub_district(ed) unmatched_key = u'{}:{}'.format(full_prefix, d_type) if unmatched_key not in unmatched: unmatched[unmatched_key] = {'prefix': full_prefix, 'districts': {}, 'dist_type': d_type} if d_name not in unmatched[unmatched_key]['districts']: unmatched[unmatched_key]['districts'][d_name] = [] unmatched[unmatched_key]['districts'][d_name].append(row) else: if full_prefix is None: full_prefix = '' row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], full_prefix, ratio) row['ocdid'] = full_prefix matched.append(row) # Match unmatched items by type and count, finding closest matches for k, v in unmatched.iteritems(): print 'KEY: {}\nVALUE: {}'.format(k, v) full_prefix = v['prefix'] d_type = v['dist_type'] districts = v['districts'] ##print 'FULL: {}\n D_TYPE: {}\n DIST: {}\n\n'.format(full_prefix, d_type, districts) ##if d_type == 'sldu': ## for d in districts: ## print d ## print '\n' print '------------------------------------------------------------------------' print 'FULL: {}\nD_TYPE: {}\nLEN: {}\n'.format(full_prefix, d_type, len(districts)) type_val = ocdid.match_type(full_prefix, d_type, len(districts)) if not type_val: for d_name, rows in districts.iteritems(): for row in rows: row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], 'xxx', -1) matched.append(row) else: for d_name, rows in districts.iteritems(): id_val, ratio = ocdid.match_name(full_prefix, type_val, d_name) if id_val is None: id_val = '' for row in rows: row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], id_val, ratio) row['ocdid'] = id_val matched.append(row) for row in matched: writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
def testMatchName(self): for prefix,dist_type,dist_name,result in test_match_name: self.assertEqual(result, ocdid.match_name(prefix,dist_type,dist_name))
def assign_ids(f): """Function that does the bulk of the processing. Definitely too long and needs to be split out to smaller functions, oh well. Outputs the matched data to a staging folder Keyword Arguments: f -- name of the file to process """ with open(Dirs.TEST_DIR + f, 'rb') as r, open(Dirs.STAGING_DIR + f, 'w') as w: reader = DictReader(r) fields = reader.fieldnames print 'FIELDS: {}'.format(fields) # ocdid_report is not included sometimes, and additional fields are # occassionally added. if 'ocdid_report' not in fields: fields.append('ocdid_report') writer = DictWriter(w, fieldnames=fields) writer.writeheader() ocdid_vals = {} unmatched = {} matched = [] row_count = 0 for row in reader: # Clean district names for ocdid matching #Removed county for first deliverable state = row['State'].lower() #county = row['Body Represents - County'].lower().replace(' ', '_') county = None #muni = row['Body Represents - Muni'].lower().replace(' ', '_') muni = None ed = row['Electoral District'].lower() #print row_count row_count += 1 #print 'ED: {}'.format(ed) # Add to prefix_list in order: state, county, muni prefix_list = [] if state: prefix_list.append('state:{}'.format(state)) if county: if state in Assign.ALT_COUNTIES: prefix_list.append('{}:{}'.format( Assign.ALT_COUNTIES[state], county)) else: # issue with coos county in NH, damn ascii if state == 'nh' and county.startswith('co'): county = 'coos' prefix_list.append('county:{}'.format(county)) # exception for dc if muni: if muni == 'dc': prefix_list.append('district:{}'.format(muni)) else: prefix_list.append('place:{}'.format(muni)) print 'PREFIX LIST: {}'.format(prefix_list) # ocdid_key is a tuple of the prefix list, makes matching to # specific group of district values only happens once ocdid_key = tuple(prefix_list) print 'OCDID_KEY: {}'.format(ocdid_key) if ocdid_key in ocdid_vals: full_prefix = ocdid_vals[ocdid_key]['ocdid'] ratio = ocdid_vals[ocdid_key]['ratio'] else: full_prefix, ratio = get_full_prefix(prefix_list) ocdid_vals[ocdid_key] = {'ocdid': full_prefix, 'ratio': ratio} # If sub-body district (sub-county, sub-muni, etc.), add to # unmatched list to perform matching based on district name # identifiers and district count if is_sub_district(ed): d_type, d_name = get_sub_district(ed) unmatched_key = u'{}:{}'.format(full_prefix, d_type) if unmatched_key not in unmatched: unmatched[unmatched_key] = { 'prefix': full_prefix, 'districts': {}, 'dist_type': d_type } if d_name not in unmatched[unmatched_key]['districts']: unmatched[unmatched_key]['districts'][d_name] = [] unmatched[unmatched_key]['districts'][d_name].append(row) else: if full_prefix is None: full_prefix = '' row['ocdid_report'] = Assign.REPORT_TEMPLATE.format( row['Electoral District'], full_prefix, ratio) row['ocdid'] = full_prefix matched.append(row) # Match unmatched items by type and count, finding closest matches for k, v in unmatched.iteritems(): print 'KEY: {}\nVALUE: {}'.format(k, v) full_prefix = v['prefix'] d_type = v['dist_type'] districts = v['districts'] ##print 'FULL: {}\n D_TYPE: {}\n DIST: {}\n\n'.format(full_prefix, d_type, districts) ##if d_type == 'sldu': ## for d in districts: ## print d ## print '\n' print '------------------------------------------------------------------------' print 'FULL: {}\nD_TYPE: {}\nLEN: {}\n'.format( full_prefix, d_type, len(districts)) type_val = ocdid.match_type(full_prefix, d_type, len(districts)) if not type_val: for d_name, rows in districts.iteritems(): for row in rows: row['ocdid_report'] = Assign.REPORT_TEMPLATE.format( row['Electoral District'], 'xxx', -1) matched.append(row) else: for d_name, rows in districts.iteritems(): id_val, ratio = ocdid.match_name(full_prefix, type_val, d_name) if id_val is None: id_val = '' for row in rows: row['ocdid_report'] = Assign.REPORT_TEMPLATE.format( row['Electoral District'], id_val, ratio) row['ocdid'] = id_val matched.append(row) for row in matched: writer.writerow( dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
def testMatchName(self): for prefix, dist_type, dist_name, result in test_match_name: self.assertEqual(result, ocdid.match_name(prefix, dist_type, dist_name))