Example #1
0
def match_exists(prefix_list, offset):
    """Given a district list and offset value, determines if a match exists
    for that ocdid

    Keyword Arguments:
        prefix_list -- list of district values for the ocdid
        offset -- district level to check against
                  (ex. 1 - one additional level, 2 - two levels, etc.)

    Returns:
        ocdid -- matching full ocdid if match found, otherwise None
        ratio -- ratio of that exact match (1-100), returns -1 if not found
    """
    new_prefix = is_exact(prefix_list[:offset])
    if new_prefix:
        dist_type, dist_name = prefix_list[offset].split(':')
        return ocdidlib.match_name(new_prefix, dist_type, dist_name)
    return None, -1
Example #2
0
def match_exists(prefix_list, offset):
    """Given a district list and offset value, determines if a match exists
    for that ocdid

    Keyword Arguments:
        prefix_list -- list of district values for the ocdid
        offset -- district level to check against
                  (ex. 1 - one additional level, 2 - two levels, etc.)

    Returns:
        ocdid -- matching full ocdid if match found, otherwise None
        ratio -- ratio of that exact match (1-100), returns -1 if not found
    """
    new_prefix = is_exact(prefix_list[:offset])
    if new_prefix:
        dist_type, dist_name = prefix_list[offset].split(':')
        return ocdid.match_name(new_prefix, dist_type, dist_name)
    return None, -1
Example #3
0
def assign_ids(f):
    """Function that does the bulk of the processing. Definitely too long and
    needs to be split out to smaller functions, oh well. Outputs the
    matched data to a staging folder

    Keyword Arguments:
        f -- name of the file to process
    """
    with open(Dirs.TEST_DIR + f, 'rb') as r, open(Dirs.STAGING_DIR + f, 'w') as w:
        reader = DictReader(r)
        fields = reader.fieldnames
        print 'FIELDS: {}'.format(fields)
        # ocdid_report is not included sometimes, and additional fields are
        # occassionally added.
        if 'ocdid_report' not in fields:
            fields.append('ocdid_report')
        writer = DictWriter(w, fieldnames=fields)
        writer.writeheader()

        ocdid_vals = {}
        unmatched = {}
        matched = []

        row_count = 0
        
        for row in reader:
            # Clean district names for ocdid matching
            #Removed county for first deliverable
            state = row['State'].lower()
            #county = row['Body Represents - County'].lower().replace(' ', '_')
            county = None
            #muni = row['Body Represents - Muni'].lower().replace(' ', '_')
            muni = None
            ed = row['Electoral District'].lower()

            #print row_count
            row_count += 1
            #print 'ED: {}'.format(ed)
            
            # Add to prefix_list in order: state, county, muni
            prefix_list = []
            if state:
                prefix_list.append('state:{}'.format(state))
            if county:
                if state in Assign.ALT_COUNTIES:
                    prefix_list.append('{}:{}'.format(Assign.ALT_COUNTIES[state],
                                                      county))
                else:
                    # issue with coos county in NH, damn ascii
                    if state == 'nh' and county.startswith('co'):
                        county = 'coos'
                    prefix_list.append('county:{}'.format(county))
            # exception for dc
            if muni:
                if muni == 'dc':
                    prefix_list.append('district:{}'.format(muni))
                else:
                    prefix_list.append('place:{}'.format(muni))

            print 'PREFIX LIST: {}'.format(prefix_list)
            # ocdid_key is a tuple of the prefix list, makes matching to
            # specific group of district values only happens once
            ocdid_key = tuple(prefix_list)
            print 'OCDID_KEY: {}'.format(ocdid_key)
            if ocdid_key in ocdid_vals:
                full_prefix = ocdid_vals[ocdid_key]['ocdid']
                ratio = ocdid_vals[ocdid_key]['ratio']
            else:
                full_prefix, ratio = get_full_prefix(prefix_list)
                ocdid_vals[ocdid_key] = {'ocdid': full_prefix, 'ratio': ratio}

            # If sub-body district (sub-county, sub-muni, etc.), add to
            # unmatched list to perform matching based on district name
            # identifiers and district count
            if is_sub_district(ed):
                d_type, d_name = get_sub_district(ed)
                unmatched_key = u'{}:{}'.format(full_prefix, d_type)
                if unmatched_key not in unmatched:
                    unmatched[unmatched_key] = {'prefix': full_prefix,
                                                'districts': {},
                                                'dist_type': d_type}
                if d_name not in unmatched[unmatched_key]['districts']:
                    unmatched[unmatched_key]['districts'][d_name] = []
                unmatched[unmatched_key]['districts'][d_name].append(row)
            else:
                if full_prefix is None:
                    full_prefix = ''
                row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], full_prefix, ratio)
                row['ocdid'] = full_prefix
                matched.append(row)

        # Match unmatched items by type and count, finding closest matches
        for k, v in unmatched.iteritems():
            print 'KEY: {}\nVALUE: {}'.format(k, v)
            full_prefix = v['prefix']
            d_type = v['dist_type']
            districts = v['districts']

            ##print 'FULL: {}\n D_TYPE: {}\n DIST: {}\n\n'.format(full_prefix, d_type, districts)
            ##if d_type == 'sldu':
            ##    for d in districts: 
            ##        print d
            ##        print '\n'

            print '------------------------------------------------------------------------'
            print 'FULL: {}\nD_TYPE: {}\nLEN: {}\n'.format(full_prefix, d_type, len(districts))
            type_val = ocdid.match_type(full_prefix, d_type, len(districts))
            
            if not type_val:
                for d_name, rows in districts.iteritems():
                    for row in rows:
                        row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], 'xxx', -1)
                        matched.append(row)
            else:
                for d_name, rows in districts.iteritems():
                    id_val, ratio = ocdid.match_name(full_prefix,
                                                     type_val,
                                                     d_name)
                    if id_val is None:
                        id_val = ''
                    for row in rows:
                        row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(row['Electoral District'], id_val, ratio)
                        row['ocdid'] = id_val
                        matched.append(row)

        for row in matched:
            writer.writerow(dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
 def testMatchName(self):
     for prefix,dist_type,dist_name,result in test_match_name:
         self.assertEqual(result, ocdid.match_name(prefix,dist_type,dist_name))
Example #5
0
def assign_ids(f):
    """Function that does the bulk of the processing. Definitely too long and
    needs to be split out to smaller functions, oh well. Outputs the
    matched data to a staging folder

    Keyword Arguments:
        f -- name of the file to process
    """
    with open(Dirs.TEST_DIR + f, 'rb') as r, open(Dirs.STAGING_DIR + f,
                                                  'w') as w:
        reader = DictReader(r)
        fields = reader.fieldnames
        print 'FIELDS: {}'.format(fields)
        # ocdid_report is not included sometimes, and additional fields are
        # occassionally added.
        if 'ocdid_report' not in fields:
            fields.append('ocdid_report')
        writer = DictWriter(w, fieldnames=fields)
        writer.writeheader()

        ocdid_vals = {}
        unmatched = {}
        matched = []

        row_count = 0

        for row in reader:
            # Clean district names for ocdid matching
            #Removed county for first deliverable
            state = row['State'].lower()
            #county = row['Body Represents - County'].lower().replace(' ', '_')
            county = None
            #muni = row['Body Represents - Muni'].lower().replace(' ', '_')
            muni = None
            ed = row['Electoral District'].lower()

            #print row_count
            row_count += 1
            #print 'ED: {}'.format(ed)

            # Add to prefix_list in order: state, county, muni
            prefix_list = []
            if state:
                prefix_list.append('state:{}'.format(state))
            if county:
                if state in Assign.ALT_COUNTIES:
                    prefix_list.append('{}:{}'.format(
                        Assign.ALT_COUNTIES[state], county))
                else:
                    # issue with coos county in NH, damn ascii
                    if state == 'nh' and county.startswith('co'):
                        county = 'coos'
                    prefix_list.append('county:{}'.format(county))
            # exception for dc
            if muni:
                if muni == 'dc':
                    prefix_list.append('district:{}'.format(muni))
                else:
                    prefix_list.append('place:{}'.format(muni))

            print 'PREFIX LIST: {}'.format(prefix_list)
            # ocdid_key is a tuple of the prefix list, makes matching to
            # specific group of district values only happens once
            ocdid_key = tuple(prefix_list)
            print 'OCDID_KEY: {}'.format(ocdid_key)
            if ocdid_key in ocdid_vals:
                full_prefix = ocdid_vals[ocdid_key]['ocdid']
                ratio = ocdid_vals[ocdid_key]['ratio']
            else:
                full_prefix, ratio = get_full_prefix(prefix_list)
                ocdid_vals[ocdid_key] = {'ocdid': full_prefix, 'ratio': ratio}

            # If sub-body district (sub-county, sub-muni, etc.), add to
            # unmatched list to perform matching based on district name
            # identifiers and district count
            if is_sub_district(ed):
                d_type, d_name = get_sub_district(ed)
                unmatched_key = u'{}:{}'.format(full_prefix, d_type)
                if unmatched_key not in unmatched:
                    unmatched[unmatched_key] = {
                        'prefix': full_prefix,
                        'districts': {},
                        'dist_type': d_type
                    }
                if d_name not in unmatched[unmatched_key]['districts']:
                    unmatched[unmatched_key]['districts'][d_name] = []
                unmatched[unmatched_key]['districts'][d_name].append(row)
            else:
                if full_prefix is None:
                    full_prefix = ''
                row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(
                    row['Electoral District'], full_prefix, ratio)
                row['ocdid'] = full_prefix
                matched.append(row)

        # Match unmatched items by type and count, finding closest matches
        for k, v in unmatched.iteritems():
            print 'KEY: {}\nVALUE: {}'.format(k, v)
            full_prefix = v['prefix']
            d_type = v['dist_type']
            districts = v['districts']

            ##print 'FULL: {}\n D_TYPE: {}\n DIST: {}\n\n'.format(full_prefix, d_type, districts)
            ##if d_type == 'sldu':
            ##    for d in districts:
            ##        print d
            ##        print '\n'

            print '------------------------------------------------------------------------'
            print 'FULL: {}\nD_TYPE: {}\nLEN: {}\n'.format(
                full_prefix, d_type, len(districts))
            type_val = ocdid.match_type(full_prefix, d_type, len(districts))

            if not type_val:
                for d_name, rows in districts.iteritems():
                    for row in rows:
                        row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(
                            row['Electoral District'], 'xxx', -1)
                        matched.append(row)
            else:
                for d_name, rows in districts.iteritems():
                    id_val, ratio = ocdid.match_name(full_prefix, type_val,
                                                     d_name)
                    if id_val is None:
                        id_val = ''
                    for row in rows:
                        row['ocdid_report'] = Assign.REPORT_TEMPLATE.format(
                            row['Electoral District'], id_val, ratio)
                        row['ocdid'] = id_val
                        matched.append(row)

        for row in matched:
            writer.writerow(
                dict((k, v.encode('utf-8')) for k, v in row.iteritems()))
Example #6
0
 def testMatchName(self):
     for prefix, dist_type, dist_name, result in test_match_name:
         self.assertEqual(result,
                          ocdid.match_name(prefix, dist_type, dist_name))