Python get_csv Examples, common.get_csv Python Examples

Example #1

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def get_all_values(filename, column, max_num_values = 100):
    """Return all values of column named columny in file named filename
        
        CharlsonIndex  ['0', '1-2', '3-4', '5+']
        ProcedureGroup ['', 'ANES', 'EM', 'MED', 'PL', 'RAD', 'SAS', 'SCS', 'SDS', 'SEOA', 'SGS', 
                        'SIS', 'SMCD', 'SMS', 'SNS', 'SO', 'SRS', 'SUS']
        Specialty      ['', 'Anesthesiology', 'Diagnostic Imaging', 'Emergency', 'General Practice', 
                        'Internal', 'Laboratory', 'Obstetrics and Gynecology', 'Other', 'Pathology', 
                        'Pediatrics',  'Rehabilitation', 'Surgery']
        PlaceSvc       ['', 'Ambulance', 'Home', 'Independent Lab', 'Inpatient Hospital', 'Office', 
                        'Other', 'Outpatient Hospital', 'Urgent Care'] 
        
    """
    print 'get_all_values(filename=%s, column=%s)' % (filename, column)
    
    column_keys, get_data = common.get_csv(filename)
    assert(column in column_keys[1:])
    
    column_index = column_keys[1:].index(column)
    print 'column_index=%d' % column_index
    
    values = set([])
    for i,(k,v) in enumerate(get_data()):
        #print '%4d:%s %d' % (i,v, len(v))
        values.add(v[column_index])
        if max_num_values > 0:
            if len(values) > max_num_values:
                print 'max_num_values = %d reached' % max_num_values
                break
        
    return sorted(values)

Example #2

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_charlson_table(filename):
    """CharlsonIndex  ['0', '1-2', '3-4', '5+']
        Use highest Charlson index for any claim for patient in period. Not sure if this is the
        best thing to do
        Wow. This needs process_multi_pass()
    """
    print 'make_charlson_table(filename=%s)' % filename
    
    prefix = 'charlson'
    column_keys, get_data = common.get_csv(filename)
    
    charlson_column = column_keys[1:].index('CharlsonIndex')
    print 'charlson_column=%d' % charlson_column
    
    charlson_func = common.get_int_part
    
    def init_func():
        return [-1]

    def update_func(derived_list, input_row):
        derived_list[0] = max(derived_list[0], charlson_func(input_row[charlson_column]))

    # NUM_GROUPS = 50 if 1.6x faster than 100 which is much faster than 10 for Claims.csv
    process_multi_pass(filename, init_func, update_func, prefix, ['CharlsonIndex'], NUM_GROUPS = 50) 
    
    if False:
        year_column = column_keys[1:].index('Year') 
        charlson_column = column_keys[1:].index('CharlsonIndex')
        print 'year_column=%d' % year_column
        print 'charlson_column=%d' % charlson_column
         
        charlson_func = common.get_int_part
           
        for i,(k,v) in enumerate(get_data()):
            year = v[year_column]
            if not k in derived_dict[year].keys():
                derived_dict[year][k] = -1
            derived_dict[year][k] = max(derived_dict[year][k], charlson_func(v[charlson_column]))
            if (i % 10000) == 0: 
                print 'Processed row %d derived_dict = %s' % (i, 
                    [(kk,len(vv)) for kk,vv in derived_dict.items()])
            
        print 'Read all rows %d' % i    

        for year in sorted(derived_dict.keys()):
            derived_filename = '%s%s_%s' % (DERIVED_PREFIX, year, filename)
            print 'derived_filename=%s' % derived_filename
            f = open(derived_filename , 'wb')
            data_writer = csv.writer(f, delimiter=',', quotechar='"')
            data_writer.writerow(['MemberID', 'CharlsonIndex'])
            for k in sorted(derived_dict[year].keys()):
                x = derived_dict[year][k]
                data_writer.writerow([k] + [str(x)])

Example #3

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_primary_condition_group_counts_table(filename):
    """Make the primary condition group table
        PrimaryConditionGroup
    """
    prefix = 'pcg'
    derived_column_keys = ['None'] + sorted(PCG_LUT.keys(), key = lambda x: PCG_LUT[x])
    
    column_keys, _ = common.get_csv(filename)
    pcg_column = column_keys[1:].index('PrimaryConditionGroup')

    def init_func():
        return [0 for i in range(len(derived_column_keys))]

    def update_func(derived_list, input_row):
        derived_list[get_pcg_index(input_row[pcg_column])] += 1

    process_multi_pass(filename, init_func, update_func, prefix, derived_column_keys)

Example #4

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_lab_counts_table(filename, title):
    """ Used for LabCount.csv and DrugCount.csv which have similar formats"""
    print 'make_lab_counts_table(filename=%s, title=%s)' % (filename, title)
    
    derived_dict = {'Y1':{}, 'Y2':{}, 'Y3':{}}
    column_keys, get_data = common.get_csv(filename)
    year_column = column_keys[1:].index('Year')
    dsfs_column = column_keys[1:].index('DSFS')
    labcount_column = column_keys[1:].index(title)
    print 'year_column=%d' % year_column
    print 'dsfs_column=%d' % dsfs_column
    print 'labcount_column=%d' % labcount_column

    if labcount_column < 0:
        print 'title not matched'
        exit()
  
    dsfs_func = common.get_int_part
    labcount_func = common.get_int_part
    
    for i,(k,v) in enumerate(get_data()):
        year = v[year_column]
        if not k in derived_dict[year].keys():
            derived_dict[year][k] = [0, 0]
        derived_dict[year][k][0] += dsfs_func(v[dsfs_column])
        derived_dict[year][k][1] += labcount_func(v[labcount_column])
        if (i % 10000) == 0: 
            print 'Processed row %d derived_dict = %s' % (i, 
                [(kk,len(vv)) for kk,vv in derived_dict.items()])
        
    print 'Read all rows %d' % i    

    for year in sorted(derived_dict.keys()):
        derived_filename = '%s%s_%s' % (DERIVED_PREFIX, year, filename)
        print 'derived_filename=%s' % derived_filename
        f = open(derived_filename , 'wb')
        data_writer = csv.writer(f, delimiter=',', quotechar='"')
        data_writer.writerow(['MemberID', '%s_DSFS' % title, title])
        for k in sorted(derived_dict[year].keys()):
            row = derived_dict[year][k]
            data_writer.writerow([k] + [str(v) for v in row])

Example #5

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_place_service_counts_table(filename):
    """
        PlaceSvc   ['', 'Ambulance', 'Home', 'Independent Lab', 'Inpatient Hospital', 'Office', 
                        'Other', 'Outpatient Hospital', 'Urgent Care']
    """
    PLACE_SVC_KEYS =  ['Ambulance', 'Home', 'Independent Lab', 'Inpatient Hospital', 'Office', 
                        'Other', 'Outpatient Hospital', 'Urgent Care']
    prefix = 'place_svc'
    derived_column_keys = ['None'] + sorted(PLACE_SVC_KEYS)
    key_lut = make_key_lut(PLACE_SVC_KEYS)
    
    column_keys, _ = common.get_csv(filename)
    column = column_keys[1:].index('PlaceSvc')

    def init_func():
        return [0 for i in range(len(derived_column_keys))]

    def update_func(derived_list, input_row):
        derived_list[get_key_index(key_lut, input_row[column])] += 1

    process_multi_pass(filename, init_func, update_func, prefix, derived_column_keys)

Example #6

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_procedure_group_counts_table(filename):
    """
        ProcedureGroup ['', 'ANES', 'EM', 'MED', 'PL', 'RAD', 'SAS', 'SCS', 'SDS', 'SEOA', 'SGS', 
                'SIS', 'SMCD', 'SMS', 'SNS', 'SO', 'SRS', 'SUS']
    """
    PROCEDURE_GROUP_KEYS =  ['ANES', 'EM', 'MED', 'PL', 'RAD', 'SAS', 'SCS', 'SDS', 'SEOA', 'SGS', 
                'SIS', 'SMCD', 'SMS', 'SNS', 'SO', 'SRS', 'SUS']
    prefix = 'proc_group'
    derived_column_keys = ['None'] + sorted(PROCEDURE_GROUP_KEYS)
    key_lut = make_key_lut(PROCEDURE_GROUP_KEYS)
    
    column_keys, _ = common.get_csv(filename)
    column = column_keys[1:].index('ProcedureGroup')

    def init_func():
        return [0 for i in range(len(derived_column_keys))]

    def update_func(derived_list, input_row):
        derived_list[get_key_index(key_lut, input_row[column])] += 1

    process_multi_pass(filename, init_func, update_func, prefix, derived_column_keys)

Example #7

0

Show file

File: pcg.py Project: invinciblejha/kaggle

def show_dih_counts(year):
    print 'show_dih_counts(year=%d)' % year
    
    pcg_filename = get_pcg_filename(year-1)
    print 'pcg_filename=%s' % pcg_filename
    
    dih_dict = get_dih(year)
    dih_dict_keys = set(dih_dict.keys())
    member_ids = common.get_member_ids(pcg_filename)
    print '%d claims' % len(member_ids)
    
    pcg_keys, pcg_counts_dict = get_pcg_counts_dict(year-1)
    print 'got dicts %d x %d' % (len(pcg_counts_dict), len(pcg_keys))
       
    user_keys = sorted(pcg_counts_dict.keys())
    has_dih_keys = np.zeros(len(pcg_counts_dict))
    has_no_dih_keys = np.zeros(len(pcg_counts_dict))
    for i in range(len(has_dih_keys)):
        k = user_keys[i]
        if (k in dih_dict_keys):
            if dih_dict[k] > 0:
                has_dih_keys[i] = 1
            else:
                has_no_dih_keys[i] = 1 
    
    pcg_counts_a = np.array([pcg_counts_dict[k] for k in user_keys]) 
    pcg_counts_a = pcg_counts_a.astype(float)
    print 'converted to numpy array'
    print 'pcg_counts_a.shape', pcg_counts_a.shape 
    
    column_keys,_ = common.get_csv(get_pcg_filename(year))
    for num_keys in range(1,len(TOP_PCG_KEYS)+1):
        for key0 in range((num_keys+1)//2):
            common.HEADING()
            print 'Testing keys %s %d %d' % (TOP_PCG_KEYS[key0:num_keys], key0, num_keys)
            idxs = [column_keys[1:].index(key) for key in TOP_PCG_KEYS[:num_keys]]
            X = pcg_counts_a[:,idxs]
            Y = has_dih_keys
            classify(X, Y)

Example #8

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def make_specialty_counts_table(filename):
    """
        Specialty  ['', 'Anesthesiology', 'Diagnostic Imaging', 'Emergency', 'General Practice', 
                    'Internal', 'Laboratory', 'Obstetrics and Gynecology', 'Other', 'Pathology', 
                    'Pediatrics',  'Rehabilitation', 'Surgery']
    """
    SPECIALTY_KEYS = ['Anesthesiology', 'Diagnostic Imaging', 'Emergency', 'General Practice', 
                    'Internal', 'Laboratory', 'Obstetrics and Gynecology', 'Other', 'Pathology', 
                    'Pediatrics',  'Rehabilitation', 'Surgery']
    prefix = 'specialty'
    derived_column_keys = ['None'] + sorted(SPECIALTY_KEYS)
    key_lut = make_key_lut(SPECIALTY_KEYS)
    
    column_keys, _ = common.get_csv(filename)
    column = column_keys[1:].index('Specialty')

    def init_func():
        return [0 for i in range(len(derived_column_keys))]

    def update_func(derived_list, input_row):
        derived_list[get_key_index(key_lut, input_row[column])] += 1

    process_multi_pass(filename, init_func, update_func, prefix, derived_column_keys)

Example #9

0

Show file

File: reduce.py Project: invinciblejha/kaggle

def process_multi_pass(filename, init_func, update_func, prefix, DERIVED_COLUMN_KEYS, NUM_GROUPS = 50):
    """This has got complicated due to python running slowing with large dicts
        Passes through input file multiple times and writes partial results to 
        disk (see group).
        
        init_func() returns the initial value of derived_list 
        update_func(derived_list, input_row) updates derived_list based on input_row
        
        50 may be a better default for NUM_GROUPS (see make_charlson_table())
    """

    print 'process_multi_pass(filename=%s,prefix=%s,DERIVED_COLUMN_KEYS=%s,NUM_GROUPS=%d)' % (filename, 
            prefix, DERIVED_COLUMN_KEYS, NUM_GROUPS)
    
    assert(prefix), 'Need a prefix to avoid over-writing existing files'
    column_keys, get_data = common.get_csv(filename)

    year_column = column_keys[1:].index('Year')

    t0 = time.clock()
    num_rows = 0

    for group in range(NUM_GROUPS):
        derived_dict = {'ALL':{}, 'Y1':{}, 'Y2':{}, 'Y3':{}}
        print 'group=%d of %d' % (group, NUM_GROUPS)
        _, get_data = common.get_csv(filename)
        for k,v in get_data():
            if (int(k) % NUM_GROUPS) != group:
                continue
            year = v[year_column]

            if num_rows and num_rows % 10000 == 0:
                t = time.clock() - t0
                eta  = int(t * (2668990 - num_rows)/num_rows)
                print ' %8d row (%4.1f%%) %7.1f sec, %4d rows/sec, eta = %6d sec' % (num_rows, 
                    100.0 * num_rows/2668990, t, int(num_rows/t), eta) 

            for y in (year, 'ALL'):
                if not k in derived_dict[y].keys():
                    derived_dict[y][k] = init_func() 
                update_func(derived_dict[y][k], v)
            num_rows += 1
 
        print ' saving: %d entries' % sum([len(v) for v in derived_dict.values()])        
        pickled_path = make_group_name(group)            
        pkl_file = open(pickled_path , 'wb')
        pickle.dump(derived_dict, pkl_file, -1)   # Pickle data using highest protocol available.
        pkl_file.close()  

    print 'Writing to file'
    for year in sorted(derived_dict.keys()):
        rows_per_year = 0
        derived_filename = '%s%s_%s_%s' % (DERIVED_PREFIX, prefix, year, os.path.basename(filename))
        print 'year=%4s: file=%s' % (year, derived_filename)
        data_writer = csv.writer(open(derived_filename , 'wb'), delimiter=',', quotechar='"')
        data_writer.writerow(['MemberID'] + DERIVED_COLUMN_KEYS)
        for group in range(NUM_GROUPS):
            pickled_path = make_group_name(group)            
            pkl_file = open(pickled_path , 'rb')
            derived_dict = pickle.load(pkl_file)   
            pkl_file.close()
            for k in sorted(derived_dict[year].keys()):
                row = derived_dict[year][k]
                data_writer.writerow([k] + [str(v) for v in row])
                rows_per_year += 1
        print ' rows=%d' % rows_per_year