def save_svmlight(patient_features, mortality, op_file, op_deliverable):

    '''
    TODO: This function needs to be completed

    Refer to instructions in Q3 d

    Create two files:
    1. op_file - which saves the features in svmlight format. (See instructions in Q3d for detailed explanation)
    2. op_deliverable - which saves the features in following format:
       patient_id1 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...
       patient_id2 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...

    Note: Please make sure the features are ordered in ascending order, and patients are stored in ascending order as well.
    '''
    # op_file = op_deliverable = './'
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    for patient, features in patient_features.iteritems():
        features = pd.DataFrame(features).sort_values(0)

        features = features.values.tolist()

        deliverable1.write("{} {} \n".format(mortality.get(patient, 0),
                                          utils.bag_to_svmlight(features)))
        deliverable2.write("{} {} {} \n".format(int(patient),
                                                mortality.get(patient, 0),
                                             utils.bag_to_svmlight(features)))
Ejemplo n.º 2
0
def save_svmlight(patient_features, mortality, op_file, op_deliverable):
    '''
    
    Create two files:
    1. op_file - which saves the features in svmlight format.
    2. op_deliverable - which saves the features in following format:
       patient_id1 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...
       patient_id2 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...  
    
    the features are ordered in ascending order, and patients are stored in ascending order.     
    '''

    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')
    keys = sorted(patient_features.keys())

    for key in keys:

        content_string = str(mortality[key]) + ' ' + utils.bag_to_svmlight(
            sorted(patient_features[key], key=lambda x: x[0]))
        deliverable1.write(content_string)
        content_string = str(int(key)) + ' ' + content_string
        deliverable2.write(content_string)
        if key == keys[-1]:
            deliverable1.write(' \n')
            deliverable2.write(' \n')
        else:
            deliverable1.write('\n')
            deliverable2.write('\n')
    deliverable1.close()
    deliverable2.close()
Ejemplo n.º 3
0
def save_test_features(patient_features, test_features_deliverable):

    # Get sorted list of all patient_ids
    patient_ids = sorted(patient_features.keys())
    patient_ids = map(int, patient_ids)

    # Sort each feature list in patient_features by feature_id
    for patient_id in patient_ids:

        patient_features[patient_id] = sorted(patient_features[patient_id],
                                              key=lambda x: x[0])

    # Get SVMLight-formatted row for each patient in patient_ids
    all_feature_rows = map(
        lambda x: utils.bag_to_svmlight(patient_features[x]) + " ",
        patient_ids)

    # Combine patient id's with <feature_id , feature_value> data for the deliverable file
    deliverable_format_rows = [
        str(id) + " " + data for id, data in zip(patient_ids, all_feature_rows)
    ]

    # Combine rows together into a single writable string; one row for each line in the file to be created
    deliverable_feature_data = reduce(lambda x, y: x + "\n" + y,
                                      deliverable_format_rows) + "\n"

    # Open write-connection to deliverable file
    deliverable = open(test_features_deliverable, 'wb')

    # Write deliverable to file
    deliverable.write(deliverable_feature_data)

    # Close connection to deliverable files
    deliverable.close()
Ejemplo n.º 4
0
def save_test_features(patient_features, test_features_deliverable):

    # Get sorted list of all patient_ids
    patient_ids = sorted(patient_features.keys())
    patient_ids = map(int , patient_ids)

     
    # Sort each feature list in patient_features by feature_id
    for patient_id in patient_ids:
        
        patient_features[patient_id] = sorted(patient_features[patient_id] , key = lambda x: x[0])          
               

    # Get SVMLight-formatted row for each patient in patient_ids
    all_feature_rows = map(lambda x: utils.bag_to_svmlight(patient_features[x]) + " " , patient_ids)
    
    
    # Combine patient id's with <feature_id , feature_value> data for the deliverable file
    deliverable_format_rows = [str(id) + " " + data for id,data in zip(patient_ids,all_feature_rows)]

    # Combine rows together into a single writable string; one row for each line in the file to be created
    deliverable_feature_data = reduce(lambda x, y: x + "\n" + y , deliverable_format_rows) + "\n"
    
    
    # Open write-connection to deliverable file
    deliverable = open(test_features_deliverable, 'wb')

    # Write deliverable to file
    deliverable.write(deliverable_feature_data);
    
    # Close connection to deliverable files
    deliverable.close()
Ejemplo n.º 5
0
def my_features():
    #TODO: complete this
    train_events, train_mortality, feature_map = read_from_csv(
        '../data/train/')
    entire_feature_set = set(feature_map.idx.unique())
    test_events, _, _ = read_from_csv('../data/test/')
    train_events = process_training_data(train_events.iloc[:, :],
                                         train_mortality)
    train_features, test_features = process_features(train_events, test_events)
    patient_id_series = pd.Series(train_features.index,
                                  index=train_features.index)
    dead_ids_list = list(train_mortality.patient_id)
    train_labels = np.array(
        [id in dead_ids_list for id in list(patient_id_series)])
    X_train = train_features
    Y_train = train_labels
    X_test = test_features.sort_index()
    test_features.index.name = 'patient_id'
    test_features_long = pd.melt(test_features.reset_index(),
                                 id_vars=['patient_id'])
    test_features_long.columns = ['patient_id', 'feature_id', 'feature_value']
    test_features_long = test_features_long.sort_values('patient_id')
    tuple_temp = test_features_long.groupby('patient_id').apply(lambda x: list(
        x.sort_values('feature_id').apply(
            lambda y: (y.feature_id, y.feature_value), axis=1)))
    patient_features_dict = tuple_temp.to_dict()
    deliverable1 = open('../deliverables/test_features.txt', 'wb')
    for patient in sorted(patient_features_dict.keys()):
        deliverable1.write(
            bytes(
                "{} {} \n".format(
                    patient,
                    utils.bag_to_svmlight(patient_features_dict[patient])),
                'UTF-8'))
    return X_train, Y_train, X_test
def save_test_features(test_features_long):
    tuple_dict = test_features_long.groupby('patient_id').apply(lambda x:
                                                               list(x.sort_values('feature_id').apply(lambda y:
                                                                    (y.feature_id,
                                                                     y.feature_value),
                                                                     axis=1)))
    patient_features = tuple_dict.to_dict()

    deliverable1 = open('../deliverables/test_features.txt', 'wb')

    for patient in sorted(patient_features.keys()):
        deliverable1.write("{} {} \n".format(patient,
                                             utils.bag_to_svmlight(patient_features[patient])))
Ejemplo n.º 7
0
def save_svmlight(patient_features, mortality, op_file, op_deliverable):

    # Get sorted list of all patient_ids
    patient_ids = sorted(patient_features.keys())
    patient_ids = map(int, patient_ids)

    # Sort each feature list in patient_features by feature_id
    for patient_id in patient_ids:

        patient_features[patient_id] = sorted(patient_features[patient_id],
                                              key=lambda x: x[0])

    all_feature_rows = map(
        lambda x: bag_to_svmlight(patient_features[x]) + " ", patient_ids)

    svmlight_feature_data = [
        str(int(mortality[x])) + " " + y
        for x, y in zip(patient_ids, all_feature_rows)
    ]

    # Combine rows together into a single writable string; one row for each line in the file to be created
    svmlight_feature_data = reduce(lambda x, y: x + "\n" + y,
                                   svmlight_feature_data) + "\n"

    # Combine patient id's with <feature_id , feature_value> data for the deliverable file
    deliverable_format_rows = [
        str(id) + " " + str(float(mortality[id])) + " " + data
        for id, data in zip(patient_ids, all_feature_rows)
    ]

    # Combine rows together into a single writable string; one row for each line in the file to be created
    deliverable_feature_data = reduce(lambda x, y: x + "\n" + y,
                                      deliverable_format_rows) + "\n"

    # Open write-connections to deliverable files
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    # Write deliverable1 to file
    deliverable1.write(svmlight_feature_data)

    # Write deliverable2 to file
    deliverable2.write(deliverable_feature_data)

    # Close connection to deliverable files
    deliverable1.close()
    deliverable2.close()
Ejemplo n.º 8
0
def save_svmlight(patient_features, mortality, op_file, op_deliverable):
    
    # Get sorted list of all patient_ids
    patient_ids = sorted(patient_features.keys())
    patient_ids = map(int , patient_ids)

     
    # Sort each feature list in patient_features by feature_id
    for patient_id in patient_ids:
        
        patient_features[patient_id] = sorted(patient_features[patient_id] , key = lambda x: x[0])          
               
                         
    all_feature_rows = map(lambda x: bag_to_svmlight(patient_features[x]) + " " , patient_ids)
    
    svmlight_feature_data = [str(int(mortality[x])) + " " + y for x,y in zip(patient_ids , all_feature_rows)]
    

    # Combine rows together into a single writable string; one row for each line in the file to be created
    svmlight_feature_data = reduce(lambda x, y: x + "\n" + y , svmlight_feature_data) + "\n"
    
    
    # Combine patient id's with <feature_id , feature_value> data for the deliverable file
    deliverable_format_rows = [str(id) + " " + str(float(mortality[id])) + " " + data for id,data in zip(patient_ids,all_feature_rows)]

    # Combine rows together into a single writable string; one row for each line in the file to be created
    deliverable_feature_data = reduce(lambda x, y: x + "\n" + y , deliverable_format_rows) + "\n"
    
    
    # Open write-connections to deliverable files
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    
    # Write deliverable1 to file    
    deliverable1.write(svmlight_feature_data)
    
    # Write deliverable2 to file
    deliverable2.write(deliverable_feature_data);
    
    # Close connection to deliverable files
    deliverable1.close()
    deliverable2.close()
Ejemplo n.º 9
0
def save_svmlight(patient_features, mortality, op_file, op_deliverable):
    '''
    TODO: This function needs to be completed

    Refer to instructions in Q3 d

    Create two files:
    1. op_file - which saves the features in svmlight format. (See instructions in Q3d for detailed explanation)
    2. op_deliverable - which saves the features in following format:
       patient_id1 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...
       patient_id2 label feature_id:feature_value feature_id:feature_value feature_id:feature_value ...  
    
    Note: Please make sure the features are ordered in ascending order, and patients are stored in ascending order as well.     
    '''
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    keys = mortality.keys()
    for k in keys:
        f_k = sorted(patient_features[k], key=lambda tup: tup[0])
        l = str(mortality[k]) + " " + utils.bag_to_svmlight(f_k) + " " + "\n"
        l_id = str(k).replace('.0', "") + " " + l
        deliverable1.write(bytes((l), 'UTF-8'))
        deliverable2.write(bytes((l_id), 'UTF-8'))
Ejemplo n.º 10
0
def my_features():
    '''
    You may generate your own features over here.
    Note that for the test data, all events are already filtered such that they fall in the observation window of their respective patients. Thus, if you were to generate features similar to those you constructed in code/etl.py for the test data, all you have to do is aggregate events for each patient.
    IMPORTANT: Store your test data features in a file called "test_features.txt" where each line has the
    patient_id followed by a space and the corresponding feature in sparse format.
    Eg of a line:
    60 971:1.000000 988:1.000000 1648:1.000000 1717:1.000000 2798:0.364078 3005:0.367953 3049:0.013514
    Here, 60 is the patient id and 971:1.000000 988:1.000000 1648:1.000000 1717:1.000000 2798:0.364078 3005:0.367953 3049:0.013514 is the feature for the patient with id 60.

    Save the file as "test_features.txt" and save it inside the folder deliverables

    input:
    output: X_train,Y_train,X_test
    '''

    events_train, mortality_train, feature_map = etl.read_csv('../data/train/')

    patient_features_train, mortality = etl.create_features(
        events_train, mortality_train, feature_map)
    etl.save_svmlight(patient_features_train, mortality,
                      '../deliverables/features_svmlight.train',
                      '../deliverables/features.train')
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    try:
        events_test = pd.read_csv('../data/test/' + 'events.csv',
                                  parse_dates=['timestamp'])
        events_test = events_test.sort_values('timestamp')
    except IOError:
        events_test = None

    aggregated_events_test = aggregate_test_events(events_test, feature_map)

    patientID_test_tuple = aggregated_events_test.groupby('patient_id').apply(
        lambda x: list(
            x.sort_values('feature_id').apply(
                lambda y: (y['feature_id'], y['feature_value']), axis=1)))
    patient_features_test = patientID_test_tuple.to_dict()

    deliverable1 = open("../features_svmlight.test", 'wb')
    deliverable2 = open("../deliverables/test_features.txt", 'wb')

    #deliverable1.write(bytes((""),'UTF-8')); #Use 'UTF-8'
    #deliverable2.write(bytes((""),'UTF-8'));

    for patients in patient_features_test:

        features = patient_features_test[patients]

        features = pd.DataFrame(features).sort_values(0)

        features = features.values.tolist()

        deliverable1.write(
            bytes(("{} {} \n".format(str(1), utils.bag_to_svmlight(features))),
                  'UTF-8'))
        deliverable2.write(
            bytes(("{} {} \n".format(int(patients),
                                     utils.bag_to_svmlight(features))),
                  'UTF-8'))

    deliverable1.close()
    deliverable2.close()

    print("Number of Patients in patient_features_test")
    print(len(patient_features_test))

    X_test, _ = utils.get_data_from_svmlight("../features_svmlight.test")

    print("Dim of X_test")
    print(X_test.shape)

    #TODO: complete this
    return X_train, Y_train, X_test
Ejemplo n.º 11
0
def my_features(filepath_train, filepath_test):
    #TODO: complete this
    events, mortality, feature_map = etl.read_csv(filepath_train)
    events_test = pd.read_csv(filepath_test + 'events.csv')
    feature_map_test = pd.read_csv(filepath_test + 'event_feature_map.csv')
    #deliverables_path = 'C:/Users/yyan/Downloads/'

    indx_date = etl.calculate_index_date(events, mortality, '')
    filtered_events = etl.filter_events(events, indx_date, '')
    aggregated_events = etl.aggregate_events(filtered_events, mortality,
                                             feature_map, deliverables_path)

    feature_count = aggregated_events.groupby(by=['feature_id']).count()
    n = 600
    selected_features = list(
        feature_count[feature_count['patient_id'] >= n].index)
    aggregated_events = aggregated_events[aggregated_events['feature_id'].isin(
        selected_features)]

    df = aggregated_events.join(mortality.set_index('patient_id'),
                                on='patient_id',
                                lsuffix='',
                                rsuffix='_r')
    patient_features = df.set_index('patient_id')[[
        'feature_id', 'feature_value'
    ]].T.apply(tuple).to_frame()
    patient_features.columns = ['features']
    patient_features = patient_features.groupby(
        by=['patient_id'])['features'].apply(np.array)
    mortality = df.fillna(0).drop_duplicates().set_index(
        'patient_id')['label'].to_dict()
    s = aggregated_events.pivot_table(index='patient_id',
                                      columns='feature_id',
                                      values='feature_value').fillna(0)
    l = df[['patient_id', 'label']].fillna(0).drop_duplicates()

    df_test = events_test.join(feature_map_test.set_index('event_id'),
                               on='event_id',
                               lsuffix='',
                               rsuffix='_r')
    sub_sum = df_test[df_test['event_id'].str.startswith((
        'DIAG', 'DRUG')) == True].groupby(by=['patient_id', 'idx']).sum()
    sub_count = df_test[df_test['event_id'].str.startswith((
        'LAB')) == True].groupby(by=['patient_id', 'idx']).count()
    sub_count = sub_count[['value']]
    columns = ['patient_id', 'feature_id', 'feature_value']
    agg_events = pd.concat([sub_sum, sub_count]).reset_index()
    agg_events.columns = columns
    agg_events[
        'feature_value'] = agg_events['feature_value'] / agg_events.groupby(
            ['feature_id'])['feature_value'].transform('max')

    #agg_events = agg_events[agg_events['feature_id'].isin(selected_features)]

    X_train = s
    Y_train = l.set_index('patient_id')
    clf = LogisticRegression(penalty='l1')
    clf.fit(X_train, Y_train)
    coef = clf.coef_
    selected_features = pd.DataFrame(coef,
                                     columns=X_train.columns).columns.delete(0)

    X_train = X_train[selected_features]

    agg_events = agg_events[agg_events['feature_id'].isin(
        selected_features)].fillna(0)
    patient_features_test = agg_events.set_index('patient_id')[[
        'feature_id', 'feature_value'
    ]].T.apply(tuple).to_frame()
    patient_features_test.columns = ['features']
    patient_features_test = patient_features_test.groupby(
        by=['patient_id'])['features'].apply(np.array)
    X_test = agg_events.pivot_table(index='patient_id',
                                    columns='feature_id',
                                    values='feature_value').fillna(0)
    #X_test = X_test[selected_features]

    deliverable = open(
        'C:/Users/yyan/Downloads/homework1/deliverables/test_features.txt',
        'wb')
    keys = patient_features_test.keys()
    for k in keys:
        f_k = sorted(patient_features_test[k], key=lambda tup: tup[0])
        l = utils.bag_to_svmlight(f_k) + " " + "\n"
        l_id = str(k).replace('.0', "") + " " + l
        deliverable.write(bytes((l_id), 'UTF-8'))

    return X_train, Y_train, X_test