Ejemplo n.º 1
0
def my_features():

    # Get train data from svmlight_file
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    # Read in the test events and feature_map data
    events = pd.read_csv("../data/test/events.csv")
    feature_map = pd.read_csv("../data/test/event_feature_map.csv")

    # Aggregate test events data using the aggregate_events method from etl.py
    aggregated_events = aggregate_events(
        events, None, feature_map, "../data/test/test_aggregated_events.csv")

    # Create the test features
    patient_feautures = create_test_features(aggregated_events)

    # Generate the test features file
    save_test_features(patient_feautures, "../deliverables/test_features.txt")

    # Get test data from svmlight_file created above
    X_test, patient_ids = utils.get_data_from_svmlight(
        "../deliverables/test_features.txt")

    return X_train, Y_train, X_test
Ejemplo n.º 2
0
def my_features():
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")
    filepath = '../data/test/'
    filtered_events = pd.read_csv(filepath + 'events.csv')[[
        'patient_id', 'event_id', 'value'
    ]]
    feature_map = pd.read_csv(filepath + 'event_feature_map.csv')
    aggregated_events = etl.aggregate_events(filtered_events, None,
                                             feature_map, '')
    patient_features = aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict()
    events_mortality = pd.DataFrame(aggregated_events['patient_id'])
    events_mortality['label'] = aggregated_events['patient_id']
    mortality = events_mortality.set_index('patient_id')['label'].to_dict()
    etl.save_svmlight(patient_features, mortality,
                      '../deliverables/test_features.txt',
                      '../deliverables/features.txt')
    X_test = load_svmlight_file('../deliverables/test_features.txt',
                                n_features=3190)[0]
    # X_testt, Y_testt = utils.get_data_from_svmlight("../data/features_svmlight.validate")
    clf = GradientBoostingClassifier()
    clf = clf.fit(X_train, Y_train)
    model = SelectFromModel(clf, prefit=True)
    X_train_n = model.transform(X_train)
    X_test_n = model.transform(X_test)
    # X_testt_n=model.transform(X_testt)

    return X_train_n.todense(), Y_train, X_test_n.todense()
Ejemplo n.º 3
0
def my_features():
    # TODO: complete this
    train_path = '../data/test/'
    deliverables_path = '../deliverables/'
    # Calculate index date
    events = pd.read_csv(train_path + 'events.csv')
    feature_map = pd.read_csv(train_path + 'event_feature_map.csv')
    # Aggregate the event values for each pat ient
    aggregated_events = etl.aggregate_events(events, None, feature_map,
                                             deliverables_path)
    '''
    TODO: Complete the code below by creating two dictionaries -
    1. patient_features :  Key - patient_id and value is array of tuples(feature_id, feature_value)
    2. mortality : Key - patient_id and value is mortality label
    '''
    patient_features = {}
    for index, row in aggregated_events.iterrows():
        if not patient_features.get(row['patient_id']):

            patient_features[row['patient_id']] = [(row['feature_id'],
                                                    row['feature_value'])]
        else:
            patient_features[row['patient_id']].append(
                (row['feature_id'], row['feature_value']))

    line = ''
    line_svm = ''
    for key, value in sorted(patient_features.iteritems()):
        line += str(int(key)) + ' '
        line_svm += str(1) + ' '
        value = sorted(value)
        for item in value:
            line += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' '
            line_svm += str(int(item[0])) + ":" + str(format(item[1],
                                                             '.6f')) + ' '
        line += '\n'
        line_svm += '\n'

    deliverable2 = open(deliverables_path + 'test_features.txt', 'wb')
    deliverable2.write(line)
    deliverable2.close()

    svm_file = open(deliverables_path + 'test_mymodel_features.train', 'wb')
    svm_file.write(line_svm)
    svm_file.close()

    data_train = load_svmlight_file(deliverables_path +
                                    'test_mymodel_features.train',
                                    n_features=3190)
    X_test = data_train[0]
    print(X_test.shape)

    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    return X_train, Y_train, X_test
def my_features():
    #TODO: complete this
    train_path = '../data/train/'
    test_path = '../data/test/'
    train_events = pd.read_csv(train_path + 'events.csv')
    train_mortality = pd.read_csv(train_path + 'mortality_events.csv')
    train_feature_map = pd.read_csv(train_path + 'event_feature_map.csv')

    test_events = pd.read_csv(test_path + 'events.csv')
    test_feature_map = pd.read_csv(test_path + 'event_feature_map.csv')

    patient_features, mortality = etl.create_features(train_events,
                                                      train_mortality,
                                                      train_feature_map)
    etl.save_svmlight(patient_features, mortality,
                      '../others/features_svmlight.train',
                      '../others/features.train')
    X_train, Y_train = utils.get_data_from_svmlight(
        "../others/features_svmlight.train")

    deliverables_path = '../others/'

    aggregated_events = etl.aggregate_events(
        test_events[['patient_id', 'event_id', 'value']], train_mortality,
        test_feature_map, deliverables_path)
    merged = pd.merge(test_events,
                      train_mortality,
                      on='patient_id',
                      suffixes=['_x', '_y'],
                      how='left')
    merged.fillna(0, inplace=True)
    test_patient_features = aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict()
    test_mortality = merged.groupby('patient_id')['label'].apply(
        lambda x: x.unique()[0]).to_dict()
    etl.save_svmlight(test_patient_features, test_mortality,
                      '../others/features_svmlight.test',
                      '../others/features.test')

    deliverable1 = open('../deliverables/test_features.txt', 'wb')

    sorted_keys = sorted(test_patient_features.keys())
    d1 = ''
    for i in sorted_keys:
        deliverable1.write(str(int(i)))
        others = sorted(test_patient_features[i])
        for j in others:
            deliverable1.write(' ' + str(int(j[0])) + ':' + '%.6f' % (j[1]))
        deliverable1.write(' \n')

    X_test, Y_test = utils.get_data_from_svmlight(
        '../others/features_svmlight.test')

    return X_train, Y_train, X_test
Ejemplo n.º 5
0
def my_features():
    #TODO: complete this
    X_train, Y_train = utils.get_data_from_svmlight(
        '../deliverables/features_svmlight.train')

    deliverables_path = '../deliverables/'
    test_events = pd.read_csv('../data/test/events.csv')
    test_events_map = pd.read_csv('../data/test/event_feature_map.csv')

    test_aggregated_events = etl.aggregate_events(test_events, None,
                                                  test_events_map,
                                                  deliverables_path)

    #make patient_features for test data
    test_patient_features = test_aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

    #store test_feature.txt and test_svmlight file

    line_svm = ''
    line_test = ''

    for key in sorted(test_patient_features):
        line_svm += '1 '
        line_test += str(int(key)) + ' '

        for tup in sorted(test_patient_features[key]):
            line_svm += str(int(tup[0])) + ':' + str("{:.6f}".format(
                tup[1])) + ' '
            line_test += str(int(tup[0])) + ':' + str("{:.6f}".format(
                tup[1])) + ' '
        line_svm += '\n'
        line_test += '\n'

    test_featuresfile = open(deliverables_path + 'test_features.txt', 'wb')
    test_svmlightfile = open(deliverables_path + 'test_mymodel_svm.train',
                             'wb')
    test_svmlightfile.write(bytes(line_svm, 'UTF-8'))  #Use 'UTF-8'
    test_featuresfile.write(bytes(line_test, 'UTF-8'))

    test_data = load_svmlight_file(deliverables_path +
                                   'test_mymodel_svm.train',
                                   n_features=3190)
    X_test = test_data[0]

    return X_train, Y_train, X_test
Ejemplo n.º 6
0
def my_features():
    #TODO: complete this
    X_train, Y_train = utils.get_data_from_svmlight(
        '../deliverables/features_svmlight.train')

    events_test = pd.read_csv('../data/test/events.csv')
    feature_map_test = pd.read_csv('../data/test/event_feature_map.csv')

    deliverables_path = '../deliverables/'
    aggregated_events_test = etl.aggregate_events(events_test, None,
                                                  feature_map_test,
                                                  deliverables_path)

    patient_features_test = aggregated_events_test.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]]
    patient_features_test = patient_features_test.apply(
        lambda g: list(map(tuple, g.values.tolist()))).to_dict()

    op_file = deliverables_path + 'features_svmlight.test'
    op_deliverable = deliverables_path + 'test_features.txt'
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    line1 = line2 = ''
    for key in sorted(patient_features_test.keys()):
        line1 += '1 '
        line2 += str(int(key)) + ' '

        for value in sorted(patient_features_test[key]):
            line1 += str(int(value[0])) + ':' + str("{:.6f}".format(
                value[1])) + ' '
            line2 += str(int(value[0])) + ':' + str("{:.6f}".format(
                value[1])) + ' '

        line1 += '\n'
        line2 += '\n'

    deliverable1.write(bytes(line1, 'UTF-8'))  #Use 'UTF-8'
    deliverable2.write(bytes(line2, 'UTF-8'))

    X_test = load_svmlight_file(deliverables_path + 'features_svmlight.test',
                                n_features=3190)[0]
    return X_train, Y_train, X_test
Ejemplo n.º 7
0
def my_features():

	# Get train data from svmlight_file	
	X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	
	# Read in the test events and feature_map data
	events = pd.read_csv("../data/test/events.csv")
	feature_map = pd.read_csv("../data/test/event_feature_map.csv")
	
	# Aggregate test events data using the aggregate_events method from etl.py
	aggregated_events = aggregate_events(events , None, feature_map , "../data/test/test_aggregated_events.csv")

	# Create the test features
	patient_feautures = create_test_features(aggregated_events)

        # Generate the test features file	
	save_test_features(patient_feautures,"../deliverables/test_features.txt")
	
	# Get test data from svmlight_file created above
	X_test, patient_ids = utils.get_data_from_svmlight("../deliverables/test_features.txt")
	
	
	return X_train,Y_train,X_test
Ejemplo n.º 8
0
def my_features(filepath_train, filepath_test):
    #TODO: complete this
    events, mortality, feature_map = etl.read_csv(filepath_train)
    events_test = pd.read_csv(filepath_test + 'events.csv')
    feature_map_test = pd.read_csv(filepath_test + 'event_feature_map.csv')
    #deliverables_path = 'C:/Users/yyan/Downloads/'

    indx_date = etl.calculate_index_date(events, mortality, '')
    filtered_events = etl.filter_events(events, indx_date, '')
    aggregated_events = etl.aggregate_events(filtered_events, mortality,
                                             feature_map, deliverables_path)

    feature_count = aggregated_events.groupby(by=['feature_id']).count()
    n = 600
    selected_features = list(
        feature_count[feature_count['patient_id'] >= n].index)
    aggregated_events = aggregated_events[aggregated_events['feature_id'].isin(
        selected_features)]

    df = aggregated_events.join(mortality.set_index('patient_id'),
                                on='patient_id',
                                lsuffix='',
                                rsuffix='_r')
    patient_features = df.set_index('patient_id')[[
        'feature_id', 'feature_value'
    ]].T.apply(tuple).to_frame()
    patient_features.columns = ['features']
    patient_features = patient_features.groupby(
        by=['patient_id'])['features'].apply(np.array)
    mortality = df.fillna(0).drop_duplicates().set_index(
        'patient_id')['label'].to_dict()
    s = aggregated_events.pivot_table(index='patient_id',
                                      columns='feature_id',
                                      values='feature_value').fillna(0)
    l = df[['patient_id', 'label']].fillna(0).drop_duplicates()

    df_test = events_test.join(feature_map_test.set_index('event_id'),
                               on='event_id',
                               lsuffix='',
                               rsuffix='_r')
    sub_sum = df_test[df_test['event_id'].str.startswith((
        'DIAG', 'DRUG')) == True].groupby(by=['patient_id', 'idx']).sum()
    sub_count = df_test[df_test['event_id'].str.startswith((
        'LAB')) == True].groupby(by=['patient_id', 'idx']).count()
    sub_count = sub_count[['value']]
    columns = ['patient_id', 'feature_id', 'feature_value']
    agg_events = pd.concat([sub_sum, sub_count]).reset_index()
    agg_events.columns = columns
    agg_events[
        'feature_value'] = agg_events['feature_value'] / agg_events.groupby(
            ['feature_id'])['feature_value'].transform('max')

    #agg_events = agg_events[agg_events['feature_id'].isin(selected_features)]

    X_train = s
    Y_train = l.set_index('patient_id')
    clf = LogisticRegression(penalty='l1')
    clf.fit(X_train, Y_train)
    coef = clf.coef_
    selected_features = pd.DataFrame(coef,
                                     columns=X_train.columns).columns.delete(0)

    X_train = X_train[selected_features]

    agg_events = agg_events[agg_events['feature_id'].isin(
        selected_features)].fillna(0)
    patient_features_test = agg_events.set_index('patient_id')[[
        'feature_id', 'feature_value'
    ]].T.apply(tuple).to_frame()
    patient_features_test.columns = ['features']
    patient_features_test = patient_features_test.groupby(
        by=['patient_id'])['features'].apply(np.array)
    X_test = agg_events.pivot_table(index='patient_id',
                                    columns='feature_id',
                                    values='feature_value').fillna(0)
    #X_test = X_test[selected_features]

    deliverable = open(
        'C:/Users/yyan/Downloads/homework1/deliverables/test_features.txt',
        'wb')
    keys = patient_features_test.keys()
    for k in keys:
        f_k = sorted(patient_features_test[k], key=lambda tup: tup[0])
        l = utils.bag_to_svmlight(f_k) + " " + "\n"
        l_id = str(k).replace('.0', "") + " " + l
        deliverable.write(bytes((l_id), 'UTF-8'))

    return X_train, Y_train, X_test