Esempio n. 1
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate")

    display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_test), Y_test)
    display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test)
    display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_test), Y_test)
Esempio n. 2
0
def my_features():
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")
    filepath = '../data/test/'
    filtered_events = pd.read_csv(filepath + 'events.csv')[[
        'patient_id', 'event_id', 'value'
    ]]
    feature_map = pd.read_csv(filepath + 'event_feature_map.csv')
    aggregated_events = etl.aggregate_events(filtered_events, None,
                                             feature_map, '')
    patient_features = aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict()
    events_mortality = pd.DataFrame(aggregated_events['patient_id'])
    events_mortality['label'] = aggregated_events['patient_id']
    mortality = events_mortality.set_index('patient_id')['label'].to_dict()
    etl.save_svmlight(patient_features, mortality,
                      '../deliverables/test_features.txt',
                      '../deliverables/features.txt')
    X_test = load_svmlight_file('../deliverables/test_features.txt',
                                n_features=3190)[0]
    X_testt, Y_testt = utils.get_data_from_svmlight(
        "../data/features_svmlight.validate")
    clf = GradientBoostingClassifier()
    clf = clf.fit(X_train, Y_train)
    model = SelectFromModel(clf, prefit=True)
    X_train_n = model.transform(X_train)
    X_test_n = model.transform(X_test)
    X_testt_n = model.transform(X_testt)

    return X_train_n.todense(), Y_train, X_test_n.todense(), X_testt_n.todense(
    ), Y_testt
def main():
	X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate")

	display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)
	display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)
	display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
Esempio n. 4
0
def my_features():
    #TODO: complete this
    #X_train,Y_train = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train")
    X_train, Y_train = utils.get_data_from_svmlight(
        "C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train"
    )

    events = pd.read_csv(
        r'C:\Users\Xiaojun\Desktop\omscs\CSE6250\hw1\data\test\events.csv')
    events['timestamp'] = pd.to_datetime(events['timestamp'])

    #Columns in event_feature_map.csv - idx,event_id
    feature_map = pd.read_csv(
        r'C:\Users\Xiaojun\Desktop\omscs\CSE6250\hw1\data\test\event_feature_map.csv'
    )

    patient_features = create_features(events, feature_map)
    save_svmlight(patient_features)

    #X_test,Y_holder = load_svmlight_file('C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/test_features.txt')
    X_test, Y_holder = utils.get_data_from_svmlight(
        'C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/test_features.txt'
    )

    return X_train, Y_train, X_test
Esempio n. 5
0
def main():

    for algo in [
            "logistic regression", "linear_svm", "decision_tree", "ada boost",
            "bagging logistic", "bagging_svm", "neural_network"
    ]:
        # for algo in ["logistic regression"]:
        X, Y = utils.get_data_from_svmlight(
            "../scala/output/withoutUPDRS.train", n_features=350)
        print
        print "Without UPDRS"
        print "Classifier:", algo, "__________"
        acc_k, auc_k, precision_k, recall_k = get_acc_auc_kfold(X, Y, algo)
        print acc_k
        print auc_k
        print precision_k
        print recall_k

        #        print "Average Accuracy in KFold CV: "+str(acc_k)
        #        print "Average AUC in KFold CV: "+str(auc_k)
        #        print "Average Precision Score in KFold CV: "+str(precision_k)
        #        print "Average Recall Score in KFold CV: "+str(recall_k)

        X, Y = utils.get_data_from_svmlight("../scala/output/withUPDRS.train",
                                            n_features=350)
        print "With UPDRS"
        print "Classifier:", algo, "__________"
        acc_k, auc_k, precision_k, recall_k = get_acc_auc_kfold(X, Y, algo)
        print acc_k
        print auc_k
        print precision_k
        print recall_k
Esempio n. 6
0
def my_features():

    # Get train data from svmlight_file
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    # Read in the test events and feature_map data
    events = pd.read_csv("../data/test/events.csv")
    feature_map = pd.read_csv("../data/test/event_feature_map.csv")

    # Aggregate test events data using the aggregate_events method from etl.py
    aggregated_events = aggregate_events(
        events, None, feature_map, "../data/test/test_aggregated_events.csv")

    # Create the test features
    patient_feautures = create_test_features(aggregated_events)

    # Generate the test features file
    save_test_features(patient_feautures, "../deliverables/test_features.txt")

    # Get test data from svmlight_file created above
    X_test, patient_ids = utils.get_data_from_svmlight(
        "../deliverables/test_features.txt")

    return X_train, Y_train, X_test
Esempio n. 7
0
def features():
    #TODO: complete this

    events = pd.read_csv('../data/' + 'LosAngeles.csv')
    feature_map = pd.read_csv('../data/' + 'filter_map.csv')
    result = pd.merge(left=events, right=feature_map, on='filter')
    result = result.drop('filter', 1)
    result = result.rename(columns={'idx': 'filter'})
    result = result[['filter', 'likes', 'comments', 'createdtime']]
    result = result.dropna()
    result['timeofday'] = result['createdtime'].apply(lambda x: day[int(
        datetime.datetime.fromtimestamp(int(x)).strftime('%H'))])
    result['season'] = result['createdtime'].apply(lambda x: month[int(
        datetime.datetime.fromtimestamp(int(x)).strftime('%m'))])
    result = result.drop('createdtime', 1)

    print(result)

    train = pd.read_csv("../data/LosAngeles.csv", delimiter=",")
    data_samples = train.tags
    topics = topic_modelling.topic_LDA(data_samples)

    agg = pd.concat([result, topics], axis=1, join='inner')
    agg = agg[-agg['filter'].isin([26])]
    agg = agg.sample(frac=0.2, replace=True)

    fil = agg.ix[:, 0]
    agg = agg.drop('filter', 1)
    dump_svmlight_file(agg, fil, "../data/output.train")

    #
    X_train, Y_train = utils.get_data_from_svmlight("../data/output.train")
    X_test, Y_test = utils.get_data_from_svmlight("../data/output.train")
    #
    return X_train, Y_train, X_test
Esempio n. 8
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate")   
    # X_train, Y_train = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train")
    # X_test, Y_test = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/data/features_svmlight.validate",n_features=3179)
    display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)
    display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)
    display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
Esempio n. 9
0
def main():
    test_events = pd.read_csv('../data/test/events.csv')
    feature_map = pd.read_csv('../data/test/event_feature_map.csv')
    my_features(test_events,feature_map)
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight("../deliverables/test_features.train")
    Y_pred= my_classifier_predictions(X_train,Y_train,X_test)
    utils.generate_submission("../deliverables/test_features.txt",Y_pred)
def main():
    #X_train, Y_train = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//deliverables//features_svmlight.train")
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    #X_test, Y_test = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//data//features_svmlight.validate")
    X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate")
    display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test)
    display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test)
    display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
Esempio n. 11
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test")

    X_train, Y_train, X_test = my_features()
    Y_pred = my_classifier_predictions(X_train, Y_train, X_test)

    display_metrics("LinearSVC", Y_pred, Y_test)
    print Y_pred
Esempio n. 12
0
def my_features():
	#TODO: complete this
    
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    test_events = pd.read_csv("../data/test/events.csv")
    feature_map = pd.read_csv("../data/test/event_feature_map.csv")

    
    event = pd.merge(test_events, feature_map,on = "event_id",how="left").dropna(subset=["value"])
    cols = list(event)
    cols[5], cols[1] = cols[1], cols[5]
    event = event.ix[:,cols]
    
    event = event.drop(["event_id"],axis=1)
    
    aggregated_events = event.groupby(["patient_id", "idx"])[["value"]].count().reset_index()
   
    aggregated_events["max_value_by_feature"] = aggregated_events.groupby(
        "idx")["value"].transform(max)
    aggregated_events["normalized_value"] = aggregated_events["value"].divide(
        aggregated_events["max_value_by_feature"], 0)
    
    aggregated_events = aggregated_events.rename(columns={"normalized_value": "feature_value", "idx": "feature_id"})
    
    patient_features = {k: list(map(tuple, aggregated_events[["feature_id", "feature_value"]].values)) for k , aggregated_events in aggregated_events.groupby('patient_id')}


    deliverable1 = open('../data/test/feature_svmlight.test', 'wb')
                    
    for key in patient_features:
        w_str = '0 '
               
        for tuples in patient_features[key]:
            w_str = w_str + str(int(tuples[0])) + ':' + str("{:.4f}".format(tuples[1])) + ' '
        
        deliverable1.write(w_str)
        deliverable1.write('\n')

        
    deliverable2 = open('../deliverables/test_features.txt', 'w')

    for key in patient_features:
        w_str = str(int(key)) + ' '
        
        for tuples in patient_features[key]:
            w_str = w_str + str(int(tuples[0])) + ':' + str("{:.4f}".format(tuples[1])) + ' '
        
        deliverable2.write(w_str)
        deliverable2.write('\n')
        
    deliverable1.close()
    deliverable2.close()

    X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test")
    
    return X_train, Y_train, X_test
def my_features():
    #TODO: complete this
    train_path = '../data/train/'
    test_path = '../data/test/'
    train_events = pd.read_csv(train_path + 'events.csv')
    train_mortality = pd.read_csv(train_path + 'mortality_events.csv')
    train_feature_map = pd.read_csv(train_path + 'event_feature_map.csv')

    test_events = pd.read_csv(test_path + 'events.csv')
    test_feature_map = pd.read_csv(test_path + 'event_feature_map.csv')

    patient_features, mortality = etl.create_features(train_events,
                                                      train_mortality,
                                                      train_feature_map)
    etl.save_svmlight(patient_features, mortality,
                      '../others/features_svmlight.train',
                      '../others/features.train')
    X_train, Y_train = utils.get_data_from_svmlight(
        "../others/features_svmlight.train")

    deliverables_path = '../others/'

    aggregated_events = etl.aggregate_events(
        test_events[['patient_id', 'event_id', 'value']], train_mortality,
        test_feature_map, deliverables_path)
    merged = pd.merge(test_events,
                      train_mortality,
                      on='patient_id',
                      suffixes=['_x', '_y'],
                      how='left')
    merged.fillna(0, inplace=True)
    test_patient_features = aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict()
    test_mortality = merged.groupby('patient_id')['label'].apply(
        lambda x: x.unique()[0]).to_dict()
    etl.save_svmlight(test_patient_features, test_mortality,
                      '../others/features_svmlight.test',
                      '../others/features.test')

    deliverable1 = open('../deliverables/test_features.txt', 'wb')

    sorted_keys = sorted(test_patient_features.keys())
    d1 = ''
    for i in sorted_keys:
        deliverable1.write(str(int(i)))
        others = sorted(test_patient_features[i])
        for j in others:
            deliverable1.write(' ' + str(int(j[0])) + ':' + '%.6f' % (j[1]))
        deliverable1.write(' \n')

    X_test, Y_test = utils.get_data_from_svmlight(
        '../others/features_svmlight.test')

    return X_train, Y_train, X_test
Esempio n. 14
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")
    X_test, Y_test = utils.get_data_from_svmlight(
        "../data/test/feature_svmlight.test")

    X_train, Y_train, X_test = my_features()
    Y_pred = my_classifier_predictions(X_train, Y_train, X_test)

    display_metrics("LinearSVC", Y_pred, Y_test)
    print Y_pred
Esempio n. 15
0
def my_features():
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    #X_train, Y_train = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//deliverables//features_svmlight.train") 
    filepath = '../data/test/'
    events_test = pd.read_csv(filepath + 'events.csv')
    feature_map_test = pd.read_csv(filepath + 'event_feature_map.csv')    
    # using modified etl.py methods to construct X_test
    patient_features = create_features(events_test, feature_map_test)
    save_svmlight(patient_features)   
    X_test, _ = utils.get_data_from_svmlight("../data/test/features_svmlight.test")
    return X_train,Y_train,X_test
Esempio n. 16
0
def load_SVMLight_data(train_data, test_data):
    '''
    This function loads training and testing data in SVMLight.
    '''

    # Extract previous training data.
    X_train, Y_train = utils.get_data_from_svmlight(train_data)

    # Extract testing data.
    X_test, Y_test = utils.get_data_from_svmlight(test_data)

    return X_train, Y_train, X_test, Y_test
def my_features():
	#TODO: complete this

	X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	
	test_events = pd.read_csv('../data/test/events.csv')
	feature_map = pd.read_csv('../data/test/event_feature_map.csv')
	aggregate_test_events(test_events, feature_map)

	X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test")

	return X_train,Y_train,X_test
def main():
    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    Y_pred = my_classifier_predictions(X_train,Y_train)
    utils.generate_submission("../deliverables/features.train",Y_pred)
	#The above function will generate a csv file of (patient_id,predicted label) and will be saved as "my_predictions.csv" in the deliverables folder.
    X,Y = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    print("Classifier: Decision Tree Regressor__________")
    acc_k,auc_k = get_acc_auc_kfold(X,Y)
    print(("Average Accuracy in KFold CV: "+str(acc_k)))
    print(("Average AUC in KFold CV: "+str(auc_k)))
    acc_r,auc_r = get_acc_auc_randomisedCV(X,Y)
    print(("Average Accuracy in Randomised CV: "+str(acc_r)))
    print(("Average AUC in Randomised CV: "+str(auc_r)))
Esempio n. 19
0
def features():
    '''
    This function generates features.
    '''

    # Extract previous training data.
    X_train, Y_train = utils.get_data_from_svmlight(
        "../data/training/part-r-00000")

    # Extract testing data.
    X_test, Y_test = utils.get_data_from_svmlight(
        "../data/testing/part-r-00000")

    return X_train, Y_train, X_test, Y_test
Esempio n. 20
0
def my_features():
    # TODO: complete this

    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    test_events = pd.read_csv('../data/test/events.csv')
    feature_map = pd.read_csv('../data/test/event_feature_map.csv')
    aggregate_test_events(test_events, feature_map)

    X_test, Y_test = utils.get_data_from_svmlight(
        "../data/test/feature_svmlight.test")

    return X_train, Y_train, X_test
Esempio n. 21
0
def my_features():
    """
    Generate own features. As a first pass, just replicate what we've already done in etl.py
    """
    # First pass basically code above replicates what we've already done in ETL.py for the test data
    filepath = '../data/test'
    events = pd.read_csv(os.path.join(filepath, 'events.csv'))
    feature_map = pd.read_csv(os.path.join(filepath, 'event_feature_map.csv'))
    write_features(events, feature_map)

    X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
    # X_val, Y_val = utils.get_data_from_svmlight("../deliverables/features_svmlight.validate")
    X_test, _ = utils.get_data_from_svmlight("../deliverables/test_features.txt")

    return X_train, Y_train, X_test
def main():
	X,Y = utils.get_data_from_svmlight("../data/output.train")
	print "Classifier: Logistic Regression__________"
	acc_k = get_acc_auc_kfold(X,Y)
	print "Average Accuracy in KFold CV: "+str(acc_k)
	acc_r = get_acc_auc_randomisedCV(X,Y)
	print "Average Accuracy in Randomised CV: "+str(acc_r)
Esempio n. 23
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight(
        "../data/training/part-r-00000")
    X_test, Y_test = utils.get_data_from_svmlight(
        "../data/testing/part-r-00000")

    print(
        "Baseline Using Full Training/Test Sets and No Parameter Optimization")
    print("_____________________________________________________________")
    print("Dimensions of Training Set", X_train.shape)
    print("Dimensions of Testing Set", X_test.shape)

    display_metrics("Logistic Regression",
                    logistic_regression_pred(X_train, Y_train, X_test), Y_test)
    display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test)
    display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train,
                                                       X_test), Y_test)
Esempio n. 24
0
def main():
	X,Y = utils.get_data_from_svmlight('../deliverables/features_svmlight.train')
	print "Classifier: Logistic Regression__________"
	acc_k,auc_k = get_acc_auc_kfold(X,Y)
	print "Average Accuracy in KFold CV: "+str(acc_k)
	print "Average AUC in KFold CV: "+str(auc_k)
	acc_r,auc_r = get_acc_auc_randomisedCV(X,Y)
	print "Average Accuracy in Randomised CV: "+str(acc_r)
	print "Average AUC in Randomised CV: "+str(auc_r)
Esempio n. 25
0
def main():
	X,Y = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	print("Classifier: Logistic Regression__________")
	acc_k,auc_k = get_acc_auc_kfold(X,Y)
	print(("Average Accuracy in KFold CV: "+str(acc_k)))
	print(("Average AUC in KFold CV: "+str(auc_k)))
	acc_r,auc_r = get_acc_auc_randomisedCV(X,Y)
	print(("Average Accuracy in Randomised CV: "+str(acc_r)))
	print(("Average AUC in Randomised CV: "+str(auc_r)))
Esempio n. 26
0
def my_features():
	#complete this
	X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	test_path = '../data/test/'
	
	events = pd.read_csv(os.path.join(test_path) + 'events.csv', parse_dates=['timestamp'])
	feature_map = pd.read_csv(os.path.join(test_path) + 'event_feature_map.csv')

	# create a fake mortality
	dead = pd.read_csv(os.path.join('../data/train/')+'mortality_events.csv',parse_dates=['timestamp']).ix[0:1,:]
	dead.set_value(0,'patient_id',123456789)
	dead.set_value(1,'patient_id',123456789)
	

	# create features
	deliverable_path='../data/test/'
	indx_date=etl.calculate_index_date(events, dead, deliverable_path)
	#indx_date=indx_date.ix[0:(indx_date.shape[0]-2),:]
	
	filtered_events=etl.filter_events(events, indx_date, deliverable_path)
	
	#print filtered_events
	patient_features, mortality_fake=etl.create_features(events, dead, feature_map)


   
	
	mortality={}
	for key in patient_features.keys():
		mortality[key]=0

	#print patient_features
	
	op_file='../deliverables/features_svmlight.test'
	op_deliverable='../deliverables/test_features.txt'
	etl.save_svmlight(patient_features, mortality, op_file, op_deliverable)


	X_test,Y_test = utils.get_data_from_svmlight('../deliverables/features_svmlight.test')
	


	
	return X_train, Y_train, X_test
Esempio n. 27
0
def my_features():
    # TODO: complete this
    train_path = '../data/test/'
    deliverables_path = '../deliverables/'
    # Calculate index date
    events = pd.read_csv(train_path + 'events.csv')
    feature_map = pd.read_csv(train_path + 'event_feature_map.csv')
    # Aggregate the event values for each pat ient
    aggregated_events = etl.aggregate_events(events, None, feature_map,
                                             deliverables_path)
    '''
    TODO: Complete the code below by creating two dictionaries -
    1. patient_features :  Key - patient_id and value is array of tuples(feature_id, feature_value)
    2. mortality : Key - patient_id and value is mortality label
    '''
    patient_features = {}
    for index, row in aggregated_events.iterrows():
        if not patient_features.get(row['patient_id']):

            patient_features[row['patient_id']] = [(row['feature_id'],
                                                    row['feature_value'])]
        else:
            patient_features[row['patient_id']].append(
                (row['feature_id'], row['feature_value']))

    line = ''
    line_svm = ''
    for key, value in sorted(patient_features.iteritems()):
        line += str(int(key)) + ' '
        line_svm += str(1) + ' '
        value = sorted(value)
        for item in value:
            line += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' '
            line_svm += str(int(item[0])) + ":" + str(format(item[1],
                                                             '.6f')) + ' '
        line += '\n'
        line_svm += '\n'

    deliverable2 = open(deliverables_path + 'test_features.txt', 'wb')
    deliverable2.write(line)
    deliverable2.close()

    svm_file = open(deliverables_path + 'test_mymodel_features.train', 'wb')
    svm_file.write(line_svm)
    svm_file.close()

    data_train = load_svmlight_file(deliverables_path +
                                    'test_mymodel_features.train',
                                    n_features=3190)
    X_test = data_train[0]
    print(X_test.shape)

    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    return X_train, Y_train, X_test
Esempio n. 28
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight(
        "C:/users/yyan/Downloads/homework1/deliverables/features_svmlight.train"
    )

    display_metrics("Logistic Regression",
                    logistic_regression_pred(X_train, Y_train), Y_train)
    display_metrics("SVM", svm_pred(X_train, Y_train), Y_train)
    display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train),
                    Y_train)
Esempio n. 29
0
def main():
    X, Y = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")
    # X,Y = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train")
    print("Classifier: Logistic Regression__________")
    acc_k, auc_k = get_acc_auc_kfold(X, Y)
    print(("Average Accuracy in KFold CV: " + str(acc_k)))
    print(("Average AUC in KFold CV: " + str(auc_k)))
    acc_r, auc_r = get_acc_auc_randomisedCV(X, Y)
    print(("Average Accuracy in Randomised CV: " + str(acc_r)))
    print(("Average AUC in Randomised CV: " + str(auc_r)))
def main():
    X_train, Y_train, X_test = my_features()
    #print X_train,X_test
    Y_pred = my_classifier_predictions(X_train, Y_train, X_test)
    utils.generate_submission("../deliverables/test_features.txt", Y_pred)
    #The above function will generate a csv file of (patient_id,predicted label) and will be saved as "my_predictions.csv" in the deliverables folder.

    X_traintest, Y_traintest = utils.get_data_from_svmlight(
        "../data/features_svmlight.validate")
    Y_trainpred = my_classifier_predictions(X_train, Y_train, X_traintest)
    auc = roc_auc_score(Y_traintest, Y_trainpred)
Esempio n. 31
0
def main():
	X, Y = utils.get_data_from_svmlight("data/allfeature.data")
	#X, Y = utils.get_data_from_svmlight("data/biggerallfeature.data")

	classifierName = ['Logistic Regression', 'SVM',,'SGDClassifier','adaboost','Decision Tree','RFC','GBC','RFC']
	for clfname in classifierName:
		display_metrics(clfname,X,Y)
	drawrocdt(X,Y)
	drawroclr(X,Y)
	drawrocada(X,Y)
	drawrocrfc(X,Y)
Esempio n. 32
0
def main():
    X_train, Y_train = utils.get_data_from_svmlight(
        "../deliverables/features_svmlight.train")

    #Check model performance on training set
    display_metrics("Logistic Regression",
                    logistic_regression_pred(X_train, Y_train, X_train),
                    Y_train)
    display_metrics("SVM", svm_pred(X_train, Y_train, X_train), Y_train)
    display_metrics("Decision Tree",
                    decisionTree_pred(X_train, Y_train, X_train), Y_train)
Esempio n. 33
0
def my_features():

	# Get train data from svmlight_file	
	X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train")
	
	# Read in the test events and feature_map data
	events = pd.read_csv("../data/test/events.csv")
	feature_map = pd.read_csv("../data/test/event_feature_map.csv")
	
	# Aggregate test events data using the aggregate_events method from etl.py
	aggregated_events = aggregate_events(events , None, feature_map , "../data/test/test_aggregated_events.csv")

	# Create the test features
	patient_feautures = create_test_features(aggregated_events)

        # Generate the test features file	
	save_test_features(patient_feautures,"../deliverables/test_features.txt")
	
	# Get test data from svmlight_file created above
	X_test, patient_ids = utils.get_data_from_svmlight("../deliverables/test_features.txt")
	
	
	return X_train,Y_train,X_test
Esempio n. 34
0
def my_features():
    #TODO: complete this
    X_train, Y_train = utils.get_data_from_svmlight(
        '../deliverables/features_svmlight.train')

    deliverables_path = '../deliverables/'
    test_events = pd.read_csv('../data/test/events.csv')
    test_events_map = pd.read_csv('../data/test/event_feature_map.csv')

    test_aggregated_events = etl.aggregate_events(test_events, None,
                                                  test_events_map,
                                                  deliverables_path)

    #make patient_features for test data
    test_patient_features = test_aggregated_events.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict()

    #store test_feature.txt and test_svmlight file

    line_svm = ''
    line_test = ''

    for key in sorted(test_patient_features):
        line_svm += '1 '
        line_test += str(int(key)) + ' '

        for tup in sorted(test_patient_features[key]):
            line_svm += str(int(tup[0])) + ':' + str("{:.6f}".format(
                tup[1])) + ' '
            line_test += str(int(tup[0])) + ':' + str("{:.6f}".format(
                tup[1])) + ' '
        line_svm += '\n'
        line_test += '\n'

    test_featuresfile = open(deliverables_path + 'test_features.txt', 'wb')
    test_svmlightfile = open(deliverables_path + 'test_mymodel_svm.train',
                             'wb')
    test_svmlightfile.write(bytes(line_svm, 'UTF-8'))  #Use 'UTF-8'
    test_featuresfile.write(bytes(line_test, 'UTF-8'))

    test_data = load_svmlight_file(deliverables_path +
                                   'test_mymodel_svm.train',
                                   n_features=3190)
    X_test = test_data[0]

    return X_train, Y_train, X_test
Esempio n. 35
0
def my_features():
    #TODO: complete this
    X_train, Y_train = utils.get_data_from_svmlight(
        '../deliverables/features_svmlight.train')

    events_test = pd.read_csv('../data/test/events.csv')
    feature_map_test = pd.read_csv('../data/test/event_feature_map.csv')

    deliverables_path = '../deliverables/'
    aggregated_events_test = etl.aggregate_events(events_test, None,
                                                  feature_map_test,
                                                  deliverables_path)

    patient_features_test = aggregated_events_test.groupby('patient_id')[[
        'feature_id', 'feature_value'
    ]]
    patient_features_test = patient_features_test.apply(
        lambda g: list(map(tuple, g.values.tolist()))).to_dict()

    op_file = deliverables_path + 'features_svmlight.test'
    op_deliverable = deliverables_path + 'test_features.txt'
    deliverable1 = open(op_file, 'wb')
    deliverable2 = open(op_deliverable, 'wb')

    line1 = line2 = ''
    for key in sorted(patient_features_test.keys()):
        line1 += '1 '
        line2 += str(int(key)) + ' '

        for value in sorted(patient_features_test[key]):
            line1 += str(int(value[0])) + ':' + str("{:.6f}".format(
                value[1])) + ' '
            line2 += str(int(value[0])) + ':' + str("{:.6f}".format(
                value[1])) + ' '

        line1 += '\n'
        line2 += '\n'

    deliverable1.write(bytes(line1, 'UTF-8'))  #Use 'UTF-8'
    deliverable2.write(bytes(line2, 'UTF-8'))

    X_test = load_svmlight_file(deliverables_path + 'features_svmlight.test',
                                n_features=3190)[0]
    return X_train, Y_train, X_test