def main(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate") display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_test), Y_test) display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test) display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_test), Y_test)
def my_features(): X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") filepath = '../data/test/' filtered_events = pd.read_csv(filepath + 'events.csv')[[ 'patient_id', 'event_id', 'value' ]] feature_map = pd.read_csv(filepath + 'event_feature_map.csv') aggregated_events = etl.aggregate_events(filtered_events, None, feature_map, '') patient_features = aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict() events_mortality = pd.DataFrame(aggregated_events['patient_id']) events_mortality['label'] = aggregated_events['patient_id'] mortality = events_mortality.set_index('patient_id')['label'].to_dict() etl.save_svmlight(patient_features, mortality, '../deliverables/test_features.txt', '../deliverables/features.txt') X_test = load_svmlight_file('../deliverables/test_features.txt', n_features=3190)[0] X_testt, Y_testt = utils.get_data_from_svmlight( "../data/features_svmlight.validate") clf = GradientBoostingClassifier() clf = clf.fit(X_train, Y_train) model = SelectFromModel(clf, prefit=True) X_train_n = model.transform(X_train) X_test_n = model.transform(X_test) X_testt_n = model.transform(X_testt) return X_train_n.todense(), Y_train, X_test_n.todense(), X_testt_n.todense( ), Y_testt
def main(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate") display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test) display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test) display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
def my_features(): #TODO: complete this #X_train,Y_train = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train") X_train, Y_train = utils.get_data_from_svmlight( "C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train" ) events = pd.read_csv( r'C:\Users\Xiaojun\Desktop\omscs\CSE6250\hw1\data\test\events.csv') events['timestamp'] = pd.to_datetime(events['timestamp']) #Columns in event_feature_map.csv - idx,event_id feature_map = pd.read_csv( r'C:\Users\Xiaojun\Desktop\omscs\CSE6250\hw1\data\test\event_feature_map.csv' ) patient_features = create_features(events, feature_map) save_svmlight(patient_features) #X_test,Y_holder = load_svmlight_file('C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/test_features.txt') X_test, Y_holder = utils.get_data_from_svmlight( 'C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/test_features.txt' ) return X_train, Y_train, X_test
def main(): for algo in [ "logistic regression", "linear_svm", "decision_tree", "ada boost", "bagging logistic", "bagging_svm", "neural_network" ]: # for algo in ["logistic regression"]: X, Y = utils.get_data_from_svmlight( "../scala/output/withoutUPDRS.train", n_features=350) print print "Without UPDRS" print "Classifier:", algo, "__________" acc_k, auc_k, precision_k, recall_k = get_acc_auc_kfold(X, Y, algo) print acc_k print auc_k print precision_k print recall_k # print "Average Accuracy in KFold CV: "+str(acc_k) # print "Average AUC in KFold CV: "+str(auc_k) # print "Average Precision Score in KFold CV: "+str(precision_k) # print "Average Recall Score in KFold CV: "+str(recall_k) X, Y = utils.get_data_from_svmlight("../scala/output/withUPDRS.train", n_features=350) print "With UPDRS" print "Classifier:", algo, "__________" acc_k, auc_k, precision_k, recall_k = get_acc_auc_kfold(X, Y, algo) print acc_k print auc_k print precision_k print recall_k
def my_features(): # Get train data from svmlight_file X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") # Read in the test events and feature_map data events = pd.read_csv("../data/test/events.csv") feature_map = pd.read_csv("../data/test/event_feature_map.csv") # Aggregate test events data using the aggregate_events method from etl.py aggregated_events = aggregate_events( events, None, feature_map, "../data/test/test_aggregated_events.csv") # Create the test features patient_feautures = create_test_features(aggregated_events) # Generate the test features file save_test_features(patient_feautures, "../deliverables/test_features.txt") # Get test data from svmlight_file created above X_test, patient_ids = utils.get_data_from_svmlight( "../deliverables/test_features.txt") return X_train, Y_train, X_test
def features(): #TODO: complete this events = pd.read_csv('../data/' + 'LosAngeles.csv') feature_map = pd.read_csv('../data/' + 'filter_map.csv') result = pd.merge(left=events, right=feature_map, on='filter') result = result.drop('filter', 1) result = result.rename(columns={'idx': 'filter'}) result = result[['filter', 'likes', 'comments', 'createdtime']] result = result.dropna() result['timeofday'] = result['createdtime'].apply(lambda x: day[int( datetime.datetime.fromtimestamp(int(x)).strftime('%H'))]) result['season'] = result['createdtime'].apply(lambda x: month[int( datetime.datetime.fromtimestamp(int(x)).strftime('%m'))]) result = result.drop('createdtime', 1) print(result) train = pd.read_csv("../data/LosAngeles.csv", delimiter=",") data_samples = train.tags topics = topic_modelling.topic_LDA(data_samples) agg = pd.concat([result, topics], axis=1, join='inner') agg = agg[-agg['filter'].isin([26])] agg = agg.sample(frac=0.2, replace=True) fil = agg.ix[:, 0] agg = agg.drop('filter', 1) dump_svmlight_file(agg, fil, "../data/output.train") # X_train, Y_train = utils.get_data_from_svmlight("../data/output.train") X_test, Y_test = utils.get_data_from_svmlight("../data/output.train") # return X_train, Y_train, X_test
def main(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate") # X_train, Y_train = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train") # X_test, Y_test = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/data/features_svmlight.validate",n_features=3179) display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test) display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test) display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
def main(): test_events = pd.read_csv('../data/test/events.csv') feature_map = pd.read_csv('../data/test/event_feature_map.csv') my_features(test_events,feature_map) X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight("../deliverables/test_features.train") Y_pred= my_classifier_predictions(X_train,Y_train,X_test) utils.generate_submission("../deliverables/test_features.txt",Y_pred)
def main(): #X_train, Y_train = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//deliverables//features_svmlight.train") X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") #X_test, Y_test = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//data//features_svmlight.validate") X_test, Y_test = utils.get_data_from_svmlight("../data/features_svmlight.validate") display_metrics("Logistic Regression",logistic_regression_pred(X_train,Y_train,X_test),Y_test) display_metrics("SVM",svm_pred(X_train,Y_train,X_test),Y_test) display_metrics("Decision Tree",decisionTree_pred(X_train,Y_train,X_test),Y_test)
def main(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test") X_train, Y_train, X_test = my_features() Y_pred = my_classifier_predictions(X_train, Y_train, X_test) display_metrics("LinearSVC", Y_pred, Y_test) print Y_pred
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") test_events = pd.read_csv("../data/test/events.csv") feature_map = pd.read_csv("../data/test/event_feature_map.csv") event = pd.merge(test_events, feature_map,on = "event_id",how="left").dropna(subset=["value"]) cols = list(event) cols[5], cols[1] = cols[1], cols[5] event = event.ix[:,cols] event = event.drop(["event_id"],axis=1) aggregated_events = event.groupby(["patient_id", "idx"])[["value"]].count().reset_index() aggregated_events["max_value_by_feature"] = aggregated_events.groupby( "idx")["value"].transform(max) aggregated_events["normalized_value"] = aggregated_events["value"].divide( aggregated_events["max_value_by_feature"], 0) aggregated_events = aggregated_events.rename(columns={"normalized_value": "feature_value", "idx": "feature_id"}) patient_features = {k: list(map(tuple, aggregated_events[["feature_id", "feature_value"]].values)) for k , aggregated_events in aggregated_events.groupby('patient_id')} deliverable1 = open('../data/test/feature_svmlight.test', 'wb') for key in patient_features: w_str = '0 ' for tuples in patient_features[key]: w_str = w_str + str(int(tuples[0])) + ':' + str("{:.4f}".format(tuples[1])) + ' ' deliverable1.write(w_str) deliverable1.write('\n') deliverable2 = open('../deliverables/test_features.txt', 'w') for key in patient_features: w_str = str(int(key)) + ' ' for tuples in patient_features[key]: w_str = w_str + str(int(tuples[0])) + ':' + str("{:.4f}".format(tuples[1])) + ' ' deliverable2.write(w_str) deliverable2.write('\n') deliverable1.close() deliverable2.close() X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test") return X_train, Y_train, X_test
def my_features(): #TODO: complete this train_path = '../data/train/' test_path = '../data/test/' train_events = pd.read_csv(train_path + 'events.csv') train_mortality = pd.read_csv(train_path + 'mortality_events.csv') train_feature_map = pd.read_csv(train_path + 'event_feature_map.csv') test_events = pd.read_csv(test_path + 'events.csv') test_feature_map = pd.read_csv(test_path + 'event_feature_map.csv') patient_features, mortality = etl.create_features(train_events, train_mortality, train_feature_map) etl.save_svmlight(patient_features, mortality, '../others/features_svmlight.train', '../others/features.train') X_train, Y_train = utils.get_data_from_svmlight( "../others/features_svmlight.train") deliverables_path = '../others/' aggregated_events = etl.aggregate_events( test_events[['patient_id', 'event_id', 'value']], train_mortality, test_feature_map, deliverables_path) merged = pd.merge(test_events, train_mortality, on='patient_id', suffixes=['_x', '_y'], how='left') merged.fillna(0, inplace=True) test_patient_features = aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda x: [tuple(x) for x in x.values]).to_dict() test_mortality = merged.groupby('patient_id')['label'].apply( lambda x: x.unique()[0]).to_dict() etl.save_svmlight(test_patient_features, test_mortality, '../others/features_svmlight.test', '../others/features.test') deliverable1 = open('../deliverables/test_features.txt', 'wb') sorted_keys = sorted(test_patient_features.keys()) d1 = '' for i in sorted_keys: deliverable1.write(str(int(i))) others = sorted(test_patient_features[i]) for j in others: deliverable1.write(' ' + str(int(j[0])) + ':' + '%.6f' % (j[1])) deliverable1.write(' \n') X_test, Y_test = utils.get_data_from_svmlight( '../others/features_svmlight.test') return X_train, Y_train, X_test
def main(): X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") X_test, Y_test = utils.get_data_from_svmlight( "../data/test/feature_svmlight.test") X_train, Y_train, X_test = my_features() Y_pred = my_classifier_predictions(X_train, Y_train, X_test) display_metrics("LinearSVC", Y_pred, Y_test) print Y_pred
def my_features(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") #X_train, Y_train = utils.get_data_from_svmlight("C://Users//rpothams//Downloads//LN//Big Data//homework1//deliverables//features_svmlight.train") filepath = '../data/test/' events_test = pd.read_csv(filepath + 'events.csv') feature_map_test = pd.read_csv(filepath + 'event_feature_map.csv') # using modified etl.py methods to construct X_test patient_features = create_features(events_test, feature_map_test) save_svmlight(patient_features) X_test, _ = utils.get_data_from_svmlight("../data/test/features_svmlight.test") return X_train,Y_train,X_test
def load_SVMLight_data(train_data, test_data): ''' This function loads training and testing data in SVMLight. ''' # Extract previous training data. X_train, Y_train = utils.get_data_from_svmlight(train_data) # Extract testing data. X_test, Y_test = utils.get_data_from_svmlight(test_data) return X_train, Y_train, X_test, Y_test
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") test_events = pd.read_csv('../data/test/events.csv') feature_map = pd.read_csv('../data/test/event_feature_map.csv') aggregate_test_events(test_events, feature_map) X_test, Y_test = utils.get_data_from_svmlight("../data/test/feature_svmlight.test") return X_train,Y_train,X_test
def main(): X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") Y_pred = my_classifier_predictions(X_train,Y_train) utils.generate_submission("../deliverables/features.train",Y_pred) #The above function will generate a csv file of (patient_id,predicted label) and will be saved as "my_predictions.csv" in the deliverables folder. X,Y = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") print("Classifier: Decision Tree Regressor__________") acc_k,auc_k = get_acc_auc_kfold(X,Y) print(("Average Accuracy in KFold CV: "+str(acc_k))) print(("Average AUC in KFold CV: "+str(auc_k))) acc_r,auc_r = get_acc_auc_randomisedCV(X,Y) print(("Average Accuracy in Randomised CV: "+str(acc_r))) print(("Average AUC in Randomised CV: "+str(auc_r)))
def features(): ''' This function generates features. ''' # Extract previous training data. X_train, Y_train = utils.get_data_from_svmlight( "../data/training/part-r-00000") # Extract testing data. X_test, Y_test = utils.get_data_from_svmlight( "../data/testing/part-r-00000") return X_train, Y_train, X_test, Y_test
def my_features(): # TODO: complete this X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") test_events = pd.read_csv('../data/test/events.csv') feature_map = pd.read_csv('../data/test/event_feature_map.csv') aggregate_test_events(test_events, feature_map) X_test, Y_test = utils.get_data_from_svmlight( "../data/test/feature_svmlight.test") return X_train, Y_train, X_test
def my_features(): """ Generate own features. As a first pass, just replicate what we've already done in etl.py """ # First pass basically code above replicates what we've already done in ETL.py for the test data filepath = '../data/test' events = pd.read_csv(os.path.join(filepath, 'events.csv')) feature_map = pd.read_csv(os.path.join(filepath, 'event_feature_map.csv')) write_features(events, feature_map) X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") # X_val, Y_val = utils.get_data_from_svmlight("../deliverables/features_svmlight.validate") X_test, _ = utils.get_data_from_svmlight("../deliverables/test_features.txt") return X_train, Y_train, X_test
def main(): X,Y = utils.get_data_from_svmlight("../data/output.train") print "Classifier: Logistic Regression__________" acc_k = get_acc_auc_kfold(X,Y) print "Average Accuracy in KFold CV: "+str(acc_k) acc_r = get_acc_auc_randomisedCV(X,Y) print "Average Accuracy in Randomised CV: "+str(acc_r)
def main(): X_train, Y_train = utils.get_data_from_svmlight( "../data/training/part-r-00000") X_test, Y_test = utils.get_data_from_svmlight( "../data/testing/part-r-00000") print( "Baseline Using Full Training/Test Sets and No Parameter Optimization") print("_____________________________________________________________") print("Dimensions of Training Set", X_train.shape) print("Dimensions of Testing Set", X_test.shape) display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_test), Y_test) display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test) display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_test), Y_test)
def main(): X,Y = utils.get_data_from_svmlight('../deliverables/features_svmlight.train') print "Classifier: Logistic Regression__________" acc_k,auc_k = get_acc_auc_kfold(X,Y) print "Average Accuracy in KFold CV: "+str(acc_k) print "Average AUC in KFold CV: "+str(auc_k) acc_r,auc_r = get_acc_auc_randomisedCV(X,Y) print "Average Accuracy in Randomised CV: "+str(acc_r) print "Average AUC in Randomised CV: "+str(auc_r)
def main(): X,Y = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") print("Classifier: Logistic Regression__________") acc_k,auc_k = get_acc_auc_kfold(X,Y) print(("Average Accuracy in KFold CV: "+str(acc_k))) print(("Average AUC in KFold CV: "+str(auc_k))) acc_r,auc_r = get_acc_auc_randomisedCV(X,Y) print(("Average Accuracy in Randomised CV: "+str(acc_r))) print(("Average AUC in Randomised CV: "+str(auc_r)))
def my_features(): #complete this X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") test_path = '../data/test/' events = pd.read_csv(os.path.join(test_path) + 'events.csv', parse_dates=['timestamp']) feature_map = pd.read_csv(os.path.join(test_path) + 'event_feature_map.csv') # create a fake mortality dead = pd.read_csv(os.path.join('../data/train/')+'mortality_events.csv',parse_dates=['timestamp']).ix[0:1,:] dead.set_value(0,'patient_id',123456789) dead.set_value(1,'patient_id',123456789) # create features deliverable_path='../data/test/' indx_date=etl.calculate_index_date(events, dead, deliverable_path) #indx_date=indx_date.ix[0:(indx_date.shape[0]-2),:] filtered_events=etl.filter_events(events, indx_date, deliverable_path) #print filtered_events patient_features, mortality_fake=etl.create_features(events, dead, feature_map) mortality={} for key in patient_features.keys(): mortality[key]=0 #print patient_features op_file='../deliverables/features_svmlight.test' op_deliverable='../deliverables/test_features.txt' etl.save_svmlight(patient_features, mortality, op_file, op_deliverable) X_test,Y_test = utils.get_data_from_svmlight('../deliverables/features_svmlight.test') return X_train, Y_train, X_test
def my_features(): # TODO: complete this train_path = '../data/test/' deliverables_path = '../deliverables/' # Calculate index date events = pd.read_csv(train_path + 'events.csv') feature_map = pd.read_csv(train_path + 'event_feature_map.csv') # Aggregate the event values for each pat ient aggregated_events = etl.aggregate_events(events, None, feature_map, deliverables_path) ''' TODO: Complete the code below by creating two dictionaries - 1. patient_features : Key - patient_id and value is array of tuples(feature_id, feature_value) 2. mortality : Key - patient_id and value is mortality label ''' patient_features = {} for index, row in aggregated_events.iterrows(): if not patient_features.get(row['patient_id']): patient_features[row['patient_id']] = [(row['feature_id'], row['feature_value'])] else: patient_features[row['patient_id']].append( (row['feature_id'], row['feature_value'])) line = '' line_svm = '' for key, value in sorted(patient_features.iteritems()): line += str(int(key)) + ' ' line_svm += str(1) + ' ' value = sorted(value) for item in value: line += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' ' line_svm += str(int(item[0])) + ":" + str(format(item[1], '.6f')) + ' ' line += '\n' line_svm += '\n' deliverable2 = open(deliverables_path + 'test_features.txt', 'wb') deliverable2.write(line) deliverable2.close() svm_file = open(deliverables_path + 'test_mymodel_features.train', 'wb') svm_file.write(line_svm) svm_file.close() data_train = load_svmlight_file(deliverables_path + 'test_mymodel_features.train', n_features=3190) X_test = data_train[0] print(X_test.shape) X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") return X_train, Y_train, X_test
def main(): X_train, Y_train = utils.get_data_from_svmlight( "C:/users/yyan/Downloads/homework1/deliverables/features_svmlight.train" ) display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train), Y_train) display_metrics("SVM", svm_pred(X_train, Y_train), Y_train) display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train), Y_train)
def main(): X, Y = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") # X,Y = load_svmlight_file("C:/Users/Xiaojun/Desktop/omscs/CSE6250/hw1/deliverables/features_svmlight.train") print("Classifier: Logistic Regression__________") acc_k, auc_k = get_acc_auc_kfold(X, Y) print(("Average Accuracy in KFold CV: " + str(acc_k))) print(("Average AUC in KFold CV: " + str(auc_k))) acc_r, auc_r = get_acc_auc_randomisedCV(X, Y) print(("Average Accuracy in Randomised CV: " + str(acc_r))) print(("Average AUC in Randomised CV: " + str(auc_r)))
def main(): X_train, Y_train, X_test = my_features() #print X_train,X_test Y_pred = my_classifier_predictions(X_train, Y_train, X_test) utils.generate_submission("../deliverables/test_features.txt", Y_pred) #The above function will generate a csv file of (patient_id,predicted label) and will be saved as "my_predictions.csv" in the deliverables folder. X_traintest, Y_traintest = utils.get_data_from_svmlight( "../data/features_svmlight.validate") Y_trainpred = my_classifier_predictions(X_train, Y_train, X_traintest) auc = roc_auc_score(Y_traintest, Y_trainpred)
def main(): X, Y = utils.get_data_from_svmlight("data/allfeature.data") #X, Y = utils.get_data_from_svmlight("data/biggerallfeature.data") classifierName = ['Logistic Regression', 'SVM',,'SGDClassifier','adaboost','Decision Tree','RFC','GBC','RFC'] for clfname in classifierName: display_metrics(clfname,X,Y) drawrocdt(X,Y) drawroclr(X,Y) drawrocada(X,Y) drawrocrfc(X,Y)
def main(): X_train, Y_train = utils.get_data_from_svmlight( "../deliverables/features_svmlight.train") #Check model performance on training set display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_train), Y_train) display_metrics("SVM", svm_pred(X_train, Y_train, X_train), Y_train) display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_train), Y_train)
def my_features(): # Get train data from svmlight_file X_train, Y_train = utils.get_data_from_svmlight("../deliverables/features_svmlight.train") # Read in the test events and feature_map data events = pd.read_csv("../data/test/events.csv") feature_map = pd.read_csv("../data/test/event_feature_map.csv") # Aggregate test events data using the aggregate_events method from etl.py aggregated_events = aggregate_events(events , None, feature_map , "../data/test/test_aggregated_events.csv") # Create the test features patient_feautures = create_test_features(aggregated_events) # Generate the test features file save_test_features(patient_feautures,"../deliverables/test_features.txt") # Get test data from svmlight_file created above X_test, patient_ids = utils.get_data_from_svmlight("../deliverables/test_features.txt") return X_train,Y_train,X_test
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight( '../deliverables/features_svmlight.train') deliverables_path = '../deliverables/' test_events = pd.read_csv('../data/test/events.csv') test_events_map = pd.read_csv('../data/test/event_feature_map.csv') test_aggregated_events = etl.aggregate_events(test_events, None, test_events_map, deliverables_path) #make patient_features for test data test_patient_features = test_aggregated_events.groupby('patient_id')[[ 'feature_id', 'feature_value' ]].apply(lambda g: list(map(tuple, g.values.tolist()))).to_dict() #store test_feature.txt and test_svmlight file line_svm = '' line_test = '' for key in sorted(test_patient_features): line_svm += '1 ' line_test += str(int(key)) + ' ' for tup in sorted(test_patient_features[key]): line_svm += str(int(tup[0])) + ':' + str("{:.6f}".format( tup[1])) + ' ' line_test += str(int(tup[0])) + ':' + str("{:.6f}".format( tup[1])) + ' ' line_svm += '\n' line_test += '\n' test_featuresfile = open(deliverables_path + 'test_features.txt', 'wb') test_svmlightfile = open(deliverables_path + 'test_mymodel_svm.train', 'wb') test_svmlightfile.write(bytes(line_svm, 'UTF-8')) #Use 'UTF-8' test_featuresfile.write(bytes(line_test, 'UTF-8')) test_data = load_svmlight_file(deliverables_path + 'test_mymodel_svm.train', n_features=3190) X_test = test_data[0] return X_train, Y_train, X_test
def my_features(): #TODO: complete this X_train, Y_train = utils.get_data_from_svmlight( '../deliverables/features_svmlight.train') events_test = pd.read_csv('../data/test/events.csv') feature_map_test = pd.read_csv('../data/test/event_feature_map.csv') deliverables_path = '../deliverables/' aggregated_events_test = etl.aggregate_events(events_test, None, feature_map_test, deliverables_path) patient_features_test = aggregated_events_test.groupby('patient_id')[[ 'feature_id', 'feature_value' ]] patient_features_test = patient_features_test.apply( lambda g: list(map(tuple, g.values.tolist()))).to_dict() op_file = deliverables_path + 'features_svmlight.test' op_deliverable = deliverables_path + 'test_features.txt' deliverable1 = open(op_file, 'wb') deliverable2 = open(op_deliverable, 'wb') line1 = line2 = '' for key in sorted(patient_features_test.keys()): line1 += '1 ' line2 += str(int(key)) + ' ' for value in sorted(patient_features_test[key]): line1 += str(int(value[0])) + ':' + str("{:.6f}".format( value[1])) + ' ' line2 += str(int(value[0])) + ':' + str("{:.6f}".format( value[1])) + ' ' line1 += '\n' line2 += '\n' deliverable1.write(bytes(line1, 'UTF-8')) #Use 'UTF-8' deliverable2.write(bytes(line2, 'UTF-8')) X_test = load_svmlight_file(deliverables_path + 'features_svmlight.test', n_features=3190)[0] return X_train, Y_train, X_test