def runExperiments(features, es, logFile): # Reading the data into an array data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) data = m.conceptPreprocessing(data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Looping over different feature parameters for featTypes in features: #for x in [True, False]: #es.fs_confidence = x logFile.write('Executing for ' + ','.join(featTypes) + ' model.\n') es.featTypes = featTypes if es.svmParamSweep: result_params = m.param_sweep_svm(data, es, gammaSweep=False, nFolds=10, verbose=False, random_seed=44) for name in result_params: logFile.write( str(name) + ": " + str(result_params[name]) + '\n') else: estimator = m.getEstimator(es) if es.bootstrap: results = m.eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es, 10, printTree=False) else: results = m.evalCrossval(estimator, data, es, 10, printTree=False) for name in results: logFile.write(str(name) + ": " + str(results[name]) + '\n')
def main(useAnnotatorWeighing=True): ''' This script runs the experiments by training on a trainset and testing on a test set. Also allows bootstrapping (which is hard coded in this script as well) Configure your model settings by modifying the ExperimentSettings object in the script. The output of these models are annotated files in the output folder, which can be evaluated (in metrics) using testEval.py ''' # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+1"]] #features = [["CONCEPTS"]]#['BOW'], # features = [["CONCEPTS"]] # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() # es.fs_varianceFilter = True # es.bootstrap = True # es.ss_prototyping = True # es.weighInterAnnot = False # es.ml_algorithm='RF' #remove these! # es.removeDeniedConcepts=False # es.splitFamilyConcepts=False # es.splitUncertainConcepts=False # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_data, y_train) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) concatenated_data = [] concatenated_data.extend(train_data) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_data, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_data)] test_feats = featurized[len(train_data):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_train, es) train_feats, y_train, train_bucket = ss.runSampleSelection( train_feats, y_train, [i for i in range(len(train_data))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_data, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_train, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')
def main(useAnnotatorWeighing=True): """ This script allows for 10-fold cross validation over the data in the training set. Experiments only yield results, they don't yield annotated files. The standard deviation seen over the different folds for each metric are reported as well. Configure your model settings by modifying the ExperimentSettings object in the script. """ # Making folders from config # cfg.makeFolders() # Here, you can specify the feature sets you would like to use. It is arranged in an array of arrays, to enable combinations features = [["DSM+2"], ["BOW"], ["DSM+1"], ["DSM"], ["SNOMED"], ["SNOMED+1"], ["DSM+2"], ["CONCEPTS"]] #features = [["DSM"],["DSM+1","DIST_HIER"],["DSM+1"], ["CATEGORICAL_QUESTIONSET","QUESTIONSET","LONG_QUESTIONSET"]] # Options: # 'CONCEPTS', 'DSM+1', 'DSM', 'DSM_HIER', 'MED', 'BOW', 'BOW_ANSWERS', 'CATEGORICAL_QUESTIONSET', 'QUESTIONSET' # 'WORD_VECTOR', 'WORD_VECTOR_ANSWERS', 'CONCEPT_VECTOR', 'DIST_WORDVECTOR', 'DIST_CONCEPTVECTOR' # 'CONCEPT_CLUSTERS', 'PREAMBLE_CLUSTERS' # if you want anything set differently than default, please change the corresponding parameter in es (ExperimentSettings) es = ExperimentSettings() es.fs_varianceFilter = True es.bootstrap = False es.ss_prototyping = False es.weighInterAnnot = False #es.ml_algorithm='XGBOOST' #es.ml_algorithm = 'RANDOM' '''es.removeDeniedConcepts=True es.removeUncertainConcepts=False es.splitDeniedConcepts=False es.splitFamilyConcepts=True''' es.removeDeniedConcepts = False es.splitDeniedConcepts = False es.splitUncertainConcepts = False es.splitFamilyConcepts = False #es.fs_confidence=True #es.fs_confidenceValueDistinction = True #es.fs_chiSquare = False #es.fs_varianceFilter = True #es.fs_varianceThreshold = 0.05 #es.fs_confidence = True #es.fs_informationGain = False #es.fs_confidenceWithCoverage = True #es.fs_confidenceTopK = 100 #es.fs_confidenceCoverageOverlap = 3 #es.fs_confidenceCutOff = 0.05''' # Reading the data into an array data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) data = m.conceptPreprocessing(data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Looping over different feature parameters for featTypes in features: #for x in [True, False]: #es.fs_confidence = x utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes if es.svmParamSweep: result_params = m.param_sweep_svm(data, es, gammaSweep=False, nFolds=10, verbose=False, random_seed=44) for name in result_params: print(str(name) + ":", result_params[name]) else: estimator = m.getEstimator(es) if es.bootstrap: results = m.eval_bootstrapped_crossVal(estimator, data, bootstrap_data, es, 10, printTree=False) else: results = m.evalCrossval(estimator, data, es, 10, printTree=False) for name in results: print(str(name) + ":", results[name])
def runForExperimentSettings(features, es): # Reading the train/test_data into an array train_data = utils.readData(cfg.PATH_TRAIN, cfg.PATH_PREPROCESSED_TRAIN) test_data = utils.readData(cfg.PATH_TEST, cfg.PATH_PREPROCESSED_TEST) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Reading in bootstrap data as well when enabled if es.bootstrap: bootstrap_data = utils.readData(cfg.PATH_UNANNOTATED, cfg.PATH_PREPROCESSED_UNANNOTATED) bootstrap_data = m.conceptPreprocessing( bootstrap_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts, es.removeFamilyConcepts, es.splitFamilyConcepts) # Doing modifications on the concepts, based on the segmentation settings that are defined (ONLY PERFORM ONCE) # train_data = m.conceptPreprocessing(train_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) # test_data = m.conceptPreprocessing(test_data, es.removeDeniedConcepts, es.splitDeniedConcepts, es.removeUncertainConcepts, es.splitUncertainConcepts,es.removeFamilyConcepts,es.splitFamilyConcepts) vectorizer = DictVectorizer() min_max_scalar = MinMaxScaler() # Looping over different feature parameters for featTypes in features: utils.out('Executing for ' + ','.join(featTypes) + ' model.') es.featTypes = featTypes estimator = m.getEstimator(es) m.generatePrimaryFeats(train_data, es) m.generatePrimaryFeats(test_data, es) utils.out('Generated primary features for train and test_data!') y_train = [d.severity for d in train_data] #else argument added here to not override the train_data/y_train setting, otherwise we can only do one featType at a time if es.bootstrap: m.generatePrimaryFeats(bootstrap_data, es) (train_datac, y_trainc) = m.get_bootstrapped_trainset(train_data, y_train, bootstrap_data, es, estimator, th_bs=0.6) else: train_datac = train_data y_trainc = y_train concatenated_data = [] concatenated_data.extend(train_datac) concatenated_data.extend(test_data) m.generateDataDrivenFeats(train_datac, concatenated_data, es) featurized = m.featurize(concatenated_data) train_feats = featurized[0:len(train_datac)] test_feats = featurized[len(train_datac):len(featurized)] # Do feature selection on train data train_feats = fs.runFeatureSelection(train_feats, y_trainc, es) train_feats, y_trainc, train_bucket = ss.runSampleSelection( train_feats, y_trainc, [i for i in range(len(train_datac))], es) x_train = vectorizer.fit_transform(train_feats) x_test = vectorizer.transform(test_feats) if es.scaleData: x_train = min_max_scalar.fit_transform(x_train.toarray()) x_test = min_max_scalar.transform(x_test.toarray()) weights_train = m.getWeights(train_datac, train_bucket, es.weighInterAnnot) model = m.train(estimator, x_train, y_trainc, weights_train, model=None) y_pred = m.test(x_test, estimator=model) # print(y_pred) for i, cur_data in enumerate(test_data): cur_data.predSev = y_pred[i] out_dir = cfg.PATH_OUTPUT + ','.join(featTypes) + '/' if not os.path.exists(out_dir): os.makedirs(out_dir) utils.genOutput(data=test_data, outDir=out_dir, dtd=cfg.PATH_OUTPUT + '2016_CEGS_N-GRID_TRACK2.dtd/')