def generate(datasetpath, outputpath, pretrainedpath, frequency, batch_size, sample_mode): Path(outputpath).mkdir(parents=True, exist_ok=True) temppath = outputpath + "/temp/" rootdir = Path(datasetpath) videos = [str(f) for f in rootdir.glob('**/*.mp4')] # setup the model i3d = i3_res50(400, pretrainedpath) i3d.cuda() i3d.train(False) # Set model to evaluate mode for video in videos: videoname = video.split("/")[-1].split(".")[0] startime = time.time() print("Generating for {0}".format(video)) Path(temppath).mkdir(parents=True, exist_ok=True) ffmpeg.input(video).output('{}%d.jpg'.format(temppath), start_number=0).global_args( '-loglevel', 'quiet').run() print("Preprocessing done..") features = run(i3d, frequency, temppath, batch_size, sample_mode) np.save(outputpath + "/" + videoname, features) print("Obtained features of size: ", features.shape) shutil.rmtree(temppath) print("done in {0}.".format(time.time() - startime))
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. #You should have received a copy of the GNU General Public License #along with this program. If not, see <http://www.gnu.org/licenses/>. __author__ = "Miquel Ferrarons, David Vazquez" __copyright__ = "Copyright 2015, CVC-UAB" __credits__ = ["Miquel Ferrarons", "David Vazquez"] __license__ = "GPL" __version__ = "1.0" __maintainer__ = "Miquel Ferrarons" import extract_features import train_model import test_image import test_folder import show_results import evaluate_results extract_features.run() #Extracts the features for all the images train_model.run() #Train the classifier #Tests a single image and shows the results #test_image.run() #Test a whole folder test_folder.run() show_results.run() #Saves the resulting images evaluate_results.run()# Runs the evaluation
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. __author__ = "Miquel Ferrarons, David Vazquez" __copyright__ = "Copyright 2015, CVC-UAB" __credits__ = ["Miquel Ferrarons", "David Vazquez"] __license__ = "GPL" __version__ = "1.0" __maintainer__ = "Miquel Ferrarons" import extract_features import train_model import test_image import test_folder import show_results import evaluate_results extract_features.run() # Extracts the features for all the images train_model.run() # Train the classifier # Tests a single image and shows the results # test_image.run() # Test a whole folder test_folder.run() show_results.run() # Saves the resulting images evaluate_results.run() # Runs the evaluation
def run(trainfile, testfile, outfile, num_clusters, model='seq', clusterfile=None, vecfile=None): starttime = time.time() # Setup train_feat_file = trainfile + ".feats" train_kw_file = trainfile + ".kw_clusters" test_feat_file = testfile + ".feats" test_kw_file = testfile + ".kw_clusters" if vecfile is None: vecfile = "/u/sjeblee/research/va/data/datasets/mds+rct/narr+ice+medhelp.vectors.100" #clusterfile = "/u/sjeblee/research/va/data/datasets/mds+rct/train_adult_cat_spell.clusters_km50" max_label = int(num_clusters) - 1 # Extract word vector features and keyword vectors if not (os.path.exists(train_feat_file) and os.path.exists(test_feat_file)): extract_features.run(trainfile, train_feat_file, testfile, test_feat_file, arg_featurenames="narr_vec", arg_vecfile=vecfile) trainids, trainx = preprocess(train_feat_file, [], [], ["narr_vec"]) testids, testx = preprocess(test_feat_file, [], [], ["narr_vec"]) if not (os.path.exists(train_kw_file) and os.path.exists(test_kw_file)): extract_features.run(trainfile, train_kw_file, testfile, test_kw_file, arg_featurenames="kw_clusters", arg_vecfile=vecfile) trainyids, trainkws = preprocess(train_kw_file, [], [], ["keyword_clusters"]) train_kw_dict = dict(zip(trainyids, trainkws)) testyids, testkws = preprocess(test_kw_file, [], [], ["keyword_clusters"]) test_kw_dict = dict(zip(testyids, testkws)) # Calculate majority class kw_list = [] for kw_items in trainkws: for item in kw_items: if item != "": kw_list.append(int(item)) counter = Counter(kw_list) majority_kw = counter.most_common(1)[0][0] print "majority kw: " + str(majority_kw) majority_vec = data_util.multi_hot_encoding([[majority_kw]], max_label)[0] print "majority_vec: " + str(majority_vec) # Match up the kw vectors with the trainids and testids trainy = [] testy = [] for x in range(len(trainids)): rec_id = trainids[x] trainy.append(train_kw_dict[rec_id]) for y in range(len(testids)): rec_id = testids[y] testy.append(list(test_kw_dict[rec_id])) print "trainy[0]: " + str(trainy[0]) trainy_vecs = None trainy_phrases = None testy_vecs = None testy_phrases = None y_pred = [] # keyword one-hot encoding if model == 'seq': keywords = [] for item in trainy: for kw in item: keywords.append(kw) labelencoder = model_seq.create_labelencoder(keywords) trainy_vecs = model_seq.encode_labels(trainy, labelencoder) testy_vecs = model_seq.encode_labels(testy, labelencoder) print "trainy_vecs[0]: " + str(trainy_vecs[0]) # keyword multi-hot encoding if model == 'cnn': trainy = data_util.multi_hot_encoding(trainy, max_label) testy = data_util.multi_hot_encoding(testy, max_label) if model == 'encoder-decoder': trainy_vecs, trainy_phrases = cluster_keywords.cluster_embeddings(trainy, clusterfile, vecfile, return_phrases=True) testy_vecs, testy_phrases = cluster_keywords.cluster_embeddings(testy, clusterfile, vecfile, return_phrases=True) # Train and test the model print "trainy size: " + str(len(trainy)) #output_seq_len = numpy.asarray(trainy).shape[1] #print "output_seq_len: " + str(output_seq_len) #print "trainx[0]: " + str(trainx[0]) print "trainy[0]: " + str(trainy[0]) #print "trainy_phrases[0]: " + str(trainy_phrases[0]) # Seq2seq - with sequences of one-hot encodings if model == 'seq': print "seq model" output_seq_len = 10 nodes = 128 model, encoder, decoder, output_dim = model_seq.train_seq2seq(trainx, trainy_vecs, nodes, True) y_pred = model_seq.predict_seqs(encoder, decoder, testx, output_seq_len, output_dim, True) testy_pred_labs = model_seq.decode_all_labels(y_pred, labelencoder) testy_pred_labels = [','.join(row) for row in testy_pred_labs] #testy_pred_labels = cluster_keywords.embeddings_to_clusters(y_pred, clusterfile) #kw_true_text = testy_phrases testy = data_util.multi_hot_encoding(testy, max_label) testy_pred = data_util.multi_hot_encoding(testy_pred_labs, max_label) #testy_pred = data_util.map_to_multi_hot(y_pred) #testy_pred_labels = data_util.decode_multi_hot(testy_pred) print "testy_pred_labels[0]: " + str(testy_pred_labels[0]) # Torch encoder-decoder elif model == 'encoder-decoder': print "torch encoder-decoder" output_seq_len = 10 nodes = 100 # TODO: does this have to be the same as the word vector dim? encoder, decoder, output_dim = model_library_torch.encoder_decoder_model(trainx, trainy_vecs, nodes, num_epochs=1) y_pred = model_library_torch.test_encoder_decoder(encoder, decoder, testx, output_seq_len, output_dim) testy_pred_labels = cluster_keywords.embeddings_to_clusters(y_pred, clusterfile) kw_true_text = testy_phrases testy = data_util.multi_hot_encoding(testy, max_label) testy_pred = data_util.multi_hot_encoding(testy_pred_labels, max_label) # CNN elif model == 'cnn': #modelfile = "keyword_cnn_kwkm" + str(num_clusters) + ".model" modelfile = "/u/sjeblee/research/va/data/crossval_kw/gru_cnn_1/gru_cnn_1_adult.model" nodes = 100 if os.path.exists(modelfile): print "Using existing model" cnn = load_model(modelfile) else: print "Training new model..." #cnn, x, y = model_library.rnn_model(trainx, trainy, 100, modelname='gru', num_epochs=15) #cnn, x, y = model_library.cnn_model(trainx, numpy.asarray(trainy), num_epochs=10, loss_func='mean_squared_error') cnn, x, y = model_library.stacked_model(trainx, [numpy.asarray(trainy)], nodes, models='gru_cnn', num_epochs=15, loss_func='mean_squared_error') cnn.save(modelfile) # Test #y_pred = cnn.predict(numpy.asarray(testx)) # TEMP for multi model y_pred = cnn.predict(numpy.asarray(testx))[1].tolist() print "y_pred: " + str(len(y_pred)) #testy_pred_0 = data_util.map_to_multi_hot(testy_pred, 0.5) testy_pred = data_util.map_to_multi_hot(y_pred) # Decode labels testy_pred_labels = data_util.decode_multi_hot(testy_pred) print "testy_pred_labels[0]: " + str(testy_pred_labels[0]) kw_pred = [thing.split(',') for thing in testy_pred_labels] kw_true = [thing.split(',') for thing in data_util.decode_multi_hot(testy)] kw_emb, kw_pred_text = cluster_keywords.cluster_embeddings(kw_pred, clusterfile, vecfile, True) kw_true_emb, kw_true_text = cluster_keywords.cluster_embeddings(kw_true, clusterfile, vecfile, True) #testy = testy.tolist() #print "testx[0]: " + str(testx[0]) print "testy[0]: " + str(len(testy[0])) + " " + str(testy[0]) print "testy_pred[0]: " + str(len(testy_pred[0])) + " " + str(testy_pred[0]) testy_pred_2 = data_util.map_to_multi_hot(y_pred, 0.2) testy_pred_3 = data_util.map_to_multi_hot(y_pred, 0.3) #kw_pred_text = cluster_keywords.interpret_clusters(testy_pred_labels, clusterfile) #kw_true_text = cluster_keywords.interpret_clusters(data_util.decode_multi_hot(testy), clusterfile) print "kw_pred_text[0]: " + str(kw_pred_text[0]) print "kw_true_text[0]: " + str(kw_true_text[0]) testy_pred_majority = [] for x in range(len(testy_pred)): testy_pred_majority.append(majority_vec) #print "testy_pred: " + str(testy_pred) # Score results against nearest neighbor classifier print "Scores for 1 class (0.1 cutoff):" precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred) print "Macro KW scores:" print "F1: " + str(f1) print "precision: " + str(precision) print "recall: " + str(recall) print "Micro KW scores:" print "F1: " + str(micro_f1) print "precision: " + str(micro_p) print "recall: " + str(micro_r) # Score results against nearest neighbor classifier print "Scores for 1 class (0.2 cutoff):" precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred_2) print "Macro KW scores:" print "F1: " + str(f1) print "precision: " + str(precision) print "recall: " + str(recall) print "Micro KW scores:" print "F1: " + str(micro_f1) print "precision: " + str(micro_p) print "recall: " + str(micro_r) # Score results against nearest neighbor classifier print "Scores for 1 class (majority baseline):" precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred_majority) print "Macro KW scores:" print "F1: " + str(f1) print "precision: " + str(precision) print "recall: " + str(recall) print "Micro KW scores:" print "F1: " + str(micro_f1) print "precision: " + str(micro_p) print "recall: " + str(micro_r) # Save ouput to file #pred_dict = dict(zip(testids, testy_pred_labels)) #output = open(outfile, 'w') #output.write(str(pred_dict)) #output.close() cluster_keywords.write_clusters_to_xml(testfile, outfile, testids, kw_pred, kw_pred_text) endtime = time.time() print "preprocessing took " + str(endtime - starttime) + " s"
def setup(arg_modelname, arg_train, arg_test, arg_features, arg_featurename, arg_name, arg_preprocess, arg_labels, arg_dev, arg_dataloc, arg_vecfile, crossval_num=None, arg_prefix="/u/sjeblee/research/va/data", arg_sympfile=None, arg_chvfile=None): print "setup prefix: " + arg_prefix if crossval_num is not None: trainname = arg_train + "_" + str(crossval_num) devname = arg_test + "_" + str(crossval_num) else: trainname = arg_train devname = arg_test trainname = trainname + "_cat" # all, adult, child, or neonate devname = devname + "_cat" pre = arg_preprocess print "pre: " + pre labels = arg_labels featureset = arg_featurename # Name of the feature set for feature file features = arg_features # type, checklist, narr_bow, narr_tfidf, narr_count, narr_vec, kw_bow, kw_tfidf, symp_train #modeltype = arg_model # svm, knn, nn, lstm, nb, rf modelname = arg_modelname query_vectors = None # Location of data files dataloc = arg_dataloc resultsloc = arg_prefix + "/" + arg_name heideldir = "/u/sjeblee/tools/heideltime/heideltime-standalone" #scriptdir="/u/sjeblee/research/va/git/verbal-autopsy" # Setup if not os.path.exists(resultsloc): os.mkdir(resultsloc) #trainset = dataloc + "/train_" + trainname + ".xml" trainset = dataloc + "/train_" + trainname + ".xml" # Trying to make spell_symp for all dataset. Make cross-validation faster devset = "" devfeatures = "" devresults = "" if arg_dev: devset = dataloc + "/dev_" + devname + ".xml" devfeatures = dataloc + "/dev_" + devname + ".features." + featureset devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset else: devset = dataloc + "/test_" + devname + ".xml" devfeatures = dataloc + "/test_" + devname + ".features." + featureset devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset element = "narrative" # Edit by Yoona element = [] element.append("narrative") # Preprocessing spname = "spell" print "Preprocessing..." if "spell" in pre: print "Running spelling correction..." #trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml" trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml" devsp = "" if arg_dev: devsp = dataloc + "/dev_" + devname + "_" + spname + ".xml" else: devsp = dataloc + "/test_" + devname + "_" + spname + ".xml" if not os.path.exists(trainsp): print "spellcorrect on train data..." spellcorrect.run(trainset, trainsp) if not os.path.exists(devsp): print "spellcorrect on test data..." spellcorrect.run(devset, devsp) trainset = trainsp devset = devsp devname = devname + "_" + spname trainname = trainname + "_" + spname if "heidel" in pre: print "Running Heideltime..." with cd(heideldir): trainh = dataloc + "/train_" + trainname + "_ht.xml" if not os.path.exists(trainh): heidel_tag.run(trainset, trainh) fixtags(trainh) trainset = trainh devh = "" if arg_dev: devh = dataloc + "/dev_" + devname + "_ht.xml" else: devh = dataloc + "/test_" + devname + "_ht.xml" if not os.path.exists(devh): heidel_tag.run(devset, devh) fixtags(devh) devset = devh devname = devname + "_ht" trainname = trainname + "_ht" if "medttk" in pre: print "Running medttk..." trainh = dataloc + "/train_" + trainname + "_medttk.xml" if not os.path.exists(trainh): medttk_tag.run(trainset, trainh) fixtags(trainh) trainset = trainh devh = "" if arg_dev: devh = dataloc + "/dev_" + devname + "_medttk.xml" else: devh = dataloc + "/test_" + devname + "_medttk.xml" if not os.path.exists(devh): medttk_tag.run(devset, devh) fixtags(devh) devset = devh devname = devname + "_medttk" trainname = trainname + "_medttk" element = "narr_medttk" if "symp" in pre: if (arg_sympfile is None or arg_chvfile is None): print "Symptom files are not provided." print "Tagging symptoms..." sympname = "symp" tagger_name = "tag_symptoms" #trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml" trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml" devsp = "" if arg_dev: devsp = dataloc + "/dev_" + devname + "_" + sympname + ".xml" else: devsp = dataloc + "/test_" + devname + "_" + sympname + ".xml" if not os.path.exists(trainsp): print "tag_symptoms on train data..." tag_symptoms.run(trainset, trainsp, tagger_name, arg_sympfile, arg_chvfile) #fixtags(trainsp) if not os.path.exists(devsp): print "tag_symptoms on test data..." tag_symptoms.run(devset, devsp, tagger_name, arg_sympfile, arg_chvfile) # fixtags(devsp) trainset = trainsp devset = devsp devname = devname + "_" + sympname trainname = trainname + "_" + sympname #element = "narr_symp" element.append("narr_symp") if "textrank" in pre: print "Extract keywords using textrank" textrankname = "textrank" tagger_name = "textrank" trainsp = dataloc + "/train_" + trainname + "_" + textrankname + ".xml" devsp = "" if arg_dev: devsp = dataloc + "/dev_" + devname + "_" + textrankname + ".xml" else: devsp = dataloc + "/test_" + devname + "_" + textrankname + ".xml" if not os.path.exists(trainsp): print "Keyword extraction using textrank on train data..." tag_symptoms.run(trainset, trainsp, tagger_name) if not os.path.exists(devsp): print "Keyword extraction using textrank on test data..." tag_symptoms.run(devset, devsp, tagger_name) trainset = trainsp devset = devsp devname = devname + "_" + textrankname trainname = trainname + "_" + textrankname element.append("narr_textrank") if "kwc" in pre: numc = "50" kwname = "kwkm" + str(numc) # TODO: move this setup to a function trainkw = dataloc + "/train_" + trainname + "_" + kwname + ".xml" devkw = "" if arg_dev: devkw = dataloc + "/dev_" + devname + "_" + kwname + ".xml" else: devkw = dataloc + "/test_" + devname + "_" + kwname + ".xml" if not os.path.exists(trainkw and devkw): print "Keyword clustering..." #clusterfile = trainkw + ".clusters" clusterfile = dataloc + "/train_" + trainname + "_" + kwname + ".clusters" cluster_keywords.run(trainkw, clusterfile, arg_vecfile, trainset, devset, devkw, num_clusters=numc) trainset = trainkw devset = devkw devname = devname + "_" + kwname trainname = trainname + "_" + kwname #query_vectors = eval(open(clusterfile + '.centers', 'r').read()) #print('Loaded query vectors:', type(query_vectors)) print("Elements: ") print(element) # Feature Extraction if arg_dev: devset = dataloc + "/dev_" + devname + ".xml" devfeatures = dataloc + "/dev_" + devname + ".features." + featureset devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset else: devset = dataloc + "/test_" + devname + ".xml" devfeatures = dataloc + "/test_" + devname + ".features." + featureset devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset print "trainfeatures: " + trainfeatures print "devfeatures: " + devfeatures stem = False lemma = False if "stem" in pre: stem = True if "lemma" in pre: lemma = True print "stem: " + str(stem) + " lemma: " + str(lemma) if not (os.path.exists(trainfeatures) and os.path.exists(devfeatures)): print "Extracting features..." if arg_vecfile is not None: extract_features.run(trainset, trainfeatures, devset, devfeatures, features, labels, stem, lemma, element, arg_vecfile=arg_vecfile) else: extract_features.run(trainset, trainfeatures, devset, devfeatures, features, labels, stem, lemma, element) return trainfeatures, devfeatures, devresults
__author__ = "Sergi Sancho, Adriana Fernandez, Eric Lopez y Gerard Marti" __credits__ = ['Sergi Sancho', 'Adriana Fernandez', 'Eric Lopez', 'Gerard Marti'] __license__ = "GPL" __version__ = "1.0" import extract_features import train import test import draw_results from MatlabCode import detect_signals #DETECTION #Color segmentation detect_signals.run() #CLASSIFICATION extract_features.run() # Extracts the features for all the images to train the classifier train.run() # Train the classifier #TESTING AND EVALUATION test.run() # Test a whole folder draw_results.run() # Draw the detected boxes in the images