def generate(datasetpath, outputpath, pretrainedpath, frequency, batch_size,
             sample_mode):
    Path(outputpath).mkdir(parents=True, exist_ok=True)
    temppath = outputpath + "/temp/"
    rootdir = Path(datasetpath)
    videos = [str(f) for f in rootdir.glob('**/*.mp4')]
    # setup the model
    i3d = i3_res50(400, pretrainedpath)
    i3d.cuda()
    i3d.train(False)  # Set model to evaluate mode
    for video in videos:
        videoname = video.split("/")[-1].split(".")[0]
        startime = time.time()
        print("Generating for {0}".format(video))
        Path(temppath).mkdir(parents=True, exist_ok=True)
        ffmpeg.input(video).output('{}%d.jpg'.format(temppath),
                                   start_number=0).global_args(
                                       '-loglevel', 'quiet').run()
        print("Preprocessing done..")
        features = run(i3d, frequency, temppath, batch_size, sample_mode)
        np.save(outputpath + "/" + videoname, features)
        print("Obtained features of size: ", features.shape)
        shutil.rmtree(temppath)
        print("done in {0}.".format(time.time() - startime))
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.

#You should have received a copy of the GNU General Public License
#along with this program.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Miquel Ferrarons, David Vazquez"
__copyright__ = "Copyright 2015, CVC-UAB"
__credits__ = ["Miquel Ferrarons", "David Vazquez"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "Miquel Ferrarons"

import extract_features
import train_model
import test_image
import test_folder
import show_results
import evaluate_results

extract_features.run() #Extracts the features for all the images
train_model.run() #Train the classifier

#Tests a single image and shows the results
#test_image.run()

#Test a whole folder
test_folder.run()
show_results.run() #Saves the resulting images
evaluate_results.run()# Runs the evaluation
Ejemplo n.º 3
0
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

__author__ = "Miquel Ferrarons, David Vazquez"
__copyright__ = "Copyright 2015, CVC-UAB"
__credits__ = ["Miquel Ferrarons", "David Vazquez"]
__license__ = "GPL"
__version__ = "1.0"
__maintainer__ = "Miquel Ferrarons"

import extract_features
import train_model
import test_image
import test_folder
import show_results
import evaluate_results

extract_features.run()  # Extracts the features for all the images
train_model.run()  # Train the classifier

# Tests a single image and shows the results
# test_image.run()

# Test a whole folder
test_folder.run()
show_results.run()  # Saves the resulting images
evaluate_results.run()  # Runs the evaluation
Ejemplo n.º 4
0
def run(trainfile, testfile, outfile, num_clusters, model='seq', clusterfile=None, vecfile=None):
    starttime = time.time()

    # Setup
    train_feat_file = trainfile + ".feats"
    train_kw_file = trainfile + ".kw_clusters"
    test_feat_file = testfile + ".feats"
    test_kw_file = testfile + ".kw_clusters"
    if vecfile is None:
        vecfile = "/u/sjeblee/research/va/data/datasets/mds+rct/narr+ice+medhelp.vectors.100"
    #clusterfile = "/u/sjeblee/research/va/data/datasets/mds+rct/train_adult_cat_spell.clusters_km50"
    max_label = int(num_clusters) - 1

    # Extract word vector features and keyword vectors
    if not (os.path.exists(train_feat_file) and os.path.exists(test_feat_file)):
        extract_features.run(trainfile, train_feat_file, testfile, test_feat_file, arg_featurenames="narr_vec", arg_vecfile=vecfile)
    trainids, trainx = preprocess(train_feat_file, [], [], ["narr_vec"])
    testids, testx = preprocess(test_feat_file, [], [], ["narr_vec"])
    if not (os.path.exists(train_kw_file) and os.path.exists(test_kw_file)):
        extract_features.run(trainfile, train_kw_file, testfile, test_kw_file, arg_featurenames="kw_clusters", arg_vecfile=vecfile)
    trainyids, trainkws = preprocess(train_kw_file, [], [], ["keyword_clusters"])
    train_kw_dict = dict(zip(trainyids, trainkws))
    
    testyids, testkws = preprocess(test_kw_file, [], [], ["keyword_clusters"])
    test_kw_dict = dict(zip(testyids, testkws))

    # Calculate majority class
    kw_list = []
    for kw_items in trainkws:
        for item in kw_items:
            if item != "":
                kw_list.append(int(item))
    counter = Counter(kw_list)
    majority_kw = counter.most_common(1)[0][0]
    print "majority kw: " + str(majority_kw)
    majority_vec = data_util.multi_hot_encoding([[majority_kw]], max_label)[0]
    print "majority_vec: " + str(majority_vec)

    # Match up the kw vectors with the trainids and testids
    trainy = []
    testy = []
    for x in range(len(trainids)):
        rec_id = trainids[x]
        trainy.append(train_kw_dict[rec_id])
    for y in range(len(testids)):
        rec_id = testids[y]
        testy.append(list(test_kw_dict[rec_id]))

    print "trainy[0]: " + str(trainy[0])
    trainy_vecs = None
    trainy_phrases = None
    testy_vecs = None
    testy_phrases = None
    y_pred = []

    # keyword one-hot encoding
    if model == 'seq':
        keywords = []
        for item in trainy:
            for kw in item:
                keywords.append(kw)
        labelencoder = model_seq.create_labelencoder(keywords)
        trainy_vecs = model_seq.encode_labels(trainy, labelencoder)
        testy_vecs = model_seq.encode_labels(testy, labelencoder)
        print "trainy_vecs[0]: " + str(trainy_vecs[0])
    
    # keyword multi-hot encoding
    if model == 'cnn':
        trainy = data_util.multi_hot_encoding(trainy, max_label)
        testy = data_util.multi_hot_encoding(testy, max_label)

    if model == 'encoder-decoder':
        trainy_vecs, trainy_phrases = cluster_keywords.cluster_embeddings(trainy, clusterfile, vecfile, return_phrases=True)
        testy_vecs, testy_phrases = cluster_keywords.cluster_embeddings(testy, clusterfile, vecfile, return_phrases=True)

    # Train and test the model
    print "trainy size: " + str(len(trainy))
    #output_seq_len = numpy.asarray(trainy).shape[1]
    #print "output_seq_len: " + str(output_seq_len)
    #print "trainx[0]: " + str(trainx[0])
    print "trainy[0]: " + str(trainy[0])
    #print "trainy_phrases[0]: " + str(trainy_phrases[0])

    # Seq2seq - with sequences of one-hot encodings
    if model == 'seq':
        print "seq model"
        output_seq_len = 10
        nodes = 128
        model, encoder, decoder, output_dim = model_seq.train_seq2seq(trainx, trainy_vecs, nodes, True)
        y_pred = model_seq.predict_seqs(encoder, decoder, testx, output_seq_len, output_dim, True)
        testy_pred_labs = model_seq.decode_all_labels(y_pred, labelencoder)
        testy_pred_labels = [','.join(row) for row in testy_pred_labs] 
        #testy_pred_labels = cluster_keywords.embeddings_to_clusters(y_pred, clusterfile)
        #kw_true_text = testy_phrases
        testy = data_util.multi_hot_encoding(testy, max_label)
        testy_pred = data_util.multi_hot_encoding(testy_pred_labs, max_label)
        #testy_pred = data_util.map_to_multi_hot(y_pred)
        #testy_pred_labels = data_util.decode_multi_hot(testy_pred)
        print "testy_pred_labels[0]: " + str(testy_pred_labels[0])

    # Torch encoder-decoder
    elif model == 'encoder-decoder':
        print "torch encoder-decoder"
        output_seq_len = 10
        nodes = 100 # TODO: does this have to be the same as the word vector dim?
        encoder, decoder, output_dim = model_library_torch.encoder_decoder_model(trainx, trainy_vecs, nodes, num_epochs=1)
        y_pred = model_library_torch.test_encoder_decoder(encoder, decoder, testx, output_seq_len, output_dim)
        testy_pred_labels = cluster_keywords.embeddings_to_clusters(y_pred, clusterfile)
        kw_true_text = testy_phrases
        testy = data_util.multi_hot_encoding(testy, max_label)
        testy_pred = data_util.multi_hot_encoding(testy_pred_labels, max_label)

    # CNN
    elif model == 'cnn':
        #modelfile = "keyword_cnn_kwkm" + str(num_clusters) + ".model"
        modelfile = "/u/sjeblee/research/va/data/crossval_kw/gru_cnn_1/gru_cnn_1_adult.model"
        nodes = 100
        if os.path.exists(modelfile):
            print "Using existing model"
            cnn = load_model(modelfile)
        else:
            print "Training new model..."
            #cnn, x, y = model_library.rnn_model(trainx, trainy, 100, modelname='gru', num_epochs=15)
            #cnn, x, y = model_library.cnn_model(trainx, numpy.asarray(trainy), num_epochs=10, loss_func='mean_squared_error')
            cnn, x, y = model_library.stacked_model(trainx, [numpy.asarray(trainy)], nodes, models='gru_cnn', num_epochs=15, loss_func='mean_squared_error')
            cnn.save(modelfile)
        # Test
        #y_pred = cnn.predict(numpy.asarray(testx))
        
        # TEMP for multi model
        y_pred = cnn.predict(numpy.asarray(testx))[1].tolist()
        print "y_pred: " + str(len(y_pred))
        
        #testy_pred_0 = data_util.map_to_multi_hot(testy_pred, 0.5)
        testy_pred = data_util.map_to_multi_hot(y_pred)
        # Decode labels
        testy_pred_labels = data_util.decode_multi_hot(testy_pred)
        print "testy_pred_labels[0]: " + str(testy_pred_labels[0])
        kw_pred = [thing.split(',') for thing in testy_pred_labels]
        kw_true = [thing.split(',') for thing in data_util.decode_multi_hot(testy)]
        kw_emb, kw_pred_text = cluster_keywords.cluster_embeddings(kw_pred, clusterfile, vecfile, True)
        kw_true_emb, kw_true_text = cluster_keywords.cluster_embeddings(kw_true, clusterfile, vecfile, True)

        #testy = testy.tolist()
        #print "testx[0]: " + str(testx[0])
        print "testy[0]: " + str(len(testy[0])) + " " + str(testy[0])
        print "testy_pred[0]: " + str(len(testy_pred[0])) + " " + str(testy_pred[0])

    testy_pred_2 = data_util.map_to_multi_hot(y_pred, 0.2)
    testy_pred_3 = data_util.map_to_multi_hot(y_pred, 0.3)
    #kw_pred_text = cluster_keywords.interpret_clusters(testy_pred_labels, clusterfile)
    #kw_true_text = cluster_keywords.interpret_clusters(data_util.decode_multi_hot(testy), clusterfile)
    print "kw_pred_text[0]: " + str(kw_pred_text[0])
    print "kw_true_text[0]: " + str(kw_true_text[0])

    testy_pred_majority = []
    for x in range(len(testy_pred)):
        testy_pred_majority.append(majority_vec)

    #print "testy_pred: " + str(testy_pred)

    # Score results against nearest neighbor classifier
    print "Scores for 1 class (0.1 cutoff):"
    precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred)
    print "Macro KW scores:"
    print "F1: " + str(f1)
    print "precision: " + str(precision)
    print "recall: " + str(recall)
    print "Micro KW scores:"
    print "F1: " + str(micro_f1)
    print "precision: " + str(micro_p)
    print "recall: " + str(micro_r)

    # Score results against nearest neighbor classifier
    print "Scores for 1 class (0.2 cutoff):"
    precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred_2)
    print "Macro KW scores:"
    print "F1: " + str(f1)
    print "precision: " + str(precision)
    print "recall: " + str(recall)
    print "Micro KW scores:"
    print "F1: " + str(micro_f1)
    print "precision: " + str(micro_p)
    print "recall: " + str(micro_r)

    # Score results against nearest neighbor classifier
    print "Scores for 1 class (majority baseline):"
    precision, recall, f1, micro_p, micro_r, micro_f1 = data_util.score_vec_labels(testy, testy_pred_majority)
    print "Macro KW scores:"
    print "F1: " + str(f1)
    print "precision: " + str(precision)
    print "recall: " + str(recall)
    print "Micro KW scores:"
    print "F1: " + str(micro_f1)
    print "precision: " + str(micro_p)
    print "recall: " + str(micro_r)

    # Save ouput to file
    #pred_dict = dict(zip(testids, testy_pred_labels))
    #output = open(outfile, 'w')
    #output.write(str(pred_dict))
    #output.close()
    cluster_keywords.write_clusters_to_xml(testfile, outfile, testids, kw_pred, kw_pred_text)

    endtime = time.time()
    print "preprocessing took " + str(endtime - starttime) + " s"
Ejemplo n.º 5
0
def setup(arg_modelname,
          arg_train,
          arg_test,
          arg_features,
          arg_featurename,
          arg_name,
          arg_preprocess,
          arg_labels,
          arg_dev,
          arg_dataloc,
          arg_vecfile,
          crossval_num=None,
          arg_prefix="/u/sjeblee/research/va/data",
          arg_sympfile=None,
          arg_chvfile=None):
    print "setup prefix: " + arg_prefix
    if crossval_num is not None:
        trainname = arg_train + "_" + str(crossval_num)
        devname = arg_test + "_" + str(crossval_num)
    else:
        trainname = arg_train
        devname = arg_test
    trainname = trainname + "_cat"  # all, adult, child, or neonate
    devname = devname + "_cat"
    pre = arg_preprocess
    print "pre: " + pre
    labels = arg_labels
    featureset = arg_featurename  # Name of the feature set for feature file
    features = arg_features  # type, checklist, narr_bow, narr_tfidf, narr_count, narr_vec, kw_bow, kw_tfidf, symp_train
    #modeltype = arg_model # svm, knn, nn, lstm, nb, rf
    modelname = arg_modelname
    query_vectors = None

    # Location of data files
    dataloc = arg_dataloc
    resultsloc = arg_prefix + "/" + arg_name
    heideldir = "/u/sjeblee/tools/heideltime/heideltime-standalone"
    #scriptdir="/u/sjeblee/research/va/git/verbal-autopsy"

    # Setup
    if not os.path.exists(resultsloc):
        os.mkdir(resultsloc)
    #trainset = dataloc + "/train_" + trainname + ".xml"
    trainset = dataloc + "/train_" + trainname + ".xml"  # Trying to make spell_symp for all dataset. Make cross-validation faster
    devset = ""
    devfeatures = ""
    devresults = ""
    if arg_dev:
        devset = dataloc + "/dev_" + devname + ".xml"
        devfeatures = dataloc + "/dev_" + devname + ".features." + featureset
        devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset
    else:
        devset = dataloc + "/test_" + devname + ".xml"
        devfeatures = dataloc + "/test_" + devname + ".features." + featureset
        devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset
    trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset
    element = "narrative"

    # Edit by Yoona
    element = []
    element.append("narrative")

    # Preprocessing
    spname = "spell"
    print "Preprocessing..."
    if "spell" in pre:
        print "Running spelling correction..."
        #trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml"
        trainsp = dataloc + "/train_" + trainname + "_" + spname + ".xml"
        devsp = ""
        if arg_dev:
            devsp = dataloc + "/dev_" + devname + "_" + spname + ".xml"
        else:
            devsp = dataloc + "/test_" + devname + "_" + spname + ".xml"
        if not os.path.exists(trainsp):
            print "spellcorrect on train data..."
            spellcorrect.run(trainset, trainsp)
        if not os.path.exists(devsp):
            print "spellcorrect on test data..."
            spellcorrect.run(devset, devsp)

        trainset = trainsp
        devset = devsp
        devname = devname + "_" + spname
        trainname = trainname + "_" + spname

    if "heidel" in pre:
        print "Running Heideltime..."
        with cd(heideldir):
            trainh = dataloc + "/train_" + trainname + "_ht.xml"
            if not os.path.exists(trainh):
                heidel_tag.run(trainset, trainh)
                fixtags(trainh)
            trainset = trainh
            devh = ""
            if arg_dev:
                devh = dataloc + "/dev_" + devname + "_ht.xml"
            else:
                devh = dataloc + "/test_" + devname + "_ht.xml"
            if not os.path.exists(devh):
                heidel_tag.run(devset, devh)
                fixtags(devh)
            devset = devh
        devname = devname + "_ht"
        trainname = trainname + "_ht"

    if "medttk" in pre:
        print "Running medttk..."
        trainh = dataloc + "/train_" + trainname + "_medttk.xml"
        if not os.path.exists(trainh):
            medttk_tag.run(trainset, trainh)
            fixtags(trainh)
        trainset = trainh
        devh = ""
        if arg_dev:
            devh = dataloc + "/dev_" + devname + "_medttk.xml"
        else:
            devh = dataloc + "/test_" + devname + "_medttk.xml"
        if not os.path.exists(devh):
            medttk_tag.run(devset, devh)
            fixtags(devh)
        devset = devh
        devname = devname + "_medttk"
        trainname = trainname + "_medttk"
        element = "narr_medttk"

    if "symp" in pre:
        if (arg_sympfile is None or arg_chvfile is None):
            print "Symptom files are not provided."
        print "Tagging symptoms..."
        sympname = "symp"
        tagger_name = "tag_symptoms"
        #trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml"
        trainsp = dataloc + "/train_" + trainname + "_" + sympname + ".xml"
        devsp = ""
        if arg_dev:
            devsp = dataloc + "/dev_" + devname + "_" + sympname + ".xml"
        else:
            devsp = dataloc + "/test_" + devname + "_" + sympname + ".xml"
        if not os.path.exists(trainsp):
            print "tag_symptoms on train data..."
            tag_symptoms.run(trainset, trainsp, tagger_name, arg_sympfile,
                             arg_chvfile)
            #fixtags(trainsp)
        if not os.path.exists(devsp):
            print "tag_symptoms on test data..."
            tag_symptoms.run(devset, devsp, tagger_name, arg_sympfile,
                             arg_chvfile)
        #    fixtags(devsp)

        trainset = trainsp
        devset = devsp
        devname = devname + "_" + sympname
        trainname = trainname + "_" + sympname
        #element = "narr_symp"
        element.append("narr_symp")

    if "textrank" in pre:
        print "Extract keywords using textrank"
        textrankname = "textrank"
        tagger_name = "textrank"

        trainsp = dataloc + "/train_" + trainname + "_" + textrankname + ".xml"
        devsp = ""
        if arg_dev:
            devsp = dataloc + "/dev_" + devname + "_" + textrankname + ".xml"
        else:
            devsp = dataloc + "/test_" + devname + "_" + textrankname + ".xml"
        if not os.path.exists(trainsp):
            print "Keyword extraction using textrank on train data..."
            tag_symptoms.run(trainset, trainsp, tagger_name)
        if not os.path.exists(devsp):
            print "Keyword extraction using textrank on test data..."
            tag_symptoms.run(devset, devsp, tagger_name)

        trainset = trainsp
        devset = devsp
        devname = devname + "_" + textrankname
        trainname = trainname + "_" + textrankname
        element.append("narr_textrank")

    if "kwc" in pre:
        numc = "50"
        kwname = "kwkm" + str(numc)
        # TODO: move this setup to a function
        trainkw = dataloc + "/train_" + trainname + "_" + kwname + ".xml"
        devkw = ""
        if arg_dev:
            devkw = dataloc + "/dev_" + devname + "_" + kwname + ".xml"
        else:
            devkw = dataloc + "/test_" + devname + "_" + kwname + ".xml"
        if not os.path.exists(trainkw and devkw):
            print "Keyword clustering..."
            #clusterfile = trainkw + ".clusters"
            clusterfile = dataloc + "/train_" + trainname + "_" + kwname + ".clusters"
            cluster_keywords.run(trainkw,
                                 clusterfile,
                                 arg_vecfile,
                                 trainset,
                                 devset,
                                 devkw,
                                 num_clusters=numc)
        trainset = trainkw
        devset = devkw
        devname = devname + "_" + kwname
        trainname = trainname + "_" + kwname
        #query_vectors = eval(open(clusterfile + '.centers', 'r').read())
        #print('Loaded query vectors:', type(query_vectors))

    print("Elements: ")
    print(element)
    # Feature Extraction
    if arg_dev:
        devset = dataloc + "/dev_" + devname + ".xml"
        devfeatures = dataloc + "/dev_" + devname + ".features." + featureset
        devresults = resultsloc + "/dev_" + devname + ".results." + modelname + "." + featureset
    else:
        devset = dataloc + "/test_" + devname + ".xml"
        devfeatures = dataloc + "/test_" + devname + ".features." + featureset
        devresults = resultsloc + "/test_" + devname + ".results." + modelname + "." + featureset
    trainfeatures = dataloc + "/train_" + trainname + ".features." + featureset
    print "trainfeatures: " + trainfeatures
    print "devfeatures: " + devfeatures
    stem = False
    lemma = False
    if "stem" in pre:
        stem = True
    if "lemma" in pre:
        lemma = True
    print "stem: " + str(stem) + " lemma: " + str(lemma)
    if not (os.path.exists(trainfeatures) and os.path.exists(devfeatures)):
        print "Extracting features..."
        if arg_vecfile is not None:
            extract_features.run(trainset,
                                 trainfeatures,
                                 devset,
                                 devfeatures,
                                 features,
                                 labels,
                                 stem,
                                 lemma,
                                 element,
                                 arg_vecfile=arg_vecfile)
        else:
            extract_features.run(trainset, trainfeatures, devset, devfeatures,
                                 features, labels, stem, lemma, element)
    return trainfeatures, devfeatures, devresults
Ejemplo n.º 6
0
__author__ = "Sergi Sancho, Adriana Fernandez, Eric Lopez y Gerard Marti"
__credits__ = ['Sergi Sancho', 'Adriana Fernandez', 'Eric Lopez', 'Gerard Marti']
__license__ = "GPL"
__version__ = "1.0"

import extract_features
import train
import test
import draw_results
from MatlabCode import detect_signals

#DETECTION

#Color segmentation
detect_signals.run()

#CLASSIFICATION
extract_features.run() # Extracts the features for all the images to train the classifier
train.run() # Train the classifier

#TESTING AND EVALUATION
test.run() # Test a whole folder
draw_results.run() # Draw the detected boxes in the images