Example #1
0
def get_lowerbound_f1_score(test_y_file_path):
    labelof = commons.get_label()

    with open(test_y_file_path, 'rb') as f:
        test_y = pickle.load(f)
    test_y = [LABEL_TO_DIGIT[item] for item in test_y]

    random_y = [
        random.randint(min(DIGITS), max(DIGITS)) for i in range(len(test_y))
    ]
    all_good_y = [LABEL_TO_DIGIT['GOOD'] for i in range(len(test_y))]

    count = Counter(test_y)
    count = [count[i] for i in DIGITS]
    weighted_random_y = random.choices(DIGITS, weights=count, k=len(test_y))

    print("random y: {}".format(random_y))
    print("all good y: {}".format(all_good_y))
    print("weighted random y: {}".format(weighted_random_y))

    random_score = sklearn.metrics.f1_score(test_y,
                                            random_y,
                                            average='weighted')
    all_good_score = sklearn.metrics.f1_score(test_y,
                                              all_good_y,
                                              average='weighted')
    weighted_random_score = sklearn.metrics.f1_score(test_y,
                                                     weighted_random_y,
                                                     average='weighted')

    print("random score: {}".format(random_score))
    print("all good score: {}".format(all_good_score))
    print("weighted random score: {}".format(weighted_random_score))
Example #2
0
def test(data=None, model_path=commons.BASELINE_MODEL):
    from sklearn.externals import joblib
    svm = joblib.load(model_path)

    if data is None:
        test_x, test_y, test_who = load_test_data()
    else:
        test_x, test_y, test_who = data
    test_x, test_y = transform_x_y(test_x, test_y)

    result = svm.predict(test_x)
    assert (len(result) == len(test_y))

    # 1. naive accuracy
    print("\n==============================")
    print("Test accuracy: ")
    naive_accuracy(svm.predict(test_x), test_y)
    f1_score(result, test_y)

    # 2. majority voting accuracy
    resultof = defaultdict(lambda: defaultdict(list))
    label = commons.get_label()

    # 2.1 collect all the  votings
    for i in range(len(result)):
        imdb_id, character_id, _ = test_who[i].split('-')
        character_id = int(character_id)
        resultof[imdb_id][character_id].append(result[i])

    # 2.2 majority vote
    correct = 0
    correct_by_type = defaultdict(int)
    count_by_type = defaultdict(int)
    for imdb_id in resultof:
        for character_id in resultof[imdb_id]:
            true_label = LABEL_TO_DIGIT[label[imdb_id][character_id]]
            if decide_y(resultof[imdb_id][character_id]) == true_label:
                correct += 1
                correct_by_type[true_label] += 1

    # 2.3 how many characters
    count = 0
    for imdb_id in resultof:
        count += len(resultof[imdb_id])

        for character_id in resultof[imdb_id]:
            count_by_type[LABEL_TO_DIGIT[label[imdb_id][character_id]]] += 1

    print("accuracy by actor: {}/{}".format(correct, count))
    for i in DIGITS:
        print("accuracy by actor type {}: {}/{}".format(
            i, correct_by_type[i], count_by_type[i]))
Example #3
0
def character_statistics():
	labelof = commons.get_label()

	count = []
	labels = ['GOOD','BAD','N','NA']
	label_count = defaultdict(list)

	for imdb_id in labelof:
		count.append(len(labelof[imdb_id]))
		counter = Counter(labelof[imdb_id].values())
		for label in labels:
			label_count[label].append(counter[label])

	mean = numpy.mean(count)
	std = numpy.std(count)
	summ = sum(count)
	print("mean = {} , std = {} , sum = {}".format(mean, std, summ))

	label_mean, label_std = [], []
	for label in labels:
		label_mean.append(numpy.mean(label_count[label]))
		label_std.append(numpy.std(label_count[label]))

	# plot label distribution
	fig, ax = plt.subplots()
	indent, width = numpy.arange(len(labels)), 0.5
	rect = ax.bar(indent, label_mean, width, color='w', yerr=label_std, edgecolor='black', 
			error_kw={'linestyle':':', 'markersize':2,'capsize':4})
	ax.set_title('Distribution of classes per movie')
	ax.set_xticks(indent)
	ax.set_xticklabels(labels)

	for r in rect:
		height = r.get_height()
		ax.text(r.get_x() + r.get_width()/2., 1.05*height,
			"{0:.2f}".format(height),
			ha='center', va='bottom')

	plt.show()
Example #4
0
def generate_baseline_data():
    if not os.path.exists(commons.BASELINE_DIR):
        os.mkdir(commons.BASELINE_DIR)

    # if we already got them...
    generated = (os.path.exists(commons.BASELINE_TRAIN_X)
                 and os.path.exists(commons.BASELINE_TRAIN_Y)
                 and os.path.exists(commons.BASELINE_TEST_X)
                 and os.path.exists(commons.BASELINE_TEST_Y)
                 and os.path.exists(commons.BASELINE_TEST_WHO))

    if generated:
        return

    # generate them, save, and return them!
    train_ids = set([])
    with open(commons.TRAIN_FILE, 'r') as f:
        for line in f.readlines():
            train_ids.add(line.strip())

    test_ids = set([])
    with open(commons.TEST_FILE, 'r') as f:
        for line in f.readlines():
            test_ids.add(line.strip())

    # encoding and separate (data, label)
    train_x, train_y, test_x, test_y, test_who = [], [], [], [], []
    labelof = commons.get_label()

    IMAGE_SIZE = 224
    cnn_model = vgg16.VGG16(weights='imagenet', include_top=False)
    print("Loading baseline data...", datetime.now())

    image_filenames = [
        x for x in sorted(os.listdir(commons.TRAIN_IMAGES_DIR))
        if len(x) > 4 and x[-4:] == '.jpg'
    ]
    for i in range(len(image_filenames)):
        print("{}/{}...".format(i, len(image_filenames)), end='\r')

        image_filename = image_filenames[i]
        imdb_id, character_id, timestamp = image_filename[:-4].split('-')
        character_id = int(character_id)

        # if this movie is not yet labeled
        if imdb_id not in labelof:
            continue

        image = keras.preprocessing.image.load_img(
            os.path.join(commons.TRAIN_IMAGES_DIR, image_filename),
            target_size=(IMAGE_SIZE, IMAGE_SIZE))

        x = keras.preprocessing.image.img_to_array(image)
        x = numpy.expand_dims(x, axis=0)
        x = vgg16.preprocess_input(x)
        x = cnn_model.predict(x)

        y = labelof[imdb_id][character_id]

        if imdb_id in train_ids:
            train_x.append(x)
            train_y.append(y)
        else:
            test_x.append(x)
            test_y.append(y)
            test_who.append(image_filename[:-4])

    print("Done. ", datetime.now())

    with open(commons.BASELINE_TRAIN_X, 'wb') as f:
        pickle.dump(train_x, f)
    with open(commons.BASELINE_TRAIN_Y, 'wb') as f:
        pickle.dump(train_y, f)
    with open(commons.BASELINE_TEST_X, 'wb') as f:
        pickle.dump(test_x, f)
    with open(commons.BASELINE_TEST_Y, 'wb') as f:
        pickle.dump(test_y, f)
    with open(commons.BASELINE_TEST_WHO, 'w') as f:
        for who in test_who:
            f.write("{}\n".format(who))
Example #5
0
def generate_fusion_data():
    if not os.path.exists(FUSION_DIR):
        os.mkdir(FUSION_DIR)

    if (os.path.exists(FUSION_TRAIN_X) and os.path.exists(FUSION_TRAIN_Y)
            and os.path.exists(FUSION_TRAIN_WHO)
            and os.path.exists(FUSION_TEST_X) and os.path.exists(FUSION_TEST_Y)
            and os.path.exists(FUSION_TEST_WHO)):
        with open(FUSION_TRAIN_X, 'rb') as f:
            train_x = pickle.load(f)
        with open(FUSION_TRAIN_Y, 'rb') as f:
            train_y = pickle.load(f)
        with open(FUSION_TRAIN_WHO, 'r') as f:
            train_who = []
            for line in f.readlines():
                train_who.append(line.strip())
        with open(FUSION_TEST_X, 'rb') as f:
            test_x = pickle.load(f)
        with open(FUSION_TEST_Y, 'rb') as f:
            test_y = pickle.load(f)
        with open(FUSION_TEST_WHO, 'r') as f:
            test_who = []
            for line in f.readlines():
                test_who.append(line.strip())
        return (train_x, train_y, train_who), (test_x, test_y, test_who)

    audio_train_data = audio_model.load_training_data()
    audio_test_data = audio_model.load_test_data()

    train_ids, test_ids = commons.get_train_and_test_imbd_ids()
    movies, labelof = commons.load_movies(), commons.get_label()

    train_x, train_y, test_x, test_y = [], [], [], []
    train_who, test_who = audio_train_data[2], audio_test_data[2]

    IMAGE_SIZE = 224
    cnn_model = VGGFace(include_top=False,
                        input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
                        pooling='None')
    vgg19_model = scene_model.VGG_19()

    print("Generating & merging features...", datetime.now())

    for (xs, ys, whos) in [audio_train_data, audio_test_data]:
        assert (len(xs) == len(ys) and len(ys) == len(whos))

        for i in range(len(whos)):
            print("\r{}/{}".format(i, len(whos)), end="")

            imdb_id, c_id, scene = whos[i].strip().split('-')
            start_time, end_time = scene.split('~')
            c_id, start_time, end_time = int(c_id), float(start_time), float(
                end_time)

            audio_x, y = numpy.asarray(xs[i]).flatten(), ys[i]

            # find 2 random images belonging to (imdb_id, c_id) scene
            images = [
                x for x in os.listdir(commons.TRAIN_IMAGES_DIR)
                if x.startswith("{}-{}".format(imdb_id, c_id))
            ]
            temp_images = []

            for image in images:
                assert (image[-4:] == '.jpg')
                _, _, timestamp = image[:-4].split('-')
                timestamp = float(timestamp)
                if start_time <= timestamp and timestamp <= end_time:
                    temp_images.append(image)

            assert (len(temp_images) >= 5
                    )  # a scene is guaranteed to have >= 5 images
            images = random.sample(temp_images, NUM_IMAGES_USED)

            image_x = []
            for image in images:
                image_filename = os.path.join(commons.TRAIN_IMAGES_DIR, image)
                image = keras.preprocessing.image.load_img(
                    image_filename, target_size=(IMAGE_SIZE, IMAGE_SIZE))

                # face features
                x = keras.preprocessing.image.img_to_array(image)
                x = numpy.expand_dims(x, axis=0)
                x = keras_vggface.utils.preprocess_input(x, version=1)
                x = cnn_model.predict(x)
                x = x.flatten()
                image_x.append(x)

                # scene features
                x = scene_model.extract_scene_feature(vgg19_model, image)
                x = x.flatten()
                image_x.append(x)

            x = numpy.concatenate(tuple(image_x + [audio_x]))

            if imdb_id in train_ids:
                train_x.append(x)
                train_y.append(y)
            elif imdb_id in test_ids:
                test_x.append(x)
                test_y.append(y)

    print("\rDone. ", datetime.now())

    with open(FUSION_TRAIN_X, 'wb') as f:
        pickle.dump(train_x, f)
    with open(FUSION_TRAIN_Y, 'wb') as f:
        pickle.dump(train_y, f)
    with open(FUSION_TRAIN_WHO, 'w') as f:
        for who in train_who:
            f.write("{}\n".format(who))
    with open(FUSION_TEST_X, 'wb') as f:
        pickle.dump(test_x, f)
    with open(FUSION_TEST_Y, 'wb') as f:
        pickle.dump(test_y, f)
    with open(FUSION_TEST_WHO, 'w') as f:
        for who in test_who:
            f.write("{}\n".format(who))

    return (train_x, train_y, train_who), (test_x, test_y, test_who)
Example #6
0
def generate_audio_features():
    movies = commons.load_movies()
    print("Extract audio pieces...")

    # if not os.path.exists(commons.AUDIO_UNIT_DIR):
    #       os.mkdir(commons.AUDIO_UNIT_DIR)
    # if not os.path.exists(commons.AUDIO_UNIT_DONE_DIR):
    #       os.mkdir(commons.AUDIO_UNIT_DONE_DIR)

    labelof = commons.get_label()
    train_ids, test_ids = commons.get_train_and_test_imbd_ids()
    train_x, train_y, train_who, test_x, test_y, test_who = [], [], [], [], [], []

    if not os.path.exists(commons.AUDIO_BASELINE_DIR):
        os.mkdir(commons.AUDIO_BASELINE_DIR)
    all_scenes = {}  # imdb_id -> scenes

    for imdb_id in movies:
        if imdb_id not in labelof:
            continue
        temp_audio_unit_path = os.path.join(commons.AUDIO_UNIT_DIR, imdb_id)

        # 0. clean up work if exists
        if os.path.exists(os.path.join(commons.AUDIO_UNIT_DONE_DIR, imdb_id)):
            print("<{}> already completed.".format(imdb_id))
            return
        if os.path.exists(os.path.join(commons.AUDIO_UNIT_DIR, imdb_id)):
            shutil.rmtree(temp_audio_unit_path)

        # 1. create temp path and start working!
        os.mkdir(temp_audio_unit_path)
        frames = [
            x for x in os.listdir(commons.TRAIN_IMAGES_DIR)
            if x.startswith(imdb_id)
        ]

        timestampsof = defaultdict(list)  # character_id -> timestamps
        for frame in frames:
            _, character_id, timestamp = frame[:-4].split('-')
            character_id = int(character_id)
            timestamp = float(timestamp)
            timestampsof[character_id].append(timestamp)
        for character_id in timestampsof:
            timestampsof[character_id].sort()

        scenes = defaultdict(
            list)  # character_id -> [ scene0 = [ timestamp0 ... ] ]
        # timestamp is a float exactly what in train_image, in milisecond
        curr_scene = []
        for character_id in timestampsof:
            for timestamp in timestampsof[character_id]:
                if not is_the_same_scene(curr_scene, timestamp):
                    scenes[character_id].append(curr_scene)
                    curr_scene = []
                curr_scene.append(timestamp)

            scenes[character_id].append(curr_scene)
            curr_scene = []

        for character_id in scenes:
            scenes[character_id] = filter_scene(scenes[character_id])
        all_scenes[imdb_id] = scenes

        # 2. from character_id -> a list of scene [timestamps],
        #        extract audio file and save to train/test
        #        NOTE: we need to write because we don't know how to convert scipy wavfile to
        #                  format known to pyAudioAnalysis
        partial_xs = []
        rate, data = wavfile.read(
            os.path.join(commons.AUDIO_DIR, "{}.wav".format(imdb_id)))
        for c_id in scenes:
            for scene in scenes[c_id]:
                # clip audio of this scene
                start, end = scene[0], scene[-1]
                start_sec, end_sec = start / 1000, end / 1000  # convert to seconds
                start_frame, end_frame = int(start_sec * rate), int(end_sec *
                                                                    rate)
                x = data[start_frame:end_frame + 1]

                # get audio features of this scene
                from pyAudioAnalysis import audioFeatureExtraction

                if len(x.shape) > 1 and x.shape[1] == 2:  # stereo to mono
                    x = x.sum(axis=1) / 2
                features, _ = audioFeatureExtraction.stFeatureExtraction(
                    x,
                    rate,
                    0.05 * rate,  #frame size
                    0.025 * (end_frame + 1 - start_frame))  #frame step

                # catch edge case: nothing in audio
                if not numpy.isfinite(features).all():
                    features = numpy.nan_to_num(features)
                    print("Catched error: <{}> has nothing in audio.".format(
                        imdb_id))
                features = features[:, :38]  # drop extra frame data ... sign

                # add (x, y)
                label = labelof[imdb_id][c_id]
                if imdb_id in train_ids:
                    train_x.append(features)
                    train_y.append(label)
                    train_who.append("{}-{}-{}~{}".format(
                        imdb_id, c_id, start, end))
                else:
                    test_x.append(features)
                    test_y.append(label)
                    test_who.append("{}-{}-{}~{}".format(
                        imdb_id, c_id, start, end))

        print("<{}> Finished".format(imdb_id))

    with open(commons.AUDIO_BASELINE_TRAIN_X, 'wb') as f:
        pickle.dump(train_x, f)
    with open(commons.AUDIO_BASELINE_TRAIN_Y, 'wb') as f:
        pickle.dump(train_y, f)
    with open(commons.AUDIO_BASELINE_TRAIN_WHO, 'w') as f:
        for who in train_who:
            f.write(who + '\n')

    with open(commons.AUDIO_BASELINE_TEST_X, 'wb') as f:
        pickle.dump(test_x, f)
    with open(commons.AUDIO_BASELINE_TEST_Y, 'wb') as f:
        pickle.dump(test_y, f)
    with open(commons.AUDIO_BASELINE_TEST_WHO, 'w') as f:
        for who in test_who:
            f.write(who + '\n')

    with open(commons.SCENES, 'wb') as f:
        pickle.dump(all_scenes, f)