def get_lowerbound_f1_score(test_y_file_path): labelof = commons.get_label() with open(test_y_file_path, 'rb') as f: test_y = pickle.load(f) test_y = [LABEL_TO_DIGIT[item] for item in test_y] random_y = [ random.randint(min(DIGITS), max(DIGITS)) for i in range(len(test_y)) ] all_good_y = [LABEL_TO_DIGIT['GOOD'] for i in range(len(test_y))] count = Counter(test_y) count = [count[i] for i in DIGITS] weighted_random_y = random.choices(DIGITS, weights=count, k=len(test_y)) print("random y: {}".format(random_y)) print("all good y: {}".format(all_good_y)) print("weighted random y: {}".format(weighted_random_y)) random_score = sklearn.metrics.f1_score(test_y, random_y, average='weighted') all_good_score = sklearn.metrics.f1_score(test_y, all_good_y, average='weighted') weighted_random_score = sklearn.metrics.f1_score(test_y, weighted_random_y, average='weighted') print("random score: {}".format(random_score)) print("all good score: {}".format(all_good_score)) print("weighted random score: {}".format(weighted_random_score))
def test(data=None, model_path=commons.BASELINE_MODEL): from sklearn.externals import joblib svm = joblib.load(model_path) if data is None: test_x, test_y, test_who = load_test_data() else: test_x, test_y, test_who = data test_x, test_y = transform_x_y(test_x, test_y) result = svm.predict(test_x) assert (len(result) == len(test_y)) # 1. naive accuracy print("\n==============================") print("Test accuracy: ") naive_accuracy(svm.predict(test_x), test_y) f1_score(result, test_y) # 2. majority voting accuracy resultof = defaultdict(lambda: defaultdict(list)) label = commons.get_label() # 2.1 collect all the votings for i in range(len(result)): imdb_id, character_id, _ = test_who[i].split('-') character_id = int(character_id) resultof[imdb_id][character_id].append(result[i]) # 2.2 majority vote correct = 0 correct_by_type = defaultdict(int) count_by_type = defaultdict(int) for imdb_id in resultof: for character_id in resultof[imdb_id]: true_label = LABEL_TO_DIGIT[label[imdb_id][character_id]] if decide_y(resultof[imdb_id][character_id]) == true_label: correct += 1 correct_by_type[true_label] += 1 # 2.3 how many characters count = 0 for imdb_id in resultof: count += len(resultof[imdb_id]) for character_id in resultof[imdb_id]: count_by_type[LABEL_TO_DIGIT[label[imdb_id][character_id]]] += 1 print("accuracy by actor: {}/{}".format(correct, count)) for i in DIGITS: print("accuracy by actor type {}: {}/{}".format( i, correct_by_type[i], count_by_type[i]))
def character_statistics(): labelof = commons.get_label() count = [] labels = ['GOOD','BAD','N','NA'] label_count = defaultdict(list) for imdb_id in labelof: count.append(len(labelof[imdb_id])) counter = Counter(labelof[imdb_id].values()) for label in labels: label_count[label].append(counter[label]) mean = numpy.mean(count) std = numpy.std(count) summ = sum(count) print("mean = {} , std = {} , sum = {}".format(mean, std, summ)) label_mean, label_std = [], [] for label in labels: label_mean.append(numpy.mean(label_count[label])) label_std.append(numpy.std(label_count[label])) # plot label distribution fig, ax = plt.subplots() indent, width = numpy.arange(len(labels)), 0.5 rect = ax.bar(indent, label_mean, width, color='w', yerr=label_std, edgecolor='black', error_kw={'linestyle':':', 'markersize':2,'capsize':4}) ax.set_title('Distribution of classes per movie') ax.set_xticks(indent) ax.set_xticklabels(labels) for r in rect: height = r.get_height() ax.text(r.get_x() + r.get_width()/2., 1.05*height, "{0:.2f}".format(height), ha='center', va='bottom') plt.show()
def generate_baseline_data(): if not os.path.exists(commons.BASELINE_DIR): os.mkdir(commons.BASELINE_DIR) # if we already got them... generated = (os.path.exists(commons.BASELINE_TRAIN_X) and os.path.exists(commons.BASELINE_TRAIN_Y) and os.path.exists(commons.BASELINE_TEST_X) and os.path.exists(commons.BASELINE_TEST_Y) and os.path.exists(commons.BASELINE_TEST_WHO)) if generated: return # generate them, save, and return them! train_ids = set([]) with open(commons.TRAIN_FILE, 'r') as f: for line in f.readlines(): train_ids.add(line.strip()) test_ids = set([]) with open(commons.TEST_FILE, 'r') as f: for line in f.readlines(): test_ids.add(line.strip()) # encoding and separate (data, label) train_x, train_y, test_x, test_y, test_who = [], [], [], [], [] labelof = commons.get_label() IMAGE_SIZE = 224 cnn_model = vgg16.VGG16(weights='imagenet', include_top=False) print("Loading baseline data...", datetime.now()) image_filenames = [ x for x in sorted(os.listdir(commons.TRAIN_IMAGES_DIR)) if len(x) > 4 and x[-4:] == '.jpg' ] for i in range(len(image_filenames)): print("{}/{}...".format(i, len(image_filenames)), end='\r') image_filename = image_filenames[i] imdb_id, character_id, timestamp = image_filename[:-4].split('-') character_id = int(character_id) # if this movie is not yet labeled if imdb_id not in labelof: continue image = keras.preprocessing.image.load_img( os.path.join(commons.TRAIN_IMAGES_DIR, image_filename), target_size=(IMAGE_SIZE, IMAGE_SIZE)) x = keras.preprocessing.image.img_to_array(image) x = numpy.expand_dims(x, axis=0) x = vgg16.preprocess_input(x) x = cnn_model.predict(x) y = labelof[imdb_id][character_id] if imdb_id in train_ids: train_x.append(x) train_y.append(y) else: test_x.append(x) test_y.append(y) test_who.append(image_filename[:-4]) print("Done. ", datetime.now()) with open(commons.BASELINE_TRAIN_X, 'wb') as f: pickle.dump(train_x, f) with open(commons.BASELINE_TRAIN_Y, 'wb') as f: pickle.dump(train_y, f) with open(commons.BASELINE_TEST_X, 'wb') as f: pickle.dump(test_x, f) with open(commons.BASELINE_TEST_Y, 'wb') as f: pickle.dump(test_y, f) with open(commons.BASELINE_TEST_WHO, 'w') as f: for who in test_who: f.write("{}\n".format(who))
def generate_fusion_data(): if not os.path.exists(FUSION_DIR): os.mkdir(FUSION_DIR) if (os.path.exists(FUSION_TRAIN_X) and os.path.exists(FUSION_TRAIN_Y) and os.path.exists(FUSION_TRAIN_WHO) and os.path.exists(FUSION_TEST_X) and os.path.exists(FUSION_TEST_Y) and os.path.exists(FUSION_TEST_WHO)): with open(FUSION_TRAIN_X, 'rb') as f: train_x = pickle.load(f) with open(FUSION_TRAIN_Y, 'rb') as f: train_y = pickle.load(f) with open(FUSION_TRAIN_WHO, 'r') as f: train_who = [] for line in f.readlines(): train_who.append(line.strip()) with open(FUSION_TEST_X, 'rb') as f: test_x = pickle.load(f) with open(FUSION_TEST_Y, 'rb') as f: test_y = pickle.load(f) with open(FUSION_TEST_WHO, 'r') as f: test_who = [] for line in f.readlines(): test_who.append(line.strip()) return (train_x, train_y, train_who), (test_x, test_y, test_who) audio_train_data = audio_model.load_training_data() audio_test_data = audio_model.load_test_data() train_ids, test_ids = commons.get_train_and_test_imbd_ids() movies, labelof = commons.load_movies(), commons.get_label() train_x, train_y, test_x, test_y = [], [], [], [] train_who, test_who = audio_train_data[2], audio_test_data[2] IMAGE_SIZE = 224 cnn_model = VGGFace(include_top=False, input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), pooling='None') vgg19_model = scene_model.VGG_19() print("Generating & merging features...", datetime.now()) for (xs, ys, whos) in [audio_train_data, audio_test_data]: assert (len(xs) == len(ys) and len(ys) == len(whos)) for i in range(len(whos)): print("\r{}/{}".format(i, len(whos)), end="") imdb_id, c_id, scene = whos[i].strip().split('-') start_time, end_time = scene.split('~') c_id, start_time, end_time = int(c_id), float(start_time), float( end_time) audio_x, y = numpy.asarray(xs[i]).flatten(), ys[i] # find 2 random images belonging to (imdb_id, c_id) scene images = [ x for x in os.listdir(commons.TRAIN_IMAGES_DIR) if x.startswith("{}-{}".format(imdb_id, c_id)) ] temp_images = [] for image in images: assert (image[-4:] == '.jpg') _, _, timestamp = image[:-4].split('-') timestamp = float(timestamp) if start_time <= timestamp and timestamp <= end_time: temp_images.append(image) assert (len(temp_images) >= 5 ) # a scene is guaranteed to have >= 5 images images = random.sample(temp_images, NUM_IMAGES_USED) image_x = [] for image in images: image_filename = os.path.join(commons.TRAIN_IMAGES_DIR, image) image = keras.preprocessing.image.load_img( image_filename, target_size=(IMAGE_SIZE, IMAGE_SIZE)) # face features x = keras.preprocessing.image.img_to_array(image) x = numpy.expand_dims(x, axis=0) x = keras_vggface.utils.preprocess_input(x, version=1) x = cnn_model.predict(x) x = x.flatten() image_x.append(x) # scene features x = scene_model.extract_scene_feature(vgg19_model, image) x = x.flatten() image_x.append(x) x = numpy.concatenate(tuple(image_x + [audio_x])) if imdb_id in train_ids: train_x.append(x) train_y.append(y) elif imdb_id in test_ids: test_x.append(x) test_y.append(y) print("\rDone. ", datetime.now()) with open(FUSION_TRAIN_X, 'wb') as f: pickle.dump(train_x, f) with open(FUSION_TRAIN_Y, 'wb') as f: pickle.dump(train_y, f) with open(FUSION_TRAIN_WHO, 'w') as f: for who in train_who: f.write("{}\n".format(who)) with open(FUSION_TEST_X, 'wb') as f: pickle.dump(test_x, f) with open(FUSION_TEST_Y, 'wb') as f: pickle.dump(test_y, f) with open(FUSION_TEST_WHO, 'w') as f: for who in test_who: f.write("{}\n".format(who)) return (train_x, train_y, train_who), (test_x, test_y, test_who)
def generate_audio_features(): movies = commons.load_movies() print("Extract audio pieces...") # if not os.path.exists(commons.AUDIO_UNIT_DIR): # os.mkdir(commons.AUDIO_UNIT_DIR) # if not os.path.exists(commons.AUDIO_UNIT_DONE_DIR): # os.mkdir(commons.AUDIO_UNIT_DONE_DIR) labelof = commons.get_label() train_ids, test_ids = commons.get_train_and_test_imbd_ids() train_x, train_y, train_who, test_x, test_y, test_who = [], [], [], [], [], [] if not os.path.exists(commons.AUDIO_BASELINE_DIR): os.mkdir(commons.AUDIO_BASELINE_DIR) all_scenes = {} # imdb_id -> scenes for imdb_id in movies: if imdb_id not in labelof: continue temp_audio_unit_path = os.path.join(commons.AUDIO_UNIT_DIR, imdb_id) # 0. clean up work if exists if os.path.exists(os.path.join(commons.AUDIO_UNIT_DONE_DIR, imdb_id)): print("<{}> already completed.".format(imdb_id)) return if os.path.exists(os.path.join(commons.AUDIO_UNIT_DIR, imdb_id)): shutil.rmtree(temp_audio_unit_path) # 1. create temp path and start working! os.mkdir(temp_audio_unit_path) frames = [ x for x in os.listdir(commons.TRAIN_IMAGES_DIR) if x.startswith(imdb_id) ] timestampsof = defaultdict(list) # character_id -> timestamps for frame in frames: _, character_id, timestamp = frame[:-4].split('-') character_id = int(character_id) timestamp = float(timestamp) timestampsof[character_id].append(timestamp) for character_id in timestampsof: timestampsof[character_id].sort() scenes = defaultdict( list) # character_id -> [ scene0 = [ timestamp0 ... ] ] # timestamp is a float exactly what in train_image, in milisecond curr_scene = [] for character_id in timestampsof: for timestamp in timestampsof[character_id]: if not is_the_same_scene(curr_scene, timestamp): scenes[character_id].append(curr_scene) curr_scene = [] curr_scene.append(timestamp) scenes[character_id].append(curr_scene) curr_scene = [] for character_id in scenes: scenes[character_id] = filter_scene(scenes[character_id]) all_scenes[imdb_id] = scenes # 2. from character_id -> a list of scene [timestamps], # extract audio file and save to train/test # NOTE: we need to write because we don't know how to convert scipy wavfile to # format known to pyAudioAnalysis partial_xs = [] rate, data = wavfile.read( os.path.join(commons.AUDIO_DIR, "{}.wav".format(imdb_id))) for c_id in scenes: for scene in scenes[c_id]: # clip audio of this scene start, end = scene[0], scene[-1] start_sec, end_sec = start / 1000, end / 1000 # convert to seconds start_frame, end_frame = int(start_sec * rate), int(end_sec * rate) x = data[start_frame:end_frame + 1] # get audio features of this scene from pyAudioAnalysis import audioFeatureExtraction if len(x.shape) > 1 and x.shape[1] == 2: # stereo to mono x = x.sum(axis=1) / 2 features, _ = audioFeatureExtraction.stFeatureExtraction( x, rate, 0.05 * rate, #frame size 0.025 * (end_frame + 1 - start_frame)) #frame step # catch edge case: nothing in audio if not numpy.isfinite(features).all(): features = numpy.nan_to_num(features) print("Catched error: <{}> has nothing in audio.".format( imdb_id)) features = features[:, :38] # drop extra frame data ... sign # add (x, y) label = labelof[imdb_id][c_id] if imdb_id in train_ids: train_x.append(features) train_y.append(label) train_who.append("{}-{}-{}~{}".format( imdb_id, c_id, start, end)) else: test_x.append(features) test_y.append(label) test_who.append("{}-{}-{}~{}".format( imdb_id, c_id, start, end)) print("<{}> Finished".format(imdb_id)) with open(commons.AUDIO_BASELINE_TRAIN_X, 'wb') as f: pickle.dump(train_x, f) with open(commons.AUDIO_BASELINE_TRAIN_Y, 'wb') as f: pickle.dump(train_y, f) with open(commons.AUDIO_BASELINE_TRAIN_WHO, 'w') as f: for who in train_who: f.write(who + '\n') with open(commons.AUDIO_BASELINE_TEST_X, 'wb') as f: pickle.dump(test_x, f) with open(commons.AUDIO_BASELINE_TEST_Y, 'wb') as f: pickle.dump(test_y, f) with open(commons.AUDIO_BASELINE_TEST_WHO, 'w') as f: for who in test_who: f.write(who + '\n') with open(commons.SCENES, 'wb') as f: pickle.dump(all_scenes, f)