target = float([wp30, re30][is_re]) true_engagement.append(target) random_guess = 0.5 if is_re: guess_engagement.append(random_guess) else: guess_engagement.append(to_watch_percentage(engagement_map, duration, random_guess, lookup_keys=split_keys)) print('>>> Predict {0} on duration...'.format(['watch percentage', 'relative engagement'][is_re])) print('>>> MAE on test set: {0:.4f}'.format(mean_absolute_error(true_engagement, guess_engagement))) print('>>> R2 on test set: {0:.4f}'.format(r2_score(true_engagement, guess_engagement))) print('=' * 79) timer.stop() # write to pickle file to_write = True true_result_dict = {vid: true for vid, true in zip(test_vids, true_engagement)} predict_result_dict = {vid: pred for vid, pred in zip(test_vids, guess_engagement)} test_duration_dict = {vid: duration for vid, duration in zip(test_vids, test_duration)} if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(test_vids))) write_dict_to_pickle(dict=true_result_dict, path=os.path.join(output_dir, '{0}_true_predictor.p'.format(['wp', 're'][is_re]))) write_dict_to_pickle(dict=predict_result_dict, path=os.path.join(output_dir, '{0}_duration_predictor.p'.format(['wp', 're'][is_re]))) if not os.path.exists(os.path.join(output_dir, 'test_duration.p')): write_dict_to_pickle(dict=test_duration_dict, path=os.path.join(output_dir, 'test_duration.p'))
train_matrix = [] print('>>> Start to load training dataset...') for subdir, _, files in os.walk(train_loc): for f in files: train_matrix.extend(_load_data(os.path.join(subdir, f))) train_matrix = np.array(train_matrix) test_matrix = [] print('>>> Start to load test dataset...') for subdir, _, files in os.walk(test_loc): for f in files: test_matrix.extend(_load_data(os.path.join(subdir, f))) test_matrix = np.array(test_matrix) print('>>> Finish loading all data!\n') # predict test data from customized ridge regressor test_yhat, test_vids = RidgeRegressor(train_matrix, test_matrix).predict_from_sparse(vectorize_train_data, vectorize_test_data) # get running time print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3]) # write to pickle file to_write = True predict_result_dict = {vid: pred for vid, pred in zip(test_vids, test_yhat)} if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path='./output/sparse_topic_predictor.p')
test_matrix = [] for subdir, _, files in os.walk(test_loc): for f in files: test_matrix.extend(_load_data(os.path.join(subdir, f), is_re)) print('>>> Finish loading all data!') # predict test data from customized ridge regressor test_yhat, test_vids = RidgeRegressor(train_matrix, test_matrix).predict_from_sparse( vectorize_train_data, vectorize_test_data) timer.stop() # write to pickle file to_write = True predict_result_dict = { vid: pred for vid, pred in zip(test_vids, test_yhat) } if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format( len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path=os.path.join( output_dir, '{0}_sparse_context_topic_predictor.p'.format( ['wp', 're'][is_re])))
for f in files: with open(os.path.join(subdir, f), 'r') as fin: # read header fin.readline() for line in fin: vid, _, duration, dump = line.rstrip().split('\t', 3) test_vids.append(vid) duration = int(duration) wp30 = float(dump.split('\t')[7]) true_wp.append(wp30) random_guess = 0.5 guess_wp.append(to_watch_percentage(engagement_map, duration, random_guess, lookup_keys=lookup_durations)) print('>>> Predict watch percentage on duration...') print('>>> MAE on test set: {0:.4f}'.format(mean_absolute_error(true_wp, guess_wp))) print('>>> R2 on test set: {0:.4f}'.format(r2_score(true_wp, guess_wp))) print('=' * 79) # get running time print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3]) # write to pickle file to_write = True true_result_dict = {vid: true for vid, true in zip(test_vids, true_wp)} predict_result_dict = {vid: pred for vid, pred in zip(test_vids, guess_wp)} if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(test_vids))) write_dict_to_pickle(dict=true_result_dict, path='./output/true_predictor.p') write_dict_to_pickle(dict=predict_result_dict, path='./output/duration_predictor.p')
if detect_lang in lang_dict: row[3+category_cnt + lang_dict[detect_lang]] = 1 if topics == '' or topics == 'NA': row[3 + category_cnt + lang_cnt + topic_dict['NA']] = 1 else: topics = topics.split(',') for topic in topics: if topic in topic_dict: row[3 + category_cnt + lang_cnt + topic_dict[topic]] = 1 else: row[3 + category_cnt + lang_cnt + topic_dict['NA']] = 1 row[-1] = float(re30) test_matrix.append(row) test_vids.append(vid) test_matrix = np.array(test_matrix) # predict test data from customized ridge regressor test_yhat = RidgeRegressor(train_matrix, test_matrix, verbose=False).predict() predict_result_dict.update({vid: pred for vid, pred in zip(test_vids, test_yhat)}) # get running time print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3]) # write to pickle file to_write = True if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path='./output/csp_predictor_{0}.p'.format(k))
row[1] = 1 if category in category_dict: row[2 + category_dict[category]] = 1 if detect_lang in lang_dict: row[2 + category_cnt + lang_dict[detect_lang]] = 1 if not (topics == '' or topics == 'NA'): topics = topics.split(',') for topic in topics: if topic in topic_dict: row[2 + category_cnt + lang_cnt + topic_dict[topic]] = 1 target = [wp30, re30][is_re] row[-1] = float(target) test_matrix.append(row) test_vids.append(vid) test_matrix = np.array(test_matrix) # predict test data from customized ridge regressor test_yhat = RidgeRegressor(train_matrix, test_matrix, verbose=False).predict() predict_result_dict.update({vid: pred for vid, pred in zip(test_vids, test_yhat)}) timer.stop() # write to pickle file to_write = True if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path=os.path.join(output_dir, '{0}_csp_predictor_{1}.p'.format(['wp', 're'][is_re], k)))
for subdir, _, files in os.walk(train_loc): for f in files: train_matrix.extend(_load_data(os.path.join(subdir, f))[0]) train_matrix = np.array(train_matrix) print('>>> Start to load test dataset...') test_matrix = [] test_vids = [] for subdir, _, files in os.walk(test_loc): for f in files: matrix, vids = _load_data(os.path.join(subdir, f)) test_matrix.extend(matrix) test_vids.extend(vids) test_matrix = np.array(test_matrix) print('>>> Finish loading all data!\n') # predict test data from customized ridge regressor test_yhat = RidgeRegressor(train_matrix, test_matrix).predict() # get running time print('\n>>> Total running time: {0}'.format(str(datetime.timedelta(seconds=time.time() - start_time)))[:-3]) # write to pickle file to_write = True predict_result_dict = {vid: pred for vid, pred in zip(test_vids, test_yhat)} if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format(len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path='./output/content_predictor.p')
for subdir, _, files in os.walk(test_loc): for f in files: matrix, vids = _load_data(os.path.join(subdir, f), is_re) test_matrix.extend(matrix) test_vids.extend(vids) test_matrix = np.array(test_matrix) print('>>> Finish loading all data!') # predict test data from customized ridge regressor test_yhat = RidgeRegressor(train_matrix, test_matrix).predict(show_params=True) timer.stop() # write to pickle file to_write = True predict_result_dict = { vid: pred for vid, pred in zip(test_vids, test_yhat) } if to_write: print('>>> Prepare to write to pickle file...') print('>>> Number of videos in final test result dict: {0}'.format( len(predict_result_dict))) write_dict_to_pickle(dict=predict_result_dict, path=os.path.join( output_dir, '{0}_reputation_predictor.p'.format( ['wp', 're'][is_re])))