# Main Function if __name__ == '__main__': # Get set of all movies all_movies = list(load_all_movies("plot.list.gz")) random.shuffle(all_movies) years, plots, titles = [], [], [] for movie in all_movies: years.append(movie['year']) plots.append(movie['summary']) titles.append(movie['title']) min_year, max_year, bin_num = P2.year_stats(years) # Get uniform subset of movies years_train, plots_train, titles_train = [], [], [] years_test, plots_test, titles_test = [], [], [] year_count_train = [0]*bin_num year_count_test = [0]*bin_num train_sample_size = 5000 test_sample_size = 1000 # Create uniformly distributed training and test sets for i, year in enumerate(years): bin = int((year - min_year)/10) if year_count_train[bin] < train_sample_size: year_count_train[bin] += 1 years_train.append(year)