def unite_imdb_profiles(verbose): """Unite all movie profiles in the IMDB profile directory.""" if verbose: print("Uniting IMDB movie profiles to one csv file...") if not os.path.exists(_IMDB_DIR_PATH): print("No IMDB profiles to unite!") return profiles = [] profile_files = os.listdir(_IMDB_DIR_PATH) if verbose: profile_files = tqdm(profile_files) for profile_file in profile_files: if verbose: profile_files.set_description('Reading {}'.format(profile_file)) file_path = os.path.join(_IMDB_DIR_PATH, profile_file) _, ext = os.path.splitext(file_path) if ext == '.json': with open(file_path, 'r') as json_file: profiles.append(json.load(json_file)) df = pd.DataFrame(profiles) df = _decompose_dict_column(df, 'avg_rating_per_demo', _DEMOGRAPHICS) df = _decompose_dict_column(df, 'votes_per_demo', _DEMOGRAPHICS) df = _decompose_dict_column(df, 'rating_freq', [str(i) for i in range(1, 11)]) df = _dummy_list_column(df, 'genres') unison_fpath = os.path.join(_get_dataset_dir_path(), 'imdb_dataset.csv') df.to_csv(unison_fpath, index=False)
def build_united_profiles(verbose): """Build movie profiles with data from all resources.""" os.makedirs(_UNITED_DIR_PATH, exist_ok=True) prof_names = sorted(_prof_names_in_all_resources()) if verbose: print("Building movie profiles with data from all resources.") prof_names = tqdm(prof_names) for prof_name in prof_names: file_name = prof_name + '.json' imdb_prof_path = os.path.join(_IMDB_DIR_PATH, file_name) with open(imdb_prof_path, 'r') as imbd_prof_file: imdb_prof = json.load(imbd_prof_file) meta_prof_path = os.path.join(_METACRITIC_DIR_PATH, file_name) with open(meta_prof_path, 'r') as meta_prof_file: meta_prof = json.load(meta_prof_file) united_prof = {**imdb_prof, **meta_prof} united_prof_fpath = os.path.join(_UNITED_DIR_PATH, file_name) with open(united_prof_fpath, 'w+') as unite_prof_file: json.dump(united_prof, unite_prof_file, indent=2, sort_keys=True)
def test_load_unsupported_type(self): """Testing dump of unsupported types.""" expected = { "name": "Kevin", "age": 21, "pet": { "name": "Trippy Jack", "age": 20762, "__type__": "hyperdimensional.hamster" } } with open('tests/unsupported_type.json', 'r') as json_file: self.assertEqual(expected, morejson.load(json_file))
def test_load_bad_datetime_arg(self): """Testing dumps of unsupported types.""" expected = { "release_day": 2, "closing_date": { "bad_arg": 12, "month": 10, "year": 2013, "day": 18, "__type__": "datetime.date" } } with open('tests/bad_datetime_arg.json', 'r') as json_file: self.assertEqual(expected, morejson.load(json_file))
def test_dump_monkey(self): """Testing dumps of monkey types.""" try: _build_test_dirs() johnny = TestDump._Monkey("Johnny", 54) dicti = {"my_pet": johnny} with open(_TEST_FILE, 'w+') as fileobj: morejson.dump(dicti, fileobj, default=TestDump._monkey_default_encoder) with open(_TEST_FILE, 'r') as fileobj: res = morejson.load(fileobj, object_hook=TestDump._monkey_object_hook) self.assertEqual(dicti, res) finally: _dismantle_test_dirs()
def test_dumps_date(self): """Testing dump and load of date types.""" try: _build_test_dirs() dicti = { 'date': datetime.date.today(), 'array': [1, 2, 3], 'string': 'trololo', 'int': 1, 'float': 4.32, 'true': True, 'false': False, 'null': None } with open(_TEST_FILE, 'w+') as fileobj: morejson.dump(dicti, fileobj) with open(_TEST_FILE, 'r') as fileobj: self.assertEqual(dicti, morejson.load(fileobj)) finally: _dismantle_test_dirs()
def test_dumps_complex(self): """Testing dump and load of complex types.""" try: _build_test_dirs() dicti = { 'complex1': complex(1, 34.2), 'complex2': complex(-98.213, 91823), 'array': [1, 2, 3], 'string': 'trololo', 'int': 1, 'float': 4.32, 'true': True, 'false': False, 'null': None } with open(_TEST_FILE, 'w+') as fileobj: morejson.dump(dicti, fileobj) with open(_TEST_FILE, 'r') as fileobj: self.assertEqual(dicti, morejson.load(fileobj)) finally: _dismantle_test_dirs()
def test_dumps_timedelta(self): """Testing dump and load of timedelta types.""" try: _build_test_dirs() dicti = { 'timedelta1': datetime.timedelta(days=392), 'timedelta2': datetime.timedelta(weeks=2, hours=23), 'timedelta3': datetime.timedelta(microseconds=27836), 'array': [1, 2, 3], 'string': 'trololo', 'int': 1, 'float': 4.32, 'true': True, 'false': False, 'null': None } with open(_TEST_FILE, 'w+') as fileobj: morejson.dump(dicti, fileobj) with open(_TEST_FILE, 'r') as fileobj: self.assertEqual(dicti, morejson.load(fileobj)) finally: _dismantle_test_dirs()
def test_dumps_datetime_with_fold(self): """Testing dump and load of datetime types.""" if sys.version_info.major < 3 or sys.version_info.minor < 6: return try: _build_test_dirs() dt = datetime.datetime(year=2012, month=10, day=10, fold=1) dicti = { 'datetime': dt, 'array': [1, 2, 3], 'string': 'trololo', 'int': 1, 'float': 4.32, 'true': True, 'false': False, 'null': None } with open(_TEST_FILE, 'w+') as fileobj: morejson.dump(dicti, fileobj) with open(_TEST_FILE, 'r') as fileobj: self.assertEqual(dicti, morejson.load(fileobj)) finally: _dismantle_test_dirs()
def build_csv(verbose): """Build movie dataset from united profiles.""" # build profiles array profiles = [] profile_files = os.listdir(_UNITED_DIR_PATH) if verbose: profile_files = tqdm(profile_files) for profile_file in profile_files: if verbose: profile_files.set_description('Reading {}'.format(profile_file)) file_path = os.path.join(_UNITED_DIR_PATH, profile_file) _, ext = os.path.splitext(file_path) if ext == '.json': with open(file_path, 'r') as json_file: profiles.append(json.load(json_file)) # flatten some dict or array columns df = pd.DataFrame(profiles) df = df[df['opening_weekend_date'].notnull()] df = holcrawl.imdb_crawl._decompose_dict_column(df, 'avg_rating_per_demo', _DEMOGRAPHICS) df = holcrawl.imdb_crawl._decompose_dict_column(df, 'votes_per_demo', _DEMOGRAPHICS) df = holcrawl.imdb_crawl._decompose_dict_column( df, 'rating_freq', [str(i) for i in range(1, 11)]) df = holcrawl.imdb_crawl._dummy_list_column(df, 'genres') df['num_mc_critic'] = df.apply( lambda row: len(row['mc_pro_critic_reviews']), axis=1) df['avg_mc_critic'] = df.apply( _avg_review_generator('mc_pro_critic_reviews'), axis=1) df['num_mc_critic_by_opening'] = df.apply( _num_reviews_by_opening_generator('mc_pro_critic_reviews'), axis=1) df['avg_mc_critic_by_opening'] = df.apply( _avg_review_by_opening_generator('mc_pro_critic_reviews'), axis=1) df['num_mc_user'] = df.apply(lambda row: len(row['mc_user_reviews']), axis=1) df['avg_mc_user'] = df.apply(_avg_review_generator('mc_user_reviews'), axis=1) df['num_mc_user_by_opening'] = df.apply( _num_reviews_by_opening_generator('mc_user_reviews'), axis=1) df['avg_mc_user_by_opening'] = df.apply( _avg_review_by_opening_generator('mc_user_reviews'), axis=1) df['num_imdb_user'] = df.apply(lambda row: len(row['imdb_user_reviews']), axis=1) df['avg_imdb_user'] = df.apply(_avg_review_generator('imdb_user_reviews'), axis=1) df['num_imdb_user_by_opening'] = df.apply( _num_reviews_by_opening_generator('imdb_user_reviews'), axis=1) df['avg_imdb_user_by_opening'] = df.apply( _avg_review_by_opening_generator('imdb_user_reviews'), axis=1) df['opening_month'] = df['opening_weekend_date'].map( lambda opendate: opendate.month) df['opening_day'] = df['opening_weekend_date'].map( lambda opendate: opendate.day) df['opening_day_of_year'] = df['opening_weekend_date'].map( lambda opendate: opendate.timetuple().tm_yday) # save to file dataset_dir = holcrawl.shared._get_dataset_dir_path() os.makedirs(dataset_dir, exist_ok=True) csv_fpath = os.path.join(dataset_dir, 'movies_dataset.csv') df.to_csv(csv_fpath, index=False)