def pickle_simple(): conn = IMDB() ids = conn.fetch_vec("SELECT DISTINCT movie_id FROM stars,title " "WHERE movie_id= title.id ORDER BY production_year ASC") all_movies = [dict(Movie(conn, mid)) for mid in ids] with open('../data/movies.pkl','wb') as fp: pck.dump(all_movies,fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--title', nargs='+', type=str, required=True, help='Title of the movie') parser.add_argument('--title_type', type=str, nargs='+', choices=[ 'feature', 'tv_movie', 'tv_series', 'tv_episode', 'tv_special', 'tv_miniseries', 'documentary', 'video_game', 'short', 'video', 'tv_short' ], help='The Title type of the movie') args = parser.parse_args() imdb = IMDB() if args.title_type: IMDB.title_type = args.title_type url = imdb.build_query_string(args.title, args.title_type) search_result = imdb.execute_query(url) imdb.extract_data(search_result) imdb.build_table()
def create_movie_db(): conn = IMDB() ids = conn.fetch_vec("SELECT DISTINCT movie_id FROM stars,title " "WHERE movie_id= title.id ORDER BY production_year ASC") table = {key: [] for key in Movie._keys} for mov_id in ids: mov = Movie(conn, mov_id) for key in Movie._keys: table[key] += [mov[key]] df = pd.DataFrame(table)[Movie._keys] df.set_index('id') df.to_csv('../data/movies.csv') df.to_pickle('../data/movies.pkl')
parser.add_argument("-t", "--title", type=str) parser.add_argument("-y", "--year", type=int) parser.add_argument("-g", "--genre", type=str, choices=[\ 'Film-Noir', 'History', 'Biography', 'Fantasy',\ 'Thriller', 'Comedy', 'Horror', 'Musical',\ 'Drama', 'Mystery', 'Western', 'Music',\ 'Animation', 'Sport', 'Crime', 'War', 'Family',\ 'Sci-Fi', 'Action', 'Adventure', 'Romance']) args = parser.parse_args() search_terms = args.director if args.title: search_terms.insert(0, args.title) search_terms += args.actor if args.year is not None: search_terms.append(str(args.year)) if args.genre is not None: search_terms.append(args.genre) if len(search_terms) == 0: print( "Please provide at least one search term (movie title, director, actor, genre, or year)" ) print("Use --help for more info") sys.exit() data = IMDB() data.search(search_terms)
def __init__(self, imdb_conn=None): self.imdb_conn = IMDB() if imdb_conn is None else imdb_conn
class DataFileGenerator(object): def __init__(self, imdb_conn=None): self.imdb_conn = IMDB() if imdb_conn is None else imdb_conn def fix(self, data_version_num=None): raise NotImplementedError def generate_csv(self, movie_vectorizer, movie_generator=None, limit=None): """ :param movie_vectorizer: Instance of MovieVectorGenerator :param movie_generator: Generator of Movies :return: path to data_raw file """ if limit is None: limit = 999999 data_dir_ctrl = DataDirControl(str(movie_vectorizer)) start_time = monotonic() data_dir = data_dir_ctrl.create_version() with open(data_dir + "about.txt", 'wb') as about_fp: about_fp.write("db : {}\n".format(self.imdb_conn.db)) if movie_generator is None: movie_generator = self.imdb_conn.get_all_movies() succ_num = 0 fail_num = 0 total = 0 with open(data_dir + "log.txt", 'wb', 0) as log_fp, \ open(data_dir + "failed.txt", 'wb', 0) as fail_fp, \ open(data_dir + "data_raw.csv", 'wb', 0) as data_fp: csv_writer = csv.writer(data_fp) csv_writer.writerow(['id'] + movie_vectorizer.header) for movie in movie_generator: if limit <= 0: break limit -= 1 total += 1 try: movie_vec = movie_vectorizer.get_vector(movie) csv_writer.writerow([movie['id']] + movie_vec) succ_num += 1 except Exception, e: fail_num += 1 log_fp.write(" {} : <{}> \n {} \n".format(movie['id'], e, traceback.format_exc())) fail_fp.write(str(movie['id']) + "\n") end_time = monotonic() with open(data_dir + "about.txt", 'ab') as res_fp: res_fp.writelines(["\n", "Runtime : {}\n".format( timedelta(seconds=end_time - start_time)), "total movies : {}\n".format(total), "Success count : {} \n".format(succ_num), "Fail count : {} \n".format(fail_num)]) print "DONE" return data_dir + "data_raw.csv"