def load_basics(): return pd.read_csv( directories.data("title.basics.tsv.gz"), compression="gzip", sep="\t", low_memory=False, )
def run(): print("Downloading data") retrieve_imdb_data("name.basics.tsv.gz") retrieve_imdb_data("title.basics.tsv.gz") retrieve_imdb_data("title.ratings.tsv.gz") retrieve_imdb_data("title.principals.tsv.gz") print("Creating graph") g = make_professional_graph() with open(directories.data("professional.pkl"), "wb") as f: pickle.dump(g, f)
def retrieve_imdb_data(filename): path = directories.data(filename) print(path) if os.path.exists(path): print("{} already exists".format(path)) return url = "https://datasets.imdbws.com/{}".format(filename) print("Down loading {}".format(url)) response = requests.get(url, stream=True) total_size_in_bytes = int(response.headers.get("content-length", 0)) block_size = 1024 progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True) with open(path, "wb") as f: for data in response.iter_content(block_size): progress_bar.update(len(data)) f.write(data) progress_bar.close()
def load_principals(): return pd.read_csv(directories.data("title.principals.tsv.gz"), compression="gzip", sep="\t")
def load_ratings(): return pd.read_csv(directories.data("title.ratings.tsv.gz"), compression="gzip", sep="\t")
def load_names(): return pd.read_csv(directories.data("name.basics.tsv.gz"), compression="gzip", sep="\t")