from boo import download, read_dataframe download(2012) df = read_dataframe(2012) print(df.head())
"""Download and truncate Rosstat corporate dataset.""" from boo import download, build, read_dataframe, files print("Please be prepared: " "download and build operations " "can take long time!") year = 2012 # Download raw file from Rosstat try: download(year) except FileExistsError: print("Raw file already downloaded") # Select fewer columns and assign short column names # Will save to new file try: build(year) except FileExistsError: print("Work file already created") # Read data as dataframe df = read_dataframe(year) print(year, "dataset:", df.shape[0], "rows and", df.shape[1], "columns") print("File locations:", files(year))
# convert from thousand to billion rub and round to 3 digits cols = numeric_columns(df) zf = df.copy() zf.loc[:, cols] = zf.loc[:, cols].divide(divide_by).round(digits) return zf # save as CSV and Excel def locate(filename): return os.path.join("assets", filename) if __name__ == "__main__": must_overwrite = True boo.download(2018) source_df = boo.read_dataframe(2018) print("Finished reading file, querying...") # Has some profit or loss, but not exactly zero thousand RUB, # (protects from ghost firms) ix = source_df.profit_before_tax != 0 # Not a financial firm ix = ix & (~source_df.ok1.isin([64, 65])) # Gazprom will be on top of list df = source_df[ix].sort_values("ta", ascending=False).dropna() if must_overwrite: print("Saving files...")