"""Download and truncate Rosstat corporate dataset.""" from boo import download, build, read_dataframe, files print("Please be prepared: " "download and build operations " "can take long time!") year = 2012 # Download raw file from Rosstat try: download(year) except FileExistsError: print("Raw file already downloaded") # Select fewer columns and assign short column names # Will save to new file try: build(year) except FileExistsError: print("Work file already created") # Read data as dataframe df = read_dataframe(year) print(year, "dataset:", df.shape[0], "rows and", df.shape[1], "columns") print("File locations:", files(year))
from boo import download, read_dataframe download(2012) df = read_dataframe(2012) print(df.head())
import matplotlib.pyplot as plt from boo import read_dataframe import pick try: df except NameError: df = read_dataframe(2017) try: df0 except NameError: df0 = pick.filter0(df) try: bs except NameError: bs = pick.nlargest(df0, 'sales', 500) def ab(t, n=20): return df[df.ok1 == t].head(n) def bln(x): return str(round(x / 10**6, 0)).rjust(5) # print as tables for b in bs.itertuples(): print(b.inn, str(b.ok1).rjust(2), bln(b.sales), bln(b.cf_oper),
cols = numeric_columns(df) zf = df.copy() zf.loc[:, cols] = zf.loc[:, cols].divide(divide_by).round(digits) return zf # save as CSV and Excel def locate(filename): return os.path.join("assets", filename) if __name__ == "__main__": must_overwrite = True boo.download(2018) source_df = boo.read_dataframe(2018) print("Finished reading file, querying...") # Has some profit or loss, but not exactly zero thousand RUB, # (protects from ghost firms) ix = source_df.profit_before_tax != 0 # Not a financial firm ix = ix & (~source_df.ok1.isin([64, 65])) # Gazprom will be on top of list df = source_df[ix].sort_values("ta", ascending=False).dropna() if must_overwrite: print("Saving files...") df1 = change_unit(df, divide_by=1_000_000, digits=3).query("ta>1")
def make_df0(year): df = read_dataframe(year) n_dups = inn_duplicates(df) print(f"Cleared {n_dups} duplicates from dataset. All rows unique.") return df.set_index('inn')
return df[(df.ok1 == ok1) & (df.ok2 == ok2)] def sales_df(df): return sort_sales(df)[SMALL_SHOW] def ta_df(df): return sort_sales(df)[SMALL_SHOW] if __name__ == "__main__": try: df except NameError: df = read_dataframe(2017).set_index('inn') # 1. Показать крупнейшие компании по продажам и объему активов # ============================================================ base_df = base_report(df) sf = sort_sales(base_df)[SMALL_SHOW] af = sort_ta(base_df)[SMALL_SHOW] n = 5 print("\nКрупнейшие компании по выручке:") print(sf.head(n)) print("\nКрупнейшие компании по активам:") print(af.head(n)) # - не вычищены финановые компании, у которых большая выручка # - есть компании-призраки