コード例 #1
0
ファイル: example_start.py プロジェクト: Witold1/sandbox
"""Download and truncate Rosstat corporate dataset."""

from boo import download, build, read_dataframe, files

print("Please be prepared: "
      "download and build operations "
      "can take long time!")

year = 2012

# Download raw file from Rosstat
try:
    download(year)
except FileExistsError:
    print("Raw file already downloaded")

# Select fewer columns and assign short column names
# Will save to new file
try:
    build(year)
except FileExistsError:
    print("Work file already created")

# Read data as dataframe
df = read_dataframe(year)

print(year, "dataset:", df.shape[0], "rows and", df.shape[1], "columns")
print("File locations:", files(year))
コード例 #2
0
ファイル: example.py プロジェクト: vishalbelsare/boo
from boo import download, read_dataframe

download(2012)
df = read_dataframe(2012)
print(df.head())
コード例 #3
0
ファイル: play.py プロジェクト: Witold1/sandbox
import matplotlib.pyplot as plt
from boo import read_dataframe
import pick

try:
    df
except NameError:
    df = read_dataframe(2017)

try:
    df0
except NameError:
    df0 = pick.filter0(df)

try:
    bs
except NameError:
    bs = pick.nlargest(df0, 'sales', 500)


def ab(t, n=20):
    return df[df.ok1 == t].head(n)


def bln(x):
    return str(round(x / 10**6, 0)).rjust(5)


# print as tables
for b in bs.itertuples():
    print(b.inn, str(b.ok1).rjust(2), bln(b.sales), bln(b.cf_oper),
コード例 #4
0
    cols = numeric_columns(df)
    zf = df.copy()
    zf.loc[:, cols] = zf.loc[:, cols].divide(divide_by).round(digits)
    return zf


# save as CSV and Excel
def locate(filename):
    return os.path.join("assets", filename)


if __name__ == "__main__":
    must_overwrite = True

    boo.download(2018)
    source_df = boo.read_dataframe(2018)
    print("Finished reading file, querying...")

    # Has some profit or loss, but not exactly zero thousand RUB,
    # (protects from ghost firms)
    ix = source_df.profit_before_tax != 0

    # Not a financial firm
    ix = ix & (~source_df.ok1.isin([64, 65]))

    # Gazprom will be on top of list
    df = source_df[ix].sort_values("ta", ascending=False).dropna()

    if must_overwrite:
        print("Saving files...")
        df1 = change_unit(df, divide_by=1_000_000, digits=3).query("ta>1")
コード例 #5
0
ファイル: pick.py プロジェクト: Witold1/sandbox
def make_df0(year):
    df = read_dataframe(year)
    n_dups = inn_duplicates(df)
    print(f"Cleared {n_dups} duplicates from dataset. All rows unique.")
    return df.set_index('inn')
コード例 #6
0
ファイル: pick.py プロジェクト: Witold1/sandbox
    return df[(df.ok1 == ok1) & (df.ok2 == ok2)]


def sales_df(df):
    return sort_sales(df)[SMALL_SHOW]


def ta_df(df):
    return sort_sales(df)[SMALL_SHOW]


if __name__ == "__main__":
    try:
        df
    except NameError:
        df = read_dataframe(2017).set_index('inn')

    # 1. Показать крупнейшие компании по продажам и объему активов
    # ============================================================

    base_df = base_report(df)
    sf = sort_sales(base_df)[SMALL_SHOW]
    af = sort_ta(base_df)[SMALL_SHOW]
    n = 5
    print("\nКрупнейшие компании по выручке:")
    print(sf.head(n))
    print("\nКрупнейшие компании по активам:")
    print(af.head(n))

    # - не вычищены финановые компании, у которых большая выручка
    # - есть компании-призраки