Beispiel #1
0
# Start with rosters CSV
rosters_url = url_base + roster_path
rosters_df = pd.read_csv(rosters_url, compression='gzip', low_memory=False)

# Grab the teams table from rosters dataframe
filter_col = [col for col in rosters_df if col.startswith('team.')]

teams_df = rosters_df[filter_col]
teams_df = teams_df.drop_duplicates()
teams_df = teams_df.rename(columns=lambda x: x.replace('team.', ''))

teams_csv_file = 'teams.csv'
teams_pks = ['season', 'teamId']

repo.import_df('teams', teams_df, teams_pks, 'update')

# Grab the players table from the rosters dataframe
filter_col = [col for col in rosters_df if col.startswith('teamPlayers.')]
filter_col = filter_col + ['team.season', 'team.teamId']

players_df = rosters_df[filter_col]
players_df = players_df.rename(columns=lambda x: x.replace('team.', ''))
players_df = players_df.rename(columns=lambda x: x.replace('teamPlayers.', ''))

players_df['jerseyNumber'] = players_df['jerseyNumber'].fillna('')
players_df['jerseyNumber'] = players_df['jerseyNumber'].astype(str)
players_df['jerseyNumber'] = players_df['jerseyNumber'].str.split('.')
players_df['jerseyNumber'] = players_df['jerseyNumber'].str[0]
players_df['birthDate'] = pd.to_datetime(players_df['birthDate'])
#
# Start main
#
cases = []
transcripts = []
i = 1
for file in glob.glob('supreme-court-cases/cases/*/*.js'):
    if (file == '.js'): continue

    case = {}
    import_case_file(file, case, transcripts)
    convert_dates(case, date_convert)

    cases.append(case)

#
# Import to Dolt
#
cases_df = pandas.DataFrame(cases)
transcripts_df = pandas.DataFrame(transcripts)

transcripts_pks = ['case_name', 'title', 'speaker', 'start']

repo = Dolt('./')
repo.import_df('cases', cases_df, ['case_name'], import_mode='replace')
repo.import_df('transcripts',
               transcripts_df,
               transcripts_pks,
               import_mode='replace')
        else:
            data[col] = datetime.datetime.strptime(f'{month} {day}, {year}',
                                                   '%b %d, %Y')


#
# Coerce the JSON file into a flat dictionary
#
with open("supreme-court-cases/justices.js") as file:
    justice_dict = json.load(file)

justices = []
for justice_name in dict.keys(justice_dict):
    justice = justice_dict[justice_name]
    output = {}
    normalize(justice, column_map, output)
    convert_dates(output, date_cols)
    convert_dates_3col(output, date_3cols)
    justices.append(output)

#
# Import into Dolt
#
justices_df = pandas.DataFrame(justices)

# Convert boolean columns using pandas
justices_df[["ethnic"]] *= 1

repo = Dolt('./')
repo.import_df('justices', justices_df, ['name'], import_mode='replace')
#!/usr/local/bin/python3

import pandas as pd

from doltpy.core import Dolt

from pprint import pprint

url_base = 'https://raw.githubusercontent.com/guga31bb/nflfastR-data/master/data/'

repo = Dolt('.')

pbp_df = pd.DataFrame()
year = 2000
while year < 2020:
    url = url_base + 'play_by_play_' + str(year) + '.csv.gz?raw=True'
    i_data = pd.read_csv(url, compression='gzip', low_memory=False)
    pbp_df = pbp_df.append(i_data, sort=True)

    year += 1

#Give each row a unique index
pbp_df.reset_index(drop=True, inplace=True)

plays_pks = ['game_id', 'play_id']
plays_csv_file = 'plays.csv'

repo.import_df('plays', pbp_df, plays_pks, 'update')