def main(): this_dir = os.path.dirname(os.path.realpath(__file__)) config = toml.load(os.path.join(this_dir, 'config.toml')) u.set_full_paths(config, this_dir) csv_loc = config['file_locations']['case_study'] df: pd.DataFrame = pd.read_csv(csv_loc) # type: ignore upper_line_count_limit = 500 # bins = np.linspace(0, upper_line_count_limit, 10) # df['line_count_cat'] = pd.cut(df['line_count'], bins=bins) df['file_type'] = df['file_name'].apply(file_type_binner) df['file_ext'] = df['file_name']\ .str.replace('.*/(.*?)', '\\1', regex=True)\ .str.replace('.*/(Dockerfile).*?', '\\1', regex=True)\ .str.replace('.*\\.(.*?)', r'\1', regex=True) pd.set_option('display.max_rows', df.shape[0] + 1) pd.set_option('display.max_columns', 10000) pd.set_option('display.max_colwidth', 200) pd.set_option('display.width', 500) pd.set_option("expand_frame_repr", True) #pd.set_option("large_repr", "truncate") # print(df.sample(5)) #print(df) #print(df['file_type'].value_counts()) df.drop(columns=['net_line_changes'], inplace=True) print(df.groupby(['author', 'file_type']).agg(['count', 'mean', 'sum'])) # print(df[df['file_type'] == 'config']) # Limits data for charting purposes df = df[~(df['lines_added'] > upper_line_count_limit)]
def main(): this_dir = os.path.dirname(os.path.realpath(__file__)) config = toml.load(os.path.join(this_dir, 'config.toml')) u.set_full_paths(config, this_dir) df: pd.DataFrame = pd.read_csv( config['file_locations']['file_changes_raw']) # type: ignore for col in ['lines_added', 'lines_deleted']: df = df[df[col] != '-'] df[col] = pd.to_numeric(df[col], errors='coerce') df.convert_dtypes() # sum up all line adds and deletes per author per file summed_gdf = df.groupby(['author', 'file_name']).agg('sum') summed_df: pd.DataFrame = summed_gdf.unstack().unstack().reset_index( ) # type: ignore summed_df.rename(columns={ 'level_0': 'line_action', 0: 'line_count' }, inplace=True) # if an author didnt contribute to a file, then dont show an entry for them on that file summed_df['line_count'] = summed_df['line_count'].astype('str') # la_df = summed_df.groupby(['author', 'file_name'])['line_action'].apply(';'.join).reset_index() lc_df: pd.DataFrame = summed_df.groupby([ 'author', 'file_name' ])['line_count'].apply(';'.join).reset_index() # type: ignore # compressed_df = pd.merge(la_df, lc_df, on=['author', 'file_name']) # type: ignore lc_df[['lines_added', 'lines_deleted']] = lc_df['line_count'].str.split(';', expand=True) lc_df.drop('line_count', axis=1, inplace=True) lc_df.dropna(subset=['lines_added', 'lines_deleted'], inplace=True) lc_df = lc_df[(lc_df['lines_added'] != 'nan') | (lc_df['lines_added'] != 'nan')] for col in ['lines_added', 'lines_deleted']: lc_df = lc_df[lc_df[col] != '-'] lc_df[col] = pd.to_numeric(lc_df[col], errors='coerce') lc_df['net_line_changes'] = lc_df['lines_added'] - lc_df['lines_deleted'] lc_df.to_csv(config['file_locations']['file_changes_cleaned'], index=False)
import pdfplumber from py_linq import Enumerable import toml import os import src.utils as u from pandas_data_analytics.text_parser.parser import Parser import re import pandas_data_analytics.text_parser.utils as tu import time start_time = time.time() this_dir = os.path.dirname(os.path.realpath(__file__)) config = toml.load(os.path.join(this_dir, 'config.toml')) u.set_full_paths(config, this_dir) pdf_loc = config['file_locations']['data'] parser_settings = toml.load(config['file_locations']['parser_settings']) result = tu.from_pdf_per_page_prom(pdf_loc, parser_settings) print(result) print("--- %s seconds ---" % (time.time() - start_time))
def main(): def view_data(pdf): # pdf.style.set_properties(**{'width': '300px'}) field = 'moves_learnt_by_tr' + '_' pdf = pdf.filter(regex=f'{field}|name', axis=1) # pdf.columns[pdf.columns.str.contains(f'({field}.*|name|generation)', flags=re.I,regex=True)] total_rows = len(pdf.index.value_counts()) unique_rows = len(pdf.drop_duplicates().index.value_counts()) dup_rows = len(pdf[pdf.duplicated()].index.value_counts()) percent_duped = (dup_rows / total_rows) * 100 ps = Enumerable([ # lambda: pdf.isna().mean().sort_values(ascending=False), lambda: pdf.columns, lambda: total_rows, lambda: unique_rows, lambda: dup_rows, lambda: percent_duped, lambda: pdf.dropna().sample(7), # lambda: pdf.sort_values(['name', 'generation', 'moves_learnt_by_level_up_lvl']).drop([], axis=1).sample(5), # lambda: pdf.sort_values(['name', 'generation', 'moves_learnt_by_level_up_lvl']).sample(5), # lambda: pdf[pdf.duplicated()].sort_values(['name', 'generation']) ]) u.foreach(lambda f: print(f()), ps) def implode_move_sets(movesdf, moves): # describes what to do with each move set agg_dict = {} for m in moves: agg_dict[f'{m}_json'] = lambda x: x.tolist() # 'Implodes' data frame move sets into lists csv_move_df = movesdf\ .groupby(['name', 'generation'])\ .agg(agg_dict, regex=True)\ .reset_index() def move_list_filter(j): d = json.loads(j) return d['move'] is not None # Ex: If on Golem, we will drop Alonan form Golem moves def drop_alternate_form_moves(l): current_highest_number = -1 move_list = [] for d in l: j = json.loads(d) if (j['lvl'].isnumeric()): num = int(j['lvl']) if (num < current_highest_number): break else: current_highest_number = num move_list.append(j) return move_list for field in Enumerable(moves).select(lambda m: m + '_json'): csv_move_df[field] = csv_move_df[field]\ .apply(lambda l: Enumerable(l) .where(move_list_filter))\ .apply(drop_alternate_form_moves) # dups are due to alonan forms/ alter form tabs for move sets # try to filter then out based on if the number in the move set entry restarts # FOR EACH GENERATION # if 1..23..88 then 2..22..88 # then drop the entries from 2 onward since these are alonan form move sets return csv_move_df this_dir = os.path.dirname(os.path.realpath(__file__)) config = toml.load(os.path.join(this_dir, 'config.toml')) u.set_full_paths(config, this_dir) csv_loc = config['file_locations']['data'] df: pd.DataFrame = pd.read_csv(csv_loc) # type: ignore df: pd.DataFrame = df.drop([ 'web-scraper-order', 'web-scraper-start-url', 'name_link-href', 'generation-href' ], axis=1) # type: ignore df.columns = df.columns.str.replace(' ', '_', regex=True) df.sort_values(by="national_no", inplace=True) df.reset_index(inplace=True) pd.set_option('display.max_rows', df.shape[0] + 1) pd.set_option('display.max_columns', 175) # name field is unique per pokemon # movesdf: pd.DataFrame = df[['name', 'moves_learnt_by_level_up', 'generation']].dropna() # type: ignore movesdf: pd.DataFrame = df.drop(['name_link'], axis=1) # type: ignore types = [ 'ground', 'electric', 'bug', 'ghost', 'normal', 'psychic', 'fire', 'fairy', 'dark', 'grass', 'fighting', 'water', 'ice', 'dragon', 'poison', 'rock', 'flying', 'steel' ] # def get_thing(phrase, s): # m = re.search(phrase,s) # return m.group(1) if m is not None\ # else np.nan # def parse_moves_lvl_up(s): # lvl = get_thing('^(\d+)', s) # return lvl # movesdf['moves_learnt_by_level_up_lvl'] = movesdf.moves_learnt_by_level_up.apply(parse_moves_lvl_up) def parse_moves(df: pd.DataFrame, field: str): ldf = pd.DataFrame() ldf[f'{field}_lvl'] = df[field]\ .str.replace(r'(^\d+)(.*)', r'\1', regex=True) ldf[f'{field}_move'] = df[field]\ .str.replace('(^.*?)(\\D+(?:' + '|'.join(types) + '))(.*)', r'\2', regex=True, flags=re.I)\ .str.replace('(.*?)(' + '|'.join(types) + ')' + '$', r'\1', regex=True, flags=re.I) ldf[f'{field}_type'] = df[field]\ .str.replace('(^.*?)(\\D+(?:' + '|'.join(types) + '))(.*)', r'\2', regex=True, flags=re.I)\ .str.replace('(.*?)(' + '|'.join(types) + ')' + '$', r'\2', regex=True, flags=re.I) ldf[f'{field}_power'] = df[field]\ .str.strip()\ .str.replace('(^\\d+)(\\D+(?:' + '|'.join(types) + '))(.*)', r'\3', regex=True, flags=re.I)\ .str.replace(r'(.*?)(\d{,4}|—|∞)\s{,4}(\d{,4}|—|∞)$', r'\2', regex=True, flags=re.I)\ .str.replace(r'∞', 'inf', regex=True)\ .str.replace(r'—', '', regex=True) ldf[f'{field}_acc'] = df[field]\ .str.strip()\ .str.replace('(^\\d+)(\\D+(?:' + '|'.join(types) + '))(.*)', r'\3', regex=True, flags=re.I)\ .str.replace(r'(.*?)(\d{,4}|—|∞)\s{,4}(\d{,4}|—|∞)$', r'\3', regex=True, flags=re.I)\ .str.replace(r'∞', 'inf', regex=True)\ .str.replace(r'—', '', regex=True) return ldf # print(df.columns) moves = [ 'moves_learnt_by_level_up', 'moves_learnt_by_tm', 'moves_learnt_by_tr', 'moves_learnt_by_move_tutor', 'moves_learnt_by_egg', 'moves_learnt_by_hm', 'moves_learnt_by_transfer', 'moves_learnt_by_evolution' ] for f in moves: movesdf[f] = movesdf[f].astype(str) # type: ignore ldf = parse_moves(movesdf, f) # to have move data in columns # movesdf = pd.concat([movesdf,ldf],axis=1) # to have move data in json format in 1 column ldf.columns = ldf.columns\ .str.replace(f, '', regex=True)\ .str.replace('_', '', regex=True) movesdf[f'{f}_json'] = ldf.to_json( orient='records', lines=True).splitlines() # type: ignore # print(f) # print(movesdf[f'{f}_json'].tolist()[0:10]) csv_move_df = implode_move_sets(movesdf, moves) # print(type(csv_move_df)) print(csv_move_df.columns) # type: ignore print(len(csv_move_df)) # type: ignore # print(csv_move_df.sample(6)) # csv_move_df.to_csv(config['file_locations']['cleaned_data'], index=False) general_df: pd.DataFrame = df.drop( [ 'moves_learnt_by_level_up', 'moves_learnt_by_tm', 'moves_learnt_by_tr', 'moves_learnt_by_move_tutor', 'moves_learnt_by_egg', 'moves_learnt_by_hm', 'moves_learnt_by_transfer', 'moves_learnt_by_evolution' ], axis=1).drop_duplicates() # type: ignore print(len(df)) print(len(general_df)) print(general_df.columns)