def test_identical_stubnames(self): df = pd.DataFrame({'A2010': [1.0, 2.0], 'A2011': [3.0, 4.0], 'B2010': [5.0, 6.0], 'A': ['X1', 'X2']}) with pytest.raises(ValueError): wide_to_long(df, ['A', 'B'], i='A', j='colname')
def test_identical_stubnames(self): df = pd.DataFrame({'A2010': [1.0, 2.0], 'A2011': [3.0, 4.0], 'B2010': [5.0, 6.0], 'A': ['X1', 'X2']}) msg = "stubname can't be identical to a column name" with pytest.raises(ValueError, match=msg): wide_to_long(df, ['A', 'B'], i='A', j='colname')
def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed df = pd.DataFrame({ 'A_A1': [1, 2, 3, 4, 5], 'B_B1': [1, 2, 3, 4, 5], 'x': [1, 1, 1, 1, 1] }) with pytest.raises(ValueError): wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
def test_non_unique_idvars(self): # GH16382 # Raise an error message if non unique id vars (i) are passed df = pd.DataFrame({ 'A_A1': [1, 2, 3, 4, 5], 'B_B1': [1, 2, 3, 4, 5], 'x': [1, 1, 1, 1, 1] }) msg = "the id variables need to uniquely identify each row" with pytest.raises(ValueError, match=msg): wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
def test_separating_character(self): # GH14779 np.random.seed(123) x = np.random.randn(3) df = pd.DataFrame({"A.1970": {0: "a", 1: "b", 2: "c"}, "A.1980": {0: "d", 1: "e", 2: "f"}, "B.1970": {0: 2.5, 1: 1.2, 2: .7}, "B.1980": {0: 3.2, 1: 1.3, 2: .1}, "X": dict(zip( range(3), x))}) df["id"] = df.index exp_data = {"X": x.tolist() + x.tolist(), "A": ['a', 'b', 'c', 'd', 'e', 'f'], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": ['1970', '1970', '1970', '1980', '1980', '1980'], "id": [0, 1, 2, 0, 1, 2]} exp_frame = DataFrame(exp_data) exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]] long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".") tm.assert_frame_equal(long_frame, exp_frame)
def test_escapable_characters(self): np.random.seed(123) x = np.random.randn(3) df = pd.DataFrame({"A(quarterly)1970": {0: "a", 1: "b", 2: "c"}, "A(quarterly)1980": {0: "d", 1: "e", 2: "f"}, "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: .7}, "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: .1}, "X": dict(zip( range(3), x))}) df["id"] = df.index exp_data = {"X": x.tolist() + x.tolist(), "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'], "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": [1970, 1970, 1970, 1980, 1980, 1980], "id": [0, 1, 2, 0, 1, 2]} expected = DataFrame(exp_data) expected = expected.set_index( ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]] result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year") tm.assert_frame_equal(result, expected)
def test_character_overlap(self): # Test we handle overlapping characters in both id_vars and value_vars df = pd.DataFrame({ 'A11': ['a11', 'a22', 'a33'], 'A12': ['a21', 'a22', 'a23'], 'B11': ['b11', 'b12', 'b13'], 'B12': ['b21', 'b22', 'b23'], 'BB11': [1, 2, 3], 'BB12': [4, 5, 6], 'BBBX': [91, 92, 93], 'BBBZ': [91, 92, 93] }) df['id'] = df.index expected = pd.DataFrame({ 'BBBX': [91, 92, 93, 91, 92, 93], 'BBBZ': [91, 92, 93, 91, 92, 93], 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], 'BB': [1, 2, 3, 4, 5, 6], 'id': [0, 1, 2, 0, 1, 2], 'year': [11, 11, 11, 12, 12, 12]}) expected = expected.set_index(['id', 'year'])[ ['BBBX', 'BBBZ', 'A', 'B', 'BB']] result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_num_string_disambiguation(self): # Test that we can disambiguate number value_vars from # string value_vars df = pd.DataFrame({ 'A11': ['a11', 'a22', 'a33'], 'A12': ['a21', 'a22', 'a23'], 'B11': ['b11', 'b12', 'b13'], 'B12': ['b21', 'b22', 'b23'], 'BB11': [1, 2, 3], 'BB12': [4, 5, 6], 'Arating': [91, 92, 93], 'Arating_old': [91, 92, 93] }) df['id'] = df.index expected = pd.DataFrame({ 'Arating': [91, 92, 93, 91, 92, 93], 'Arating_old': [91, 92, 93, 91, 92, 93], 'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'], 'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'], 'BB': [1, 2, 3, 4, 5, 6], 'id': [0, 1, 2, 0, 1, 2], 'year': [11, 11, 11, 12, 12, 12]}) expected = expected.set_index(['id', 'year'])[ ['Arating', 'Arating_old', 'A', 'B', 'BB']] result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year') tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_cast_j_int(self): df = pd.DataFrame({ 'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'], 'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'], 'actor_fb_likes_1': [1000.0, 40000.0, 11000.0], 'actor_fb_likes_2': [936.0, 5000.0, 393.0], 'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']}) expected = pd.DataFrame({ 'actor': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz', 'Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'], 'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0], 'num': [1, 1, 1, 2, 2, 2], 'title': ['Avatar', 'Pirates of the Caribbean', 'Spectre', 'Avatar', 'Pirates of the Caribbean', 'Spectre']}).set_index(['title', 'num']) result = wide_to_long(df, ['actor', 'actor_fb_likes'], i='title', j='num', sep='_') tm.assert_frame_equal(result, expected)
def test_stubs(self): # GH9204 df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) df.columns = ["id", "inc1", "inc2", "edu1", "edu2"] stubs = ["inc", "edu"] df_long = pd.wide_to_long(df, stubs, i="id", j="age") self.assertEqual(stubs, ["inc", "edu"])
def test_stubs(self): # GH9204 df = pd.DataFrame([[0,1,2,3,8],[4,5,6,7,9]]) df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] stubs = ['inc', 'edu'] df_long = pd.wide_to_long(df, stubs, i='id', j='age') self.assertEqual(stubs,['inc', 'edu'])
def wide_to_long(df, stubs): #Only operate on necesarry columns to keep memory usage down. cols_to_keep= [col for col in df.columns for stub in stubs if col.startswith(stub)] cols_to_keep.extend(['cin', 'source_date', 'eligibility_date']) dw = df[cols_to_keep].copy() dw['id'] = dw.index dw = pd.wide_to_long(dw, stubs, 'cin', 'j') dw['cardinal'] = dw.index.get_level_values('j').str[-1] dw = dw.reset_index() return dw
def test_stubs(self): # GH9204 df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]]) df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2'] stubs = ['inc', 'edu'] # TODO: unused? df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa assert stubs == ['inc', 'edu']
def wide_to_long_by_aidcode(df): aidcode_stubs = ['aidcode', 'respcounty', 'eligibilitystatus', 'full', 'fosterx', 'disabledx', 'ffp'] cols_to_keep= [col for col in df.columns for stub in aidcode_stubs if col.startswith(stub)] cols_to_keep.extend(['cin', 'calendar']) dw = df[cols_to_keep].copy() dw['id'] = dw.index dw = pd.wide_to_long(dw, aidcode_stubs, 'cin', 'j') dw = dw.reset_index() return dw
def txt_to_df(path): df = pd.read_csv(path, header=None) df.columns = ['name'] + [str(j)+str(i) for i in ['1','2','3','4','5'] for j in ['q','a1','a2','a3']] file_names = df['name'] df = pd.wide_to_long(df, stubnames=['q','a1','a2','a3'],i='name',j='qa') df['index'] = list(map(lambda x :x[0],df.index)) df['qa'] = list(map(lambda x :x[1],df.index)) df['index'] = df['index'].astype('category') df['index'].cat.set_categories(file_names,inplace = True) df.sort_values(['index','qa'],ascending = True,inplace = True) return df,file_names
def test_col_substring_of_stubname(self): # GH22468 # Don't raise ValueError when a column name is a substring # of a stubname that's been passed as a string wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}, 'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81}, 'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6}, 'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67}, 'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67} } wide_df = pd.DataFrame.from_dict(wide_data) expected = pd.wide_to_long(wide_df, stubnames=['PA'], i=['node_id', 'A'], j='time') result = pd.wide_to_long(wide_df, stubnames='PA', i=['node_id', 'A'], j='time') tm.assert_frame_equal(result, expected)
def wide_to_long(df, thresh): """Reshape dataframe from wide to long format.""" fields = ["loss"] stubs = ["%s_%d_" % (prefix, thresh) for prefix in fields] new_names = dict(zip(stubs, fields)) df = pd.wide_to_long(df, stubs, i="id", j="year") df = df.reset_index() df = df.rename(columns=new_names) return df
def wide_to_long(df, thresh): """Reshape dataframe from wide to long format.""" fields = ['loss'] stubs = ['%s_%d_' % (prefix, thresh) for prefix in fields] new_names = dict(zip(stubs, fields)) df = pd.wide_to_long(df, stubs, i='id', j='year') df = df.reset_index() df = df.rename(columns=new_names) return df
def l2w_pre(df, varname): """ Changes main dataframe from wide to long. Requires pre-processed full years df: DataFrame to be transformed from wide to long varname: Column name of the transformed variable """ long_df = pd.wide_to_long(df, [varname], i="DunsNumber", j="year") long_df.sort_index(inplace=True) long_df.dropna(inplace=True) return long_df
def test_nonnumeric_suffix(self): df = pd.DataFrame({'treatment_placebo': [1.0, 2.0], 'treatment_test': [3.0, 4.0], 'result_placebo': [5.0, 6.0], 'A': ['X1', 'X2']}) expected = pd.DataFrame({ 'A': ['X1', 'X1', 'X2', 'X2'], 'colname': ['placebo', 'test', 'placebo', 'test'], 'result': [5.0, np.nan, 6.0, np.nan], 'treatment': [1.0, 3.0, 2.0, 4.0]}) expected = expected.set_index(['A', 'colname']) result = wide_to_long(df, ['result', 'treatment'], i='A', j='colname', suffix='[a-z]+', sep='_') tm.assert_frame_equal(result, expected)
def test_mixed_type_suffix(self): df = pd.DataFrame({ 'treatment_1': [1.0, 2.0], 'treatment_foo': [3.0, 4.0], 'result_foo': [5.0, 6.0], 'result_1': [0, 9], 'A': ['X1', 'X2']}) expected = pd.DataFrame({ 'A': ['X1', 'X2', 'X1', 'X2'], 'colname': ['1', '1', 'foo', 'foo'], 'result': [0.0, 9.0, 5.0, 6.0], 'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname']) result = wide_to_long(df, ['result', 'treatment'], i='A', j='colname', suffix='.+', sep='_') tm.assert_frame_equal(result, expected)
def test_float_suffix(self): df = pd.DataFrame({ 'treatment_1.1': [1.0, 2.0], 'treatment_2.1': [3.0, 4.0], 'result_1.2': [5.0, 6.0], 'result_1': [0, 9], 'A': ['X1', 'X2']}) expected = pd.DataFrame({ 'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'], 'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], 'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], 'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]}) expected = expected.set_index(['A', 'colname']) result = wide_to_long(df, ['result', 'treatment'], i='A', j='colname', suffix='[0-9.]+', sep='_') tm.assert_frame_equal(result, expected)
def test_unbalanced(self): # test that we can have a varying amount of time variables df = pd.DataFrame({'A2010': [1.0, 2.0], 'A2011': [3.0, 4.0], 'B2010': [5.0, 6.0], 'X': ['X1', 'X2']}) df['id'] = df.index exp_data = {'X': ['X1', 'X1', 'X2', 'X2'], 'A': [1.0, 3.0, 2.0, 4.0], 'B': [5.0, np.nan, 6.0, np.nan], 'id': [0, 0, 1, 1], 'year': [2010, 2011, 2010, 2011]} expected = pd.DataFrame(exp_data) expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] result = wide_to_long(df, ['A', 'B'], i='id', j='year') tm.assert_frame_equal(result, expected)
def test_multiple_id_columns(self): # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm df = pd.DataFrame({ 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] }) expected = pd.DataFrame({ 'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8, 2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9], 'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3], 'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3], 'age': [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2] }) expected = expected.set_index(['famid', 'birth', 'age'])[['ht']] result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') tm.assert_frame_equal(result, expected)
def load_old_data(self, file_path): """ Load the datafile of the old format. """ df_wide = pd.read_csv(file_path, header=None, names="C/A,UNIT,SCP,DATE1,TIME1,DESC1,ENTRIES1,EXITS1,DATE2,TIME2,DESC2,ENTRIES2,EXITS2,DATE3,TIME3,DESC3,ENTRIES3,EXITS3,DATE4,TIME4,DESC4,ENTRIES4,EXITS4,DATE5,TIME5,DESC5,ENTRIES5,EXITS5,DATE6,TIME6,DESC6,ENTRIES6,EXITS6,DATE7,TIME7,DESC7,ENTRIES7,EXITS7,DATE8,TIME8,DESC8,ENTRIES8,EXITS8".split(','), dtype={'ENTRIES': np.float64, 'EXITS': np.float64}) df_wide['ind'] = df_wide.index df_long = pd.wide_to_long(df_wide, ["DATE", "TIME", "DESC", "ENTRIES", "EXITS"], i='ind', j='loc') df_long = df_long.sort_index().reset_index(drop=True) # Merge with the station table df_long = df_long.merge(self.station_table, on=["C/A", "UNIT"], how="left") # Change the format of the DATE df_long = df_long.ix[~df_long.DATE.str.contains(":").fillna(False), :] # A few data entry error that wrongly put TIME into DATE df_long["DATE"] = pd.DatetimeIndex(df_long.DATE).format() # Create datetime df_long["DATETIME"] = pd.to_datetime(df_long["DATE"] + " " + df_long["TIME"]) return df_long
def test_invalid_separator(self): # if an invalid separator is supplied a empty data frame is returned sep = 'nope!' df = pd.DataFrame({'A2010': [1.0, 2.0], 'A2011': [3.0, 4.0], 'B2010': [5.0, 6.0], 'X': ['X1', 'X2']}) df['id'] = df.index exp_data = {'X': '', 'A2010': [], 'A2011': [], 'B2010': [], 'id': [], 'year': [], 'A': [], 'B': []} expected = pd.DataFrame(exp_data).astype({'year': 'int'}) expected = expected.set_index(['id', 'year'])[[ 'X', 'A2010', 'A2011', 'B2010', 'A', 'B']] expected.index.set_levels([0, 1], level=0, inplace=True) result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep) tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
def test_subclassed_wide_to_long(self): # GH 9762 np.random.seed(123) x = np.random.randn(3) df = tm.SubclassedDataFrame({ "A1970": {0: "a", 1: "b", 2: "c"}, "A1980": {0: "d", 1: "e", 2: "f"}, "B1970": {0: 2.5, 1: 1.2, 2: .7}, "B1980": {0: 3.2, 1: 1.3, 2: .1}, "X": dict(zip(range(3), x))}) df["id"] = df.index exp_data = {"X": x.tolist() + x.tolist(), "A": ['a', 'b', 'c', 'd', 'e', 'f'], "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1], "year": [1970, 1970, 1970, 1980, 1980, 1980], "id": [0, 1, 2, 0, 1, 2]} expected = tm.SubclassedDataFrame(exp_data) expected = expected.set_index(['id', 'year'])[["X", "A", "B"]] long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(long_frame, expected)
def test_invalid_suffixtype(self): # If all stubs names end with a string, but a numeric suffix is # assumed, an empty data frame is returned df = pd.DataFrame({'Aone': [1.0, 2.0], 'Atwo': [3.0, 4.0], 'Bone': [5.0, 6.0], 'X': ['X1', 'X2']}) df['id'] = df.index exp_data = {'X': '', 'Aone': [], 'Atwo': [], 'Bone': [], 'id': [], 'year': [], 'A': [], 'B': []} expected = pd.DataFrame(exp_data).astype({'year': 'int'}) expected = expected.set_index(['id', 'year']) expected.index.set_levels([0, 1], level=0, inplace=True) result = wide_to_long(df, ['A', 'B'], i='id', j='year') tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
df_seg = pd.read_excel(path_to_seg) df_seg.session = pd.to_datetime(df_seg.session) #-- read car-t dates df_dates = pd.read_excel(config.coh_dates_upn_map, header=0) # clean colum headers df_dates.columns = [str(col).strip() for col in df_dates.columns] # convert dates to datetime df_dates.MRI1 = pd.to_datetime(df_dates.MRI1) df_dates.MRI2 = pd.to_datetime(df_dates.MRI2) df_dates.MRI3 = pd.to_datetime(df_dates.MRI3) df_dates.MRI4 = pd.to_datetime(df_dates.MRI4) df_dates_wide = pd.wide_to_long(df_dates, stubnames=['MRI', 'RANO'], i=['UPN'], j='cart_administration').reset_index() df_dates_wide = df_dates_wide[[ 'UPN', 'DCE', 'ARM', 'MRI', 'RANO', 'cart_administration' ]] df_dates_wide.columns = [ 'subject_id', 'has_dce', 'trial_arm', 'session', 'rano', 'cart_administration' ] df_dates_wide.to_excel('/Volumes/BRAIN/CART/IRB13384_wide.xls') # merge both dataframes # merge and keep keys from both dfs df_merged = pd.merge(df_seg, df_dates_wide,
def time_wide_to_long_big(self): wide_to_long(self.df, self.letters, i='id', j='year')
#now transposing the variables which include months df['ind'] = df.index # need to add prefix to variables with similar names. rename_dict = {'PD_per_hh_preyr': 'XX_PD_per_hh_preyr', 'delay_amt': 'XX_delay_amt', 'hh_P_emp_pre' : 'XX_hh_P_emp_pre', 'lab_pre': 'XX_lab_pre', 'unemp_amt': 'XX_unemp_amt', 'unpaid_delay_amt' : 'XX_unpaid_delay_amt' } #now reshape for k,v in rename_dict.iteritems(): df.rename(columns=lambda x: x.replace(k, v) if x.find(k) ==0 else x, inplace=True) for m in range(1,13): reshape_cols = [col.replace(str(m).zfill(2), "") for col in df.columns.values if str(m).zfill(2) in col] dfT = pd.wide_to_long(df, reshape_cols, i='ind', j='month') dfT.columns = [x.replace("XX_", "") for x in dfT.columns] dfT.reset_index(level=1, inplace=True) # get rid of month as the index for now. # create a calendar year variable. dfT['cal_year'] = np.where((dfT['month'] >=4), dfT.year.str[0:4], dfT.year.str[-4:]) #create pandas date object dfT['cal_date'] = pd.to_datetime(dfT.month.astype(str) + dfT.cal_year.astype(str), format="%m%Y") """ Analysis/graph making begins below """ ### Delay days (per household employed)
def read_ncdb(filepath): """ Read data from Geolytics's Neighborhood Change Database (NCDB) and store it for later use. Parameters ---------- input_dir : str location of the input CSV file extracted from your Geolytics DVD Returns ------- DataFrame """ ncdb_vars = variables["ncdb"].dropna()[1:].values df = pd.read_csv( filepath, low_memory=False, na_values=["", " ", 99999, -999], converters={ "GEO2010": str, "COUNTY": str, "COUSUB": str, "DIVISION": str, "REGION": str, "STATE": str, }, ) cols = df.columns fixed = [] for col in cols: if col.endswith("D"): fixed.append("D" + col[:-1]) elif col.endswith("N"): fixed.append("N" + col[:-1]) elif col.endswith("1A"): fixed.append(col[:-2] + "2") orig = [] for col in cols: if col.endswith("D"): orig.append(col) elif col.endswith("N"): orig.append(col) elif col.endswith("1A"): orig.append(col) df.rename(dict(zip(orig, fixed)), axis="columns", inplace=True) df = pd.wide_to_long(df, stubnames=ncdb_vars, i="GEO2010", j="year", suffix="(7|8|9|0|1|2)").reset_index() df["year"] = df["year"].replace({ 7: 1970, 8: 1980, 9: 1990, 0: 2000, 1: 2010, 2: 2010 }) df = df.groupby(["GEO2010", "year"]).first() mapper = dict(zip(variables.ncdb, variables.ltdb)) df.reset_index(inplace=True) df = df.rename(mapper, axis="columns") df = df.set_index("geoid") store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w") store["ncdb"] = df store.close() return df
def store_ncdb(filepath): """ Read & store data from Geolytics's Neighborhood Change Database. Parameters ---------- filepath : str location of the input CSV file extracted from your Geolytics DVD """ ncdb_vars = datasets.codebook()["ncdb"].dropna()[1:].values names = [] for name in ncdb_vars: for suffix in ["7", "8", "9", "0", "1", "2"]: names.append(name + suffix) names.append("GEO2010") c = pd.read_csv(filepath, nrows=1).columns c = pd.Series(c.values) keep = [] for _, col in c.items(): for name in names: if col.startswith(name): keep.append(col) df = pd.read_csv( filepath, usecols=keep, engine="c", na_values=["", " ", 99999, -999], converters={ "GEO2010": str, "COUNTY": str, "COUSUB": str, "DIVISION": str, "REGION": str, "STATE": str, }, ) cols = df.columns fixed = [] for col in cols: if col.endswith("D"): fixed.append("D" + col[:-1]) elif col.endswith("N"): fixed.append("N" + col[:-1]) elif col.endswith("1A"): fixed.append(col[:-2] + "2") orig = [] for col in cols: if col.endswith("D"): orig.append(col) elif col.endswith("N"): orig.append(col) elif col.endswith("1A"): orig.append(col) renamer = dict(zip(orig, fixed)) df.rename(renamer, axis="columns", inplace=True) df = df[df.columns[df.columns.isin(names)]] df = pd.wide_to_long(df, stubnames=ncdb_vars, i="GEO2010", j="year", suffix="(7|8|9|0|1|2)").reset_index() df["year"] = df["year"].replace({ 7: 1970, 8: 1980, 9: 1990, 0: 2000, 1: 2010, 2: 2010 }) df = df.groupby(["GEO2010", "year"]).first() mapper = dict(zip(datasets.codebook().ncdb, datasets.codebook().variable)) df.reset_index(inplace=True) df = df.rename(mapper, axis="columns") df = df.set_index("geoid") for row in datasets.codebook()["formula"].dropna().tolist(): try: df.eval(row, inplace=True) except: warn("Unable to compute " + str(row)) keeps = df.columns[df.columns.isin( datasets.codebook()["variable"].tolist() + ["year"])] df = df[keeps] df = df.loc[df.n_total_pop != 0] df.to_parquet(os.path.join(data_dir, "ncdb.parquet"), compression="brotli") storage.set("ncdb", os.path.join(data_dir, "ncdb.parquet")) storage.build("geosnap_data/storage")
def generate_tstats_classes(df, dest_dir, params): """Computes t-test for each class. This function computes a t-test for each class in the dataset. The t-test is computed by comparing class level metrics for a set of sparse model checkpoints to non-sparse model checkppints. Args: df: input dataframe with class level metrics. dest_dir: pathway to output directory. params: dataset specific params. """ human_label_lookup = class_level_metrics.HumanLabelLookup() label_dict = human_label_lookup.create_library() class_names = list(label_dict.values()) df.drop(columns='Unnamed: 0') df.reset_index(inplace=True, drop=True) df['id'] = df.index df_ = pd.wide_to_long(df, stubnames=['precision', 'recall'], i='id', j='class', sep='/', suffix=r'\w+').reset_index() data = pd.DataFrame([]) num_classes = params['num_classes'] mean_accuracy_dict = params['accuracy'] long_df_all = df_ for i in range(num_classes): # adding label id ensures unique naming of classes c = class_names[i] + '_' + str(i) for p in [0.1, 0.3, 0.5, 0.7, 0.9]: variant_mean_recall = long_df_all[( (long_df_all['fraction_pruned'] == p) & (long_df_all['class'] == c))]['recall'].mean() baseline_mean_recall = long_df_all[( (long_df_all['fraction_pruned'] == 0.0) & (long_df_all['class'] == c))]['recall'].mean() # normalize recall by model accuracy baseline_set = long_df_all[( (long_df_all['fraction_pruned'] == 0.0) & (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[0.0] variant_set = long_df_all[( (long_df_all['fraction_pruned'] == p) & (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[p] t_stat = ttest_ind(baseline_set, variant_set, equal_var=False) data = data.append(pd.DataFrame( { 'class': c, 'pruning_fraction': p, 'baseline_mean_recall': baseline_mean_recall, 'variant_mean_recall': variant_mean_recall, 'pvalue_recall_norm': t_stat[1], 'statistic_recall_norm': t_stat[0], }, index=[0]), ignore_index=True) time_ = str(time.time()) output_file = 'recall_t_statistic' file_name = '_' + time_ + '_' + output_file + '.csv' file_path = os.path.join(dest_dir, file_name) with tf.gfile.Open(file_path, 'w') as f: data.to_csv(f)
import gurobipy as grb import itertools import pickle # Read CSV -------------------------------------------------------- family_data = pd.read_csv('attempt_01/inputs/family_data.csv') # Define indices and data ----------------------------------------- family = list(range(0, 5000)) days = list(range(1, 101)) family_days = pd.DataFrame(list(itertools.product(family, days)), columns=['family_id', 'day']) family_people = family_data[['family_id', 'n_people']] family_choices = pd.wide_to_long(family_data, stubnames='choice_', i='family_id', j='day') \ .reset_index() \ .rename(columns={'day': 'choice', 'choice_': 'day'}) \ .drop('n_people', axis=1) family_costs = family_days \ .merge(family_choices, how='left', on=['family_id', 'day']) \ .merge(family_people, how='left', on='family_id') conditions = [(family_costs['choice'] == 0), (family_costs['choice'] == 1), (family_costs['choice'] == 2), (family_costs['choice'] == 3), (family_costs['choice'] == 4), (family_costs['choice'] == 5), (family_costs['choice'] == 6), (family_costs['choice'] == 7), (family_costs['choice'] == 8), (family_costs['choice'] == 9)] choices = [ 0, 50, 50 + 9 * family_costs['n_people'], 100 + 9 * family_costs['n_people'], 200 + 9 * family_costs['n_people'], 200 + 18 * family_costs['n_people'], 300 + 18 * family_costs['n_people'],
categories= ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",'Sunday']) dat_Norm_N_1.sort_index(level=0, inplace=True) colorBar_ = 'viridis_r' vmax_ = 4500; vmin_ = 0 g = sns.heatmap(dat_Norm_N_1,cmap = colorBar_, vmin=vmin_, vmax=vmax_,linewidths=.5) g.set_title('{}—{}'.format(x,j)) fig = g.get_figure() fig.savefig('Heatmap_{}_{}'.format(x,j)) # Get Summary data for Normal days dat_Norm = dat[(dat.TYPE == 'N')].reset_index() dat_Norm_Long = pd.wide_to_long(dat_Norm, stubnames ="HR", i = ['BEGDATE','DIR'], j = "Hour") dat_Norm_Long.rename(columns = {"HR":"Volume"},inplace=True) dat_Norm_wide = dat_Norm_Long.swaplevel(1,2).unstack().reset_index() dat_Norm_wide = dat_Norm_wide.drop(columns =["day","TYPE"]) dat_Norm_wide.loc[:,'day'] = dat_Norm_wide.BEGDATE.dt.weekday_name mask = ~dat_Norm_wide.day.isin(["Saturday","Sunday"]) dat_Norm_wide_Weekday = dat_Norm_wide[mask] dat_Norm_wide_Weekday.columns = [''.join(col).strip() for col in dat_Norm_wide_Weekday.columns.values] dat_Norm_wide_Weekday_Sum = dat_Norm_wide_Weekday.groupby("Hour")[["VolumeN","VolumeS"]].mean() idx = pd.index dat_Norm_wide.day
from numpy.lib.financial import nper import pandas as pd import numpy as np df = pd.DataFrame({ "A1970": { 0: "a", 1: "b", 2: "c" }, "A1980": { 0: "d", 1: "e", 2: "f" }, "B1970": { 0: 2.5, 1: 1.2, 2: .7 }, "B1980": { 0: 3.2, 1: 1.3, 2: .1 }, "X": dict(zip(range(3), np.random.randn(3))) }) df["id"] = df.index print(pd.wide_to_long(df, ["A", "B"], i="id", j="year"))
import pandas as pd df_alunos = pd.read_csv( "https://raw.githubusercontent.com/elasComputacao/raio-x-dados/main/data/dados-brutos/alunos_raiox.csv" ) geral = df_alunos.query("periodo_ingresso >= 2000.1").query( "apv_media_geral != 'NaN'").query("apv_media_geral != '0.0'") mulheres = geral.query("sexo == 'Feminino'") mulheres = mulheres.groupby(['periodo_ingresso' ])['apv_media_geral'].mean().round(2) mulheres = mulheres.to_frame('media_mulheres').reset_index() geral = geral.groupby(['periodo_ingresso'])['apv_media_geral'].mean().round(2) geral = geral.to_frame('media_geral').reset_index() nota_geral = geral.join(mulheres['media_mulheres']) nota_geral['periodo_ingresso'].astype(str) nota_geral = pd.wide_to_long(nota_geral, stubnames='media', i=['periodo_ingresso'], j='classe', sep='_', suffix='\w+').reset_index() nota_geral.to_csv('notas.csv') files.download('notas.csv')
"Intersection geometry/\n traffic control":"Int traffic control- geometry", "Intersection traffic control/\nroadway":"Int traffic control- roadway"},inplace=True) data1.District = data1.District.astype("Int32").astype(str) data1.District = "District "+data1.District data1.groupby('District').count() data1.District = pd.Categorical(data1.District, ["District 1","District 2","District 3","District 4","District 5", "District 6","District 8","District 9","District 10","District 11", "District 12"],ordered=True) # data1.District.replace("District nan","NoData",inplace=True) data1.Year= pd.Categorical(data1.Year,['2002','2003','2004-2007','2008','2009','2010','2011','2012','2013','2014','2015'],ordered=True) #2.1 Reshape Data ###################################################################################################### data2 = pd.wide_to_long(data1,["FatalSSI",'Fatal','SSI',"Type1","Type2","Type3"], sep="_",suffix="\w+",i=['TempIndex'],j="B_A") data2.reset_index(inplace=True) data2.loc[:,'DollorSpentMil'] = data2.CostPerRow/10**6 data2.B_A = pd.Categorical(data2.B_A,["Before","After"],ordered=True) data1.columns data1.ImpCat.value_counts() data1["Roadway Type"].value_counts() data1.District.value_counts() data1["Urban-Rural"].value_counts() data1['Cost Distribution'].value_counts() data1["Method for Site Selection"].value_counts() data1.EmphArea.value_counts() data1_TotalCrash =data1.copy() data1_TotalCrash.loc[:,RenameColumnMap.values()] = data1_TotalCrash[RenameColumnMap.values()].multiply(data1_TotalCrash.AnalysisPeriod,axis=0)