Ejemplo n.º 1
0
 def test_identical_stubnames(self):
     df = pd.DataFrame({'A2010': [1.0, 2.0],
                        'A2011': [3.0, 4.0],
                        'B2010': [5.0, 6.0],
                        'A': ['X1', 'X2']})
     with pytest.raises(ValueError):
         wide_to_long(df, ['A', 'B'], i='A', j='colname')
Ejemplo n.º 2
0
 def test_identical_stubnames(self):
     df = pd.DataFrame({'A2010': [1.0, 2.0],
                        'A2011': [3.0, 4.0],
                        'B2010': [5.0, 6.0],
                        'A': ['X1', 'X2']})
     msg = "stubname can't be identical to a column name"
     with pytest.raises(ValueError, match=msg):
         wide_to_long(df, ['A', 'B'], i='A', j='colname')
Ejemplo n.º 3
0
 def test_non_unique_idvars(self):
     # GH16382
     # Raise an error message if non unique id vars (i) are passed
     df = pd.DataFrame({
         'A_A1': [1, 2, 3, 4, 5],
         'B_B1': [1, 2, 3, 4, 5],
         'x': [1, 1, 1, 1, 1]
     })
     with pytest.raises(ValueError):
         wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
Ejemplo n.º 4
0
 def test_non_unique_idvars(self):
     # GH16382
     # Raise an error message if non unique id vars (i) are passed
     df = pd.DataFrame({
         'A_A1': [1, 2, 3, 4, 5],
         'B_B1': [1, 2, 3, 4, 5],
         'x': [1, 1, 1, 1, 1]
     })
     msg = "the id variables need to uniquely identify each row"
     with pytest.raises(ValueError, match=msg):
         wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
Ejemplo n.º 5
0
 def test_separating_character(self):
     # GH14779
     np.random.seed(123)
     x = np.random.randn(3)
     df = pd.DataFrame({"A.1970": {0: "a",
                                   1: "b",
                                   2: "c"},
                        "A.1980": {0: "d",
                                   1: "e",
                                   2: "f"},
                        "B.1970": {0: 2.5,
                                   1: 1.2,
                                   2: .7},
                        "B.1980": {0: 3.2,
                                   1: 1.3,
                                   2: .1},
                        "X": dict(zip(
                            range(3), x))})
     df["id"] = df.index
     exp_data = {"X": x.tolist() + x.tolist(),
                 "A": ['a', 'b', 'c', 'd', 'e', 'f'],
                 "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
                 "year": ['1970', '1970', '1970', '1980', '1980', '1980'],
                 "id": [0, 1, 2, 0, 1, 2]}
     exp_frame = DataFrame(exp_data)
     exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
     long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
     tm.assert_frame_equal(long_frame, exp_frame)
Ejemplo n.º 6
0
 def test_escapable_characters(self):
     np.random.seed(123)
     x = np.random.randn(3)
     df = pd.DataFrame({"A(quarterly)1970": {0: "a",
                                             1: "b",
                                             2: "c"},
                        "A(quarterly)1980": {0: "d",
                                             1: "e",
                                             2: "f"},
                        "B(quarterly)1970": {0: 2.5,
                                             1: 1.2,
                                             2: .7},
                        "B(quarterly)1980": {0: 3.2,
                                             1: 1.3,
                                             2: .1},
                        "X": dict(zip(
                            range(3), x))})
     df["id"] = df.index
     exp_data = {"X": x.tolist() + x.tolist(),
                 "A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
                 "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
                 "year": [1970, 1970, 1970, 1980, 1980, 1980],
                 "id": [0, 1, 2, 0, 1, 2]}
     expected = DataFrame(exp_data)
     expected = expected.set_index(
         ['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
     result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
                           i="id", j="year")
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 7
0
 def test_character_overlap(self):
     # Test we handle overlapping characters in both id_vars and value_vars
     df = pd.DataFrame({
         'A11': ['a11', 'a22', 'a33'],
         'A12': ['a21', 'a22', 'a23'],
         'B11': ['b11', 'b12', 'b13'],
         'B12': ['b21', 'b22', 'b23'],
         'BB11': [1, 2, 3],
         'BB12': [4, 5, 6],
         'BBBX': [91, 92, 93],
         'BBBZ': [91, 92, 93]
     })
     df['id'] = df.index
     expected = pd.DataFrame({
         'BBBX': [91, 92, 93, 91, 92, 93],
         'BBBZ': [91, 92, 93, 91, 92, 93],
         'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
         'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
         'BB': [1, 2, 3, 4, 5, 6],
         'id': [0, 1, 2, 0, 1, 2],
         'year': [11, 11, 11, 12, 12, 12]})
     expected = expected.set_index(['id', 'year'])[
         ['BBBX', 'BBBZ', 'A', 'B', 'BB']]
     result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
     tm.assert_frame_equal(result.sort_index(axis=1),
                           expected.sort_index(axis=1))
Ejemplo n.º 8
0
 def test_num_string_disambiguation(self):
     # Test that we can disambiguate number value_vars from
     # string value_vars
     df = pd.DataFrame({
         'A11': ['a11', 'a22', 'a33'],
         'A12': ['a21', 'a22', 'a23'],
         'B11': ['b11', 'b12', 'b13'],
         'B12': ['b21', 'b22', 'b23'],
         'BB11': [1, 2, 3],
         'BB12': [4, 5, 6],
         'Arating': [91, 92, 93],
         'Arating_old': [91, 92, 93]
     })
     df['id'] = df.index
     expected = pd.DataFrame({
         'Arating': [91, 92, 93, 91, 92, 93],
         'Arating_old': [91, 92, 93, 91, 92, 93],
         'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
         'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
         'BB': [1, 2, 3, 4, 5, 6],
         'id': [0, 1, 2, 0, 1, 2],
         'year': [11, 11, 11, 12, 12, 12]})
     expected = expected.set_index(['id', 'year'])[
         ['Arating', 'Arating_old', 'A', 'B', 'BB']]
     result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
     tm.assert_frame_equal(result.sort_index(axis=1),
                           expected.sort_index(axis=1))
Ejemplo n.º 9
0
    def test_cast_j_int(self):
        df = pd.DataFrame({
            'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
            'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
            'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
            'actor_fb_likes_2': [936.0, 5000.0, 393.0],
            'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})

        expected = pd.DataFrame({
            'actor': ['CCH Pounder',
                      'Johnny Depp',
                      'Christoph Waltz',
                      'Joel David Moore',
                      'Orlando Bloom',
                      'Rory Kinnear'],
            'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
            'num': [1, 1, 1, 2, 2, 2],
            'title': ['Avatar',
                      'Pirates of the Caribbean',
                      'Spectre',
                      'Avatar',
                      'Pirates of the Caribbean',
                      'Spectre']}).set_index(['title', 'num'])
        result = wide_to_long(df, ['actor', 'actor_fb_likes'],
                              i='title', j='num', sep='_')

        tm.assert_frame_equal(result, expected)
Ejemplo n.º 10
0
    def test_stubs(self):
        # GH9204
        df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
        df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
        stubs = ["inc", "edu"]
        df_long = pd.wide_to_long(df, stubs, i="id", j="age")

        self.assertEqual(stubs, ["inc", "edu"])
Ejemplo n.º 11
0
    def test_stubs(self):
        # GH9204
        df = pd.DataFrame([[0,1,2,3,8],[4,5,6,7,9]])
        df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
        stubs = ['inc', 'edu']
        df_long = pd.wide_to_long(df, stubs, i='id', j='age')

        self.assertEqual(stubs,['inc', 'edu'])
Ejemplo n.º 12
0
def wide_to_long(df, stubs):
    #Only operate on necesarry columns to keep memory usage down.
    cols_to_keep= [col for col in df.columns for stub in stubs if col.startswith(stub)]
    cols_to_keep.extend(['cin', 'source_date', 'eligibility_date'])
    dw = df[cols_to_keep].copy()
    dw['id'] = dw.index
    dw = pd.wide_to_long(dw, stubs, 'cin', 'j')
    dw['cardinal'] = dw.index.get_level_values('j').str[-1]
    dw = dw.reset_index()
    return dw
Ejemplo n.º 13
0
    def test_stubs(self):
        # GH9204
        df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
        df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
        stubs = ['inc', 'edu']

        # TODO: unused?
        df_long = pd.wide_to_long(df, stubs, i='id', j='age')  # noqa

        assert stubs == ['inc', 'edu']
Ejemplo n.º 14
0
def wide_to_long_by_aidcode(df):
    aidcode_stubs = ['aidcode', 'respcounty', 'eligibilitystatus', 'full',
                     'fosterx', 'disabledx', 'ffp']
    cols_to_keep= [col for col in df.columns for stub in aidcode_stubs if col.startswith(stub)]
    cols_to_keep.extend(['cin', 'calendar'])
    dw = df[cols_to_keep].copy()
    dw['id'] = dw.index
    dw = pd.wide_to_long(dw, aidcode_stubs, 'cin', 'j')
    dw = dw.reset_index()
    return dw
Ejemplo n.º 15
0
def txt_to_df(path):
    df = pd.read_csv(path, header=None)
    df.columns = ['name'] + [str(j)+str(i) for i in ['1','2','3','4','5'] for j in ['q','a1','a2','a3']]
    file_names = df['name']
    df = pd.wide_to_long(df, stubnames=['q','a1','a2','a3'],i='name',j='qa')
    df['index'] = list(map(lambda x :x[0],df.index))
    df['qa'] = list(map(lambda x :x[1],df.index))
    df['index'] = df['index'].astype('category')
    df['index'].cat.set_categories(file_names,inplace = True)
    df.sort_values(['index','qa'],ascending = True,inplace = True)
    return df,file_names
Ejemplo n.º 16
0
 def test_col_substring_of_stubname(self):
     # GH22468
     # Don't raise ValueError when a column name is a substring
     # of a stubname that's been passed as a string
     wide_data = {'node_id': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
                  'A': {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
                  'PA0': {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
                  'PA1': {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
                  'PA3': {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67}
                  }
     wide_df = pd.DataFrame.from_dict(wide_data)
     expected = pd.wide_to_long(wide_df,
                                stubnames=['PA'],
                                i=['node_id', 'A'],
                                j='time')
     result = pd.wide_to_long(wide_df,
                              stubnames='PA',
                              i=['node_id', 'A'],
                              j='time')
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 17
0
def wide_to_long(df, thresh):
    """Reshape dataframe from wide to long format."""

    fields = ["loss"]
    stubs = ["%s_%d_" % (prefix, thresh) for prefix in fields]
    new_names = dict(zip(stubs, fields))

    df = pd.wide_to_long(df, stubs, i="id", j="year")
    df = df.reset_index()
    df = df.rename(columns=new_names)

    return df
Ejemplo n.º 18
0
def wide_to_long(df, thresh):
    """Reshape dataframe from wide to long format."""

    fields = ['loss']
    stubs = ['%s_%d_' % (prefix, thresh) for prefix in fields]
    new_names = dict(zip(stubs, fields))

    df = pd.wide_to_long(df, stubs, i='id', j='year')
    df = df.reset_index()
    df = df.rename(columns=new_names)

    return df
Ejemplo n.º 19
0
def l2w_pre(df, varname):
    """
    Changes main dataframe from wide to long.  Requires pre-processed full years
    
    df:  DataFrame to be transformed from wide to long
    varname:  Column name of the transformed variable
    """
    long_df = pd.wide_to_long(df, [varname], i="DunsNumber", j="year")
    long_df.sort_index(inplace=True)
    long_df.dropna(inplace=True)

    return long_df
Ejemplo n.º 20
0
 def test_nonnumeric_suffix(self):
     df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
                        'treatment_test': [3.0, 4.0],
                        'result_placebo': [5.0, 6.0],
                        'A': ['X1', 'X2']})
     expected = pd.DataFrame({
         'A': ['X1', 'X1', 'X2', 'X2'],
         'colname': ['placebo', 'test', 'placebo', 'test'],
         'result': [5.0, np.nan, 6.0, np.nan],
         'treatment': [1.0, 3.0, 2.0, 4.0]})
     expected = expected.set_index(['A', 'colname'])
     result = wide_to_long(df, ['result', 'treatment'],
                           i='A', j='colname', suffix='[a-z]+', sep='_')
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 21
0
 def test_mixed_type_suffix(self):
     df = pd.DataFrame({
         'treatment_1': [1.0, 2.0],
         'treatment_foo': [3.0, 4.0],
         'result_foo': [5.0, 6.0],
         'result_1': [0, 9],
         'A': ['X1', 'X2']})
     expected = pd.DataFrame({
         'A': ['X1', 'X2', 'X1', 'X2'],
         'colname': ['1', '1', 'foo', 'foo'],
         'result': [0.0, 9.0, 5.0, 6.0],
         'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
     result = wide_to_long(df, ['result', 'treatment'],
                           i='A', j='colname', suffix='.+', sep='_')
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 22
0
 def test_float_suffix(self):
     df = pd.DataFrame({
         'treatment_1.1': [1.0, 2.0],
         'treatment_2.1': [3.0, 4.0],
         'result_1.2': [5.0, 6.0],
         'result_1': [0, 9],
         'A': ['X1', 'X2']})
     expected = pd.DataFrame({
         'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
         'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
         'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
         'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
     expected = expected.set_index(['A', 'colname'])
     result = wide_to_long(df, ['result', 'treatment'],
                           i='A', j='colname', suffix='[0-9.]+', sep='_')
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 23
0
 def test_unbalanced(self):
     # test that we can have a varying amount of time variables
     df = pd.DataFrame({'A2010': [1.0, 2.0],
                        'A2011': [3.0, 4.0],
                        'B2010': [5.0, 6.0],
                        'X': ['X1', 'X2']})
     df['id'] = df.index
     exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
                 'A': [1.0, 3.0, 2.0, 4.0],
                 'B': [5.0, np.nan, 6.0, np.nan],
                 'id': [0, 0, 1, 1],
                 'year': [2010, 2011, 2010, 2011]}
     expected = pd.DataFrame(exp_data)
     expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
     result = wide_to_long(df, ['A', 'B'], i='id', j='year')
     tm.assert_frame_equal(result, expected)
Ejemplo n.º 24
0
 def test_multiple_id_columns(self):
     # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
     df = pd.DataFrame({
         'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
         'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
         'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
         'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
     })
     expected = pd.DataFrame({
         'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
                2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
         'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
         'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
         'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
                 2, 1, 2, 1, 2, 1, 2, 1, 2]
     })
     expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
     result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
     tm.assert_frame_equal(result, expected)
    def load_old_data(self, file_path):
        """
        Load the datafile of the old format.
        """

        df_wide = pd.read_csv(file_path, header=None, names="C/A,UNIT,SCP,DATE1,TIME1,DESC1,ENTRIES1,EXITS1,DATE2,TIME2,DESC2,ENTRIES2,EXITS2,DATE3,TIME3,DESC3,ENTRIES3,EXITS3,DATE4,TIME4,DESC4,ENTRIES4,EXITS4,DATE5,TIME5,DESC5,ENTRIES5,EXITS5,DATE6,TIME6,DESC6,ENTRIES6,EXITS6,DATE7,TIME7,DESC7,ENTRIES7,EXITS7,DATE8,TIME8,DESC8,ENTRIES8,EXITS8".split(','), dtype={'ENTRIES': np.float64, 'EXITS': np.float64})

        df_wide['ind'] = df_wide.index
        df_long = pd.wide_to_long(df_wide, ["DATE", "TIME", "DESC", "ENTRIES", "EXITS"], i='ind', j='loc')
        df_long = df_long.sort_index().reset_index(drop=True)

        # Merge with the station table
        df_long = df_long.merge(self.station_table, on=["C/A", "UNIT"], how="left")

        # Change the format of the DATE
        df_long = df_long.ix[~df_long.DATE.str.contains(":").fillna(False), :]  # A few data entry error that wrongly put TIME into DATE
        df_long["DATE"] = pd.DatetimeIndex(df_long.DATE).format()

        # Create datetime
        df_long["DATETIME"] = pd.to_datetime(df_long["DATE"] + " " + df_long["TIME"])

        return df_long
Ejemplo n.º 26
0
 def test_invalid_separator(self):
     # if an invalid separator is supplied a empty data frame is returned
     sep = 'nope!'
     df = pd.DataFrame({'A2010': [1.0, 2.0],
                        'A2011': [3.0, 4.0],
                        'B2010': [5.0, 6.0],
                        'X': ['X1', 'X2']})
     df['id'] = df.index
     exp_data = {'X': '',
                 'A2010': [],
                 'A2011': [],
                 'B2010': [],
                 'id': [],
                 'year': [],
                 'A': [],
                 'B': []}
     expected = pd.DataFrame(exp_data).astype({'year': 'int'})
     expected = expected.set_index(['id', 'year'])[[
         'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
     expected.index.set_levels([0, 1], level=0, inplace=True)
     result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
     tm.assert_frame_equal(result.sort_index(axis=1),
                           expected.sort_index(axis=1))
Ejemplo n.º 27
0
    def test_subclassed_wide_to_long(self):
        # GH 9762

        np.random.seed(123)
        x = np.random.randn(3)
        df = tm.SubclassedDataFrame({
            "A1970": {0: "a", 1: "b", 2: "c"},
            "A1980": {0: "d", 1: "e", 2: "f"},
            "B1970": {0: 2.5, 1: 1.2, 2: .7},
            "B1980": {0: 3.2, 1: 1.3, 2: .1},
            "X": dict(zip(range(3), x))})

        df["id"] = df.index
        exp_data = {"X": x.tolist() + x.tolist(),
                    "A": ['a', 'b', 'c', 'd', 'e', 'f'],
                    "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
                    "year": [1970, 1970, 1970, 1980, 1980, 1980],
                    "id": [0, 1, 2, 0, 1, 2]}
        expected = tm.SubclassedDataFrame(exp_data)
        expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
        long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")

        tm.assert_frame_equal(long_frame, expected)
Ejemplo n.º 28
0
    def test_invalid_suffixtype(self):
        # If all stubs names end with a string, but a numeric suffix is
        # assumed,  an empty data frame is returned
        df = pd.DataFrame({'Aone': [1.0, 2.0],
                           'Atwo': [3.0, 4.0],
                           'Bone': [5.0, 6.0],
                           'X': ['X1', 'X2']})
        df['id'] = df.index
        exp_data = {'X': '',
                    'Aone': [],
                    'Atwo': [],
                    'Bone': [],
                    'id': [],
                    'year': [],
                    'A': [],
                    'B': []}
        expected = pd.DataFrame(exp_data).astype({'year': 'int'})

        expected = expected.set_index(['id', 'year'])
        expected.index.set_levels([0, 1], level=0, inplace=True)
        result = wide_to_long(df, ['A', 'B'], i='id', j='year')
        tm.assert_frame_equal(result.sort_index(axis=1),
                              expected.sort_index(axis=1))
df_seg = pd.read_excel(path_to_seg)
df_seg.session = pd.to_datetime(df_seg.session)

#-- read car-t dates
df_dates = pd.read_excel(config.coh_dates_upn_map, header=0)

# clean colum headers
df_dates.columns = [str(col).strip() for col in df_dates.columns]
# convert dates to datetime
df_dates.MRI1 = pd.to_datetime(df_dates.MRI1)
df_dates.MRI2 = pd.to_datetime(df_dates.MRI2)
df_dates.MRI3 = pd.to_datetime(df_dates.MRI3)
df_dates.MRI4 = pd.to_datetime(df_dates.MRI4)

df_dates_wide = pd.wide_to_long(df_dates,
                                stubnames=['MRI', 'RANO'],
                                i=['UPN'],
                                j='cart_administration').reset_index()
df_dates_wide = df_dates_wide[[
    'UPN', 'DCE', 'ARM', 'MRI', 'RANO', 'cart_administration'
]]
df_dates_wide.columns = [
    'subject_id', 'has_dce', 'trial_arm', 'session', 'rano',
    'cart_administration'
]
df_dates_wide.to_excel('/Volumes/BRAIN/CART/IRB13384_wide.xls')

# merge both dataframes

# merge and keep keys from both dfs
df_merged = pd.merge(df_seg,
                     df_dates_wide,
Ejemplo n.º 30
0
 def time_wide_to_long_big(self):
     wide_to_long(self.df, self.letters, i='id', j='year')
#now transposing the variables which include months
df['ind'] = df.index

# need to add prefix to variables with similar names.
rename_dict = {'PD_per_hh_preyr': 'XX_PD_per_hh_preyr', 'delay_amt': 'XX_delay_amt', 
               'hh_P_emp_pre' : 'XX_hh_P_emp_pre', 'lab_pre': 'XX_lab_pre',
               'unemp_amt': 'XX_unemp_amt', 'unpaid_delay_amt' : 'XX_unpaid_delay_amt' }

#now reshape
for k,v in rename_dict.iteritems():
    df.rename(columns=lambda x: x.replace(k, v) if x.find(k) ==0 else x, inplace=True)

for m in range(1,13):
    reshape_cols = [col.replace(str(m).zfill(2), "") for col in df.columns.values if str(m).zfill(2) in col]

dfT = pd.wide_to_long(df, reshape_cols, i='ind', j='month')
dfT.columns = [x.replace("XX_", "") for x in dfT.columns]

dfT.reset_index(level=1, inplace=True) # get rid of month as the index for now.

# create a calendar year variable.
dfT['cal_year'] = np.where((dfT['month'] >=4), dfT.year.str[0:4], dfT.year.str[-4:]) 

#create pandas date object
dfT['cal_date'] = pd.to_datetime(dfT.month.astype(str) + dfT.cal_year.astype(str), format="%m%Y") 

"""
Analysis/graph making begins below
"""

### Delay days (per household employed)
Ejemplo n.º 32
0
def read_ncdb(filepath):
    """
    Read data from Geolytics's Neighborhood Change Database (NCDB) and store it for later use.

    Parameters
    ----------
    input_dir : str
        location of the input CSV file extracted from your Geolytics DVD

    Returns
    -------
    DataFrame

    """

    ncdb_vars = variables["ncdb"].dropna()[1:].values

    df = pd.read_csv(
        filepath,
        low_memory=False,
        na_values=["", " ", 99999, -999],
        converters={
            "GEO2010": str,
            "COUNTY": str,
            "COUSUB": str,
            "DIVISION": str,
            "REGION": str,
            "STATE": str,
        },
    )

    cols = df.columns
    fixed = []
    for col in cols:
        if col.endswith("D"):
            fixed.append("D" + col[:-1])
        elif col.endswith("N"):
            fixed.append("N" + col[:-1])
        elif col.endswith("1A"):
            fixed.append(col[:-2] + "2")

    orig = []
    for col in cols:
        if col.endswith("D"):
            orig.append(col)
        elif col.endswith("N"):
            orig.append(col)
        elif col.endswith("1A"):
            orig.append(col)

    df.rename(dict(zip(orig, fixed)), axis="columns", inplace=True)

    df = pd.wide_to_long(df,
                         stubnames=ncdb_vars,
                         i="GEO2010",
                         j="year",
                         suffix="(7|8|9|0|1|2)").reset_index()

    df["year"] = df["year"].replace({
        7: 1970,
        8: 1980,
        9: 1990,
        0: 2000,
        1: 2010,
        2: 2010
    })
    df = df.groupby(["GEO2010", "year"]).first()

    mapper = dict(zip(variables.ncdb, variables.ltdb))

    df.reset_index(inplace=True)

    df = df.rename(mapper, axis="columns")

    df = df.set_index("geoid")

    store = pd.HDFStore(os.path.join(package_directory, "data.h5"), "w")
    store["ncdb"] = df

    store.close()

    return df
Ejemplo n.º 33
0
def store_ncdb(filepath):
    """
    Read & store data from Geolytics's Neighborhood Change Database.

    Parameters
    ----------
    filepath : str
        location of the input CSV file extracted from your Geolytics DVD

    """
    ncdb_vars = datasets.codebook()["ncdb"].dropna()[1:].values

    names = []
    for name in ncdb_vars:
        for suffix in ["7", "8", "9", "0", "1", "2"]:
            names.append(name + suffix)
    names.append("GEO2010")

    c = pd.read_csv(filepath, nrows=1).columns
    c = pd.Series(c.values)

    keep = []
    for _, col in c.items():
        for name in names:
            if col.startswith(name):
                keep.append(col)

    df = pd.read_csv(
        filepath,
        usecols=keep,
        engine="c",
        na_values=["", " ", 99999, -999],
        converters={
            "GEO2010": str,
            "COUNTY": str,
            "COUSUB": str,
            "DIVISION": str,
            "REGION": str,
            "STATE": str,
        },
    )

    cols = df.columns
    fixed = []
    for col in cols:
        if col.endswith("D"):
            fixed.append("D" + col[:-1])
        elif col.endswith("N"):
            fixed.append("N" + col[:-1])
        elif col.endswith("1A"):
            fixed.append(col[:-2] + "2")

    orig = []
    for col in cols:
        if col.endswith("D"):
            orig.append(col)
        elif col.endswith("N"):
            orig.append(col)
        elif col.endswith("1A"):
            orig.append(col)

    renamer = dict(zip(orig, fixed))
    df.rename(renamer, axis="columns", inplace=True)

    df = df[df.columns[df.columns.isin(names)]]

    df = pd.wide_to_long(df,
                         stubnames=ncdb_vars,
                         i="GEO2010",
                         j="year",
                         suffix="(7|8|9|0|1|2)").reset_index()

    df["year"] = df["year"].replace({
        7: 1970,
        8: 1980,
        9: 1990,
        0: 2000,
        1: 2010,
        2: 2010
    })
    df = df.groupby(["GEO2010", "year"]).first()

    mapper = dict(zip(datasets.codebook().ncdb, datasets.codebook().variable))

    df.reset_index(inplace=True)

    df = df.rename(mapper, axis="columns")

    df = df.set_index("geoid")

    for row in datasets.codebook()["formula"].dropna().tolist():
        try:
            df.eval(row, inplace=True)
        except:
            warn("Unable to compute " + str(row))

    keeps = df.columns[df.columns.isin(
        datasets.codebook()["variable"].tolist() + ["year"])]

    df = df[keeps]

    df = df.loc[df.n_total_pop != 0]

    df.to_parquet(os.path.join(data_dir, "ncdb.parquet"), compression="brotli")
    storage.set("ncdb", os.path.join(data_dir, "ncdb.parquet"))
    storage.build("geosnap_data/storage")
Ejemplo n.º 34
0
def generate_tstats_classes(df, dest_dir, params):
    """Computes t-test for each class.

   This function computes a t-test for each class in the dataset.
   The t-test is computed by comparing class level metrics for
   a set of sparse model checkpoints to non-sparse model
   checkppints.

  Args:
    df: input dataframe with class level metrics.
    dest_dir: pathway to output directory.
    params: dataset specific params.
  """

    human_label_lookup = class_level_metrics.HumanLabelLookup()
    label_dict = human_label_lookup.create_library()
    class_names = list(label_dict.values())

    df.drop(columns='Unnamed: 0')
    df.reset_index(inplace=True, drop=True)
    df['id'] = df.index

    df_ = pd.wide_to_long(df,
                          stubnames=['precision', 'recall'],
                          i='id',
                          j='class',
                          sep='/',
                          suffix=r'\w+').reset_index()

    data = pd.DataFrame([])

    num_classes = params['num_classes']
    mean_accuracy_dict = params['accuracy']

    long_df_all = df_
    for i in range(num_classes):

        # adding label id ensures unique naming of classes
        c = class_names[i] + '_' + str(i)
        for p in [0.1, 0.3, 0.5, 0.7, 0.9]:

            variant_mean_recall = long_df_all[(
                (long_df_all['fraction_pruned'] == p) &
                (long_df_all['class'] == c))]['recall'].mean()

            baseline_mean_recall = long_df_all[(
                (long_df_all['fraction_pruned'] == 0.0) &
                (long_df_all['class'] == c))]['recall'].mean()

            # normalize recall by model accuracy
            baseline_set = long_df_all[(
                (long_df_all['fraction_pruned'] == 0.0) &
                (long_df_all['class']
                 == c))]['recall'] - mean_accuracy_dict[0.0]
            variant_set = long_df_all[(
                (long_df_all['fraction_pruned'] == p) &
                (long_df_all['class'] == c))]['recall'] - mean_accuracy_dict[p]

            t_stat = ttest_ind(baseline_set, variant_set, equal_var=False)

            data = data.append(pd.DataFrame(
                {
                    'class': c,
                    'pruning_fraction': p,
                    'baseline_mean_recall': baseline_mean_recall,
                    'variant_mean_recall': variant_mean_recall,
                    'pvalue_recall_norm': t_stat[1],
                    'statistic_recall_norm': t_stat[0],
                },
                index=[0]),
                               ignore_index=True)

    time_ = str(time.time())
    output_file = 'recall_t_statistic'
    file_name = '_' + time_ + '_' + output_file + '.csv'
    file_path = os.path.join(dest_dir, file_name)
    with tf.gfile.Open(file_path, 'w') as f:
        data.to_csv(f)
Ejemplo n.º 35
0
import gurobipy as grb
import itertools
import pickle

# Read CSV --------------------------------------------------------
family_data = pd.read_csv('attempt_01/inputs/family_data.csv')

# Define indices and data -----------------------------------------
family = list(range(0, 5000))
days = list(range(1, 101))

family_days = pd.DataFrame(list(itertools.product(family, days)),
                           columns=['family_id', 'day'])
family_people = family_data[['family_id', 'n_people']]
family_choices = pd.wide_to_long(family_data, stubnames='choice_', i='family_id', j='day') \
    .reset_index() \
    .rename(columns={'day': 'choice', 'choice_': 'day'}) \
    .drop('n_people', axis=1)

family_costs = family_days \
    .merge(family_choices, how='left', on=['family_id', 'day']) \
    .merge(family_people, how='left', on='family_id')

conditions = [(family_costs['choice'] == 0), (family_costs['choice'] == 1),
              (family_costs['choice'] == 2), (family_costs['choice'] == 3),
              (family_costs['choice'] == 4), (family_costs['choice'] == 5),
              (family_costs['choice'] == 6), (family_costs['choice'] == 7),
              (family_costs['choice'] == 8), (family_costs['choice'] == 9)]
choices = [
    0, 50, 50 + 9 * family_costs['n_people'],
    100 + 9 * family_costs['n_people'], 200 + 9 * family_costs['n_people'],
    200 + 18 * family_costs['n_people'], 300 + 18 * family_costs['n_people'],
Ejemplo n.º 36
0
          categories= ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
                       "Saturday",'Sunday'])
        dat_Norm_N_1.sort_index(level=0, inplace=True)
        colorBar_ = 'viridis_r'
        vmax_ = 4500; vmin_ = 0
        g = sns.heatmap(dat_Norm_N_1,cmap = colorBar_, vmin=vmin_, vmax=vmax_,linewidths=.5)
        g.set_title('{}—{}'.format(x,j))    
        fig = g.get_figure()
        fig.savefig('Heatmap_{}_{}'.format(x,j))

    
    
    
# Get Summary data for Normal days
dat_Norm = dat[(dat.TYPE == 'N')].reset_index()
dat_Norm_Long = pd.wide_to_long(dat_Norm, stubnames ="HR", i = ['BEGDATE','DIR'], j = "Hour")    
dat_Norm_Long.rename(columns = {"HR":"Volume"},inplace=True)
dat_Norm_wide = dat_Norm_Long.swaplevel(1,2).unstack().reset_index()

dat_Norm_wide = dat_Norm_wide.drop(columns =["day","TYPE"])
dat_Norm_wide.loc[:,'day'] = dat_Norm_wide.BEGDATE.dt.weekday_name
mask = ~dat_Norm_wide.day.isin(["Saturday","Sunday"])
dat_Norm_wide_Weekday = dat_Norm_wide[mask]
dat_Norm_wide_Weekday.columns = [''.join(col).strip() for col in dat_Norm_wide_Weekday.columns.values]
dat_Norm_wide_Weekday_Sum  = dat_Norm_wide_Weekday.groupby("Hour")[["VolumeN","VolumeS"]].mean()
idx = pd.index
dat_Norm_wide.day



Ejemplo n.º 37
0
from numpy.lib.financial import nper
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "A1970": {
        0: "a",
        1: "b",
        2: "c"
    },
    "A1980": {
        0: "d",
        1: "e",
        2: "f"
    },
    "B1970": {
        0: 2.5,
        1: 1.2,
        2: .7
    },
    "B1980": {
        0: 3.2,
        1: 1.3,
        2: .1
    },
    "X": dict(zip(range(3), np.random.randn(3)))
})
df["id"] = df.index
print(pd.wide_to_long(df, ["A", "B"], i="id", j="year"))
Ejemplo n.º 38
0
import pandas as pd

df_alunos = pd.read_csv(
    "https://raw.githubusercontent.com/elasComputacao/raio-x-dados/main/data/dados-brutos/alunos_raiox.csv"
)

geral = df_alunos.query("periodo_ingresso >= 2000.1").query(
    "apv_media_geral != 'NaN'").query("apv_media_geral != '0.0'")
mulheres = geral.query("sexo == 'Feminino'")
mulheres = mulheres.groupby(['periodo_ingresso'
                             ])['apv_media_geral'].mean().round(2)

mulheres = mulheres.to_frame('media_mulheres').reset_index()

geral = geral.groupby(['periodo_ingresso'])['apv_media_geral'].mean().round(2)
geral = geral.to_frame('media_geral').reset_index()

nota_geral = geral.join(mulheres['media_mulheres'])
nota_geral['periodo_ingresso'].astype(str)
nota_geral = pd.wide_to_long(nota_geral,
                             stubnames='media',
                             i=['periodo_ingresso'],
                             j='classe',
                             sep='_',
                             suffix='\w+').reset_index()

nota_geral.to_csv('notas.csv')
files.download('notas.csv')
Ejemplo n.º 39
0
                      "Intersection geometry/\n traffic control":"Int traffic control- geometry",
                      "Intersection traffic control/\nroadway":"Int traffic control- roadway"},inplace=True)
data1.District = data1.District.astype("Int32").astype(str)
data1.District = "District "+data1.District
data1.groupby('District').count()
data1.District = pd.Categorical(data1.District,
                                             ["District 1","District 2","District 3","District 4","District 5",
                                              "District 6","District 8","District 9","District 10","District 11",
                                              "District 12"],ordered=True)

# data1.District.replace("District nan","NoData",inplace=True)
data1.Year= pd.Categorical(data1.Year,['2002','2003','2004-2007','2008','2009','2010','2011','2012','2013','2014','2015'],ordered=True)

#2.1 Reshape Data
######################################################################################################
data2 = pd.wide_to_long(data1,["FatalSSI",'Fatal','SSI',"Type1","Type2","Type3"],
                sep="_",suffix="\w+",i=['TempIndex'],j="B_A")
data2.reset_index(inplace=True)
data2.loc[:,'DollorSpentMil'] = data2.CostPerRow/10**6
data2.B_A = pd.Categorical(data2.B_A,["Before","After"],ordered=True)

data1.columns
data1.ImpCat.value_counts()
data1["Roadway Type"].value_counts()
data1.District.value_counts()
data1["Urban-Rural"].value_counts()
data1['Cost Distribution'].value_counts()
data1["Method for Site Selection"].value_counts()
data1.EmphArea.value_counts()

data1_TotalCrash =data1.copy()
data1_TotalCrash.loc[:,RenameColumnMap.values()] = data1_TotalCrash[RenameColumnMap.values()].multiply(data1_TotalCrash.AnalysisPeriod,axis=0)