Example #1
1
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop('C', axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=['A', 'B', 'C'], how='all')
        expected.columns = ['A', 'A', 'B', 'C']

        df.columns = ['A', 'A', 'B', 'C']

        result = df.dropna(subset=['A', 'C'], how='all')
        assert_frame_equal(result, expected)
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose):
    ''' Return the prediction function, 
    for a given site iden, history Charg and temperature Temps'''
    if use_cache_trainingset:
        if test:
            X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb"))
        else:
            X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb"))
    else:
        X = DataFrame(Charg[iden])
        X = X.dropna(how='any')
        X['dayofweek'] = X.index.dayofweek
        X['Temps'] = Temps[iden].ix[X.index]
        X['fracday'] = X.index.minute/60.+X.index.hour
        X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values
        X['yesterday'] = X[iden].ix[X.index-Day()].values
        X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values
        X['lastweek'] = X[iden].ix[X.index-Week()].values
        X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values
        if test:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
        else:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
    X = X.dropna(how='any')
    y = X[iden]
    X = X.drop(iden, 1)
    scalerX = preprocessing.StandardScaler().fit(X)
    ##############################
    clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000)
    ##############################
    clf.fit(scalerX.transform(X), y)
    if verbose:
        print('Function for '+iden+' computed.')
    return(lambda x :clf.predict(scalerX.transform(x)))
Example #3
0
def checkFile(fn,satdata,beamisr,maxdtsec):
    """
    we need to find matching ISR beam IDs very near the time the satellite
    passes through the ISR beam.
    for speed, use Unix epoch time (seconds since Jan 1, 1970) for comparisons

    Note: the Madrigal HDF5 data is read in as a Numpy structured array

    Algorithm (not optimized):
    1) knowing what satellites will eventually intersect beams, are any of those beamids in this file?
    2) knowing what times intersections will occur, do those times exist in this file for those beams?
    3) For the beams that meet conditions 1 and 2, compute TEC by numerical integration of NE

    output:
    tecisr: 2-D DataFrame, beamid x time

    """
    h5p = '/Data/Table Layout'
    #rows: satellite.  cols: time
    intersections = satdata.loc[:,:,'intersect']
    intersections.dropna(axis=1,how='all',inplace=True)

    beamlist = beamisr['BEAMID'].values # have to make a copy to sort
    beamlist.sort()

    tecisr = DataFrame(index=beamlist, columns=intersections.columns)

    try:
        with h5py.File(fn,'r',libver='latest') as f:
            for t in intersections: #for each time...
                #mask for matching beam ids (not necessarily matching in time yet...)
                intmask = np.in1d(f[h5p]['beamid'].astype(int),intersections[t].dropna().astype(int))
                if not intmask.any(): #no overlap, no point in evaluating times
                    continue
                #mask for matching times (not necessarily matching beamids)
                timemask =np.absolute(f[h5p]['ut1_unix'] - (t.to_pydatetime()-datetime(1970,1,1)).total_seconds()) < maxdtsec

                #mask for where beamid and times "match"
                inttimemask = intmask & timemask
                #retrieve "good" rows of HDF5 that are the correct Beam ID(s) and time(s)
                intdata = f[h5p][inttimemask]

                #TODO not tested past this point
                #TODO account for the case where there are two times and one beam that overlap with the satellite.
                """
                intdata will have numerous rows corresponding to each matching time & beam id
                each row is a range cell. These rows will be numerically integrated over Ne.
                """
                uniqbeamid = np.unique(intdata['beamid']).astype(int)
                for b in uniqbeamid:
                    mask = np.isclose(intdata['beamid'],b) #this is one beam's rows, all range bins
                    mask &= np.isfinite(intdata['nel'][mask]) #dropna
                    tecisr.loc[b,t] = comptecisr(10**intdata['nel'][mask],
                                                 intdata['range'][mask])

    except ValueError as e:
        warn('{} does not seem to have the needed data fields.   {}'.format(fn,e))

    tecisr.dropna(axis=1,how='all',inplace=True) #only retain times with TEC data (vast majority don't have)
    return tecisr
Example #4
0
 def __read_data_values(self):
     """
     Reads the `Data Values` worksheet in a Time-Series excel file.
     :return:
     """
     sheet = self.workbook.get_sheet_by_name('Data Values')  # type: Worksheet
     dvs = self.__dv_row_generator(sheet.iter_rows())
     headers = next(dvs)
     df = DataFrame([dv for dv in dvs], columns=headers)
     df.dropna(how='all', inplace=True)
     self.tables['DataValues'] = df
Example #5
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df[2][:2] = nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, thresh=5, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, thresh=4, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=1, thresh=4)
        assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        assert_frame_equal(dropped, df)
        assert_frame_equal(inp, df)

        # all
        dropped = df.dropna(axis=1, how='all')
        assert_frame_equal(dropped, df)

        df[2] = nan
        dropped = df.dropna(axis=1, how='all')
        expected = df.loc[:, [0, 1, 3]]
        assert_frame_equal(dropped, expected)

        # bad input
        pytest.raises(ValueError, df.dropna, axis=3)
Example #6
0
def parse_essentials(essentials_file, samples, normalization=None, cutoff=100):
    data = DataFrame.from_csv(essentials_file, sep="\t", index_col=False)
    data = data[["Position"] + samples]
    data["sum"] = data[samples].apply(sum, axis=1)
    data = data[data["sum"] < cutoff]
    data = data.groupby("Position").sum()
    for sample in samples:
        sample_data = DataFrame(None, index=data.index)
        if normalization is not None:
            sample_data["insertions"] = data[sample].apply(normalization)
        else:
            sample_data["insertions"] = data[sample]

        sample_data.dropna(inplace=True)
        yield sample_data
Example #7
0
def pd_02():
    string_data=Series(['a','b','c',np.nan,'e',None])
    print string_data
    print string_data.isnull()
    print string_data.dropna()
    df=DataFrame(np.random.randn(7,3))
    df.ix[:4,1]=np.nan
    df.ix[:2,2]=np.nan
    print df
    print df.dropna()
    print df.fillna(0)
    print df.fillna({1:0.5,3:-1})
    print df
    df.fillna(0,inplace=True)
    print df
Example #8
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
Example #9
0
 def _to_frame_build_data_frame(self, tfp, hasna, usecols):
     # build data frame
     if usecols is None:
         usecols = ['node', 'kind', 'level', 'msg']
     dfinfo = {}
     dfcols = []
     if 'node' in usecols:
         dfinfo['node'] = tfp.nodes
         dfcols.append('node')
     if 'kind' in usecols:
         dfinfo['kind'] = tfp.kinds
         dfcols.append('kind')
     if tfp.get_line_type is not None and 'level' in usecols:
         dfinfo['level'] = tfp.levels
         dfcols.append('level')
     if 'msg' in usecols:
         dfinfo['msg'] = tfp.msgs
         dfcols.append('msg')
     df = DataFrame(dfinfo, index=tfp.dates, columns=dfcols)
     if hasna:
         df = df.dropna()
     df.index.name = 'dtime'
     # pytable not support unicode for now
     if 'node' in df.columns:
         df['node'] = df['node'].astype(str)
     if 'kind' in df.columns:
         df['kind'] = df['kind'].astype(str)
     if 'level' in df.columns:
         df['level'] = df['level'].astype(str)
     return df
Example #10
0
def proportion_error_per_appliance_df(mains_values, gt_values, predicted_values):
    gt_proportion = {}
    pr_proportion = {}
    proportion_error = {}

    for app in predicted_values:
        p_gt  = gt_values[app]/mains_values
        p_pr  = predicted_values[app]/mains_values
        
        fr = DataFrame(p_gt, columns=['p_gt'])
        fr['p_gt'] = p_gt
        fr['p_pr'] = p_pr
        
#        fr['01. mains'] = mains_values
#        fr['02. gt'] = gt_values[app]
#        fr['03. pr'] = predicted_values[app]
        
        fr = fr.dropna()
        
        summ_gt = fr['p_gt'].sum()
        summ_pr = fr['p_pr'].sum()

        T = len(fr)
        tru = float(summ_gt)/float(T)
        dis = float(summ_pr)/float(T)
                
        gt_proportion[app] = tru
        pr_proportion[app] = dis
        
        diff = abs(tru - dis)
        proportion_error[app] = diff
    return proportion_error, gt_proportion, pr_proportion
Example #11
0
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, params: dict,
                 categorical_splits=None):
        """
        :param train: train DF
        :param test: test DF
        :param params: dict with the following structure
        Template for params:
        params = {
            'uuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'uuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'ukuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'ukku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kukk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkkk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}
        }
        u = unknown, k = known, scaler = None for Trees and else something like scale_features from
        dmc.transformation, ignore_features are the features which should be ignored for the split,

        :return:
        """
        if categorical_splits is None:
            categorical_splits = ['articleID', 'customerID', 'voucherID', 'productGroup']
        self.processes = 8
        self.test = test.copy()
        test = test.dropna(subset=['rrp'])
        self.test_size = len(test)
        self.splits = split(train, test, categorical_splits)
        self._enrich_splits(params)
Example #12
0
def cor_exp_ess(exp, ess):
    cor = DataFrame(np.nan, index=ess.columns, columns=['cor', 'pvalue'])

    for gene in ess.columns:
        if gene in exp.columns:
            cor.loc[gene] = spearmanr(ess[gene], exp[gene])

    return cor.dropna()
Example #13
0
    def test_dropna_multiple_axes(self):
        df = DataFrame([[1, np.nan, 2, 3],
                        [4, np.nan, 5, 6],
                        [np.nan, np.nan, np.nan, np.nan],
                        [7, np.nan, 8, 9]])
        cp = df.copy()
        result = df.dropna(how='all', axis=[0, 1])
        result2 = df.dropna(how='all', axis=(0, 1))
        expected = df.dropna(how='all').dropna(how='all', axis=1)

        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)
        assert_frame_equal(df, cp)

        inp = df.copy()
        inp.dropna(how='all', axis=(0, 1), inplace=True)
        assert_frame_equal(inp, expected)
Example #14
0
class Dropna(object):

    params = (['all', 'any'], [0, 1])
    param_names = ['how', 'axis']

    def setup(self, how, axis):
        self.df = DataFrame(np.random.randn(10000, 1000))
        self.df.ix[50:1000, 20:50] = np.nan
        self.df.ix[2000:3000] = np.nan
        self.df.ix[:, 60:70] = np.nan
        self.df_mixed = self.df.copy()
        self.df_mixed['foo'] = 'bar'

    def time_dropna(self, how, axis):
        self.df.dropna(how=how, axis=axis)

    def time_dropna_axis_mixed_dtypes(self, how, axis):
        self.df_mixed.dropna(how=how, axis=axis)
Example #15
0
    def test_load_raw_arrays(self):
        reindex_reader = ReindexMinuteBarReader(
            self.trading_calendar,
            self.bcolz_equity_minute_bar_reader,
            self.START_DATE,
            self.END_DATE,
        )
        m_open, m_close = self.trading_calendar.open_and_close_for_session(
            self.START_DATE)
        outer_minutes = self.trading_calendar.minutes_in_range(m_open, m_close)
        result = reindex_reader.load_raw_arrays(
            OHLCV, m_open, m_close, [1, 2])

        opens = DataFrame(data=result[0], index=outer_minutes,
                          columns=[1, 2])
        opens_with_price = opens.dropna()

        self.assertEqual(
            1440,
            len(opens),
            "The result should have 1440 bars, the number of minutes in a "
            "trading session on the target calendar."
        )

        self.assertEqual(
            390,
            len(opens_with_price),
            "The result, after dropping nans, should have 390 bars, the "
            " number of bars in a trading session in the reader's calendar."
        )

        slicer = outer_minutes.slice_indexer(
            end=pd.Timestamp('2015-12-01 14:30', tz='UTC'))

        assert_almost_equal(
            opens[1][slicer],
            full(slicer.stop, nan),
            err_msg="All values before the NYSE market open should be nan.")

        slicer = outer_minutes.slice_indexer(
            start=pd.Timestamp('2015-12-01 21:01', tz='UTC'))

        assert_almost_equal(
            opens[1][slicer],
            full(slicer.stop - slicer.start, nan),
            err_msg="All values after the NYSE market close should be nan.")

        first_minute_loc = outer_minutes.get_loc(pd.Timestamp(
            '2015-12-01 14:31', tz='UTC'))

        # Spot check a value.
        # The value is the autogenerated value from test fixtures.
        assert_almost_equal(
            10.0,
            opens[1][first_minute_loc],
            err_msg="The value for Equity 1, should be 10.0, at NYSE open.")
def PolyEq(x, y, order=1):
    try:
        df=DataFrame({'x':x,'y':y},index=x.index)
        df = df.dropna()
        PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs
        PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation
    except:
        print 'No regression equation possible'
        PolyEq = np.poly1d([0])
    return PolyEq
Example #17
0
    def gatherGenDataData(ert, case, key):
        """ :rtype: pandas.DataFrame """
        key, report_step = key.split("@", 1)
        report_step = int(report_step)
        try:
            data = GenDataCollector.loadGenData(ert, case, key, report_step)
        except ValueError:
            data = DataFrame()

        return data.dropna() # removes all rows that has a NaN
Example #18
0
    def test_dropna_tz_aware_datetime(self):
        # GH13407
        df = DataFrame()
        dt1 = datetime.datetime(2015, 1, 1,
                                tzinfo=dateutil.tz.tzutc())
        dt2 = datetime.datetime(2015, 2, 2,
                                tzinfo=dateutil.tz.tzutc())
        df['Time'] = [dt1]
        result = df.dropna(axis=0)
        expected = DataFrame({'Time': [dt1]})
        assert_frame_equal(result, expected)

        # Ex2
        df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
        result = df.dropna(axis=0)
        expected = DataFrame([dt1, dt2],
                             columns=['Time'],
                             index=[0, 3])
        assert_frame_equal(result, expected)
Example #19
0
    def agg(self):
        dframe = DataFrame(index=self.column.index)

        dframe = self._build_dframe(dframe, self.columns)
        column_names = [self._name_for_idx(i) for i in xrange(0, 2)]
        dframe = dframe.dropna(subset=column_names)

        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
Example #20
0
    def test_na_actions_categorical(self):

        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
        vals = ["a", "b", np.nan, "d"]
        df = DataFrame({"cats": cat, "vals": vals})
        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
        vals2 = ["a", "b", "b", "d"]
        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
        vals3 = ["a", "b", np.nan]
        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
        cat4 = Categorical([1, 2], categories=[1, 2, 3])
        vals4 = ["a", "b"]
        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

        # fillna
        res = df.fillna(value={"cats": 3, "vals": "b"})
        tm.assert_frame_equal(res, df_exp_fill)

        with pytest.raises(ValueError, match=("fill value must "
                                              "be in categories")):
            df.fillna(value={"cats": 4, "vals": "c"})

        res = df.fillna(method='pad')
        tm.assert_frame_equal(res, df_exp_fill)

        # dropna
        res = df.dropna(subset=["cats"])
        tm.assert_frame_equal(res, df_exp_drop_cats)

        res = df.dropna()
        tm.assert_frame_equal(res, df_exp_drop_all)

        # make sure that fillna takes missing values into account
        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
        df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})

        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

        res = df.fillna("a")
        tm.assert_frame_equal(res, df_exp)
Example #21
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
Example #22
0
def createDataset():
    data=read_csv('data/data2.csv',parse_dates=['DATE'],index_col='DATE')
    data.drop('DY', axis=1, inplace=True)
    data=data.dropna()
    data['RETURNS']=data['Price'].pct_change()
    rets=DataFrame(data['RETURNS'])*100
    rets['MA10']=fun.sampleMovingAverage(rets,10)
    rets['MA30']=fun.sampleMovingAverage(rets['RETURNS'],50)
    rets['VAR10']=fun.movingVariance(rets['RETURNS'],30)
    rets=rets.dropna()
    return rets
def Polyfit(x, y, order=1, color='k--',lab='label',Xvals=10,subplot=plot):
    df=DataFrame({'x':x.values,'y':y.values},index=x.index)
    df = df.dropna()
    print df
    PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs
    PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation
    #print PolyEq
    PolyXvals = np.linspace(min(x), max(x)+Xvals) ## creates x-values for trendline
    #print PolyXvals
    Polyplot = subplot.plot(PolyXvals, PolyEq(PolyXvals),color,label=lab) ## plots the trendline
    return Polyplot
Example #24
0
def get_plate_data(path,c):
    """ Get plate data, drop empty columns, drop selected columns, 
        rename columns, add normalized columns. """
    return thread_first(path,
                        from_file,
                        (str.replace,'\r',''),
                        StringIO,
                        pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']),
                        df.dropna(axis=1,how='all'),
                        (drop_matching_columns,c['dropcols']),
                        df.rename(columns=c['colrename']),
                        (add_normalized_columns,c['normcols']))
Example #25
0
def append_2013_gva(dfin, csv_file_path):
    df = dfin.copy()
    gva = pd.read_csv(csv_file_path)
    gvasub = DataFrame(columns=['nuts3id', 'gva2013'])
    gvasub['nuts3id'], gvasub['gva2013'] = gva['nutsid'], gva['2013']
    df_gva = pd.merge(
        left=df,
        right=gvasub.dropna(),
        how='left',
        left_on='nuts3id',
        right_on='nuts3id')
    return df_gva
Example #26
0
 def detect_objects(self, img, template, thres):
     #Conver to gray scale
     img_grey = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     #Match Template
     temp_match = cv2.matchTemplate(img_grey, template, 1)
     #Normalize scores
     temp_match = temp_match-temp_match.min()
     temp_match = 1 - temp_match/temp_match.max()
     #Apply Threshold
     retval, dst = cv2.threshold(temp_match, thres, 1, 0)
     dst = dst.astype(np.uint8)
     #Extract centroid of connected components
     contours, hierarchy = cv2.findContours(dst, 1, 2)
     if len(contours) > 0:
         m_df = DataFrame([cv2.moments(cont) for cont in contours])
         m_df['x'] = (m_df['m10']/m_df['m00']) + int(template.shape[0]/2)
         m_df['y'] = (m_df['m01']/m_df['m00']) + int(template.shape[1]/2)
         m_df.dropna(subset=['x', 'y'], inplace=True)
         return m_df
     else:
         return DataFrame()
def gettraining(data,XColumns,YColumn,dropNA=True,shuffle=True):
    #make a copy
    trainingData = DataFrame(data)
    #drop a few columns which we won't use for ML models
    trainingData = trainingData[XColumns+YColumn]
    if dropNA == True:
        #drop Na values if necessary
        trainingData = trainingData.dropna()
    if shuffle == True:
        #shuffle data if necessary
        trainingData = trainingData.reindex(np.random.permutation(trainingData.index))
    return(trainingData)
Example #28
0
def parse_region(df: pd.DataFrame, min_row: int, max_row: int, cols: list) -> pd.DataFrame:
    df = df.loc[min_row:max_row, cols]

    # Region is in either 0,0 or 0,1 of the sliced DataFrame, with the data starting from the 3rd row
    region = df.iloc[0, 0]
    if type(region) == float:
        region = df.iloc[0, 1]

    df = df.dropna(axis=0, how='all').iloc[2:, 1:]
    df.columns = ['Name', 'Count']
    df['Region'] = region
    return df
Example #29
0
    def test_dropEmptyRows(self):
        N = len(self.frame.index)
        mat = random.randn(N)
        mat[:5] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        original = Series(mat, index=self.frame.index, name='foo')
        expected = original.dropna()
        inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()

        smaller_frame = frame.dropna(how='all')
        # check that original was preserved
        assert_series_equal(frame['foo'], original)
        inplace_frame1.dropna(how='all', inplace=True)
        assert_series_equal(smaller_frame['foo'], expected)
        assert_series_equal(inplace_frame1['foo'], expected)

        smaller_frame = frame.dropna(how='all', subset=['foo'])
        inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
        assert_series_equal(smaller_frame['foo'], expected)
        assert_series_equal(inplace_frame2['foo'], expected)
Example #30
0
    def test_dropna_multiple_axes(self):
        df = DataFrame([[1, np.nan, 2, 3],
                        [4, np.nan, 5, 6],
                        [np.nan, np.nan, np.nan, np.nan],
                        [7, np.nan, 8, 9]])
        cp = df.copy()

        # GH20987
        with tm.assert_produces_warning(FutureWarning):
            result = df.dropna(how='all', axis=[0, 1])
        with tm.assert_produces_warning(FutureWarning):
            result2 = df.dropna(how='all', axis=(0, 1))
        expected = df.dropna(how='all').dropna(how='all', axis=1)

        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)
        assert_frame_equal(df, cp)

        inp = df.copy()
        with tm.assert_produces_warning(FutureWarning):
            inp.dropna(how='all', axis=(0, 1), inplace=True)
        assert_frame_equal(inp, expected)
Example #31
0
    def volcano_plot(self,
                     df: pd.DataFrame,
                     p_value: float = 0.05,
                     fc=2,
                     x_colname='logFC',
                     y_colname='-log10p',
                     cutoff_lines=True,
                     top_n=None,
                     top_by='-log10p',
                     show_labels=False,
                     legend=True,
                     **kwargs):

        # Get rid of NaN data
        df = df.dropna()

        # Convert cutoffs to logspace
        log2_fc = np.log2(fc)
        log10_pval = -np.log10(p_value)

        # Split data into above and below cutoff dataframes
        sig = df[(df[y_colname] >= log10_pval)
                 & (np.abs(df[x_colname]) >= log2_fc)]
        insig = df[~(df[y_colname] >= log10_pval)
                   | ~(np.abs(df[x_colname]) >= log2_fc)]

        # Get maximum values for formatting latter
        max_y = np.ceil(np.max(sig[y_colname]))
        max_x = np.ceil(np.max(np.abs(sig[x_colname])))

        fig, ax = plt.subplots(**kwargs)

        # Split top data points if requested
        if top_n:
            # Find points to highlight
            sort = set()
            if isinstance(top_by, list):
                for col in top_by:
                    sort = sort.union(
                        set(sig.index[np.argsort(np.abs(
                            sig[col]))[::-1]][:top_n].values))
            elif isinstance(top_by, str):
                sort = sort.union(
                    set(sig.index[np.argsort(np.abs(
                        sig[top_by]))[::-1]][:top_n].values))
            else:
                raise ValueError(
                    'top_by must be a string or list of values found in the DataFrame used for the plot'
                )

            top_sig = sig.loc[sort]
            sig = sig.drop(sort)
            ax.plot(top_sig[x_colname],
                    top_sig[y_colname],
                    'o',
                    c=_colors[0],
                    ms=10,
                    zorder=2,
                    label='Top Genes')

            if show_labels:
                fs = mpl.rcParams['legend.fontsize']
                for row in top_sig.iterrows():
                    ax.annotate(row[0],
                                xy=(row[1][x_colname], row[1][y_colname]),
                                fontsize=fs,
                                style='italic')

        # Make plot
        ax.plot(sig[x_colname],
                sig[y_colname],
                'o',
                c=_colors[2],
                ms=10,
                zorder=1,
                label='Diff Exp')
        ax.plot(insig[x_colname],
                insig[y_colname],
                'o',
                c=_colors[-1],
                ms=10,
                zorder=0,
                mew=0,
                label='')

        # Adjust axes
        ax.set_xlim([-max_x, max_x])
        ax.set_ylim([0, max_y])

        # Add cutoff lines
        if cutoff_lines:
            color = _colors[1]
            # P value line

            ax.plot([-max_x, max_x], [log10_pval, log10_pval],
                    '--',
                    c=color,
                    lw=3,
                    label='Threshold')

            # log fold change lines
            ax.plot([-log2_fc, -log2_fc], [0, max_y], '--', c=color, lw=3)
            ax.plot([log2_fc, log2_fc], [0, max_y], '--', c=color, lw=3)

        if legend:
            ax.legend(loc='best', numpoints=1)

        # Adjust labels
        ax.tick_params(axis='both', which='major')
        ax.set_xlabel(r'$log_2(\frac{KO}{WT})$')
        ax.set_ylabel(r'$-log_{10}$(corrected p-value)')
        return ax
Example #32
0
def dropna(df:pd.DataFrame):
    """Drop rows with 'Nan' values"""
    df = df[df < math.exp(709)] # big number
    df = df[df != 0.0]
    df = df.dropna()
    return df
Example #33
0
def run_model(model_data: pd.DataFrame,
              pred_data: pd.DataFrame,
              hierarchy: pd.DataFrame,
              gbd_hierarchy: pd.DataFrame,
              covariate_list: List[str],
              verbose: bool = True,
              **kwargs) -> Tuple[Dict, Dict, pd.Series, pd.Series, pd.Series]:
    model_data['logit_idr'] = logit(model_data['idr'])
    model_data['logit_idr'] = model_data['logit_idr'].replace(
        (-np.inf, np.inf), np.nan)
    model_data['idr_se'] = 1
    model_data['logit_idr_se'] = 1
    model_data['intercept'] = 1

    # lose 0s and 1s
    model_data = model_data.loc[model_data['logit_idr'].notnull()]

    covariate_priors = get_covariate_priors(1, 'idr')
    covariate_priors = {
        covariate: covariate_priors[covariate]
        for covariate in covariate_list
    }
    covariate_constraints = get_covariate_constraints('idr')
    covariate_constraints = {
        covariate: covariate_constraints[covariate]
        for covariate in covariate_list
    }
    covariate_lambdas_sr_r = {covariate: 3. for covariate in covariate_list}
    covariate_lambdas_admin = {covariate: 100. for covariate in covariate_list}

    var_args = {
        'dep_var': 'logit_idr',
        'dep_var_se': 'logit_idr_se',
        'fe_vars': [
            'intercept',
            'log_infwavg_testing_rate_capacity',
        ] + covariate_list,
        'prior_dict': {
            'log_infwavg_testing_rate_capacity': {
                'prior_beta_uniform': np.array([1e-6, np.inf])
            },
        },
        're_vars': [],
        'group_var': 'location_id',
    }
    global_prior_dict = covariate_priors
    location_prior_dict = {}
    pred_replace_dict = {
        'log_testing_rate_capacity': 'log_infwavg_testing_rate_capacity',
    }
    pred_exclude_vars = []
    level_lambdas = {
        0: {
            'intercept': 3.,
            'log_infwavg_testing_rate_capacity': 3.,
            **covariate_lambdas_sr_r,
        },  # G->SR
        1: {
            'intercept': 3.,
            'log_infwavg_testing_rate_capacity': 3.,
            **covariate_lambdas_sr_r,
        },  # SR->R
        2: {
            'intercept': 100.,
            'log_infwavg_testing_rate_capacity': 100.,
            **covariate_lambdas_admin,
        },  # R->A0
        3: {
            'intercept': 100.,
            'log_infwavg_testing_rate_capacity': 100.,
            **covariate_lambdas_admin,
        },  # A0->A1
        4: {
            'intercept': 100.,
            'log_infwavg_testing_rate_capacity': 100.,
            **covariate_lambdas_admin,
        },  # A1->A2
        5: {
            'intercept': 100.,
            'log_infwavg_testing_rate_capacity': 100.,
            **covariate_lambdas_admin,
        },  # A2->A3
    }

    if var_args['group_var'] != 'location_id':
        raise ValueError(
            'NRMSE data assignment assumes `study_id` == `location_id` (`location_id` must be group_var).'
        )

    model_data_cols = [
        'location_id', 'date', var_args['dep_var'], var_args['dep_var_se']
    ] + var_args['fe_vars']
    model_data = model_data.loc[:, model_data_cols]
    model_data = model_data.dropna()
    mr_model_dict, prior_dicts = cascade.run_cascade(
        model_name='idr',
        model_data=model_data.copy(),
        hierarchy=hierarchy.copy(),  # run w/ modeling hierarchy
        var_args=var_args.copy(),
        global_prior_dict=global_prior_dict.copy(),
        location_prior_dict=location_prior_dict.copy(),
        level_lambdas=level_lambdas.copy(),
        verbose=False,
    )
    adj_gbd_hierarchy = model_inputs.validate_hierarchies(
        hierarchy.copy(), gbd_hierarchy.copy())
    pred_data = pred_data.dropna()
    pred, pred_fe, pred_location_map = cascade.predict_cascade(
        pred_data=pred_data.copy(),
        hierarchy=adj_gbd_hierarchy.copy(),  # predict w/ gbd hierarchy
        mr_model_dict=mr_model_dict.copy(),
        pred_replace_dict=pred_replace_dict.copy(),
        pred_exclude_vars=pred_exclude_vars.copy(),
        var_args=var_args.copy(),
        verbose=False,
    )

    pred = expit(pred).rename(pred.name.replace('logit_', ''))
    pred_fe = expit(pred_fe).rename(pred_fe.name.replace('logit_', ''))

    return mr_model_dict, prior_dicts, pred.dropna(), pred_fe.dropna(
    ), pred_location_map, level_lambdas
Example #34
0
    def test_sort_index_nan_multiindex(self):
        # GH#14784
        # incorrect sorting w.r.t. nans
        tuples = [[12, 13], [np.nan, np.nan], [np.nan, 3], [1, 2]]
        mi = MultiIndex.from_tuples(tuples)

        df = DataFrame(np.arange(16).reshape(4, 4), index=mi, columns=list("ABCD"))
        s = Series(np.arange(4), index=mi)

        df2 = DataFrame(
            {
                "date": pd.DatetimeIndex(
                    [
                        "20121002",
                        "20121007",
                        "20130130",
                        "20130202",
                        "20130305",
                        "20121002",
                        "20121207",
                        "20130130",
                        "20130202",
                        "20130305",
                        "20130202",
                        "20130305",
                    ]
                ),
                "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
                "whole_cost": [
                    1790,
                    np.nan,
                    280,
                    259,
                    np.nan,
                    623,
                    90,
                    312,
                    np.nan,
                    301,
                    359,
                    801,
                ],
                "cost": [12, 15, 10, 24, 39, 1, 0, np.nan, 45, 34, 1, 12],
            }
        ).set_index(["date", "user_id"])

        # sorting frame, default nan position is last
        result = df.sort_index()
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position last
        result = df.sort_index(na_position="last")
        expected = df.iloc[[3, 0, 2, 1], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame, nan position first
        result = df.sort_index(na_position="first")
        expected = df.iloc[[1, 2, 3, 0], :]
        tm.assert_frame_equal(result, expected)

        # sorting frame with removed rows
        result = df2.dropna().sort_index()
        expected = df2.sort_index().dropna()
        tm.assert_frame_equal(result, expected)

        # sorting series, default nan position is last
        result = s.sort_index()
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position last
        result = s.sort_index(na_position="last")
        expected = s.iloc[[3, 0, 2, 1]]
        tm.assert_series_equal(result, expected)

        # sorting series, nan position first
        result = s.sort_index(na_position="first")
        expected = s.iloc[[1, 2, 3, 0]]
        tm.assert_series_equal(result, expected)
Example #35
0
 def table_note(self):
     t = DataFrame(self.get_data('notes'))
     t = self.table_filter_modules(t, 'parent_type')
     t = t.dropna(subset=['filename'])
     # t = t[:10] # for debug
     return t
Example #36
0
 def _has_missing_feature(self, features):
     features = [features]
     df = DataFrame(features)
     return len(df.dropna()) == 0
Example #37
0
def drop_missing(df: DataFrame, cols: list) -> DataFrame:
    df.dropna(subset=cols, inplace=True)
    return df
def drop_df_nan_rows_according2cols(df: pd.DataFrame,
                                    cols: list) -> pd.DataFrame:
    df = df.dropna(subset=cols)
    return df
Example #39
0
#pandas : 기술적 통계와 관련된함수 NaN
from pandas import Series, DataFrame
import numpy as np

df = DataFrame([[1.4, np.nan], [7, -4.5], [np.NaN, np.NaN], [0.5, -1]],
               columns=['one', 'two'])

print(df)
print(df.drop(1), '\n')  # 1행지우기
print(df.dropna(), '\n')  #nan값을 지운다.
print(df.dropna(how='any'), '\n')  # nan값이 하나라도 있으면 지운다
print(df.dropna(how='all'), '\n')  # 모든행의 값이 nan 이면 지운다
print(df.dropna(subset=['one']), '\n')  # 특정열에 nan 이 있으면 그행을 제거한다.
print(df.fillna(0), '\n')  # 평균으로 채우기 sklearn 모듈의 SimpleInputer

# 기술적 통계와 관련된 함수
print('**' * 10)
print(df.sum(), '\n')  #열단위의 합 nan은 제외
print(df.sum(axis=0), '\n')

print(df.sum(axis=1), '\n')  # 행단위의 합
print(df.mean(axis=1), '\n')  # 행의 평균
print(df.mean(axis=1, skipna=True), '\n')  # na포함 계산
print(df.mean(axis=1, skipna=False), '\n')  # na 있을시 계산 x

print(df.mean(axis=0, skipna=True), '\n')  # nan이 있어도 계산 o (열단위)
print(df.mean(axis=0, skipna=False), '\n')  #nan이 있기 때문에 계산 x

print(df.max(), '\n')
print(df.max(axis=0), '\n')  #열값중 가장 큰값
print(df.idxmax(), '\n')
Example #40
0
    def gather_gen_data_data(self, case, key):
        """ :rtype: pandas.DataFrame """
        key_parts = key.split("@")
        key = key_parts[0]
        if len(key_parts) > 1:
            report_step = int(key_parts[1])
        else:
            report_step = 0

        try:
            data = GenDataCollector.loadGenData(self._enkf_main, case, key, report_step)
        except (ValueError, KeyError):
            data = DataFrame()

        return data.dropna() # removes all rows that has a NaN

    def gather_custom_kw_data(self, case, key):
        """ :rtype: pandas.DataFrame """
        data = CustomKWCollector.loadAllCustomKWData(self._enkf_main, case, [key])

        if key in data:
            return data[key]
        else:
            return data

    def is_summary_key(self, key):
        """ :rtype: bool """
        return key in self._enkf_main.getKeyManager().summaryKeys()

    def is_gen_kw_key(self, key):
merge_data = pd.merge(data2, dat3, left_on = "COUNTY", right_on="COUNTY_NAME")
#%%
data_final = merge_data.groupby(['Year','REGION_NAME'])['PK (FULL DAY)'].sum()
#%%

data_final = data_final.reset_index()
# #%%
main_list = []

#%%
for county in data_final.REGION_NAME.unique():
    local_list = []
    albany = data_final[data_final.REGION_NAME == county]
    series = pd.Series(albany['PK (FULL DAY)'].to_list(), index=albany['Year'].to_list())# create lagged dataset
    values = DataFrame(series.values)
    values.dropna(inplace = True)
    dataframe = concat([values.shift(1), values], axis=1)
    dataframe.columns = ['t', 't+1']
    dataframe.dropna(inplace = True)
    X = dataframe.values
    dict1= {}
    for size in [0.50,0.55, 0.60,0.66,0.70,0.75,0.80]:
        train_size = int(len(X) * size)
        train, test = X[1:train_size], X[train_size:]
        train_X, train_y = train[:,0], train[:,1]
        test_X, test_y = test[:,0], test[:,1]
        # persistence model on training set
        train_pred = [x for x in train_X]
        # calculate residuals
        train_resid = [train_y[i]-train_pred[i] for i in range(len(train_pred))]
        # model the training set residuals
Example #42
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df[2][:2] = np.nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0)
        expected = df.loc[list(range(2, 6))]
        inp = df.copy()
        inp.dropna(axis=0, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, thresh=5, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[range(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, thresh=4, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=1, thresh=4)
        assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        assert_frame_equal(dropped, df)
        assert_frame_equal(inp, df)

        # all
        dropped = df.dropna(axis=1, how="all")
        assert_frame_equal(dropped, df)

        df[2] = np.nan
        dropped = df.dropna(axis=1, how="all")
        expected = df.loc[:, [0, 1, 3]]
        assert_frame_equal(dropped, expected)

        # bad input
        msg = "No axis named 3 for object type <class 'pandas.core.frame.DataFrame'>"
        with pytest.raises(ValueError, match=msg):
            df.dropna(axis=3)
Example #43
0
def data_clean(joined: pd.DataFrame) -> pd.DataFrame:
    """[function currently does basic na forward
    filling and conversion of variables to useful types.
    I also drop a bunch of columns that either are entirely null or
    duplciate columns, the data source seems to be a weirdly processed]

    Arguments:
        joined {df} -- [original df from kaggle download
        https://www.kaggle.com/init27/fastai-v3-rossman-data-clean]

    Returns:
        [df] -- [cleaned df]
    """
    joined.loc[:, weather_vars] = joined.loc[:, weather_vars].fillna(
        method="ffill"
    )

    weather_vars.append("Events")

    # some of the initial Max_Gust_Speed Data was missing
    # so I filled with the Max_wind Speed.
    joined.loc[
        joined["Max_Gust_SpeedKm_h"].isna(), "Max_Gust_SpeedKm_h"
    ] = joined.loc[joined["Max_Gust_SpeedKm_h"].isna(), "Max_Wind_SpeedKm_h"]

    #  change text data into categories, as codes.
    joined["Events"] = joined["Events"].astype("category").cat.codes + 1
    joined["Store"] = joined["Store"] - 1
    joined["DayOfWeek"] = joined["DayOfWeek"] - 1
    joined["Week"] = joined["Week"] - 1
    joined["Assortment"] = joined["Assortment"].astype("category").cat.codes
    joined["State"] = joined["State"].astype("category").cat.codes
    joined["WindDirDegrees"] = (
        joined["WindDirDegrees"].astype("category").cat.codes
    )
    joined["StoreType"] = joined["StoreType"].astype("category").cat.codes

    # Drop variables that didn't look useful.
    joined.drop(
        [
            "Promo2Since",
            "Year",
            "Month",
            "Day",
            "PromoInterval",
            "StateName",
            "file_DE",
            "State_DE",
            "Dayofweek_DE",
            "Day_DE",
            "Date",
            "Is_quarter_end",
            "Is_month_end_DE",
            "Is_year_start",
            "week",
            "file",
            "Month_DE",
            "week_DE",
            "Dayofyear_DE",
            "CompetitionOpenSince",
            "Date_DE",
            "Elapsed_DE",
            "CompetitionDistance",
        ],
        axis=1,
        inplace=True,
    )
    if "Id" in joined.keys():
        joined.drop("Id", axis=1, inplace=True)

    # check the keys. Make sure that we don't have a miss match
    # between keys in list and dataframe.
    a = set(joined.keys())
    total_keys = cat_vars.copy()
    total_keys.extend(cont_vars)
    b = set(total_keys)
    c = a.difference(b)
    assert not c

    # convert booleans to ints.
    joined[joined.select_dtypes(include="bool").keys()] = joined.select_dtypes(
        include="bool"
    ).astype("int")

    # change to floats.
    joined[cont_vars] = joined[cont_vars].astype("float")
    joined.dropna(0, inplace=True)
    return joined
Example #44
0
def drop_na_records(table: DataFrame, keys: List[str]) -> DataFrame:
    """ Drops all records which have no data outside of the provided keys """
    value_columns = [col for col in table.columns if not col in keys]
    return table.dropna(subset=value_columns, how="all")
Example #45
0
def describe(title, df: pd.DataFrame) -> dict:
    """Calculate the statistics for each series in this DataFrame.

    Args:
        df: DataFrame.

    Returns:
        This function returns a dictionary containing:
            - table: overall statistics.
            - variables: descriptions per series.
            - correlations: correlation matrices.
            - missing: missing value diagrams.
            - messages: direct special attention to these patterns in your data.
            - package: package details.
            :param title:
    """

    if df is None:
        raise ValueError("Can not describe a `lazy` ProfileReport without a DataFrame.")

    if not isinstance(df, pd.DataFrame):
        warnings.warn("df is not of type pandas.DataFrame")

    if df.empty:
        raise ValueError("df can not be empty")

    disable_progress_bar = not config["progress_bar"].get(bool)

    date_start = datetime.utcnow()

    correlation_names = [
        correlation_name
        for correlation_name in ["pearson", "spearman", "kendall", "phi_k", "cramers",]
        if config["correlations"][correlation_name]["calculate"].get(bool)
    ]

    number_of_tasks = 9 + len(df.columns) + len(correlation_names)

    with tqdm(
        total=number_of_tasks, desc="Summarize dataset", disable=disable_progress_bar
    ) as pbar:
        series_description = get_series_descriptions(df, pbar)

        pbar.set_postfix_str("Get variable types")
        variables = {
            column: description["type"]
            for column, description in series_description.items()
        }
        pbar.update()

        # Transform the series_description in a DataFrame
        pbar.set_postfix_str("Get variable statistics")
        variable_stats = pd.DataFrame(series_description)
        pbar.update()

        # Get correlations
        correlations = {}
        for correlation_name in correlation_names:
            pbar.set_postfix_str(f"Calculate {correlation_name} correlation")
            correlations[correlation_name] = calculate_correlation(
                df, variables, correlation_name
            )
            pbar.update()

        # Make sure correlations is not None
        correlations = {
            key: value for key, value in correlations.items() if value is not None
        }

        # Scatter matrix
        pbar.set_postfix_str("Get scatter matrix")
        scatter_matrix = get_scatter_matrix(df, variables)
        pbar.update()

        # Table statistics
        pbar.set_postfix_str("Get table statistics")
        table_stats = get_table_stats(df, variable_stats)
        pbar.update()

        # Missing diagrams
        pbar.set_postfix_str("Get missing diagrams")
        missing = get_missing_diagrams(df, table_stats)
        pbar.update()

        # Sample
        pbar.set_postfix_str("Take sample")
        sample = get_sample(df)
        pbar.update()

        # Duplicates
        pbar.set_postfix_str("Locating duplicates")
        supported_columns = [
            key
            for key, value in series_description.items()
            if value["type"] != Variable.S_TYPE_UNSUPPORTED
        ]
        duplicates = get_duplicates(df, supported_columns)
        pbar.update()

        # Clusters
        pbar.set_postfix_str("Searching for clusters")
        categoricals = [column_name for column_name, variable_type in variables.items() if variable_type == Variable.TYPE_CAT]
        df_without_missing = df.dropna()
        df_ohe = pd.concat([df_without_missing.drop(categoricals, axis=1), pd.get_dummies(df_without_missing[categoricals])], axis=1).reset_index()
        clusters = {
            name: pd.concat([df_ohe, pd.DataFrame({"Cluster": eval(clustering).fit(df_ohe).labels_})], axis=1)
            for name, clustering in config["clusters"]["clusterings"].get()
        }

        # Outliers
        pbar.set_postfix_str("Detecting outliers")
        outliers = {
            name: pd.concat([df_ohe, pd.DataFrame({"Outlier": eval(detector).fit_predict(df_ohe)})], axis=1)
            for name, detector in config["outliers"]["detectors"].get()
        }

        # Messages
        pbar.set_postfix_str("Get messages/warnings")
        messages = get_messages(table_stats, series_description, correlations)
        pbar.update()

        pbar.set_postfix_str("Get reproduction details")
        package = {
            "pandas_profiling_version": VERSION,
            "pandas_profiling_config": config.dump(),
        }
        pbar.update()

        pbar.set_postfix_str("Completed")

    date_end = datetime.utcnow()

    analysis = {
        "title": title,
        "date_start": date_start,
        "date_end": date_end,
        "duration": date_end - date_start,
    }

    return {
        # Analysis metadata
        "analysis": analysis,
        # Overall dataset description
        "table": table_stats,
        # Per variable descriptions
        "variables": series_description,
        # Bivariate relations
        "scatter": scatter_matrix,
        # Correlation matrices
        "correlations": correlations,
        # Missing values
        "missing": missing,
        # Warnings
        "messages": messages,
        # Package
        "package": package,
        # Sample
        "sample": sample,
        # Duplicates
        "duplicates": duplicates,
        # Clusters
        "clusters": clusters,
        # Outliers
        "outliers": outliers
    }
Example #46
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df.iloc[:2, 2] = np.nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        return_value = inp.dropna(axis=1, inplace=True)
        tm.assert_frame_equal(dropped, expected)
        tm.assert_frame_equal(inp, expected)
        assert return_value is None

        dropped = df.dropna(axis=0)
        expected = df.loc[list(range(2, 6))]
        inp = df.copy()
        return_value = inp.dropna(axis=0, inplace=True)
        tm.assert_frame_equal(dropped, expected)
        tm.assert_frame_equal(inp, expected)
        assert return_value is None

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        return_value = inp.dropna(axis=1, thresh=5, inplace=True)
        tm.assert_frame_equal(dropped, expected)
        tm.assert_frame_equal(inp, expected)
        assert return_value is None

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[range(2, 6)]
        inp = df.copy()
        return_value = inp.dropna(axis=0, thresh=4, inplace=True)
        tm.assert_frame_equal(dropped, expected)
        tm.assert_frame_equal(inp, expected)
        assert return_value is None

        dropped = df.dropna(axis=1, thresh=4)
        tm.assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        tm.assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        tm.assert_frame_equal(dropped, df)
        tm.assert_frame_equal(inp, df)
        assert return_value is None

        # all
        dropped = df.dropna(axis=1, how="all")
        tm.assert_frame_equal(dropped, df)

        df[2] = np.nan
        dropped = df.dropna(axis=1, how="all")
        expected = df.loc[:, [0, 1, 3]]
        tm.assert_frame_equal(dropped, expected)

        # bad input
        msg = "No axis named 3 for object type DataFrame"
        with pytest.raises(ValueError, match=msg):
            df.dropna(axis=3)
    def predict(self, user_score: pd.DataFrame, course_need_pre: list = None):
        #   clear user score
        if len(user_score) > 1:
            user_score = user_score[0:1]

        # clear input data
        user_score = user_score.dropna(axis=1)
        valid_course = list(set(user_score.columns) & set(self.data.columns))
        user_score = user_score[valid_course]
        del valid_course

        # course_need_pre == none -> all course
        if course_need_pre is None or len(course_need_pre) < 1:
            course_need_pre = set(self.data.columns)
        else:
            course_need_pre = [
                c for c in course_need_pre if c in self.data.columns
            ]

        course_need_pre = list(set(course_need_pre) - set(user_score.columns))
        all_course = list(
            set(list(user_score.columns) + list(course_need_pre)))

        train_data = self.data.loc[:, all_course].dropna(axis=0, thresh=2)
        # end clear data

        # preprocessing: normalize and fill nan
        train_for_sim, filled_matrix = process.fillNan(train_data,
                                                       type=self.nor_type)

        # Tính độ tương tự
        similarity_df = self.sim(train_for_sim.loc[:, user_score.columns],
                                 user_score)
        del train_for_sim

        # normalize
        train_nor, train_avg = process.normalize(train_data,
                                                 'row_avg')  # row_avg
        user_score_nor, user_avg = process.normalize(user_score, 'row_avg')

        # Dự đoán điểm, khởi tạo pre_score_nor
        pre_score_nor = pd.DataFrame(columns=course_need_pre,
                                     index=user_score.index)

        for col in course_need_pre:
            # lấy những điểm thật
            score_series = train_nor.loc[:, col].dropna()

            # lấy độ tương tự với những sinh viên có điểm trong score_series
            k_sim = similarity_df.loc[score_series.index, user_score.index[0]]

            if self.k is not None:
                k_sim = k_sim.nlargest(self.k)
                score_series = score_series.loc[k_sim.index.tolist()]

            # tính điểm dự đoán
            pre_score_nor.loc[
                user_score.index,
                col] = k_sim.mul(score_series).sum() / k_sim.sum()

        # unnormalize pre_score_nor
        pre_score = process.unnormalize(pre_score_nor, user_avg, 'row_avg')

        # Sửa lỗi điểm dự đoán
        return process.formal_score(pre_score)
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    dtype df: dataframe
    rtype df: dataframe
    """
    df = df.dropna(subset=[
        'FOB.VALUE', 'TOTAL.TAXES'
    ])  # Remove 170 rows which does not have FOB, CIF value.
    df.loc[:, 'Unitprice'] = df['CIF.VALUE'] / df['QUANTITY']
    df.loc[:, 'WUnitprice'] = df['CIF.VALUE'] / df['GROSS.WEIGHT']
    df.loc[:, 'TaxRatio'] = df['TOTAL.TAXES'] / df['CIF.VALUE']
    df.loc[:, 'TaxUnitquantity'] = df['TOTAL.TAXES'] / df['QUANTITY']
    df.loc[:, 'FOBCIFRatio'] = df['FOB.VALUE'] / df['CIF.VALUE']
    df.loc[:, 'HS6'] = df['TARIFF.CODE'].apply(lambda x: int(x // 10000))
    df.loc[:, 'HS4'] = df['HS6'].apply(lambda x: int(x // 100))
    df.loc[:, 'HS2'] = df['HS4'].apply(lambda x: int(x // 100))
    # Factor some thing
    df.loc[:, 'HS6.Origin'] = [
        str(i) + '&' + j for i, j in zip(df['HS6'], df['ISO3'])
    ]

    # #     Made a general function "merge_attributes" for supporting any combination

    # #     Generated all possible combinations, But the final AUC is smaller than just adding three combinations active below.
    #     candFeaturesCombine = ['OFFICE','IMPORTER.TIN','ISO3','HS6','DECLARANT.CODE']
    #     for subset in combinations(candFeaturesCombine, 2):
    #         merge_attributes(df, *subset)

    #     for subset in combinations(candFeaturesCombine, 3):
    #         merge_attributes(df, *subset)

    merge_attributes(df, 'OFFICE', 'IMPORTER.TIN')
    merge_attributes(df, 'OFFICE', 'HS6')
    merge_attributes(df, 'OFFICE', 'ISO3')

    # Day of Year of SGD.DATE
    tmp2 = {}
    for date in set(df['SGD.DATE']):
        tmp2[date] = dt.strptime(date, '%y-%m-%d')
    tmp_day = {}
    tmp_week = {}
    tmp_month = {}
    yearStart = dt(tmp2[date].date().year, 1, 1)
    for item in tmp2:
        tmp_day[item] = (tmp2[item] - yearStart).days
        tmp_week[item] = int(tmp_day[item] / 7)
        tmp_month[item] = int(tmp_day[item] / 30)

    df.loc[:, 'SGD.DayofYear'] = df['SGD.DATE'].apply(lambda x: tmp_day[x])
    df.loc[:, 'SGD.WeekofYear'] = df['SGD.DATE'].apply(lambda x: tmp_week[x])
    df.loc[:, 'SGD.MonthofYear'] = df['SGD.DATE'].apply(lambda x: tmp_month[x])

    # RECEIPT-SGD time  # To-Do: We should consider where there aren't any receipt date.
    tmp = {}
    for date in set(df['SGD.DATE']).union(set(df['RECEIPT.DATE'])):
        tmp[date] = dt.strptime(date, '%y-%m-%d')
    df.loc[:, 'RECEIPT.DATE-SGD.DATE'] = df['RECEIPT.DATE'].apply(
        lambda x: tmp[x]) - df['SGD.DATE'].apply(lambda x: tmp[x])
    df.loc[:, 'RECEIPT.DATE-SGD.DATE'] = df['RECEIPT.DATE-SGD.DATE'].apply(
        lambda x: x.days)

    return df
Example #49
0
 def pipe_filter_rows(self, df: pd.DataFrame) -> pd.DataFrame:
     return df.dropna(subset=["Daily change in cumulative total"])
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            #target_names = column_mapping.get('target_names')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

            #target_names = None

        if production_data is not None and target_column is not None and prediction_column is not None:
            production_data.replace([np.inf, -np.inf], np.nan, inplace=True)
            production_data.dropna(axis=0, how='any', inplace=True)

            if len(prediction_column) <= 2:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(production_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = ['target']

                fpr, tpr, thrs = metrics.roc_curve(
                    binaraized_target, production_data[prediction_column[0]])
                fig = go.Figure()

                fig.add_trace(
                    go.Scatter(x=fpr,
                               y=tpr,
                               mode='lines',
                               name='ROC',
                               marker=dict(
                                   size=6,
                                   color=red,
                               )))

                fig.update_layout(yaxis_title="True Positive Rate",
                                  xaxis_title="False Positive Rate",
                                  showlegend=True)

                fig_json = json.loads(fig.to_json())

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="big_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={
                        "data": fig_json['data'],
                        "layout": fig_json['layout']
                    },
                    additionalGraphs=[],
                )

            else:
                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(production_data[target_column])
                binaraized_target = pd.DataFrame(
                    binaraizer.transform(production_data[target_column]))
                binaraized_target.columns = prediction_column
                #plot support bar
                graphs = []

                for label in prediction_column:
                    fpr, tpr, thrs = metrics.roc_curve(
                        binaraized_target[label], production_data[label])
                    fig = go.Figure()

                    fig.add_trace(
                        go.Scatter(x=fpr,
                                   y=tpr,
                                   mode='lines',
                                   name='ROC',
                                   marker=dict(
                                       size=6,
                                       color=red,
                                   )))

                    fig.update_layout(yaxis_title="True Positive Rate",
                                      xaxis_title="False Positive Rate",
                                      showlegend=True)

                    fig_json = json.loads(fig.to_json())

                    graphs.append({
                        "id": "tab_" + str(label),
                        "title": str(label),
                        "graph": {
                            "data": fig_json["data"],
                            "layout": fig_json["layout"],
                        }
                    })

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="tabbed_graph",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=1,
                    params={"graphs": graphs},
                    additionalGraphs=[],
                )
        else:
            self.wi = None
Example #51
0
def courses_computed(
    courses: pd.DataFrame,
    listings: pd.DataFrame,
    evaluation_statistics: pd.DataFrame,
    course_professors: pd.DataFrame,
) -> pd.DataFrame:
    """
    Populates computed course rating fields:
        average_rating:
            Average course rating over all past instances.
        average_workload:
            Average course workload over all past instances.

    Also populates last-offered course fields:
        last_offered_course_id:
            course_id of the most recent previous offering.
        last_enrollment_course_id:
            course_id of the most recent previous offering with enrollment statistics.
        last_enrollment:
            Number of students in most recent previous offering with enrollment statistics.
        last_enrollment_season_code:
            Season of recent previous offering with enrollment statistics.
        last_enrollment_same_professors:
            If recent previous offering with enrollment statistics was with same professors.

    Parameters
    ----------
    Pandas tables post-import:
        courses
        listings
        evaluation_statistics
        course_professors

    Returns
    -------
    courses:
        Table with computed fields.
    """
    listings = listings.copy(deep=True)
    evaluation_statistics = evaluation_statistics.copy(deep=True)
    course_professors = course_professors.copy(deep=True)

    (
        course_to_same_course,
        same_course_to_courses,
        course_to_same_course_filtered,
        same_course_to_courses_filtered,
    ) = resolve_historical_courses(courses, listings)

    # partition ID of same-codes courses (not used anymore, useful for debugging)
    courses["shared_code_id"] = courses["course_id"].apply(
        course_to_same_course.get)
    # connected courses with the same code (not used anymore, useful for debugging)
    courses["shared_code_courses"] = courses["shared_code_id"].apply(
        same_course_to_courses.get)

    # unique ID for each partition of the same courses
    courses["same_course_id"] = courses["course_id"].apply(
        course_to_same_course_filtered.get)

    # list of course_ids that are the same course per course_id
    courses["same_courses"] = courses["same_course_id"].apply(
        same_course_to_courses_filtered.get)

    # split same-course partition by same-professors
    course_to_same_prof_course, same_prof_course_to_courses = split_same_professors(
        course_to_same_course_filtered, course_professors)

    # unique ID for each partition of the same courses taught by the same set of profs
    courses["same_course_and_profs_id"] = courses["course_id"].apply(
        course_to_same_prof_course.get)

    # list of course_ids that are the same course and taught by same profs per course_id
    courses["same_courses_and_profs"] = courses[
        "same_course_and_profs_id"].apply(same_prof_course_to_courses.get)

    # map course_id to professor_ids
    # use frozenset because it is hashable (set is not), needed for groupby
    course_to_professors = course_professors.groupby(
        "course_id")[  # type: ignore
            "professor_id"].apply(frozenset)

    # get historical offerings with same professors
    listings["professors"] = listings["course_id"].apply(
        course_to_professors.get)
    courses["professors"] = courses["course_id"].apply(
        course_to_professors.get)

    print("Computing last offering statistics")

    # course_id for all evaluated courses
    evaluated_courses = set(
        evaluation_statistics.dropna(subset=["enrolled"], axis=0)["course_id"])

    # map course_id to season
    course_to_season = dict(zip(courses["course_id"], courses["season_code"]))

    # map course_id to number enrolled
    course_to_enrollment = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["enrolled"]))

    # get last course offering in general (with or without enrollment)
    def get_last_offered(course_row):
        same_courses = course_row["same_courses"]

        same_courses = [
            x for x in same_courses
            if course_to_season[x] < course_row["season_code"]
        ]

        if len(same_courses) == 0:
            return None

        same_courses = [
            x for x in same_courses if x is not course_row["course_id"]
        ]
        if len(same_courses) == 0:
            return None

        last_offered_course = max(same_courses,
                                  key=lambda x: course_to_season[x])

        return last_offered_course

    # helper function for getting enrollment fields of last-offered course
    def get_last_offered_enrollment(course_row):
        same_courses = course_row["same_courses"]

        # keep course only if distinct, has enrollment statistics, and is before current
        same_courses = [
            x for x in same_courses if x in evaluated_courses
            and course_to_season[x] < course_row["season_code"]
        ]
        if len(same_courses) == 0:
            return [None, None, None, None]
        same_courses = [
            x for x in same_courses if x is not course_row["course_id"]
        ]
        if len(same_courses) == 0:
            return [None, None, None, None]

        current_professors = course_to_professors.get(course_row["course_id"],
                                                      set())

        # sort courses newest-first
        same_courses = sorted(same_courses,
                              key=lambda x: course_to_season[x],
                              reverse=True)

        # get the newest course with the same professors, otherwise just the newest course
        last_enrollment_course = next(
            (prev_course
             for prev_course in same_courses if course_to_professors.get(
                 prev_course, set()) == current_professors),
            # default to newest course if no previous course has same profs
            same_courses[0],
        )

        # number of students last taking course
        last_enrollment = course_to_enrollment[last_enrollment_course]
        # season for last enrollment
        last_enrollment_season = course_to_season[last_enrollment_course]
        # professors for last enrollment
        last_enrollment_professors = course_to_professors.get(
            last_enrollment_course, set())

        # if last enrollment is with same professors
        last_enrollment_same_professors = (
            last_enrollment_professors == current_professors)

        return (
            last_enrollment_course,
            last_enrollment,
            last_enrollment_season,
            last_enrollment_same_professors,
        )

    tqdm.pandas(desc="Finding last-offered course")
    courses["last_offered_course_id"] = courses.progress_apply(  # type: ignore
        get_last_offered, axis=1)

    tqdm.pandas(desc="Finding last-offered enrollment")
    # getting last-offered enrollment
    (
        courses["last_enrollment_course_id"],
        courses["last_enrollment"],
        courses["last_enrollment_season_code"],
        courses["last_enrollment_same_professors"],
    ) = zip(*courses.progress_apply(get_last_offered_enrollment,
                                    axis=1)  # type: ignore
            )

    print("Computing historical ratings for courses")

    # map courses to ratings
    course_to_overall = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["avg_rating"]))
    course_to_workload = dict(
        zip(evaluation_statistics["course_id"],
            evaluation_statistics["avg_workload"]))

    # get ratings
    courses["average_rating"] = courses["same_courses"].apply(
        lambda courses: [course_to_overall.get(x) for x in courses])
    courses["average_workload"] = courses["same_courses"].apply(
        lambda courses: [course_to_workload.get(x) for x in courses])

    courses["average_rating_same_professors"] = courses[
        "same_courses_and_profs"].apply(
            lambda courses: [course_to_overall.get(x) for x in courses])
    courses["average_workload_same_professors"] = courses[
        "same_courses_and_profs"].apply(
            lambda courses: [course_to_workload.get(x) for x in courses])

    # calculate the average of an array
    def average(nums):
        nums = list(filter(lambda x: x is not None, nums))
        nums = list(filter(lambda x: not math.isnan(x), nums))
        if not nums:
            return [None, None]
        num_obs = len(nums)
        return (sum(nums) / num_obs, num_obs)

    # calculate averages over past offerings
    for average_col, num_col in [
        ("average_rating", "average_rating_n"),
        ("average_workload", "average_workload_n"),
        ("average_rating_same_professors", "average_rating_same_professors_n"),
        ("average_workload_same_professors",
         "average_workload_same_professors_n"),
    ]:
        courses[average_col], courses[num_col] = zip(
            *courses[average_col].apply(average))

    # remove intermediate columns
    courses = courses.loc[:, get_table_columns(database.models.Course)]

    return courses
Example #52
0
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
1. df라는 변수에 raw_data, 컬럼은 ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore']로 만들기

df = DataFrame(data=raw_data, columns=['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df
2. 각 컬럼별 Nan 갯수
df.isnull().sum()
3. 위 2번을 이용하여 각 컬럼별 NaN의 비율
df.isnull().sum()/len(df)

4. df_no_missing에 NaN이 한개도 없는 row만 저장하기
df_no_missing = df.dropna()
df_no_missing
5. df_cleaned에 NaN으로만 된 행들 제거하여 저장하기
df_cleaned = df.dropna(how='all')
df_cleaned
6. NaN이 3개이상 있는 row만 drop하기.
df.dropna(thresh=3)
7. df에서 NaN을 0으로 바꾸기
df.fillna(0)
8. preTestScore 컬럼의 평균구하기
pre_mean = df.preTestScore.mean()
9. preTestScore의 NaN에 preTestScore의 평균 집어넣기. 단 df는 원본그대로 보존
df.preTestScore.fillna(pre_mean)
df
10. preTestScore의 NaN에 preTestScore의 평균 집어넣기. 단 df data자체에 저장
df["preTestScore"].fillna(pre_mean, inplace=True)
import pandas as pd
import numpy as np
from pandas import DataFrame, get_dummies
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.constraints import max_norm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

f = pd.read_csv('presidents-data-words-january-3-2018.csv')
df = DataFrame(f)
df = df.dropna(subset=['dagalb','nseg','nsyll','nstress','mean'])

early_stop = EarlyStopping(patience=5)

X_cols = ['widx','lexstress','nseg','nsyll','nstress','pos','dep','doc.freq','d.inform.3','corpus.freq','c.inform.3','category']
X = df[X_cols]
y = np.array(to_categorical(df.dagalb))

cat_cols = ['lexstress','pos','dep','category']
scale_cols = ['widx','nseg','nsyll','nstress','doc.freq','d.inform.3','corpus.freq','c.inform.3']

for c in cat_cols:
    dum = pd.get_dummies(X[c], columns=[c], prefix=c)
    X = pd.concat([dum, X], axis=1)
    del(X[c])
Example #54
0
rets.head()
prices.plot()
plt.show()
sns.heatmap(rets.corr())
plt.show()
rets.corr()

data = Series(['one', 'two', np.nan, 'four'])
data
dframe = DataFrame([[1, 2, 3], [np.nan, 5, 6], [7, np.nan, 9],
                    [np.nan, np.nan, np.nan]])
dframe
dframe2 = DataFrame([[1, 2, 3, nan], [2, nan, 5, 6], [nan, 7, nan, 9],
                     [1, nan, nan, nan]])
dframe2
dframe2.dropna(thresh=2)
dframe2.fillna({0: 'a', 1: 'b', 2: 'c', 3: 'd'})
ser = Series(np.random.randn(6),
             index=[[1, 1, 1, 2, 2, 2], ['a', 'b', 'c', 'a', 'b', 'c']])
ser
ser.index
ser[1]
ser[2]
ser[:, 'a']
dframe = ser.unstack()
dframe
dframe
dframe.T.unstack()
dframe2 = DataFrame(np.arange(16).reshape(4, 4),
                    index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                    columns=[['NY', 'NY', 'LA', 'SF'],
    def calculate(self, reference_data: pd.DataFrame,
                  production_data: pd.DataFrame, column_mapping):
        if column_mapping:
            date_column = column_mapping.get('datetime')
            id_column = column_mapping.get('id')
            target_column = column_mapping.get('target')
            prediction_column = column_mapping.get('prediction')
            num_feature_names = column_mapping.get('numerical_features')
            if num_feature_names is None:
                num_feature_names = []
            else:
                num_feature_names = [
                    name for name in num_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

            cat_feature_names = column_mapping.get('categorical_features')
            if cat_feature_names is None:
                cat_feature_names = []
            else:
                cat_feature_names = [
                    name for name in cat_feature_names
                    if is_numeric_dtype(reference_data[name])
                ]

        else:
            date_column = 'datetime' if 'datetime' in reference_data.columns else None
            id_column = None
            target_column = 'target' if 'target' in reference_data.columns else None
            prediction_column = 'prediction' if 'prediction' in reference_data.columns else None

            utility_columns = [
                date_column, id_column, target_column, prediction_column
            ]

            num_feature_names = list(
                set(reference_data.select_dtypes([np.number]).columns) -
                set(utility_columns))
            cat_feature_names = list(
                set(reference_data.select_dtypes([np.object]).columns) -
                set(utility_columns))

        if production_data is not None:
            if target_column is not None and prediction_column is not None:
                production_data.replace([np.inf, -np.inf],
                                        np.nan,
                                        inplace=True)
                production_data.dropna(axis=0, how='any', inplace=True)

                binaraizer = preprocessing.LabelBinarizer()
                binaraizer.fit(reference_data[target_column])
                binaraized_target = binaraizer.transform(
                    production_data[target_column])

                array_prediction = production_data[prediction_column].to_numpy(
                )

                prediction_ids = np.argmax(array_prediction, axis=-1)
                prediction_labels = [
                    prediction_column[x] for x in prediction_ids
                ]

                #calculate quality metrics
                if len(prediction_column) > 2:
                    roc_auc = metrics.roc_auc_score(binaraized_target,
                                                    array_prediction,
                                                    average='macro')
                    log_loss = metrics.log_loss(binaraized_target,
                                                array_prediction)
                else:
                    roc_auc = metrics.roc_auc_score(
                        binaraized_target,
                        production_data[prediction_column[0]])  #problem!!!
                    log_loss = metrics.log_loss(
                        binaraized_target,
                        production_data[prediction_column[0]])  #problem!!!

                accuracy_score = metrics.accuracy_score(
                    production_data[target_column], prediction_labels)
                avg_precision = metrics.precision_score(
                    production_data[target_column],
                    prediction_labels,
                    average='macro')
                avg_recall = metrics.recall_score(
                    production_data[target_column],
                    prediction_labels,
                    average='macro')
                avg_f1 = metrics.f1_score(production_data[target_column],
                                          prediction_labels,
                                          average='macro')

                self.wi = BaseWidgetInfo(
                    title=self.title,
                    type="counter",
                    details="",
                    alertStats=AlertStats(),
                    alerts=[],
                    alertsPosition="row",
                    insights=[],
                    size=2,
                    params={
                        "counters": [{
                            "value": str(round(accuracy_score, 3)),
                            "label": "Accuracy"
                        }, {
                            "value": str(round(avg_precision, 3)),
                            "label": "Precision"
                        }, {
                            "value": str(round(avg_recall, 3)),
                            "label": "Recall"
                        }, {
                            "value": str(round(avg_f1, 3)),
                            "label": "F1"
                        }, {
                            "value": str(round(roc_auc, 3)),
                            "label": "ROC AUC"
                        }, {
                            "value": str(round(log_loss, 3)),
                            "label": "LogLoss"
                        }]
                    },
                    additionalGraphs=[],
                )
            else:
                self.wi = None
        else:
            self.wi = None
Example #56
0
def df2ecl(
    grid_df: pd.DataFrame,
    keywords: Union[str, List[str]],
    eclfiles: Optional[EclFiles] = None,
    dtype: Optional[Type] = None,
    filename: Optional[str] = None,
    nocomments: bool = False,
) -> str:
    """
    Write an include file with grid data keyword, like PERMX, PORO,
    FIPNUM etc, for the GRID section of the Eclipse deck.

    Output (returned as string and optionally written to file) will then
    contain f.ex::

        PERMX
           3.3 4.1 500.1 8543.0 1223.0 5022.0
           411.455 4433.9
        /

    if the grid contains 8 cells (inactive and active).

    Args:
        grid_df: Dataframe with the keyword for which
            we want to export data, and also the a column with GLOBAL_INDEX.
            Without GLOBAL_INDEX, the output will likely be invalid.
            The grid can contain both active and inactive cells.
        keywords: The keyword(s) to export, with one
            value for every cell.
        eclfiles: If provided, the total cell count for the grid
            will be requested from this object. If not, it will be *guessed*
            from the maximum number of GLOBAL_INDEX, which can be under-estimated
            in the corner-case that the last cells are inactive.
        dtype: If provided, the columns which are
            outputted are converted to int or float. Dataframe columns
            read from CSV files easily gets the wrong type, while Eclipse
            might require some data to be strictly integer.
        filename: If provided, the string produced will also to be
            written to this filename.
        nocomments: Set to True to avoid any comments being written. Defaults
            to False.
    """
    if isinstance(keywords, str):
        keywords = [keywords]

    if isinstance(dtype, str):
        if dtype.startswith("int"):
            dtype = int
        elif dtype.startswith("float"):
            dtype = float
        else:
            raise ValueError(f"Wrong dtype argument {dtype}")

    # Figure out the total number of cells for which we need to export data for:
    global_size = None
    active_cells = None
    if eclfiles is not None:
        if eclfiles.get_egrid() is not None:
            global_size = eclfiles.get_egrid().get_global_size()
            active_cells = eclfiles.get_egrid().getNumActive()

    if "GLOBAL_INDEX" not in grid_df:
        logger.warning(("Global index not found in grid dataframe. "
                        "Assumes all cells are active"))
        # Drop NaN rows for columns to be used (triggered by stacked
        # dates and no global index, unlikely)
        # Also copy dataframe to avoid side-effects on incoming data.
        grid_df = grid_df.dropna(
            axis="rows",
            subset=[keyword for keyword in keywords if keyword in grid_df])
        grid_df["GLOBAL_INDEX"] = grid_df.index

    if global_size is None:
        global_size = int(grid_df["GLOBAL_INDEX"].max() + 1)
        active_cells = len(grid_df[grid_df.index >= 0])
        logger.warning("Global grid size estimated to %s", str(global_size))

    ecl2df_header = ("Output file printed by " + "ecl2df.grid " + __version__ +
                     "\n" + " at " + str(datetime.datetime.now()))

    string = ""
    if not nocomments:
        string += common.comment_formatter(ecl2df_header)
    string += "\n"

    # If we have NaNs in the dataframe, we will be more careful (costs memory)
    if grid_df.isna().any().any():
        grid_df = grid_df.dropna(
            axis="rows",
            subset=[keyword for keyword in keywords if keyword in grid_df])

    for keyword in keywords:
        if keyword not in grid_df.columns:
            raise ValueError(f"Keyword {keyword} not found in grid dataframe")
        vector = np.zeros(global_size)
        vector[grid_df["GLOBAL_INDEX"].astype(int).values] = grid_df[keyword]
        if dtype == int:
            vector = vector.astype(int)
        if dtype == float:
            vector = vector.astype(float)
        if len(vector) != global_size:
            logger.warning(
                ("Mismatch between dumped vector length "
                 "%d from df2ecl and assumed grid size %d"),
                len(vector),
                global_size,
            )
            logger.warning("Data will be dumped, but may error in simulator")
        strvector = "  ".join([str(x) for x in vector])
        strvector = common.runlength_eclcompress(strvector)

        string += keyword + "\n"
        indent = " " * 5
        string += "\n".join(
            textwrap.wrap(strvector,
                          initial_indent=indent,
                          subsequent_indent=indent,
                          width=70))
        string += "\n/"
        if not nocomments:
            string += (f" -- {keyword}: {active_cells} active cells, "
                       f"{global_size} total cell count\n")
        string += "\n"

    if filename is not None:
        Path(filename).parent.mkdir(parents=True, exist_ok=True)
        Path(filename).write_text(string, encoding="utf-8")
    return string
Example #57
0
 def process_merged(self, data: pd.DataFrame) -> DataType:
     data.dropna(inplace=True)
     return data
Example #58
0
import pandas as pd
from pandas import DataFrame

#import data from Excel
ReadExcel = pd.read_excel(
    r'C:\Users\asus\Documents\Python-Vaje\ts-lubatruu.xlsx')
df = DataFrame(ReadExcel, columns=['Date', 'event_horizon', 'LUBATRUU_AR'])
df.dropna(inplace=True)

#event_horizon = -7
timeminus7 = df[df.event_horizon == -7]
#print (timeminus7)
listLUBATRUU_ARminus7 = timeminus7['LUBATRUU_AR']
#print (listLUBATRUU_ARminus7)
a = sum(listLUBATRUU_ARminus7)
#print (a)
b = len(listLUBATRUU_ARminus7)
#print (b)
c = a / b
#print (c) #AAR(-7)

#event_horizon = -6
timeminus6 = df[df.event_horizon == -6]
#print (timeminus6)
listLUBATRUU_ARminus6 = timeminus6['LUBATRUU_AR']
#print (listLUBATRUU_ARminus6)
d = sum(listLUBATRUU_ARminus6)
#print (d)
e = len(listLUBATRUU_ARminus6)
#print (e)
f = d / e
Example #59
0
        data[i] += noisyCount(sensitivety, epsilon)
    return data


if __name__ == '__main__':
    data = [[2, 2, 0, 0, 3, 0], [2, 0, 2, 2, 0, 1], [2, 3, 1, 3, 0, 0],
            [1, 1, 1, 0, 1, 0], [0, 1, 1, 3, 2, 1], [3, 3, 0, 1, 3, 0],
            [2, 2, 1, 1, 3, 0], [2, 0, 1, 1, 3, 0], [0, 0, 1, 3, 3, 0],
            [1, 0, 1, 0, 1, 0], [3, 2, 1, 3, 0, 2], [2, 3, 1, 0, 3, 0]]

    df = DataFrame(
        data,
        index=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12'],
        columns=['1', '2', '3', '4', '5', '6'])
    # print(df)
    df.dropna()
    x = df[0:1]
    print(x)
    w = cal_weight(x)

# if __name__ == '__main__':
# df = pd.read_csv('/Users/qiaoyanming/Desktop/工作簿2.csv', encoding='gb2312')
# # 2数据预处理 ,去除空值的记录
# df.dropna()
# print(df)
# w = cal_weight(df)

# x = [1., 1., 0.]
# sensitivety = 1
# epsilon = 1
# data = laplace_mech(x, sensitivety, epsilon)
    data = data.replace(".", "")
    data = data.replace("m²", "")
    data = re.sub(re.compile(" \D.*"), "", data)
    data = data.strip()
    return data


def get_firstlayer(data):
    fist_layer = data.split(",")[0]
    return fist_layer.strip()


def get_lastlayer(data):
    last_layer = data.split(",")[-1]
    return last_layer.strip()


wohnung_data_clean = wohnung_data.dropna(axis=0)
wohnung_data_clean["price"] = wohnung_data_clean["price"].apply(
    clean_pricesize)
wohnung_data_clean["size"] = wohnung_data_clean["size"].apply(clean_pricesize)
wohnung_data_clean["location_first"] = wohnung_data_clean["location"].apply(
    get_firstlayer)
wohnung_data_clean["location_last"] = wohnung_data_clean["location"].apply(
    get_lastlayer)

wohnung_data_clean.to_csv("~/wohnung_data_clean_" + time.strftime("%d/%m/%Y") +
                          ".csv",
                          sep=";",
                          index=False)