Example #1
0
def checkFile(fn,satdata,beamisr,maxdtsec):
    """
    we need to find matching ISR beam IDs very near the time the satellite
    passes through the ISR beam.
    for speed, use Unix epoch time (seconds since Jan 1, 1970) for comparisons

    Note: the Madrigal HDF5 data is read in as a Numpy structured array

    Algorithm (not optimized):
    1) knowing what satellites will eventually intersect beams, are any of those beamids in this file?
    2) knowing what times intersections will occur, do those times exist in this file for those beams?
    3) For the beams that meet conditions 1 and 2, compute TEC by numerical integration of NE

    output:
    tecisr: 2-D DataFrame, beamid x time

    """
    h5p = '/Data/Table Layout'
    #rows: satellite.  cols: time
    intersections = satdata.loc[:,:,'intersect']
    intersections.dropna(axis=1,how='all',inplace=True)

    beamlist = beamisr['BEAMID'].values # have to make a copy to sort
    beamlist.sort()

    tecisr = DataFrame(index=beamlist, columns=intersections.columns)

    try:
        with h5py.File(fn,'r',libver='latest') as f:
            for t in intersections: #for each time...
                #mask for matching beam ids (not necessarily matching in time yet...)
                intmask = np.in1d(f[h5p]['beamid'].astype(int),intersections[t].dropna().astype(int))
                if not intmask.any(): #no overlap, no point in evaluating times
                    continue
                #mask for matching times (not necessarily matching beamids)
                timemask =np.absolute(f[h5p]['ut1_unix'] - (t.to_pydatetime()-datetime(1970,1,1)).total_seconds()) < maxdtsec

                #mask for where beamid and times "match"
                inttimemask = intmask & timemask
                #retrieve "good" rows of HDF5 that are the correct Beam ID(s) and time(s)
                intdata = f[h5p][inttimemask]

                #TODO not tested past this point
                #TODO account for the case where there are two times and one beam that overlap with the satellite.
                """
                intdata will have numerous rows corresponding to each matching time & beam id
                each row is a range cell. These rows will be numerically integrated over Ne.
                """
                uniqbeamid = np.unique(intdata['beamid']).astype(int)
                for b in uniqbeamid:
                    mask = np.isclose(intdata['beamid'],b) #this is one beam's rows, all range bins
                    mask &= np.isfinite(intdata['nel'][mask]) #dropna
                    tecisr.loc[b,t] = comptecisr(10**intdata['nel'][mask],
                                                 intdata['range'][mask])

    except ValueError as e:
        warn('{} does not seem to have the needed data fields.   {}'.format(fn,e))

    tecisr.dropna(axis=1,how='all',inplace=True) #only retain times with TEC data (vast majority don't have)
    return tecisr
Example #2
0
    def test_column_dups2(self):

        # drop buggy GH 6240
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})

        expected = df.take([0, 1, 1], axis=1)
        df2 = df.take([2, 0, 1, 2, 1], axis=1)
        result = df2.drop('C', axis=1)
        assert_frame_equal(result, expected)

        # dropna
        df = DataFrame({'A': np.random.randn(5),
                        'B': np.random.randn(5),
                        'C': np.random.randn(5),
                        'D': ['a', 'b', 'c', 'd', 'e']})
        df.iloc[2, [0, 1, 2]] = np.nan
        df.iloc[0, 0] = np.nan
        df.iloc[1, 1] = np.nan
        df.iloc[:, 3] = np.nan
        expected = df.dropna(subset=['A', 'B', 'C'], how='all')
        expected.columns = ['A', 'A', 'B', 'C']

        df.columns = ['A', 'A', 'B', 'C']

        result = df.dropna(subset=['A', 'C'], how='all')
        assert_frame_equal(result, expected)
def training(iden, Charg, Temps, use_cache_trainingset, test, verbose):
    ''' Return the prediction function, 
    for a given site iden, history Charg and temperature Temps'''
    if use_cache_trainingset:
        if test:
            X = pickle.load(open(CACHE_DIR+"X_test_"+iden+".p", "rb"))
        else:
            X = pickle.load(open(CACHE_DIR+"X_"+iden+".p", "rb"))
    else:
        X = DataFrame(Charg[iden])
        X = X.dropna(how='any')
        X['dayofweek'] = X.index.dayofweek
        X['Temps'] = Temps[iden].ix[X.index]
        X['fracday'] = X.index.minute/60.+X.index.hour
        X['lastminutes'] = X[iden].ix[X.index-10*Minute()].values
        X['yesterday'] = X[iden].ix[X.index-Day()].values
        X['yesterdaybis'] = X[iden].ix[X.index-Day()-10*Minute()].values
        X['lastweek'] = X[iden].ix[X.index-Week()].values
        X['lastweekbis'] = X[iden].ix[X.index-Week()-10*Minute()].values
        if test:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
        else:
            pickle.dump(X, open(CACHE_DIR+"X_test_"+iden+".p", "wb" ) )
    X = X.dropna(how='any')
    y = X[iden]
    X = X.drop(iden, 1)
    scalerX = preprocessing.StandardScaler().fit(X)
    ##############################
    clf = linear_model.SGDRegressor(alpha = 0.000001,n_iter=3000)
    ##############################
    clf.fit(scalerX.transform(X), y)
    if verbose:
        print('Function for '+iden+' computed.')
    return(lambda x :clf.predict(scalerX.transform(x)))
Example #4
0
 def __read_data_values(self):
     """
     Reads the `Data Values` worksheet in a Time-Series excel file.
     :return:
     """
     sheet = self.workbook.get_sheet_by_name('Data Values')  # type: Worksheet
     dvs = self.__dv_row_generator(sheet.iter_rows())
     headers = next(dvs)
     df = DataFrame([dv for dv in dvs], columns=headers)
     df.dropna(how='all', inplace=True)
     self.tables['DataValues'] = df
Example #5
0
    def test_dropna(self):
        df = DataFrame(np.random.randn(6, 4))
        df[2][:2] = nan

        dropped = df.dropna(axis=1)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        # threshold
        dropped = df.dropna(axis=1, thresh=5)
        expected = df.loc[:, [0, 1, 3]]
        inp = df.copy()
        inp.dropna(axis=1, thresh=5, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=0, thresh=4)
        expected = df.loc[lrange(2, 6)]
        inp = df.copy()
        inp.dropna(axis=0, thresh=4, inplace=True)
        assert_frame_equal(dropped, expected)
        assert_frame_equal(inp, expected)

        dropped = df.dropna(axis=1, thresh=4)
        assert_frame_equal(dropped, df)

        dropped = df.dropna(axis=1, thresh=3)
        assert_frame_equal(dropped, df)

        # subset
        dropped = df.dropna(axis=0, subset=[0, 1, 3])
        inp = df.copy()
        inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
        assert_frame_equal(dropped, df)
        assert_frame_equal(inp, df)

        # all
        dropped = df.dropna(axis=1, how='all')
        assert_frame_equal(dropped, df)

        df[2] = nan
        dropped = df.dropna(axis=1, how='all')
        expected = df.loc[:, [0, 1, 3]]
        assert_frame_equal(dropped, expected)

        # bad input
        pytest.raises(ValueError, df.dropna, axis=3)
Example #6
0
def parse_essentials(essentials_file, samples, normalization=None, cutoff=100):
    data = DataFrame.from_csv(essentials_file, sep="\t", index_col=False)
    data = data[["Position"] + samples]
    data["sum"] = data[samples].apply(sum, axis=1)
    data = data[data["sum"] < cutoff]
    data = data.groupby("Position").sum()
    for sample in samples:
        sample_data = DataFrame(None, index=data.index)
        if normalization is not None:
            sample_data["insertions"] = data[sample].apply(normalization)
        else:
            sample_data["insertions"] = data[sample]

        sample_data.dropna(inplace=True)
        yield sample_data
Example #7
0
def pd_02():
    string_data=Series(['a','b','c',np.nan,'e',None])
    print string_data
    print string_data.isnull()
    print string_data.dropna()
    df=DataFrame(np.random.randn(7,3))
    df.ix[:4,1]=np.nan
    df.ix[:2,2]=np.nan
    print df
    print df.dropna()
    print df.fillna(0)
    print df.fillna({1:0.5,3:-1})
    print df
    df.fillna(0,inplace=True)
    print df
Example #8
0
def get_flights_from_route(cur, origin, destination):
    """
    Returns a dataframe for all flights matching origin, destination.
    """

    import time
    
    ### MySQL query
    time0 = time.time()
    cur.execute("SELECT Year, Month, DayofMonth, DayOfWeek, CRSDepTime, UniqueCarrier, ArrDelay FROM flights_100000 WHERE Origin = %s and Dest = %s;", (origin, destination))
    rows = cur.fetchall()
    td = time.time() - time0
    print 'Database query took %.2f seconds.' % td
    
    ### Convert to dataframe
    df = DataFrame(list(rows), columns=['Year', 'Month', 'DayOfMonth', 'DayOfWeek', 'CRSDepTime', 'Carrier', 'ArrDelay'])

    ### Drop columns without delays (cancellations)
    df = df.dropna()
    
    ### Create some auxiliary columns
    df['DayOfYear'] = df.apply( lambda x: datetime.datetime(x['Year'],x['Month'],x['DayOfMonth']).timetuple().tm_yday, axis=1)
    df['Week'] = df['DayOfYear'] / 7 + 1
    df['DepHour'] = df['CRSDepTime']/100

    ### Drop unused columns
    df = df.drop(['DayOfMonth','CRSDepTime'],axis=1).sort_index(axis=1)

    ## df.head()
    
    return df
Example #9
0
def proportion_error_per_appliance_df(mains_values, gt_values, predicted_values):
    gt_proportion = {}
    pr_proportion = {}
    proportion_error = {}

    for app in predicted_values:
        p_gt  = gt_values[app]/mains_values
        p_pr  = predicted_values[app]/mains_values
        
        fr = DataFrame(p_gt, columns=['p_gt'])
        fr['p_gt'] = p_gt
        fr['p_pr'] = p_pr
        
#        fr['01. mains'] = mains_values
#        fr['02. gt'] = gt_values[app]
#        fr['03. pr'] = predicted_values[app]
        
        fr = fr.dropna()
        
        summ_gt = fr['p_gt'].sum()
        summ_pr = fr['p_pr'].sum()

        T = len(fr)
        tru = float(summ_gt)/float(T)
        dis = float(summ_pr)/float(T)
                
        gt_proportion[app] = tru
        pr_proportion[app] = dis
        
        diff = abs(tru - dis)
        proportion_error[app] = diff
    return proportion_error, gt_proportion, pr_proportion
Example #10
0
 def _to_frame_build_data_frame(self, tfp, hasna, usecols):
     # build data frame
     if usecols is None:
         usecols = ['node', 'kind', 'level', 'msg']
     dfinfo = {}
     dfcols = []
     if 'node' in usecols:
         dfinfo['node'] = tfp.nodes
         dfcols.append('node')
     if 'kind' in usecols:
         dfinfo['kind'] = tfp.kinds
         dfcols.append('kind')
     if tfp.get_line_type is not None and 'level' in usecols:
         dfinfo['level'] = tfp.levels
         dfcols.append('level')
     if 'msg' in usecols:
         dfinfo['msg'] = tfp.msgs
         dfcols.append('msg')
     df = DataFrame(dfinfo, index=tfp.dates, columns=dfcols)
     if hasna:
         df = df.dropna()
     df.index.name = 'dtime'
     # pytable not support unicode for now
     if 'node' in df.columns:
         df['node'] = df['node'].astype(str)
     if 'kind' in df.columns:
         df['kind'] = df['kind'].astype(str)
     if 'level' in df.columns:
         df['level'] = df['level'].astype(str)
     return df
Example #11
0
    def __init__(self, train: pd.DataFrame, test: pd.DataFrame, params: dict,
                 categorical_splits=None):
        """
        :param train: train DF
        :param test: test DF
        :param params: dict with the following structure
        Template for params:
        params = {
            'uuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'uuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'ukuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'ukku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kuku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kukk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkuu': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkuk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkku': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes},
            'kkkk': {'sample': None, 'scaler': sc, 'ignore_features': None, 'classifier': Bayes}
        }
        u = unknown, k = known, scaler = None for Trees and else something like scale_features from
        dmc.transformation, ignore_features are the features which should be ignored for the split,

        :return:
        """
        if categorical_splits is None:
            categorical_splits = ['articleID', 'customerID', 'voucherID', 'productGroup']
        self.processes = 8
        self.test = test.copy()
        test = test.dropna(subset=['rrp'])
        self.test_size = len(test)
        self.splits = split(train, test, categorical_splits)
        self._enrich_splits(params)
Example #12
0
def cor_exp_ess(exp, ess):
    cor = DataFrame(np.nan, index=ess.columns, columns=['cor', 'pvalue'])

    for gene in ess.columns:
        if gene in exp.columns:
            cor.loc[gene] = spearmanr(ess[gene], exp[gene])

    return cor.dropna()
Example #13
0
    def test_dropna_multiple_axes(self):
        df = DataFrame([[1, np.nan, 2, 3],
                        [4, np.nan, 5, 6],
                        [np.nan, np.nan, np.nan, np.nan],
                        [7, np.nan, 8, 9]])
        cp = df.copy()
        result = df.dropna(how='all', axis=[0, 1])
        result2 = df.dropna(how='all', axis=(0, 1))
        expected = df.dropna(how='all').dropna(how='all', axis=1)

        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)
        assert_frame_equal(df, cp)

        inp = df.copy()
        inp.dropna(how='all', axis=(0, 1), inplace=True)
        assert_frame_equal(inp, expected)
Example #14
0
class Dropna(object):

    params = (['all', 'any'], [0, 1])
    param_names = ['how', 'axis']

    def setup(self, how, axis):
        self.df = DataFrame(np.random.randn(10000, 1000))
        self.df.ix[50:1000, 20:50] = np.nan
        self.df.ix[2000:3000] = np.nan
        self.df.ix[:, 60:70] = np.nan
        self.df_mixed = self.df.copy()
        self.df_mixed['foo'] = 'bar'

    def time_dropna(self, how, axis):
        self.df.dropna(how=how, axis=axis)

    def time_dropna_axis_mixed_dtypes(self, how, axis):
        self.df_mixed.dropna(how=how, axis=axis)
Example #15
0
    def test_load_raw_arrays(self):
        reindex_reader = ReindexMinuteBarReader(
            self.trading_calendar,
            self.bcolz_equity_minute_bar_reader,
            self.START_DATE,
            self.END_DATE,
        )
        m_open, m_close = self.trading_calendar.open_and_close_for_session(
            self.START_DATE)
        outer_minutes = self.trading_calendar.minutes_in_range(m_open, m_close)
        result = reindex_reader.load_raw_arrays(
            OHLCV, m_open, m_close, [1, 2])

        opens = DataFrame(data=result[0], index=outer_minutes,
                          columns=[1, 2])
        opens_with_price = opens.dropna()

        self.assertEqual(
            1440,
            len(opens),
            "The result should have 1440 bars, the number of minutes in a "
            "trading session on the target calendar."
        )

        self.assertEqual(
            390,
            len(opens_with_price),
            "The result, after dropping nans, should have 390 bars, the "
            " number of bars in a trading session in the reader's calendar."
        )

        slicer = outer_minutes.slice_indexer(
            end=pd.Timestamp('2015-12-01 14:30', tz='UTC'))

        assert_almost_equal(
            opens[1][slicer],
            full(slicer.stop, nan),
            err_msg="All values before the NYSE market open should be nan.")

        slicer = outer_minutes.slice_indexer(
            start=pd.Timestamp('2015-12-01 21:01', tz='UTC'))

        assert_almost_equal(
            opens[1][slicer],
            full(slicer.stop - slicer.start, nan),
            err_msg="All values after the NYSE market close should be nan.")

        first_minute_loc = outer_minutes.get_loc(pd.Timestamp(
            '2015-12-01 14:31', tz='UTC'))

        # Spot check a value.
        # The value is the autogenerated value from test fixtures.
        assert_almost_equal(
            10.0,
            opens[1][first_minute_loc],
            err_msg="The value for Equity 1, should be 10.0, at NYSE open.")
Example #16
0
    def test_dropna_tz_aware_datetime(self):
        # GH13407
        df = DataFrame()
        dt1 = datetime.datetime(2015, 1, 1,
                                tzinfo=dateutil.tz.tzutc())
        dt2 = datetime.datetime(2015, 2, 2,
                                tzinfo=dateutil.tz.tzutc())
        df['Time'] = [dt1]
        result = df.dropna(axis=0)
        expected = DataFrame({'Time': [dt1]})
        assert_frame_equal(result, expected)

        # Ex2
        df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
        result = df.dropna(axis=0)
        expected = DataFrame([dt1, dt2],
                             columns=['Time'],
                             index=[0, 3])
        assert_frame_equal(result, expected)
Example #17
0
    def gatherGenDataData(ert, case, key):
        """ :rtype: pandas.DataFrame """
        key, report_step = key.split("@", 1)
        report_step = int(report_step)
        try:
            data = GenDataCollector.loadGenData(ert, case, key, report_step)
        except ValueError:
            data = DataFrame()

        return data.dropna() # removes all rows that has a NaN
Example #18
0
    def agg(self):
        dframe = DataFrame(index=self.column.index)

        dframe = self._build_dframe(dframe, self.columns)
        column_names = [self._name_for_idx(i) for i in xrange(0, 2)]
        dframe = dframe.dropna(subset=column_names)

        dframe = DataFrame([dframe.sum().to_dict()])

        return self._add_calculated_column(dframe)
def PolyEq(x, y, order=1):
    try:
        df=DataFrame({'x':x,'y':y},index=x.index)
        df = df.dropna()
        PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs
        PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation
    except:
        print 'No regression equation possible'
        PolyEq = np.poly1d([0])
    return PolyEq
Example #20
0
    def test_na_actions_categorical(self):

        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
        vals = ["a", "b", np.nan, "d"]
        df = DataFrame({"cats": cat, "vals": vals})
        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
        vals2 = ["a", "b", "b", "d"]
        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
        vals3 = ["a", "b", np.nan]
        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
        cat4 = Categorical([1, 2], categories=[1, 2, 3])
        vals4 = ["a", "b"]
        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

        # fillna
        res = df.fillna(value={"cats": 3, "vals": "b"})
        tm.assert_frame_equal(res, df_exp_fill)

        with pytest.raises(ValueError, match=("fill value must "
                                              "be in categories")):
            df.fillna(value={"cats": 4, "vals": "c"})

        res = df.fillna(method='pad')
        tm.assert_frame_equal(res, df_exp_fill)

        # dropna
        res = df.dropna(subset=["cats"])
        tm.assert_frame_equal(res, df_exp_drop_cats)

        res = df.dropna()
        tm.assert_frame_equal(res, df_exp_drop_all)

        # make sure that fillna takes missing values into account
        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
        df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})

        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

        res = df.fillna("a")
        tm.assert_frame_equal(res, df_exp)
Example #21
0
def combine_spread(file_set, shift, drop_return_data=False):
    """
    Combine the spread of input files, return with mean and standard
    deviation calculated.

    """

    data = []
    values = {}
    for val in ('left', 'right', 'com', 'dist', 'radius', 'diameter'):
        values[val] = {}

    # Collect data from all files into dictionaries
    for i, _file in enumerate(file_set):
        data.append(Spread().read(_file))
        for val in values.keys():
            values[val][i] = Series(
                    data=data[i].spread[val]['val'],
                    index=data[i].times
                    )
        data[i].times = (np.array(data[i].times) - shift[i])

    spread = Spread()
    spread.spread['num'] = len(file_set)

    for val in values.keys():

        # Shift time as per synchronisation
        for i in values[val]:
            values[val][i].index = np.array(values[val][i].index) - shift[i]

        # Convert to DataFrame
        df = DataFrame(data=values[val])

        # If not a single file, keep only indices with at least two non-NaN
        if len(file_set) > 1:
            df = df.dropna()

        # If return data dropped, fill data here
        if drop_return_data:
            for i in df.columns:
                data[i].spread[val]['val'] = df[i].tolist()

        # Get times, mean and standard error as lists
        mean = list(df.mean(axis=1))
        std_error = list(df.std(axis=1))
        times = list(df.index)

        # Add to Spread object
        spread.spread[val]['val'] = mean
        spread.spread[val]['std'] = std_error
        spread.spread['times'] = times

    return spread, data
Example #22
0
def createDataset():
    data=read_csv('data/data2.csv',parse_dates=['DATE'],index_col='DATE')
    data.drop('DY', axis=1, inplace=True)
    data=data.dropna()
    data['RETURNS']=data['Price'].pct_change()
    rets=DataFrame(data['RETURNS'])*100
    rets['MA10']=fun.sampleMovingAverage(rets,10)
    rets['MA30']=fun.sampleMovingAverage(rets['RETURNS'],50)
    rets['VAR10']=fun.movingVariance(rets['RETURNS'],30)
    rets=rets.dropna()
    return rets
def Polyfit(x, y, order=1, color='k--',lab='label',Xvals=10,subplot=plot):
    df=DataFrame({'x':x.values,'y':y.values},index=x.index)
    df = df.dropna()
    print df
    PolyCoeffs = np.polyfit(df['x'], df['y'], order) ## calculates polynomial coeffs
    PolyEq = np.poly1d(PolyCoeffs) ## turns the coeffs into an equation
    #print PolyEq
    PolyXvals = np.linspace(min(x), max(x)+Xvals) ## creates x-values for trendline
    #print PolyXvals
    Polyplot = subplot.plot(PolyXvals, PolyEq(PolyXvals),color,label=lab) ## plots the trendline
    return Polyplot
Example #24
0
def append_2013_gva(dfin, csv_file_path):
    df = dfin.copy()
    gva = pd.read_csv(csv_file_path)
    gvasub = DataFrame(columns=['nuts3id', 'gva2013'])
    gvasub['nuts3id'], gvasub['gva2013'] = gva['nutsid'], gva['2013']
    df_gva = pd.merge(
        left=df,
        right=gvasub.dropna(),
        how='left',
        left_on='nuts3id',
        right_on='nuts3id')
    return df_gva
Example #25
0
 def detect_objects(self, img, template, thres):
     #Conver to gray scale
     img_grey = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     #Match Template
     temp_match = cv2.matchTemplate(img_grey, template, 1)
     #Normalize scores
     temp_match = temp_match-temp_match.min()
     temp_match = 1 - temp_match/temp_match.max()
     #Apply Threshold
     retval, dst = cv2.threshold(temp_match, thres, 1, 0)
     dst = dst.astype(np.uint8)
     #Extract centroid of connected components
     contours, hierarchy = cv2.findContours(dst, 1, 2)
     if len(contours) > 0:
         m_df = DataFrame([cv2.moments(cont) for cont in contours])
         m_df['x'] = (m_df['m10']/m_df['m00']) + int(template.shape[0]/2)
         m_df['y'] = (m_df['m01']/m_df['m00']) + int(template.shape[1]/2)
         m_df.dropna(subset=['x', 'y'], inplace=True)
         return m_df
     else:
         return DataFrame()
Example #26
0
def get_plate_data(path,c):
    """ Get plate data, drop empty columns, drop selected columns, 
        rename columns, add normalized columns. """
    return thread_first(path,
                        from_file,
                        (str.replace,'\r',''),
                        StringIO,
                        pd.read_csv(delimiter=c['delimiter'], skiprows=c['skiprows']),
                        df.dropna(axis=1,how='all'),
                        (drop_matching_columns,c['dropcols']),
                        df.rename(columns=c['colrename']),
                        (add_normalized_columns,c['normcols']))
Example #27
0
def parse_region(df: pd.DataFrame, min_row: int, max_row: int, cols: list) -> pd.DataFrame:
    df = df.loc[min_row:max_row, cols]

    # Region is in either 0,0 or 0,1 of the sliced DataFrame, with the data starting from the 3rd row
    region = df.iloc[0, 0]
    if type(region) == float:
        region = df.iloc[0, 1]

    df = df.dropna(axis=0, how='all').iloc[2:, 1:]
    df.columns = ['Name', 'Count']
    df['Region'] = region
    return df
Example #28
0
    def test_dropEmptyRows(self):
        N = len(self.frame.index)
        mat = random.randn(N)
        mat[:5] = nan

        frame = DataFrame({'foo': mat}, index=self.frame.index)
        original = Series(mat, index=self.frame.index, name='foo')
        expected = original.dropna()
        inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()

        smaller_frame = frame.dropna(how='all')
        # check that original was preserved
        assert_series_equal(frame['foo'], original)
        inplace_frame1.dropna(how='all', inplace=True)
        assert_series_equal(smaller_frame['foo'], expected)
        assert_series_equal(inplace_frame1['foo'], expected)

        smaller_frame = frame.dropna(how='all', subset=['foo'])
        inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
        assert_series_equal(smaller_frame['foo'], expected)
        assert_series_equal(inplace_frame2['foo'], expected)
def gettraining(data,XColumns,YColumn,dropNA=True,shuffle=True):
    #make a copy
    trainingData = DataFrame(data)
    #drop a few columns which we won't use for ML models
    trainingData = trainingData[XColumns+YColumn]
    if dropNA == True:
        #drop Na values if necessary
        trainingData = trainingData.dropna()
    if shuffle == True:
        #shuffle data if necessary
        trainingData = trainingData.reindex(np.random.permutation(trainingData.index))
    return(trainingData)
Example #30
0
    def test_dropna_multiple_axes(self):
        df = DataFrame([[1, np.nan, 2, 3],
                        [4, np.nan, 5, 6],
                        [np.nan, np.nan, np.nan, np.nan],
                        [7, np.nan, 8, 9]])
        cp = df.copy()

        # GH20987
        with tm.assert_produces_warning(FutureWarning):
            result = df.dropna(how='all', axis=[0, 1])
        with tm.assert_produces_warning(FutureWarning):
            result2 = df.dropna(how='all', axis=(0, 1))
        expected = df.dropna(how='all').dropna(how='all', axis=1)

        assert_frame_equal(result, expected)
        assert_frame_equal(result2, expected)
        assert_frame_equal(df, cp)

        inp = df.copy()
        with tm.assert_produces_warning(FutureWarning):
            inp.dropna(how='all', axis=(0, 1), inplace=True)
        assert_frame_equal(inp, expected)