def _clean_description(self, element):
		'''
		cleans up aggregated descriptions
		'''
		data = [x.getText() for x in element]
		data = Series(data)
		data = data.apply(lambda x: re.sub('\n', ' ', x))
		mask = data.apply(lambda x:
			False if re.search('\(sanded|\(sealed|\(endgrain|\(curl|\(burl|^$', x) else True)
		data = data[mask]
		
		def func(item):
			try:
				return list(re.search('(.*?):(.*)', item).groups())
			except:
				return [item, None]
		
		data = data.apply(func).tolist()
		data = DataFrame(data, columns=['heading', 'content'])
		
		mask = data.content.apply(lambda x: pd.notnull(x))
		if mask.shape[0] > 0:
			mask.ix[0] = True
		data = data[mask]
		return data
Example #2
0
    def test_apply(self, datetime_series):
        with np.errstate(all='ignore'):
            tm.assert_series_equal(datetime_series.apply(np.sqrt),
                                   np.sqrt(datetime_series))

            # element-wise apply
            import math
            tm.assert_series_equal(datetime_series.apply(math.exp),
                                   np.exp(datetime_series))

        # empty series
        s = Series(dtype=object, name='foo', index=pd.Index([], name='bar'))
        rs = s.apply(lambda x: x)
        tm.assert_series_equal(s, rs)

        # check all metadata (GH 9322)
        assert s is not rs
        assert s.index is rs.index
        assert s.dtype == rs.dtype
        assert s.name == rs.name

        # index but no data
        s = Series(index=[1, 2, 3])
        rs = s.apply(lambda x: x)
        tm.assert_series_equal(s, rs)
Example #3
0
    def test_apply(self):
        assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts))

        # elementwise-apply
        import math
        assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts))

        # how to handle Series result, #2316
        result = self.ts.apply(lambda x: Series(
            [x, x ** 2], index=['x', 'x^2']))
        expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2})
        tm.assert_frame_equal(result, expected)

        # empty series
        s = Series(dtype=object, name='foo', index=pd.Index([], name='bar'))
        rs = s.apply(lambda x: x)
        tm.assert_series_equal(s, rs)
        # check all metadata (GH 9322)
        self.assertIsNot(s, rs)
        self.assertIs(s.index, rs.index)
        self.assertEqual(s.dtype, rs.dtype)
        self.assertEqual(s.name, rs.name)

        # index but no data
        s = Series(index=[1, 2, 3])
        rs = s.apply(lambda x: x)
        tm.assert_series_equal(s, rs)
Example #4
0
    def test_series_map_box_timestamps(self):
        # GH#2689, GH#2627
        ser = Series(pd.date_range('1/1/2000', periods=10))

        def func(x):
            return (x.hour, x.day, x.month)

        # it works!
        ser.map(func)
        ser.apply(func)
Example #5
0
    def test_series_map_box_timedelta(self):
        # GH 11349
        s = Series(timedelta_range('1 day 1 s', periods=5, freq='h'))

        def f(x):
            return x.total_seconds()

        s.map(f)
        s.apply(f)
        DataFrame(s).applymap(f)
Example #6
0
    def test_apply_same_length_inference_bug(self):
        s = Series([1, 2])
        f = lambda x: (x, x + 1)

        result = s.apply(f)
        expected = s.map(f)
        assert_series_equal(result, expected)

        s = Series([1, 2, 3])
        result = s.apply(f)
        expected = s.map(f)
        assert_series_equal(result, expected)
Example #7
0
def test_filter_against_workaround():
    np.random.seed(0)
    # Series of ints
    s = Series(np.random.randint(0, 100, 1000))
    grouper = s.apply(lambda x: np.round(x, -1))
    grouped = s.groupby(grouper)
    f = lambda x: x.mean() > 10

    old_way = s[grouped.transform(f).astype('bool')]
    new_way = grouped.filter(f)
    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())

    # Series of floats
    s = 100 * Series(np.random.random(1000))
    grouper = s.apply(lambda x: np.round(x, -1))
    grouped = s.groupby(grouper)
    f = lambda x: x.mean() > 10
    old_way = s[grouped.transform(f).astype('bool')]
    new_way = grouped.filter(f)
    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())

    # Set up DataFrame of ints, floats, strings.
    from string import ascii_lowercase
    letters = np.array(list(ascii_lowercase))
    N = 1000
    random_letters = letters.take(np.random.randint(0, 26, N))
    df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
                    'floats': N / 10 * Series(np.random.random(N)),
                    'letters': Series(random_letters)})

    # Group by ints; filter on floats.
    grouped = df.groupby('ints')
    old_way = df[grouped.floats.
                 transform(lambda x: x.mean() > N / 20).astype('bool')]
    new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
    tm.assert_frame_equal(new_way, old_way)

    # Group by floats (rounded); filter on strings.
    grouper = df.floats.apply(lambda x: np.round(x, -1))
    grouped = df.groupby(grouper)
    old_way = df[grouped.letters.
                 transform(lambda x: len(x) < N / 10).astype('bool')]
    new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
    tm.assert_frame_equal(new_way, old_way)

    # Group by strings; filter on ints.
    grouped = df.groupby('letters')
    old_way = df[grouped.ints.
                 transform(lambda x: x.mean() > N / 20).astype('bool')]
    new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
    tm.assert_frame_equal(new_way, old_way)
def normalize_lengths(iterable, shift_right=False):
    '''
    Given an iterable of sequences as strings, returns a pandas Series of sequences
    with Ns added the end to make them all the same length.
    
    If shift_right, adds Ns to the beginning instead of the end.
    '''
    series = Series(iterable)
    max_length = series.apply(len).max()
    def normalize_seq(seq):
        ns = 'N'*(max_length - len(seq))
        if shift_right:
            return ns + seq
        else:
            return seq + ns
    return series.apply(normalize_seq)
Example #9
0
 def select_known_events(gauge_dict, base, precipitation, precip=True, date='2006-01-01', buffer_days=25):
     s = data[base]['Q_cfs']
     name = gauge_dict.keys()[0]
     date_obj = to_datetime(date)
     rng = s[date_obj - Timedelta(days=buffer_days): date_obj + Timedelta(days=buffer_days)]
     if precip:
         fig, ax1 = plt.subplots()
         ax1.plot(rng, 'k', label='Discharge [cfs]')
         ax1.legend()
         plt.title('Discharge at {} Event Peak: {}'.format(name, date))
         ppt_s = Series(array(precipitation)[:, 1], index=array(precipitation)[:, 0])
         ppt_s = ppt_s.reindex(index=rng.index, method=None)
         ppt_s = ppt_s.apply(to_numeric)
         ppt_s[ppt_s < 0] = 0.0
         ppt_s = ppt_s[date_obj - Timedelta(days=buffer_days): date_obj + Timedelta(days=buffer_days)]
         ax1.set_xlabel('Date')
         ax1.set_ylabel('[cfs]')
         for tl in ax1.get_yticklabels():
             tl.set_color('k')
         ax2 = ax1.twinx()
         ax2.bar(ppt_s.index, ppt_s, width=0.1, label='Precipitation [mm/hr]')
         plt.gca().invert_yaxis()
         ax2.set_ylabel('[mm]')
         for tl in ax2.get_yticklabels():
             tl.set_color('b')
         ax2.legend()
     if not precip:
         plt.plot(rng)
         plt.plot(rng, 'k')
         plt.legend()
         plt.title('Discharge at {} Event Peak: {}'.format(name, date))
         plt.xlabel('Date')
         plt.ylabel('[cfs]')
Example #10
0
    def test_end_time_timevalues(self, input_vals):
        # GH 17157
        # Check that the time part of the Period is adjusted by end_time
        # when using the dt accessor on a Series
        input_vals = PeriodArray._from_sequence(np.asarray(input_vals))

        s = Series(input_vals)
        result = s.dt.end_time
        expected = s.apply(lambda x: x.end_time)
        tm.assert_series_equal(result, expected)
Example #11
0
 def test_date_tz(self):
     # GH11757
     rng = pd.DatetimeIndex(['2014-04-04 23:56',
                             '2014-07-18 21:24',
                             '2015-11-22 22:14'], tz="US/Eastern")
     s = Series(rng)
     expected = Series([date(2014, 4, 4),
                        date(2014, 7, 18),
                        date(2015, 11, 22)])
     assert_series_equal(s.dt.date, expected)
     assert_series_equal(s.apply(lambda x: x.date()), expected)
def rolling_mean(data, window, min_periods=1, center=False):
    if len(data) < 2:
        return data
    ''' Function that computes a rolling mean

    Parameters
    ----------
    data : DataFrame or Series
           If a DataFrame is passed, the rolling_mean is computed for all columns.
    window : int or string
             If int is passed, window is the number of observations used for calculating
             the statistic, as defined by the function pd.rolling_mean()
             If a string is passed, it must be a frequency string, e.g. '90S'. This is
             internally converted into a DateOffset object, representing the window size.
    min_periods : int
                  Minimum number of observations in window required to have a value.

    Returns
    -------
    Series or DataFrame, if more than one column
    '''
    def f(x):
        '''Function to apply that actually computes the rolling mean'''
        offset = pd.datetools.to_offset(window)

        if center == False:
            dslice = col[x-offset.delta+timedelta(0,0,1):x]
                # adding a microsecond because when slicing with labels start and endpoint
                # are inclusive
        else:
            dslice = col[x-offset.delta/2+timedelta(0,0,1):
                         x+pd.datetools.to_offset(window).delta/2]
        if dslice.size < min_periods:
            return np.nan
        else:
            return dslice.mean()

    data = DataFrame(data.copy())
    dfout = DataFrame()
    if isinstance(window, int):
        dfout = pd.rolling_mean(data, window, min_periods=min_periods, center=center)
    elif isinstance(window, basestring):
        idx = Series(pd.to_datetime(data.index), index=data.index)
        for colname, col in data.iterkv():
            result = idx.apply(f)
            result.name = colname
            dfout = dfout.join(result, how='outer')
    if dfout.columns.size == 1:
        dfout = dfout.ix[:,0]
    return dfout
Example #13
0
    def schedule(self, positions: List[Position], data_ser: pd.Series, context: BacktestContext):
        avg: pd.Series = data_ser.apply(lambda x: x.open).rolling(self.lookback).mean()
        todays_price_ = data_ser[-1].open
        instruction = None
        if data_ser[-2].open <= avg[-2] and todays_price_ > avg[-1]:
            instruction = TradeInstruction(todays_price_, todays_price_ - as_price(self.stop_value, data_ser.name),
                                           self.risk_per_trade,
                                           data_ser.name, data_ser.index[-1])


        if data_ser[-2].open >= avg[-2] and todays_price_ < avg[-1]:
            instruction = TradeInstruction(todays_price_, todays_price_ + as_price(self.stop_value, data_ser.name),
                                           -self.risk_per_trade,
                                           data_ser.name, data_ser.index[-1])

        return instruction
def adjustDays(normedDays, period):
    for i,day in enumerate(normedDaysAdj):
        normedDaysAdj[i] = Series(day, index=array(day.index, dtype=float) - 
                                  adj[i])
    n = reduce(Series.append, normedDays)
    idx = array(n.index.copy(),dtype=float) + arange(len(n))*10e-5
    n = Series(array(n), idx % period)
    idx_rel = array(n.index.copy(), dtype=float)
    n = n.sort_index()
    n = n.apply(cycle_adjust)
    n = Series(array(n[idx_rel]), index = idx)
    adj  = split(arange(len(idx))*10e-5, cumsum(map(len, normedDays)))
    normedDaysAdj = [n[array(d.index, dtype=float) + adj[i]] for (i,d) in 
                     enumerate(normedDays)]
    for i,day in enumerate(normedDaysAdj):
        normedDaysAdj[i] = Series(day, index=array(day.index, dtype=float) - 
                                  adj[i])
    return normedDaysAdj
Example #15
0
def convert_series_of_lists_to_df(column: pd.Series, prefix='', prefix_sep=''):
    """
    input:

     index    groups
        0     ['a','b','c']
        1     ['c']
        2     ['b','c','e']
        3     ['a','c']
        4     ['b','e']

    output:

    index   a   b   c   d   e
        0   1   1   1   0   0
        1   0   0   1   0   0
        2   0   1   1   0   1
        3   1   0   1   0   0
        4   0   1   0   0   0
    """

    return pd.get_dummies(column.apply(pd.Series), prefix=prefix, prefix_sep=prefix_sep).sum(level=0, axis=1)
Example #16
0
    def test_apply_empty(self, float_frame, empty_frame):
        # empty
        applied = empty_frame.apply(np.sqrt)
        assert applied.empty

        applied = empty_frame.apply(np.mean)
        assert applied.empty

        no_rows = float_frame[:0]
        result = no_rows.apply(lambda x: x.mean())
        expected = Series(np.nan, index=float_frame.columns)
        assert_series_equal(result, expected)

        no_cols = float_frame.loc[:, []]
        result = no_cols.apply(lambda x: x.mean(), axis=1)
        expected = Series(np.nan, index=float_frame.index)
        assert_series_equal(result, expected)

        # GH 2476
        expected = DataFrame(index=['a'])
        result = expected.apply(lambda x: x['a'], axis=1)
        assert_frame_equal(expected, result)
Example #17
0
def getPointArray(conshpfn):
    conShp = shapefile.Reader(conshpfn)
    conShapes = conShp.shapes()
    conShapeArray = []
    for conShape in conShapes:
        numOfShapePoints = len(conShape.points)
        conShapePartArray = copy.deepcopy(conShape.parts)
        conShapePartArray.append(numOfShapePoints)
        partPointsArray = []
        for partIndex in range(len(conShape.parts)):
            partPointsArray.append(conShape.points[conShapePartArray[partIndex]:conShapePartArray[partIndex+1]])
        partPointsSeries = Series(partPointsArray)
        numOfPartPointsSeries = partPointsSeries.apply(lambda x: len(x))
        numOfPartPointsSeries = numOfPartPointsSeries.rank(method = 'first')
        rankDic = {}
        for i,numOfPartPointsSeriesItem in enumerate(numOfPartPointsSeries):
            rankDic[numOfPartPointsSeriesItem] = partPointsSeries[i]
        rankDicKeys = rankDic.keys()
        rankDicKeys.sort(reverse=True)
        sortedPartPointsArray = []
        for rankDicKey in rankDicKeys:
            sortedPartPointsArray.append(rankDic[rankDicKey])
        conShapeArray.append(sortedPartPointsArray)
    return conShapeArray
def rolling_timeslice_apply(data, window, func,min_periods=1,direc='F'):

    def f(x):
        '''Function to apply that actually computes the rolling mean'''
        
        time_slice = col[str(x):str(x+pdX.datetools.to_offset(window).delta)]
        if(direc == 'B'):
            time_slice = col[str(x-pdX.datetools.to_offset(window).delta):str(x)][::-1]
        if time_slice.size < min_periods:
            return np.nan
        else:
            return func(time_slice)
    
    data = DataFrame(data.copy())
    dfout = DataFrame()
    if isinstance(window, basestring):
        idx = Series(data.index.to_pydatetime(), index=data.index)
        for colname, col in data.iterkv():
            result = idx.apply(f)
            result.name = colname
            dfout = dfout.join(result, how='outer')
    if dfout.columns.size == 1:
        dfout = dfout.ix[:,0]
    return dfout
Example #19
0
 def stem(self, raw: pd.Series):
     return raw.apply(
         lambda x: [
             self.nlp.stem(word) for word in self.nlp.tokenize(x) if self.nlp.stem(word) is not None
         ]
     )
Example #20
0
def test_apply_args():
    s = Series(["foo,bar"])

    result = s.apply(str.split, args=(",",))
    assert result[0] == ["foo", "bar"]
    assert isinstance(result[0], list)
    def MLE_fast_func_with_p0t_close_to_mean_of_prev_week(
            self, p0t: list, num_of_prev_dayes=7):
        """
        Can only work if beta is not 0

        substritutes p0
        :param p0t: a dict= variables for substitute
        :return: list of the results after substituting of p0
        """
        if self.beta == 0:
            raise KeyError

        #####
        #   p0t = [min(i, 0.999) for i in p0t]
        #####

        series_p0 = Series(p0t, self.N_time)

        M = self.Mi(series_p0)

        fixed_k = self.K.swaplevel(0, 1)
        # fixed_n = self.N.swaplevel(0, 1)

        part_a = DataFrame(index=self.N_time,
                           columns=self.N_feature,
                           dtype=float)
        part_c = DataFrame(index=self.N_time,
                           columns=self.N_feature,
                           dtype=float)
        upper_part_for_first_argument = DataFrame(index=self.N_time,
                                                  columns=self.N_feature)
        for i in self.N_feature:
            upper_part_for_first_argument[i] = fixed_k[i].apply(lambda x: (x + self.alpha) * M[i]) - \
                                               self.fixed_n_without_sigma[i]
            part_a[i] = fixed_k[i].apply(lambda x: self.sigma**2 * (
                x + self.alpha) * M[i]) - self.fixed_n_with_sigma[i]
            part_c[i] = series_p0.apply(lambda x: M[i] - x * self.teta[i])
        # part_a = part_a.swapaxes(0, 1)
        part_c = part_c.swapaxes(0, 1)
        upper_part_for_first_argument = upper_part_for_first_argument.swapaxes(
            0, 1)

        return_list = np.array([
            sum(upper_part_for_first_argument[self.N_time[0]] /
                (series_p0[self.N_time[0]] * part_c[self.N_time[0]]))
        ])
        for t in self.N_time[1:num_of_prev_dayes]:
            part_b = series_p0[t] * (
                series_p0[t] -
                series_p0[self.N_time[self.N_time.index(t) - 1]])
            # Memory error with alot of days clean memory for avoiding memory error
            # gc.collect()
            return_list = np.append(
                return_list,
                sum((part_a.swapaxes(0, 1)[t] - (part_b * part_c[t])) /
                    part_c[t]))
        for t in self.N_time[num_of_prev_dayes:]:
            part_b = series_p0[t] * (series_p0[t] - statistics.mean(
                series_p0[self.N_time[self.N_time.index(t) -
                                      num_of_prev_dayes]:self.
                          N_time[self.N_time.index(t)]]))
            # Memory error with alot of days clean memory for avoiding memory error
            # gc.collect()
            return_list = np.append(
                return_list,
                sum((part_a.swapaxes(0, 1)[t] - (part_b * part_c[t])) /
                    part_c[t]))

        return return_list
Example #22
0
    def test_apply_dont_convert_dtype(self):
        s = Series(np.random.randn(10))

        f = lambda x: x if x > 0 else np.nan
        result = s.apply(f, convert_dtype=False)
        assert result.dtype == object
Example #23
0
def to_ip(series: pd.Series) -> pd.Series:
    return series.apply(ip_address)
Example #24
0
 def validate(self, series: pd.Series) -> pd.Series:
     return series.apply(self.can_call)
Example #25
0
 def convert_time_steps_to_year(self, time_steps: pd.Series) -> pd.Series:
     """converts a number of time steps relative to reference date into absolute dates"""
     return time_steps.apply(lambda x: np.round(
         x * self.time_step + self.reference_year, SIG_FIGS))
Example #26
0
emp = pd.read_csv("c:/r/emp.csv",names = ["empid","name","job","mgr","hire_date","sal","comm","deptno"])

emp[emp['comm'].notnull()][['name','comm']]



apply라는 메소드 공부
s1 = Series([1,2,3])

s1**2
def square(x):
    return x**2
square(2)

apply함수는 행, 열값을 인수값으로 받아서 반복하여 그 함수를 적용한다.

s1.apply(square)
s1.apply(lambda x: x**2)

df = DataFrame([[1,2,3],[4,5,6]])

df.apply(square,axis = 0)
#0 : 각 컬럼이 함수에 적용, 1: 각 row가 함수에 적용
df[0].apply(square)

import numpy as np

df.apply(np.sum, axis = 0)
df.apply(np.sum, axis = 1)

df.apply(lambda x: x**2)
Example #27
0
def reconstruct_taxonomy(reconstruction_map: pd.Series, 
                         taxonomy: pd.Series, 
                         database: str='none', 
                         define_missing: str='merge',
                         ambiguity_handling: str='ignore',
                         ) -> pd.Series:
    """
    Reconstructs the taxonomic annotation based on a sidle database by 
    identifying the lowest taxonomic level  where the taxonomic annotation 
    diverges for two sequences

    Parameters
    ----------
    reconstruction_map : pd.Series
        The relationship between raw  sequences and the reconstructed sidle
        database
    taxonomy: pd.Series
        A taxonomic description of each sequence
    database: {'greengenes', 'silva', 'none'}
        The database used taxonomy. This is important for selecting the 
        correct taxonomic delimiter and for removing missing sequences.
    define_missing: {'merge'; 'inherit'; 'ignore'}
        Taxonomic strings may be missing information (for example  `g__` in 
        greengenes  or `D_5__uncultured bacteria` in Silva). These can be 
        ignored  (`"ignore"`) and treated like any other taxonomic 
        designation; they can be first inherited in merged sequences 
        (`"merge`"), where, when there are two strings being merged and 
        one has a missing level, the missing level is taken form the 
        defined one, or they can be inherited from the previous level 
        (`"inherit"`) first, and then merged.
    ambiguity_handling: {'missing', 'ignore'}
        whether "ambigious taxa" (Silva-specific) should be treated as 
        missing values (`"missing`") or ignored (`"ignore"`)

    Returns
    -------
    pd.Series
        A series describing the new taxonomy 
    """
    if (database == 'none') & (define_missing != 'ignore'):
        warnings.warn('When no database is specified, '
                      'missing values are ignored by default', UserWarning)
    if (database == 'none') & (ambiguity_handling != 'ignore'):
        warnings.warn('When no database is specified, '
                      'ambiguious values are ignored by default', UserWarning)
    if (database == 'greengenes') and (ambiguity_handling != 'ignore'):
        warnings.warn('Greengenes does not include ambigious taxa. The '
                       'ambiguity handling will be ignored.', UserWarning)

    # Filters the taxonomy and converts to levels
    db_lookup = database_params.get(database, 'none')
    delim = db_lookup['delim']
    def split_taxonomy(x):
        return pd.Series([s.strip(' ') for s in  x.split(delim)])
    taxonomy = taxonomy.loc[reconstruction_map.index]
    taxonomy = taxonomy.apply(split_taxonomy)
    taxonomy.index.set_names('Feature ID', inplace=True)

    if len(taxonomy.columns) == 1:
        raise ValueError('Only one taxonomic level was found. Please check '
                         'your database and delimiter.')

    # Finds the undefined levels
    defined_f = db_lookup['defined']
    undefined_levels = ~pd.concat(axis=1, objs=[
        taxonomy[c].apply(defined_f) for c in taxonomy.columns
    ])
    ambigious_levels = pd.concat(axis=1, objs=[
        taxonomy[c].apply(lambda x: 'ambig' in x) for c in taxonomy.columns
        ])
    ambigious_levels = ambigious_levels.cummax(axis=1)
    undefined = (undefined_levels | 
                 (ambigious_levels & (ambiguity_handling == 'missing'))
                 ).astype(bool)
    

    # Filters missing taxa and  hanldes initial inherietence.
    if define_missing != 'ignore':
        taxonomy.mask(undefined, np.nan, inplace=True)
    
    if define_missing == 'inherit':
        taxonomy.fillna(method='ffill', axis=1, inplace=True)

    # Combines the taxonomy across multiple  levels
    def _combine_f(x):
        if pd.isnull(x).all():
            return np.nan
        else:
            return '|'.join(np.sort(x.dropna().unique()))

    def _combine_taxa(g):
        """Help function ot tidy taxonomy"""
        if len(g) == 1:
            return g.iloc[0]
        else:
            return g.apply(_combine_f)

    taxonomy['clean_name'] = reconstruction_map
    collapsed  = taxonomy.groupby('clean_name').apply(_combine_taxa)
    collapsed.drop(columns=['clean_name'], inplace=True)

    # Finds splits in the data
    disjoint = pd.concat(axis=1, objs=[
        collapsed[c].apply(lambda x: True if pd.isnull(x) else '|' in x) 
        for c in collapsed.columns
        ]).cummax(axis=1)
    # Set up inherietence so you inheriet the first split in each row 
    # of the data
    disjoint_inheriet = (disjoint.cummax(axis=1) & 
                         ~((disjoint.cumsum(axis=1) == 1) & disjoint))
    collapsed.mask(disjoint_inheriet, np.nan, inplace=True)

    # Does nan inherietence
    collapsed.fillna(method='ffill', axis=1,  inplace=True)
    # Returns  the summarized taxonomy
    new_taxa = collapsed.apply(lambda x: delim.join(list(x.values)), axis=1)

    new_taxa.name = 'Taxon'
    new_taxa.index.set_names('Feature ID', inplace=True)

    return new_taxa
def map_dataframe_with_dict_and_default(default: Any, mapping_dict: dict,
                                        x: pd.Series):
    col_dict = mapping_dict[x.name]
    return x.apply(lambda y: col_dict.get(str(y), default))
Example #29
0
def is_column_string(se: pd.Series) -> bool:
    """ dataframe では str と object の区別がつかない """
    if se.apply(lambda x: type(x) == str).sum() == se.shape[0]:
        return True
    else:
        return False
Example #30
0
def prices(series: pd.Series,
           initial: int = 1,
           type: Returns = Returns.SIMPLE) -> pd.Series:
    """
    Calculate price levels from returns series

    :param series: time series of returns
    :param initial: initial price level
    :param type: returns type: simple, logarithmic or absolute
    :return: date-based time series of return

    **Usage**

    Compute price levels from returns series, based on the value of *type*:

    ===========   =============================
    Type          Description
    ===========   =============================
    simple        Simple arithmetic returns
    logarithmic   Logarithmic returns
    absolute      Absolute returns
    ===========   =============================

    *Simple*

    Compute asset price series from simple returns:

    :math:`Y_t = (1 + X_{t-1}) Y_{t-1}`

    where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial`

    *Logarithmic*

    Compute asset price series from logarithmic returns:

    :math:`Y_t = e^{X_{t-1}} Y_{t-1}`

    where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial`

    *Absolute*

    Compute asset price series from absolute returns:

    :math:`Y_t = X_{t-1} + Y_{t-1}`

    where :math:`X_t` is the asset price at time :math:`t` and :math:`Y_0 = initial`

    **Examples**

    Generate price series and take compute returns

    >>> series = generate_series(100)
    >>> returns = prices(returns(series))

    **See also**

    :func:`returns` :func:`product` :func:`exp`
    """

    if series.size < 1:
        return series

    if type == Returns.SIMPLE:
        return product(1 + series) * initial
    elif type == Returns.LOGARITHMIC:
        return product(series.apply(math.exp)) * initial
    elif type == Returns.ABSOLUTE:
        return sum_(series) + initial
    else:
        raise MqValueError(
            'Unknown returns type (use simple / Logarithmic / absolute)')
Example #31
0
def returns(series: pd.Series,
            obs: int = 1,
            type: Returns = Returns.SIMPLE) -> pd.Series:
    """
    Calculate returns from price series

    :param series: time series of prices
    :param obs: number of observations
    :param type: returns type: simple, logarithmic or absolute
    :return: date-based time series of return

    **Usage**

    Compute returns series from price levels, based on the value of *type*:

    ===========   =============================
    Type          Description
    ===========   =============================
    simple        Simple arithmetic returns
    logarithmic   Logarithmic returns
    absolute      Absolute returns
    ===========   =============================

    *Simple*

    Simple geometric change in asset prices, which can be aggregated across assets

    :math:`Y_t = \\frac{X_t}{X_{t-obs}} - 1`

    where :math:`X_t` is the asset price at time :math:`t`

    *Logarithmic*

    Natural logarithm of asset price changes, which can be aggregated through time

    :math:`Y_t = log(X_t) - log(X_{t-obs})`

    where :math:`X_t` is the asset price at time :math:`t`

    *Absolute*

    Absolute change in asset prices

    :math:`Y_t = X_t - X_{t-obs}`

    where :math:`X_t` is the asset price at time :math:`t`

    **Examples**

    Generate price series and take compute returns

    >>> prices = generate_series(100)
    >>> returns = returns(prices)

    **See also**

    :func:`prices`
    """

    if series.size < 1:
        return series

    if type == Returns.SIMPLE:
        ret_series = series / series.shift(obs) - 1
    elif type == Returns.LOGARITHMIC:
        log_s = series.apply(math.log)
        ret_series = log_s - log_s.shift(obs)
    elif type == Returns.ABSOLUTE:
        ret_series = series - series.shift(obs)
    else:
        raise MqValueError(
            'Unknown returns type (use simple / logarithmic / absolute)')

    return ret_series
Example #32
0
emp[~pd.isnull(emp['comm_pct'])][['name','comm_pct']]



s1 = Series([1,2,3])
s1

s1**2  # R처럼 연산가능

def square(x):
    return x**2

square(s1)

# apply함수는 인수값으로 행,열값을 인수값으로 받아서 반복하여 그 함수를 적용
s1.apply(square)
s1.apply(lambda x : x**2)

df = DataFrame([[1,2,3],[4,5,6]])
df

df.apply(square)
df.apply(lambda x : x**2)
df[0].apply(square)
df[0].apply(lambda x : x**3)


df.apply(square, axis = 0)  # 0 : 각 컬럼이 함수에 적용
df.apply(square, axis = 1)  # 1 : 각 행 함수에 적용

df.ix[0,:]
#    
#     Series.apply(func, convert_dtype=True, args=(), **kwds)
# 
#     Invoke function on values of Series.

# <codecell>

# Let's start by using Series.apply
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.apply.html

# first of all, it's useful to find a way to use apply to return the exact same Series

def identity(s):
    return s

lower.apply(identity)

# <codecell>

# show that identity yields the same Series -- first on element by element basis

lower.apply(identity) == lower

# <codecell>

# Check that match happens for every element in the Series using numpy.all
# http://docs.scipy.org/doc/numpy/reference/generated/numpy.all.html

np.all(lower.apply(identity) == lower)

# <headingcell level=2>
def validateType(s: pd.Series, t, text=''):
    validTypeCount = s.apply(type).value_counts().to_dict()[t]
    totalTypes = len(s)
    print(text, str(totalTypes - validTypeCount), ' of ', str(totalTypes),
          ' types are invalid.')
un_rate['ZIP_county']=un_rate['ZIP']
result1 = pd.merge(data,un_rate, how='inner', on ='ZIP_county')
result2 = pd.merge(result1, Unemp, how = 'inner', on = 'county')

result11=result2[["county","INSTNM","CrimeRate","Unemp_11","ZIP_county"]].dropna()
avg_crimerate = result11['CrimeRate'].mean()
avg_unem = result11['Unemp_11'].mean()


# calculate high or low crime rate and high or low unemployment
result11['CH'] = result11['CrimeRate'] > avg_crimerate
result11['CH'] = result11['CH'].astype(int)
result11['CL'] = result11['CrimeRate'] < avg_crimerate
result11['CL'] = result11['CL'].astype(int)
result11['UH'] = result11['Unemp_11'] > avg_unem
result11['UH'] = result11['UH'].astype(int)
result11['UL'] = result11['Unemp_11'] < avg_unem
result11['UL'] = result11['UL'].astype(int)

c=DataFrame(result11)
c=c.groupby(['UH','UL']).sum()

#calculating z score and p value
z_scores=[-4.76759,4.907776,4.76759,-4.907776]
z_scores=Series(z_scores)
p_values_h0 = scipy.stats.norm.sf(abs(z_scores[0]))*2
z_scores=DataFrame(z_scores)
z_scores.columns=['z score']
z_scores['p value']=z_scores.apply( lambda x:  scipy.stats.norm.sf(abs(z_scores['z score']))*2)

print z_scores
Example #36
0
def cortical_thickness(xfms   : pd.Series,  # nlin avg -> subject XfmHandler (iirc)...
                       atlas  : MincAtom,   # nlin avg
                       label_mapping : FileAtom,
                       atlas_fwhm : float,
                       thickness_fwhm : float):

    try:
        import vtk
    except:
        warnings.warn("couldn't `import vtk`, without which `decimate.py` is unable to run ...")
        raise

    s = Stages()

    # generate thickness maps for the average:
    left_grid, right_grid = [s.defer(make_laplace_grid(input_labels=atlas.labels,
                                                       label_mapping=label_mapping,
                                                       binary_closing=True, side=side))
                             for side in (Side.left, Side.right)]

    atlas_left_thickness, atlas_right_thickness = (
        [s.defer(decimate(
            s.defer(minclaplace(input_grid=grid,
                                extra_args=["--create-surface-range", "0", "10"])).surface,  # enclose entire cortex
            reduction=0.8,  # FIXME: magic number ... implement a way to specify number rather than fraction instead?
            smoothing_method=Smoothing.laplace))
         for grid in (left_grid, right_grid)])

    # as per comment in MICe_thickness, blur atlas instead of transformed object files ... ?
    # (maybe this workaround is now obsolete)
    blurred_atlas = s.defer(mincblur(img=atlas, fwhm=atlas_fwhm)).img

    # TODO rename this dataframe
    resampled = (pd.DataFrame(
      {
        'xfm' : xfms,
        # resample the atlas files to each subject:
        'blurred_atlas_grid_resampled'  :
            xfms.apply(lambda xfm: s.defer(mincresample_new(img=blurred_atlas, xfm=xfm.xfm, like=xfm.target))),
        'atlas_left_resampled'    :
            xfms.apply(lambda xfm: s.defer(transform_objects(input_obj=atlas_left_thickness, xfm=xfm.xfm))),
        'atlas_right_resampled'   :
            xfms.apply(lambda xfm: s.defer(transform_objects(input_obj=atlas_right_thickness, xfm=xfm.xfm))),
      })
        .assign(left_grid=lambda df: df.xfm.map(lambda xfm: s.defer(
                    make_laplace_grid(input_labels=xfm.target,
                                      label_mapping=label_mapping,
                                      binary_closing=True,
                                      side=Side.left))),
                right_grid=lambda df: df.xfm.map(lambda xfm: s.defer(
                    make_laplace_grid(input_labels=xfm.target,
                                      label_mapping=label_mapping,
                                      binary_closing=True,
                                      side=Side.right))))
        .assign(left_thickness=lambda df: df.apply(axis=1, func=lambda row:
                  s.defer(minclaplace(input_grid=row.left_grid,
                                      solution_vertices=row.atlas_left_resampled))),
                right_thickness=lambda df: df.apply(axis=1, func=lambda row:
                  s.defer(minclaplace(input_grid=row.right_grid,
                                      solution_vertices=row.atlas_right_resampled))))
        .assign(smooth_left_fwhm=lambda df: df.apply(axis=1, func=lambda row:
                  s.defer(diffuse(obj_file=row.atlas_left_resampled,
                                  input_signal=row.left_thickness.solved,
                                  kernel=thickness_fwhm,
                                  iterations=1000))),
                smooth_right_fwhm=lambda df: df.apply(axis=1, func=lambda row:
                  s.defer(diffuse(obj_file=row.atlas_right_resampled,
                                  input_signal=row.right_thickness.solved,
                                  kernel=thickness_fwhm,
                                  iterations=1000)))))
    return Result(stages=s, output=resampled)
Example #37
0
def threshold(column: pd.Series, threshold: float) -> pd.Series:
    print(column)
    column = column.apply(lambda x: 'e' if x <= threshold else 'p')
    print(column)
    return column
Example #38
0
    result = requests.get(url)
    if result.status_code == 200:
        #print 'Request succesful'
        return BeautifulSoup(result.text,"html.parser")
    else:
        print 'Request failed', url
        return None


result = requests.post(link, data=pageLoad)
soupMed = BeautifulSoup(result.text, "html.parser")
#soupMed = BeautifulSoup(result.text)
#print soupMed

#print soupMed.find("a", {"class": "standart"})
#print soupMed.find("tr td + a")
names = [ x.text for x in soupMed.find_all(class_="standart")]

#print names
#names = [ x.text for x in soupMed.find_all("a", {'clasx_': 'standart'})
names = Series(names)
print names.str.strip()

#regex_dosage = re.compile(r'\d+')
#regex_


"""
names.str.strip()
names.apply(lambda x : regex.findall(x))
"""
Example #39
0
 def get_status_coluna(self, serie: pd.Series):
     return serie.apply(self.get_status_individuo)
Example #40
0
def euler(max_num):
    # Series max takes the first element in the tuple. how to use a custom fn?
    indx = Series(np.arange(max_num + 1))
    cols = indx.apply(collatz_length)
    (maxl, maxi) = cols.max()
    print "The longest Collatz sequence is for %d = %d" % (maxi, maxl)
Example #41
0
    def test_apply_args(self):
        s = Series(['foo,bar'])

        result = s.apply(str.split, args=(',', ))
        assert result[0] == ['foo', 'bar']
        assert isinstance(result[0], list)
Example #42
0
    def test_apply_args(self):
        s = Series(['foo,bar'])

        result = s.apply(str.split, args=(',', ))
        assert result[0] == ['foo', 'bar']
        assert isinstance(result[0], list)
Example #43
0
def test_apply_empty_integer_series_with_datetime_index():
    # GH 21245
    s = Series([], index=pd.date_range(start="2018-01-01", periods=0), dtype=int)
    result = s.apply(lambda x: x)
    tm.assert_series_equal(result, s)
Example #44
0
def simulate(rates, significances, impressions, numTrials, firstLook=None,
             estimateFunction=WaldEstimate, seed=None):
    """ simulate a single-proportion Z-test

    Args:
        rate (list): success rates
        significances (list): significance values (1 - confidence)
        impressions (int or list): maximum impressions or list of number of
            impressions
        numTrials (int): number of independent simulations to aggregate over
        firstLook (int): first impression at which experiment is evaluated for
            continuous evaluation
            (defaults to 1)
        estimateFunction (function): binomal approximation to use
            (defaults to Wald)
        seed (int, optional): seed for random number generation
            (defaults to current time)

    Returns:
       avgRejects (DataFrame): simulate single test read at end
       avgAnyRejects (DataFrame): simulate conintuous test read after every impression
       Both DataFrames contain the estimate and uncertainty on the type I error
       (incorrect rejection of null hypothesis) for each rate, significance, and
       impression value. Results are aggregated across numTrials independent
       experiments.
    """

    trials = range(numTrials)
    base = [rates, significances, trials]
    mi = pandas.MultiIndex.from_product(base, names=['rate', 'significance',
                                                     'trial'])

    if seed is None:
        numpy.random.seed(int(time.time()))
    else:
        numpy.random.seed(seed)

    if type(impressions) == int:
        points = range(1, impressions + 1)
    else:
        points = impressions

    avgRejects = None
    avgAnyRejects = None

    for n in points:
        if n <= 0:
            raise ValueError("All values in impressions must be positive.")
        draws = DataFrame(numpy.random.random([n, len(rates) *
                                                  len(significances) *
                                                  len(trials)]),
                          columns=mi)
        draws.index = range(1, n + 1)

        successes = draws.copy()
        rejects = draws.copy()

        for rate in rates:
            successes[rate] = draws[rate].applymap(lambda x: int(x < rate))
        cumSuccesses = successes.apply(numpy.core.fromnumeric.cumsum, raw=True)
        cumImpressions = successes.index.values
        for rate in rates:
            for sig in significances:
                for trial in trials:
                    vals = Series(zip(cumSuccesses.loc[:, (rate, sig, trial)].values,
                                      cumImpressions))
                    vals.index = cumImpressions
                    rejects.loc[:, (rate, sig, trial)] = vals.apply(lambda x: \
                        int(rejectNull(estimateFunction(x[0], x[1], sig), rate)))

        if firstLook is not None:
            anyRejects = rejects.ix[firstLook:].max()

        # apply binomial approximation to estimate type I error rate
        if avgRejects is None:
            avgRejects = rejects[-1:]. \
                         groupby(axis=1, level=['rate', 'significance']). \
                         sum(). \
                         applymap(lambda x: estimateFunction(x, numTrials))
        else:
            avgRejects.ix[n] = rejects[-1:]. \
                               groupby(axis=1, level=['rate', 'significance']). \
                               sum(). \
                               applymap(lambda x: estimateFunction(x, numTrials)). \
                               values[0]

        # apply binomial approximation to estimate cumulative type I error rate
        if firstLook is not None:
            if avgAnyRejects is None:
                avgAnyRejects = DataFrame(anyRejects. \
                                          groupby(level=['rate', 'significance']). \
                                          sum(). \
                                          map(lambda x: estimateFunction(x, numTrials))). \
                                transpose()
                avgAnyRejects.index = avgRejects.index.copy()
            else:
                avgAnyRejects.ix[n] = anyRejects. \
                                      groupby(level=['rate', 'significance']). \
                                      sum(). \
                                      map(lambda x: estimateFunction(x, numTrials)). \
                                      values

    return avgRejects, avgAnyRejects
Example #45
0
def nom_comps(srs: dd.Series, head: pd.Series, cfg: Config) -> Dict[str, Any]:
    """
    All computations required for plot(df, Nominal)
    """
    # pylint: disable=too-many-branches
    data: Dict[str, Any] = dict()

    data["nrows"] = srs.shape[0]  # total rows
    srs = srs.dropna()  # drop null values
    grps = srs.value_counts(
        sort=False)  # counts of unique values in the series
    data["geo"] = grps

    if cfg.stats.enable or cfg.bar.enable or cfg.pie.enable:
        data["nuniq"] = grps.shape[0]  # total number of groups

    # compute bar and pie together unless the parameters are different
    if cfg.bar.enable or cfg.pie.enable:
        # select the largest or smallest groups
        data["bar"] = (grps.nlargest(cfg.bar.bars) if cfg.bar.sort_descending
                       else grps.nsmallest(cfg.bar.bars))

        if cfg.bar.bars == cfg.pie.slices and cfg.bar.sort_descending == cfg.pie.sort_descending:
            data["pie"] = data["bar"]
        else:
            data["pie"] = (grps.nlargest(cfg.pie.slices)
                           if cfg.pie.sort_descending else grps.nsmallest(
                               cfg.pie.slices))

        if cfg.bar.bars == cfg.value_table.ngroups and cfg.bar.sort_descending:
            data["value_table"] = data["bar"]
        elif cfg.pie.slices == cfg.value_table.ngroups and cfg.pie.sort_descending:
            data["value_table"] = data["pie"]
        else:
            data["value_table"] = grps.nlargest(cfg.value_table.ngroups)

        if cfg.insight.enable:
            data["chisq"] = chisquare(grps.values)

    df = grps.reset_index()  # dataframe with group names and counts

    if cfg.stats.enable or cfg.wordlen.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            srs = srs.astype(
                str)  # srs must be a string to compute the value lengths
    if cfg.stats.enable or cfg.wordcloud.enable or cfg.wordfreq.enable:
        if not head.apply(lambda x: isinstance(x, str)).all():
            df[df.columns[0]] = df[df.columns[0]].astype(str)

    if cfg.stats.enable:
        data.update(_calc_nom_stats(srs, df, data["nrows"], data["nuniq"]))
    elif cfg.wordfreq.enable and cfg.insight.enable:
        data["len_stats"] = {
            "Minimum": srs.str.len().min(),
            "Maximum": srs.str.len().max()
        }
    if cfg.wordlen.enable:
        lens = srs.str.len()
        data["len_hist"] = da.histogram(lens, cfg.wordlen.bins,
                                        (lens.min(), lens.max()))
    if cfg.wordcloud.enable or cfg.wordfreq.enable:
        if all(
                getattr(cfg.wordcloud, att) == getattr(cfg.wordfreq, att)
                for att in ("top_words", "stopword", "stem", "lemmatize")):
            word_freqs = _calc_word_freq(
                df,
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            data["word_cnts_cloud"] = word_freqs["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]
        else:
            word_freqs = _calc_word_freq(
                df.copy(),
                cfg.wordfreq.top_words,
                cfg.wordfreq.stopword,
                cfg.wordfreq.lemmatize,
                cfg.wordfreq.stem,
            )
            word_freqs_cloud = _calc_word_freq(
                df,
                cfg.wordcloud.top_words,
                cfg.wordcloud.stopword,
                cfg.wordcloud.lemmatize,
                cfg.wordcloud.stem,
            )
            data["word_cnts_cloud"] = word_freqs_cloud["word_cnts"]
            data["nuniq_words_cloud"] = word_freqs["nuniq_words"]

        data["word_cnts_freq"] = word_freqs["word_cnts"]
        data["nwords_freq"] = word_freqs["nwords"]

    return data
Example #46
0
dataframe1 = DataFrame(([1,2,3,4,7,11],[4,5,6,9,5,0],[7,5,8,12,1,11]), columns=['col1','col2','col3','col4','col5','col6'])

ser_from_df1 = dataframe1.iloc[:, 0:1]

# convert series to an array 
# NOTE: series does not have a to array method.
series10 = Series(['100', '200', 'python', '300.12', '400'])
series_to_array = np.array(series10.tolist())

"""
Keep this problem in mind
"""

# convert series of lists to one series
ser1 = Series([['Red', 'Green', 'White'], ['Red', 'Black'], ['Yellow']])
ser1_one = ser1.apply(Series).stack().reset_index(drop=True)

# Sort a series
arr = ['100', '200', 'Python',  '300.12', '400']
arr = Series(arr)
arr_sort = arr.sort_values()
new_arr = arr.append(Series(['500', 'php']))

# New subset of a series based on a given condition

ser2 = Series(np.arange(10))
ser2_condition = ser2[ser2 < 6]

# change the order of the index
ser3 = Series(np.arange(1,6,1), index=['A','B','C','D','E'])
ser3 = ser3.reindex(index=['B','A','C','D','E'])
Example #47
0
def cut_and_align_helper(base_line:pd.Series, compared_line:pd.Series):
    """cut and align the two input line

    Args:
        base_line (pd.Series): the sequence of coordination
        compared_line (pd.Series): the sequence of coordination
        vis_origin_data (bool, optional): [description]. Defaults to True.

    Returns:
        [LineString]: segements of compared_line
    """
    # base_line, compared_line = line2_, line1_ 
    # base_line, compared_line = line1_, line2_ 

    mask = compared_line.apply( lambda x: the_foot_point_on_line(x, base_line))

    # 特殊情况,当一个线段包含了panos的一整条路段的时候
    if mask.sum() == 0:
        # 此时 base_line 是 panos途径, compared_line 为OSM途径
        road_segment = compared_line
        foot0 = get_foot_point( base_line.iloc[0], road_segment.iloc[0], road_segment.iloc[-1] )
        foot1 = get_foot_point( base_line.iloc[-1], road_segment.iloc[0], road_segment.iloc[-1] )

        l0 = {'x0': road_segment.iloc[0][0],
              'x1': road_segment.iloc[-1][0],
              'y0': road_segment.iloc[0][1],
              'y1': road_segment.iloc[-1][1],
            }

        l1 = {'x0': foot0[0],
              'x1': foot1[0],
              'y0': foot0[1],
              'y1': foot1[1],
            }

        coords_new = [foot0, foot1] if -30 <= angle_bet_two_line(l0, l1) <= 30 else [foot1, foot0]

        return LineString(coords_new)

    left, right = 0, len(mask)-1
    start, end = left, right
    while not mask.iloc[left]:
        left += 1 
    while not mask.iloc[right]:
        right -= 1 

    left_foot_point, right_foot_point  = [],[]
    if left != start:
        panos_segment = compared_line.iloc[left-1: left+1]
        if the_foot_point_on_line( base_line.iloc[0], panos_segment, ratio_thres=0 ):
            left_foot_point = get_foot_point( base_line.iloc[0], compared_line.iloc[left-1], compared_line.iloc[left] )
        if the_foot_point_on_line( base_line.iloc[-1], panos_segment, ratio_thres=0 ):
            left_foot_point = get_foot_point( base_line.iloc[-1], compared_line.iloc[left-1], compared_line.iloc[left] )
        # assert( len(left_foot_point)!=0 )

    if right != end:
        panos_segment = compared_line.iloc[right: right+2]
        if the_foot_point_on_line( base_line.iloc[0], panos_segment, ratio_thres=0 ):
            right_foot_point = get_foot_point( base_line.iloc[0], compared_line.iloc[right], compared_line.iloc[right+1] )
        if the_foot_point_on_line( base_line.iloc[-1], panos_segment, ratio_thres=0 ):
            right_foot_point = get_foot_point( base_line.iloc[-1], compared_line.iloc[right], compared_line.iloc[right+1] )
        # assert( len(right_foot_point)!=0 )

    # add the foot point to the new coords
    coords_new = compared_line.iloc[ left: right+1 ].values.tolist()
    if len(left_foot_point) > 0:
        coords_new = [left_foot_point] + coords_new
    if len(right_foot_point) > 0:
        coords_new = coords_new + [right_foot_point]

    return LineString( coords_new if len(coords_new) > 1 else coords_new *2 )
Example #48
0
def to_uuid(series: pd.Series) -> pd.Series:
    return series.apply(uuid.UUID)
Example #49
0
    }

url = 'http://base-donnees-publique.medicaments.gouv.fr/index.php#result'

result = requests.post(url, data=payload)
soup = BeautifulSoup(result.text)

names = [ x.text for x in soup.find_all(class_="standart")]
#on obtient la zone de texte de notre recherche

names = Series(names)
# str => transforme en string. strip() => vire les espaces
names = names.str.strip()

#sur une expression reguliere, trouver les chiffres
regex_dosage = re.compile(r'\d+')
names.apply(lambda x : regex_dosage.findall(x))

#sur une expression reguliere, trouver unité mg
regex_unite = re.compile(r'(microgrammes|µg|grammes|gL)')
names.apply(lambda x : regex_unite.findall(x))

regex_form = re.compile(r'comprim\xe9 s\xe9cable')
names.apply(lambda x : regex_form.findall(x))

a = names.apply(lambda x : regex_dosage.findall(x))
b = names.apply(lambda x : regex_unite.findall(x))
c = names.apply(lambda x : regex_form.findall(x))

d = {'dosage':a, 'unite':b,'forme':c}
e = DataFrame(d)
Example #50
0
 def problem_predict(pred: pd.Series) -> np.ndarray:
     problem_mask = pred.apply(lambda x: x.size == 0)
     if np.any(problem_mask):
         warnings.warn(f"Empty prediction for {problem_mask.sum()} objects. Replaced with highest prob class.")
     problem_idx = np.where(problem_mask)[0]
     return problem_idx
Example #51
0
    def test_apply_args(self):
        s = Series(['foo,bar'])

        result = s.apply(str.split, args=(',', ))
        self.assertEqual(result[0], ['foo', 'bar'])
        tm.assertIsInstance(result[0], list)
Example #52
0
    def _clean_conjugation_suffixes(self, pool: pd.Series) -> pd.Series:
        """Clean suffix that indicates how to conjugate the words."""

        pool_clean: pd.Series = pool.apply(self._remove_conjugation_suffix_from_word)

        return pool_clean
Example #53
0
    def test_apply_dont_convert_dtype(self):
        s = Series(np.random.randn(10))

        f = lambda x: x if x > 0 else np.nan
        result = s.apply(f, convert_dtype=False)
        assert result.dtype == object
Example #54
0
 def validate(self, series: pd.Series) -> pd.Series:
     return series.apply(self._validation)
def seriesTypes(s: pd.Series):
    return s.apply(type).value_counts()
 def look_for_date(column_i: pd.Series):
     dates = {date: pd.to_datetime(date) for date in column_i.unique()}
     return column_i.apply(lambda x: dates[x])
# ```

# <markdowncell>

# **A9**:
# <pre>
# 0    0
# 1    1
# 2    2
# 3    3
# 4    4
# </pre>

# <codecell>

s1.apply(lambda k: 2*k).sum()

# <markdowncell>

# **Q10**: What is
# 
# ```Python
# s1.apply(lambda k: 2*k).sum()
# ```

# <markdowncell>

# **A10**:
# <pre>
# 10
# </pre>
Example #58
0
def to_complex(series: pd.Series) -> bool:
    return series.apply(complex)
Example #59
0
    'isAlphabet':0,
    'inClauseSubst':0,
    'nomSubstances':'',
    'typeRecherche':0,
    'choixRecherche':'medicament',
    'txtCaracteres':'levothyroxine',
    'btnMedic.x':9,
    'btnMedic.y':15,
    'btnMedic':'Rechercher',
    'radLibelle':2,
    'txtCaracteresSub': '',
    'radLibelleSub':4
    }

raw_data = requests.post('http://base-donnees-publique.medicaments.gouv.fr/index.php#result',data=payload).text
html = BeautifulSoup(raw_data)

drugss = html.findAll('a',class_="standart")

drugs = [drug.text for drug in drugss]

names = Series(drugs)

names.str.strip()

regex_dosage = re.compile(r'\d+')
regex_units = re.compile(r'(microgrammes|µg|grammes)')

names['dosage'] = names.apply(lambda x: regex_dosage.findall(x))
names['units'] = names.apply(lambda x: regex_units.findall(x))
Example #60
0
def unlist_series(s: pd.Series) -> pd.Series:
    return s.apply(lambda x: x[0])