def _prepare_one_phenotype(C: NDArray[(Any, Any), Float], row: pd.Series, correction: str, includes_intercept: bool) -> pd.Series: ''' Creates the broadcasted information for one (phenotype, offset) pair. The returned series contains the information eventually stored in a LogRegState. This function accepts and returns a pandas series for integration with Pandas UDFs and pd.DataFrame.apply. ''' y = row['values'] mask = ~np.isnan(y) offset = row.get('offset') y_pred = _logistic_null_model_predictions(y, C, mask, offset) y_res = np.nan_to_num(y - y_pred) gamma = y_pred * (1 - y_pred) CtGammaC = C.T @ (gamma[:, None] * C) inv_CtGammaC = np.linalg.inv(CtGammaC) row.label = str(row.label) # Ensure that the phenotype name is a string row.drop(['values', 'offset'], inplace=True, errors='ignore') row['y_res'], row['gamma'], row['inv_CtGammaC'] = np.ravel( y_res), np.ravel(gamma), np.ravel(inv_CtGammaC) if correction == correction_approx_firth: row['firth_offset'] = np.ravel( af.perform_null_firth_fit(y, C, mask, offset, includes_intercept)) return row
def __clean_artifacts(data: pd.Series, threshold=0.2) -> pd.Series: """ Cleans obviously illegal IBI values (artefacts) from a list Parameters ---------- data : pd.Series the IBI list threshold : float, optional the maximum relative deviation between subsequent intervals, by default 0.2 Returns ------- pd.Series the cleaned IBIs """ # Artifact detection - Statistical # for index in trange(data.shape[0]): # # Remove RR intervals that differ more than 20% from the previous one # if np.abs(data.iloc[index] - data.iloc[index - 1]) > 0.2 * data.iloc[index]: # data.iloc[index] = np.nan # efficiency instead of loop ;-) diff = data.diff().abs() drop_indices = diff > threshold * data if drop_indices.any(): data.drop(data[drop_indices].index, inplace=True) drop_indices = (data < 250) | (data > 2000) if drop_indices.any(): data.drop(data[drop_indices].index, inplace=True) # drop by bpm > 240 or bpm < 30 data.dropna(inplace=True) # just to be sure return data
def __clean_artifacts(data: pd.Series, threshold=0.2) -> pd.Series: """ Cleans obviously illegal IBI values (artefacts) from a list Parameters ---------- data : pd.Series the IBI list threshold : float, optional the maximum relative deviation between subsequent intervals, by default 0.2 Returns ------- pd.Series the cleaned IBIs """ diff = data.diff().abs() drop_indices = diff > threshold * data if drop_indices.any(): data.drop(data[drop_indices].index, inplace=True) drop_indices = (data < 250) | (data > 2000) if drop_indices.any(): data.drop(data[drop_indices].index, inplace=True) # drop by bpm > 240 or bpm < 30 data.dropna(inplace=True) # just to be sure return data
def test_plot_accessor_updates_on_inplace(self): s = Series([1, 2, 3, 4]) _, ax = self.plt.subplots() ax = s.plot(ax=ax) before = ax.xaxis.get_ticklocs() s.drop([0, 1], inplace=True) _, ax = self.plt.subplots() after = ax.xaxis.get_ticklocs() tm.assert_numpy_array_equal(before, after)
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level): # GH 8594 mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"]) s = Series([10, 20, 30], index=mi) df = DataFrame([10, 20, 30], index=mi) with pytest.raises(KeyError, match=msg): s.drop(labels, level=level) with pytest.raises(KeyError, match=msg): df.drop(labels, level=level)
def extractJSONData(dict1): ser1 = Series(dict1['MonitoredVehicleJourney']) ser1 = ser1.append(Series({'RecordedAtTime':dict1['RecordedAtTime']})) nextStops = ser1['OnwardCalls']['OnwardCall'][0] ser1.drop('OnwardCalls', inplace = True) ser1 = unnest(ser1) nextStops = unnest(Series(nextStops)) nextStops.index = 'NextStop' + nextStops.index.values ser1 = pd.concat((ser1, nextStops)) df_row = DataFrame(ser1).transpose() return(df_row)
def get_ica_components(X, contribution=0.85): X_ica = FastICA(n_components=len(X.columns)).fit(X) L2 = Series(np.sum(X_ica.mixing_**2, axis=0)) L2.sort_values(ascending=False, inplace=True) X_S = DataFrame(X_ica.transform(X)) X_ica_mixing_ = DataFrame(X_ica.mixing_) L2.drop(L2.index[L2.cumsum() / L2.sum() >= contribution][1:], inplace=True) return X_S.reindex(columns=L2.index).values, X_ica_mixing_.reindex( columns=L2.index).values, X_ica.mean_, len(L2)
def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list("abc")) result = s.drop("bc", errors="ignore") tm.assert_series_equal(result, s) result = s.drop(["a", "d"], errors="ignore") expected = s.iloc[1:] tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) tm.assert_series_equal(result, expected)
def test_drop_with_ignore_errors(): # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') tm.assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] tm.assert_series_equal(result, expected) # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) tm.assert_series_equal(result, expected)
def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str: from pandas import Series lines = self.adj.adjoin(1, *strcols).split("\n") max_len = Series(lines).str.len().max() # plus truncate dot col width, _ = get_terminal_size() dif = max_len - width # '+ 1' to avoid too wide repr (GH PR #17023) adj_dif = dif + 1 col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) n_cols = len(col_lens) counter = 0 while adj_dif > 0 and n_cols > 1: counter += 1 mid = round(n_cols / 2) mid_ix = col_lens.index[mid] col_len = col_lens[mid_ix] # adjoin adds one adj_dif -= col_len + 1 col_lens = col_lens.drop(mid_ix) n_cols = len(col_lens) # subtract index column max_cols_fitted = n_cols - self.fmt.index # GH-21180. Ensure that we print at least two. max_cols_fitted = max(max_cols_fitted, 2) self.fmt.max_cols_fitted = max_cols_fitted # Call again _truncate to cut frame appropriately # and then generate string representation self.fmt.truncate() strcols = self._get_strcols() return self.adj.adjoin(1, *strcols)
def generate_text_features(self, X: Series, feature: str) -> DataFrame: X: DataFrame = X.to_frame(name=feature) X[feature + '.char_count'] = [self.char_count(value) for value in X[feature]] X[feature + '.word_count'] = [self.word_count(value) for value in X[feature]] X[feature + '.capital_ratio'] = [ self.capital_ratio(value) for value in X[feature] ] X[feature + '.lower_ratio'] = [self.lower_ratio(value) for value in X[feature]] X[feature + '.digit_ratio'] = [self.digit_ratio(value) for value in X[feature]] X[feature + '.special_ratio'] = [ self.special_ratio(value) for value in X[feature] ] symbols = [ '!', '?', '@', '%', '$', '*', '&', '#', '^', '.', ':', ' ', '/', ';', '-', '=' ] for symbol in symbols: X[feature + '.symbol_count.' + symbol] = [ self.symbol_in_string_count(value, symbol) for value in X[feature] ] X[feature + '.symbol_ratio.' + symbol] = X[feature + '.symbol_count.' + symbol] / X[feature + '.char_count'] X[feature + '.symbol_ratio.' + symbol].fillna(0, inplace=True) X = X.drop(feature, axis=1) return X
def apply_dcg_to_series(movie_id: int, data: pd.Series): # create lookup table (look up index in list given the item) lookup = index_reverse_lookup_dict(data.loc[movie_id]) # remove base movie from results data = data.drop(index=movie_id) # apply dcg similarity over the data return data.apply(dcg_similarity, args=(lookup, ))
def make_line_chart_popup(data_row:pd.Series, title:str) -> folium.Popup: '''Create a line chart popup from temporal Series for departements Index of the Series have to be in {year}_median, {year}_decile1, {year}_decile9, {year+1}_median, {year+1}_decile1... format this popup can be added in map layers''' # filter index names and build 3 columns from one(series) data = { 'decile_1': data_row.filter(regex=".*decile_1$").values, 'decile_9': data_row.filter(regex=".*decile_9$").values, 'median': data_row.filter(like="median").values, } df_to_display = pd.DataFrame.from_dict(data) data_row = data_row.drop("color") # create index of the dataframe from the inital data_row Series.index df_to_display.index = pd.to_datetime(list(dict.fromkeys([int(annee_c[:4]) for annee_c in data_row.index.tolist()])), format="%Y") line_chart = vincent.Line(df_to_display, width=300, height=200) line_chart.axis_titles(x='Année', y='prix m2') line_chart.legend(title=title) popup = folium.Popup() folium.Vega(line_chart, width = 400, height=250).add_to(popup) return popup
def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels, expected_data, expected_index): s = Series(data=data, index=index) result = s.drop(drop_labels, axis=axis) expected = Series(data=expected_data, index=expected_index) tm.assert_series_equal(result, expected)
def test_drop_index_ea_dtype(any_numeric_ea_dtype): # GH#45860 df = Series(100, index=Index([1, 2, 2], dtype=any_numeric_ea_dtype)) idx = Index([df.index[1]]) result = df.drop(idx) expected = Series(100, index=Index([1], dtype=any_numeric_ea_dtype)) tm.assert_series_equal(result, expected)
def create_item_mod(possible_mod_ids: list, item_mod: pd.Series, rare_mods: pd.DataFrame, mod_value: int) -> dict: logger.info('Creating item mods dict') for mod_id in possible_mod_ids: # Comparing mod with rare mods multiple times is expensive possible_mods = rare_mods[rare_mods.mod_id == mod_id] if len(possible_mods) > 0: for idx, row in possible_mods.iterrows(): min_value = row['min'] max_value = row['max'] if min_value < 0: mod_value = -mod_value if min_value <= mod_value <= max_value + 1: item_mod = possible_mods.loc[idx, :] break else: continue if isinstance(item_mod, dict): if not item_mod: item_mod = possible_mods.loc[idx, :] else: if item_mod.empty: item_mod = possible_mods.loc[idx, :] item_mod['value'] = mod_value item_mod = item_mod.drop(['min', 'max', 'required_level']) item_mod = item_mod.to_dict() break return item_mod
def remover_resultado_concursos(possibilidades, resultado_concursos): """ Remove da lista de possibilidades os resultados já sorteados. :param possibilidades: Combinações possíveis da Lotofácil :param resultado_concursos: Resultado de todos os concursos return: A lista de possibilidades sem os resultados já sorteados. """ from pandas import Series elem_ini = 0 elem_fin = len(possibilidades) - 1 indices = [buscar( possibilidades, elem_ini, elem_fin, valor_busca ) for valor_busca in resultado_concursos] s_possibilidades = Series(possibilidades) removidos = s_possibilidades.drop(indices) lista_possibilidades_atualizada = removidos.values return lista_possibilidades_atualizada.tolist()
def test_cat_accessor_updates_on_inplace(self): s = Series(list("abc")).astype("category") return_value = s.drop(0, inplace=True) assert return_value is None return_value = s.cat.remove_unused_categories(inplace=True) assert return_value is None assert len(s.cat.categories) == 2
def update_series(): ser = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e']) print('根据列表+索引创建:\n', ser) s = ser.drop('c') # print('删除后的结果:\n', s) s = ser.drop(['a', 'c']) # print('删除多个后的结果:\n', s) ser.pop('d') # ser.pop(0) # 索引删除 invalid key # ser.pop([0,1]) # 删除多个 invalid key print('pop删除,修改源数据:\n', ser) ser[0] = 1000 ser['f'] = 2000 ser2 = Series([100, 200], index=['x', 'y']) ser.append(ser2) print('修改Series:\n', ser) print(' Series append:\n', ser.append(ser2))
def perf_per_month(df: pd.Series) -> json: """Returns monthly performance of strategy Arguments: df -- Series of NAV with datetime as the index Returns: Monthly returns and datetime as the index """ df = df.to_frame() df.index = pd.to_datetime(df.index, format="%Y-%m-%d %H:%M:%S").date df['eom'] = df.index + MonthEnd(0) df.drop_duplicates('eom', keep='last', inplace=True) df = df.loc[df.index == df['eom']] df['m_rets'] = df['nav'] / df['nav'].shift(1) - 1 df.drop(columns=['eom', 'nav'], inplace=True) return df.to_json(orient='index')
def __calc_line_parameters( self, lines: pd.Series) -> Tuple[np.ndarray, np.uint64]: """ Generate general parameters of the given acquisition """ rel_idx = np.where( np.abs(lines.diff().pct_change(periods=1)) > self.change_thresh)[0] delta = np.uint64( lines.drop(rel_idx).reindex(np.arange( len(lines))).interpolate().diff().mean()) return rel_idx[::2], delta
def test_drop_pos_args_deprecation(): # https://github.com/pandas-dev/pandas/issues/41485 ser = Series([1, 2, 3]) msg = (r"In a future version of pandas all arguments of Series\.drop " r"except for the argument 'labels' will be keyword-only") with tm.assert_produces_warning(FutureWarning, match=msg): result = ser.drop(1, 0) expected = Series([1, 3], index=[0, 2]) tm.assert_series_equal(result, expected)
def test_closed_uneven(): # see gh-21704 ser = Series(data=np.arange(10), index=pd.date_range("2000", periods=10)) # uneven ser = ser.drop(index=ser.index[[1, 5]]) result = ser.rolling("3D", closed="left").min() expected = Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index) tm.assert_series_equal(result, expected)
def series_test(): print("==== 使用 list 构造 series ===="); data_list = list(i for i in range(1, 10)); index_list = list("index_%d" % i for i in range(1, 10)); # 默认索引列,从 0 开始 series = Series(data_list); print(series); series = Series(data_list, index_list); print(series); print("==== 使用 dict 构造 series ===="); data_dict_1 = {"name": "hujiang", "age": 18, "sex": "man", "index_1":"balabala"}; data_dict_2 = {"name": "hymanHu", "age": 19, "sex": "man", "index_1":"balabala"}; # dict 中的 key 自动转为 index_list series_1= Series(data_dict_1); print(series_1); # 传入的 index_list 元素与 dict key 相同时候,则显示 dict 对应的值,否者显示 NaN series_1 = Series(data_dict_1, index_list); print(series_1); series_2 = Series(data_dict_2, index_list); print(series_1 + series_2); # 多个 series 合并 print("==== 新增 ===="); series["index_10"] = 100; # 新增一个值 print(series); print("==== 删除 ===="); print(series.drop("index_10")); print(series.drop(["index_10", "index_9"])); print("==== 修改 ===="); series["index_9"] = 99; print(series); print("==== 查询 ===="); print(series.index); # 打印索引 Index 对象 print(series.index.values); # 打印索引列表 print(series.values); # 打印值列表 print(series.get("index_9"), series["index_9"]); # 根据索引或对应的值 print(series[["index_8", "index_9"]]); # 取多索引 print(series["index_1":"index_3"]); # 索引切片 print(series[[1,3]]); # 索引下标取值 print(series[0:4]); # 索引下标切片 print(np.asarray(series)); # Series 转 list
def load_sinks(self): """ Load particle by particle sink flag values. """ sinks = self._sink_value.value if self._drop_ids is not None: sinks = Series(sinks, index=self._particleIDs.value) self['sink_value'] = sinks.drop(self._drop_ids) else: self['sink_value'] = sinks
def turn_row_into_price_action(row: pd.Series, fiat_currency: str) -> typing.Dict: price_action = { "timestamp": row.timestamp, "fiat_currency": fiat_currency, "action": { "type": "external_price_update", "tokens": row.drop("timestamp").to_dict() } } return price_action
def load_PIDs(self): """ Load Particle ID numbers """ particleIDs = self._particleIDs.value if self._drop_ids is not None: particleIDs = Series(particleIDs, index=self._particleIDs.value) self['particleIDs'] = particleIDs.drop(self._drop_ids) else: self['particleIDs'] = particleIDs
def _loocv_loess(x: pd.Series, y: pd.Series, interpolator: Callable, frac: Optional[float] = None) -> tuple: """ Helper function for batch_correction. Computes loess correction with LOOCV. Parameters ---------- x: pd.Series y: pd.Series frac: float, optional fraction of sample to use in LOESS correction. If None, determines the best value using LOOCV. interpolator = callable interpolator function used to predict new values. Returns ------- corrected: pd.Series LOESS corrected data """ if frac is None: # valid frac values, from 4/N to 1/N, where N is the number of corrector # samples. frac_list = [k / x.size for k in range(4, x.size + 1)] rms = np.inf # initial value for root mean square error best_frac = 1 for frac in frac_list: curr_rms = 0 for loocv_index in x.index[1:-1]: y_temp = y.drop(loocv_index) x_temp = x.drop(loocv_index) y_loess = lowess(y_temp, x_temp, return_sorted=False, frac=frac) interp = interpolator(x_temp, y_loess) curr_rms += (y[loocv_index] - interp(x[loocv_index]))**2 if rms > curr_rms: best_frac = frac rms = curr_rms frac = best_frac return lowess(y, x, return_sorted=False, frac=frac)
def predict(self, x: Series): condition = x[self.sub_attribute] if condition in self.sub_trees.keys(): if isinstance(self.sub_trees[condition], str): return self.sub_trees[condition] else: return self.sub_trees[condition].predict( x.drop(self.sub_attribute)) else: return self.data[self.label].value_counts().keys()[0]
def test_cat_accessor_updates_on_inplace(self): ser = Series(list("abc")).astype("category") return_value = ser.drop(0, inplace=True) assert return_value is None with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): return_value = ser.cat.remove_unused_categories(inplace=True) assert return_value is None assert len(ser.cat.categories) == 2
def load_adiabatic_index(self): """ Load particle adiabatic index. """ gamma = self._Adiabatic_index.value if self._drop_ids is not None: gamma = Series(gamma, index=self._particleIDs.value) self['adiabatic_index'] = gamma.drop(self._drop_ids) else: self['adiabatic_index'] = gamma
def load_internal_energy(self, unit=None): """ Load internal particle energies per unit mass in cgs units (default set in units class) unit: unit conversion from code units """ if unit: self.units.set_energy(unit) energy = self._internal_energy.value * self.units.energy_conv if self._drop_ids is not None: energy = Series(energy, index=self._particleIDs.value) self['internal_energy'] = energy.drop(self._drop_ids) else: self['internal_energy'] = energy
def test_drop_and_dropna_caching(self): # tst that cacher updates original = Series([1, 2, np.nan], name='A') expected = Series([1, 2], dtype=original.dtype, name='A') df = pd.DataFrame({'A': original.values.copy()}) df2 = df.copy() df['A'].dropna() assert_series_equal(df['A'], original) df['A'].dropna(inplace=True) assert_series_equal(df['A'], expected) df2['A'].drop([1]) assert_series_equal(df2['A'], original) df2['A'].drop([1], inplace=True) assert_series_equal(df2['A'], original.drop([1]))
def load_masses(self, unit=None): """ Load Particle Masses in units of M_sun (default set in units class) unit: unit conversion from code units """ if unit: self.units.set_mass(unit) masses = self._masses.value * self.units.mass_conv if self.units.remove_h: h = self._header.HubbleParam masses /= h if self._drop_ids is not None: masses = Series(masses, index=self._particleIDs.value) self['masses'] = masses.drop(self._drop_ids) else: self['masses'] = masses
def load_density(self, unit=None): """ Load Particle Densities in cgs units (default set in units class) unit: unit conversion from code units """ if unit: self.units.set_density(unit) density = self._density.value * self.units.density_conv if self.units.remove_h: h = self._header.HubbleParam density *= h**2 if self.units.coordinate_system == 'physical': ainv = self._header.Redshift + 1 # 1/(scale factor) density *= ainv**3 if self._drop_ids is not None: density = Series(density, index=self._particleIDs.value) self['density'] = density.drop(self._drop_ids) else: self['density'] = density
def load_smoothing_length(self, unit=None): """ Load Particle Smoothing Lengths in units of kpc. (default set in units class) unit: unit conversion from code units """ if unit: self.units._set_smoothing_length(unit) hsml = self._smoothing_length.value * self.units.length_conv if self.units.remove_h: h = self._header.HubbleParam hsml /= h if self.units.coordinate_system == 'physical': a = self._header.ScaleFactor hsml *= a if self._drop_ids is not None: hsml = Series(hsml, index=self._particleIDs.value) self['smoothing_length'] = hsml.drop(self._drop_ids) else: self['smoothing_length'] = hsml
def _esd(x, max_outlier, alpha, direction): """ The ESD test using median and MAD in the calculation of the test statistic. """ x = Series(x) n = len(x) outlier_index = [] for i in range(1, max_outlier + 1): median = x.median() mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT if mad == 0: break if direction == 'both': ares = x.map(lambda value: abs(value - median) / mad) elif direction == 'pos': ares = x.map(lambda value: (value - median) / mad) elif direction == 'neg': ares = x.map(lambda value: (median - value) / mad) r_idx = ares.idxmax() r = ares[r_idx] if direction == 'both': p = 1.0 - alpha / (2 * (n - i + 1)) else: p = 1.0 - alpha / (n - i + 1) crit = t.ppf(p, n-i-1) lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1)) if logger.isEnabledFor(logging.DEBUG): logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" % (i, max_outlier, median, mad, r_idx, r, crit, lam)) if r > lam: outlier_index.append(r_idx) x = x.drop(r_idx) else: # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time, # we can stop. break return outlier_index
''' print x.ix[['a', 'b', 'd', 'c'], states] ''' A B C D a 0 1 2 0 b 3 4 5 0 d 6 7 8 0 c 3 4 5 0 ''' print 'Series根据行索引删除行' x = Series(numpy.arange(4), index=['a', 'b', 'c', 'd']) print x.drop('c') ''' a 0 b 1 d 3 dtype: int32 ''' print x.drop(['a', 'b']) # 花式删除 ''' c 2 d 3 dtype: int32 ''' print
obj3=Series(['blue','purple','yellow'],index=[0,2,4]) obj3.reindex(range(6),method='ffill') #********************************************************* frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California']) frame2=frame.reindex(['a','b','c','d']) #丢弃 drop 删除 索引值 obj=Series(np.arange(5),index=['a','b','c','d','e']) new_obj=obj.drop('c') data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four']) #删除 列类型 data.drop('three',axis=1) #直接 定位到元素 点 用ix data.ix['Colorado',['one','four']] #*************************** #对Series 相加 是对相同索引 的数据相加,没有的值 其和 最终会以NaN 来表示 list('abcd') 等价于 ['a','b','c','d']
[4 rows x 3 columns] ''' ################################################################### # Dropping entries from an axis ################################################################### ''' Dropping one or more entries from an axis is easy if you have an index array or list without those entries. As that can require a bit of munging and set logic, the drop method will return a new object with the indicated value or values deleted from an axis: ''' obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e']) new_obj = obj.drop('c') print(new_obj) ''' a 0 b 1 d 3 e 4 dtype: float64 ''' print(obj.drop(['d', 'c'])) ''' a 0 b 1 e 4 dtype: float64 '''
def main(): # reindex obj = Series(range(4), index="a b c d".split(" ")[::-1]) print obj obj2 = obj.reindex("a b c d e".split(" ")) print obj2 # Change NaN print obj.reindex("a b c d e".split(" "), fill_value=0) colors = ["blue", "purple", "yellow"] index = [0, 2, 4] obj3 = Series(colors, index=index) print obj3.reindex(range(6)) print obj3.reindex(range(6), method="ffill") # not found forward fill print obj3.reindex(range(6), method="backfill") # bfill # DataFrame states = ["Ohio", "Texas", "California"] frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"]) print frame frame2 = frame.reindex("a b c d".split(" ")) print frame2 states[0] = "Utah" states[1], states[0] = states[:2] print frame.reindex(columns=states) # fill print frame.reindex("a b c d".split(" "), method="ffill", columns=states) print frame.ix["a b c d".split(" ")] print frame.ix["a b c d".split(" "), states] # Delete column print "", "" obj = Series(range(5), index="a b c d e".split(" ")) new_obj = obj.drop("c") print new_obj print obj # Index reference print "", "" obj = Series(np.arange(4.0), index="a b c d".split(" ")) print obj["b"] print obj[1] # same print obj[2:4] print obj[["b", "a", "c"]] print obj[[1, 3]] print obj[obj < 2] # Slice with label print obj["b":"c"] # include 'c' obj["b":"c"] = 5 print obj data = DataFrame( np.arange(16).reshape((4, 4)), index=["Ohio", "Colorado", "Utah", "New York"], columns=["one", "two", "three", "four"], ) print data # column print data["two"] print data[["three", "one"]] # row print data[:2] print data[data["three"] > 5] # all values print data < 5 data[data < 5] = 0 print data # row and column print data.ix[["Colorado"], ["two", "three"]] print data.ix[["Colorado", "Utah"], [3, 0, 1]] # row print data.ix[2] # label row and column, return column print data.ix[:"Utah", "two"] # xs # row print data.xs("Utah") print data.xs("Utah", axis=0) # rows print data.xs("two", axis=1) # icol/irow i is index print data.icol(1) print data.irow(1) # Union print "", "" s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"]) s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"]) print s1 print s2 # index is union, but d, f, g are NaN print s1 + s2 df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"]) df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print df1 print df2 print df1 + df2 # arithmetic method print "", "" df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd")) df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde")) print df1 print df2 print df1.add(df2, fill_value=0) # reindex has fill_value argument # other arithmetic method are sub/div/mul(ti) # Calculation in a DataFrame and Series print "", "" # subtract from each row. broadcat arr = np.arange(12.0).reshape((3, 4)) print arr print arr[0] print arr - arr[0] frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) series = frame.ix[0] print frame print series print frame - series series2 = Series(range(3), index=list("bef")) print frame + series2 series3 = frame["d"] series4 = frame.ix[0] print frame print series3 print series4 print frame.sub(series3, axis=0) print frame.sub(series4, axis=1) # apply function and mapping print "", "" frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"]) print frame f = lambda x: x.max() - x.min() print frame.apply(f) print frame.apply(f, axis=1) f = lambda x: Series([x.min(), x.max()], index=["min", "max"]) print frame.apply(f) format = lambda x: "{0:.2f}".format(x) print frame.applymap(format) # frame print frame["e"].map(format) # series # sort and rank print "", "" obj = Series(range(4), index=list("dabc")) print obj print obj.sort_index() frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc")) print frame print frame.sort_index() print frame.sort_index(axis=1) print frame.sort_index(axis=1, ascending=False) # Sorting series print "", "" obj = Series([4, 7, -3, 2]) print obj.order() obj = Series([4, np.nan, 7, np.nan, -3, 2]) print obj.order() print obj.order(ascending=False) # order by multi columns print "", "" frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]}) print frame.sort_index(by=["a", "b"]) # rank print "", "" obj = Series([7, -5, 7, 4, 2, 0, 4]) print obj.rank() # method is average print obj.rank(method="first") # No Duplicates print obj.rank(ascending=False, method="min") print obj.rank(ascending=False, method="max") f1 = DataFrame(obj, columns=["data"]) f2 = DataFrame(obj.rank(), columns=["rank"]) # merge by each index print pd.merge(f1, f2, left_index=True, right_index=True) # Index of the axis with duplicate values print "", "" obj = Series(range(5), index=list("aaabc")) print obj print obj.index.is_unique print obj["a"] print obj["c"] df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd")) print df print df.ix["b"] print df["c"]
#Assign indexes mySeries=Series([4,5,2], index=['Apples','Oranges','Grapes']) mySeries #Data filtering mySeries['Oranges'] 'Apples' in mySeries mySeries #Add a value mySeries['Pears']=6 mySeries #Delete a value mySeries.drop(['Oranges']) #Create series from a dictionary myDict={'USA':75,'Canada':20} dictSeries=Series(myDict) dictSeries #Creating a data frame from dictionary empDict={'id':[1,2,3,4],'name': ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]} ## Data Structure : Data Frame from a dictionary empDict={'id':[1,2,3,4]} empDf=DataFrame(empDict) #Access rows and columns empDf.name
def test_drop(): # unique s = Series([1, 2], index=['one', 'two']) expected = Series([1], index=['one']) result = s.drop(['two']) assert_series_equal(result, expected) result = s.drop('two', axis='rows') assert_series_equal(result, expected) # non-unique # GH 5248 s = Series([1, 1, 2], index=['one', 'two', 'one']) expected = Series([1, 2], index=['one', 'one']) result = s.drop(['two'], axis=0) assert_series_equal(result, expected) result = s.drop('two') assert_series_equal(result, expected) expected = Series([1], index=['two']) result = s.drop(['one']) assert_series_equal(result, expected) result = s.drop('one') assert_series_equal(result, expected) # single string/tuple-like s = Series(range(3), index=list('abc')) pytest.raises(KeyError, s.drop, 'bc') pytest.raises(KeyError, s.drop, ('a',)) # errors='ignore' s = Series(range(3), index=list('abc')) result = s.drop('bc', errors='ignore') assert_series_equal(result, s) result = s.drop(['a', 'd'], errors='ignore') expected = s.iloc[1:] assert_series_equal(result, expected) # bad axis pytest.raises(ValueError, s.drop, 'one', axis='columns') # GH 8522 s = Series([2, 3], index=[True, False]) assert s.index.is_object() result = s.drop(True) expected = Series([3], index=[False]) assert_series_equal(result, expected) # GH 16877 s = Series([2, 3], index=[0, 1]) with tm.assert_raises_regex(KeyError, 'not contained in axis'): s.drop([False, True])
def editor(interrogation, operation=None, denominator=False, sort_by=False, keep_stats=False, keep_top=False, just_totals=False, threshold='medium', just_entries=False, skip_entries=False, merge_entries=False, just_subcorpora=False, skip_subcorpora=False, span_subcorpora=False, merge_subcorpora=False, replace_names=False, replace_subcorpus_names=False, projection=False, remove_above_p=False, p=0.05, print_info=False, spelling=False, selfdrop=True, calc_all=True, keyword_measure='ll', **kwargs ): """ See corpkit.interrogation.Interrogation.edit() for docstring """ # grab arguments, in case we get dict input and have to iterate locs = locals() import corpkit import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # to use if we also need to worry about concordance lines return_conc = False from corpkit.interrogation import Interrodict, Interrogation, Concordance if interrogation.__class__ == Interrodict: locs.pop('interrogation', None) from collections import OrderedDict outdict = OrderedDict() for i, (k, v) in enumerate(interrogation.items()): # only print the first time around if i != 0: locs['print_info'] = False if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self': denominator = interrogation # if df2 is also a dict, get the relevant entry if isinstance(denominator, (dict, Interrodict)): #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \ # sorted(set([i.lower() for i in list(denominator.keys())])): # locs['denominator'] = denominator[k] # fix: this repeats itself for every key, when it doesn't need to # denominator_sum: if kwargs.get('denominator_sum'): locs['denominator'] = denominator.collapse(axis='key') if kwargs.get('denominator_totals'): locs['denominator'] = denominator[k].totals else: locs['denominator'] = denominator[k].results outdict[k] = v.results.edit(**locs) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys())))) return Interrodict(outdict) elif isinstance(interrogation, (DataFrame, Series)): dataframe1 = interrogation elif isinstance(interrogation, Interrogation): #if interrogation.__dict__.get('concordance', None) is not None: # concordances = interrogation.concordance branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r') : dataframe1 = interrogation.results elif branch.lower().startswith('t'): dataframe1 = interrogation.totals elif branch.lower().startswith('c'): dataframe1 = interrogation.concordance return_conc = True else: dataframe1 = interrogation.results elif isinstance(interrogation, Concordance) or \ all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']): return_conc = True print('heree') dataframe1 = interrogation # hope for the best else: dataframe1 = interrogation the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None try: from process import checkstack except ImportError: from corpkit.process import checkstack if checkstack('pythontex'): print_info=False def combiney(df, df2, operation='%', threshold='medium', prinf=True): """mash df and df2 together in appropriate way""" totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list(df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print('Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show))) if len(to_drop) > 10: print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)) else: print('') else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '+': try: df = df.add(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '-': try: df = df.sub(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis=1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]) # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis=1).T def editf(datum): meth = {'%': datum.div, '*': datum.mul, '/': datum.div, '+': datum.add, '-': datum.sub} if datum.name in list(df2.columns): method = meth[operation] mathed = method(df2[datum.name], fill_value=0.0) if operation == '%': return mathed * 100.0 else: return mathed else: return datum * 0.0 df = df.apply(editf) else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2.T.sum() return df, totals def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" parsed_input = False import re if the_input == 'all': the_input = r'.*' if isinstance(the_input, int): try: the_input = str(the_input) except: pass the_input = [the_input] elif isinstance(the_input, STRINGTYPE): regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input from corpkit.dictionaries.process_types import Wordlist if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist: the_input = list(the_input) if isinstance(the_input, list): if isinstance(the_input[0], int): parsed_input = [word for index, word in enumerate(list(df)) if index in the_input] elif isinstance(the_input[0], STRINGTYPE): try: parsed_input = [word for word in the_input if word in df.columns] except AttributeError: # if series parsed_input = [word for word in the_input if word in df.index] return parsed_input def synonymise(df, pos='n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos=pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info=print_info): if print_info: print('Merging duplicate entries ... \n') # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis=1) #df = df.drop([dup for d in range(num_dupes)], axis=1) df = df.drop(dup, axis=1) df[dup] = temp return df def name_replacer(df, replace_names, print_info=print_info): """replace entry names and merge""" import re # get input into list of tuples # if it's a string, we want to delete it if isinstance(replace_names, STRINGTYPE): replace_names = [(replace_names, '')] # this is for some malformed list if not isinstance(replace_names, dict): if isinstance(replace_names[0], STRINGTYPE): replace_names = [replace_names] # if dict, make into list of tupes if isinstance(replace_names, dict): replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: if replacement: print('Replacing "%s" with "%s" ...\n' % (to_find, replacement)) else: print('Deleting "%s" from entry names ...\n' % to_find) to_find = re.compile(to_find) if not replacement: replacement = '' df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)] df = merge_duplicates(df, print_info=False) return df def just_these_entries(df, parsed_input, prinf=True): entries = [word for word in list(df) if word not in parsed_input] if prinf: print('Keeping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(entries, axis=1) return df def skip_these_entries(df, parsed_input, prinf=True): if prinf: print('Skipping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(parsed_input, axis=1) return df def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if isinstance(newname, int): the_newname = list(df.columns)[newname] elif isinstance(newname, STRINGTYPE): if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if not newname: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0] if not isinstance(the_newname, STRINGTYPE): the_newname = str(the_newname, errors='ignore') return the_newname def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'): # make new entry with sum of parsed input if len(parsed_input) == 0: import warnings warnings.warn('No %s could be automatically merged.\n' % merging) else: if prinf: print('Merging %d %s as "%s":\n %s' % \ (len(parsed_input), merging, the_newname, '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') # remove old entries temp = sum([df[i] for i in parsed_input]) if isinstance(df, Series): df = df.drop(parsed_input, errors='ignore') nms = list(df.index) else: df = df.drop(parsed_input, axis=1, errors='ignore') nms = list(df.columns) if the_newname in nms: df[the_newname] = df[the_newname] + temp else: df[the_newname] = temp return df def just_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if prinf: print('Keeping %d subcorpora:\n %s' % (len(good_years), '\n '.join(good_years[:10]))) if len(good_years) > 10: print('... and %d more ... \n' % (len(good_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0) return df def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora, int): lst_of_subcorpora = [lst_of_subcorpora] if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if len(bad_years) == 0: import warnings warnings.warn('No subcorpora skipped.\n') else: if prinf: print('Skipping %d subcorpora:\n %s' % (len(bad_years), '\n '.join([str(i) for i in bad_years[:10]]))) if len(bad_years) > 10: print('... and %d more ... \n' % (len(bad_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0) return df def span_these_subcorpora(df, lst_of_subcorpora, prinf=True): """select only a span of suborpora (first, last)""" fir, sec = lst_of_subcorpora if len(lst_of_subcorpora) == 0: import warnings warnings.warn('Span not identified.\n') else: if prinf: print('Keeping subcorpora:\n %d--%d\n' % (int(fir), int(sec))) sbs = list(df.index) df = df.ix[sbs.index(fir):sbs.index(sec) + 1] return df def projector(df, list_of_tuples, prinf=True): """project abs values""" if isinstance(list_of_tuples, list): tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list(list_of_tuples.items()): if isinstance(subcorpus, int): subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if isinstance(projection_value, float): print('Projection: %s * %s' % (subcorpus, projection_value)) if isinstance(projection_value, int): print('Projection: %s * %d' % (subcorpus, projection_value)) if prinf: print('') return df def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: thetime = strftime("%H:%M:%S", localtime()) print('%s: sort type not available in this verion of corpkit.' % thetime) return False indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = list(range(len(indices))) statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] stats = [] if isinstance(df, Series): y = list(df.values) sl = Series(list(linregress(x, y)), index=statfields) else: for entry in list(df.columns): y = list(df[entry]) stats.append(list(linregress(x, y))) sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns)) df = df.append(sl) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) return df def resort(df, sort_by = False, keep_stats = False): """sort results, potentially using scipy's linregress""" # translate options and make sure they are parseable stat_field = ['slope', 'intercept', 'r', 'p', 'stderr'] easy_sorts = ['total', 'infreq', 'name', 'most', 'least'] stat_sorts = ['increase', 'decrease', 'static', 'turbulent'] options = stat_field + easy_sorts + stat_sorts sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'} sort_by = sort_by_convert.get(sort_by, sort_by) # probably broken :( if just_totals: if sort_by == 'name': return df.sort_index() else: return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1) stats_done = False if keep_stats or sort_by in stat_field + stat_sorts: df = do_stats(df) stats_done = True if isinstance(df, bool): if df is False: return False if isinstance(df, Series): if stats_done: stats = df.ix[range(-5, 0)] df = df.drop(list(stats.index)) if sort_by == 'name': df = df.sort_index() else: df = df.sort_values(ascending=sort_by != 'total') if stats_done: df = df.append(stats) return df if sort_by == 'name': # currently case sensitive df = df.reindex_axis(sorted(df.columns), axis=1) elif sort_by in ['total', 'infreq']: if df1_istotals: df = df.T df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)] # sort by slope etc., or search by subcorpus name if sort_by in stat_field or sort_by not in options: asc = kwargs.get('reverse', False) df = df.T.sort_values(by=sort_by, ascending=asc).T if sort_by in ['increase', 'decrease', 'static', 'turbulent']: slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(stat_field, axis=0, errors='ignore') return df def set_threshold(big_list, threshold, prinf=True): if isinstance(threshold, STRINGTYPE): if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if isinstance(big_list, DataFrame): tot = big_list.sum().sum() if isinstance(big_list, Series): tot = big_list.sum() tshld = float(tot) / float(denominator) else: tshld = threshold if prinf: print('Threshold: %d\n' % tshld) return tshld # copy dataframe to be very safe df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' if isinstance(interrogation, Concordance): return_conc = True # do concordance work if return_conc: if just_entries: if isinstance(just_entries, int): just_entries = [just_entries] if isinstance(just_entries, STRINGTYPE): df = df[df['m'].str.contains(just_entries)] if isinstance(just_entries, list): if all(isinstance(e, STRINGTYPE) for e in just_entries): mp = df['m'].map(lambda x: x in just_entries) df = df[mp] else: df = df.ix[just_entries] if skip_entries: if isinstance(skip_entries, int): skip_entries = [skip_entries] if isinstance(skip_entries, STRINGTYPE): df = df[~df['m'].str.contains(skip_entries)] if isinstance(skip_entries, list): if all(isinstance(e, STRINGTYPE) for e in skip_entries): mp = df['m'].map(lambda x: x not in skip_entries) df = df[mp] else: df = df.drop(skip_entries, axis=0) if just_subcorpora: if isinstance(just_subcorpora, int): just_subcorpora = [just_subcorpora] if isinstance(just_subcorpora, STRINGTYPE): df = df[df['c'].str.contains(just_subcorpora)] if isinstance(just_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in just_subcorpora): mp = df['c'].map(lambda x: x in just_subcorpora) df = df[mp] else: df = df.ix[just_subcorpora] if skip_subcorpora: if isinstance(skip_subcorpora, int): skip_subcorpora = [skip_subcorpora] if isinstance(skip_subcorpora, STRINGTYPE): df = df[~df['c'].str.contains(skip_subcorpora)] if isinstance(skip_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora): mp = df['c'].map(lambda x: x not in skip_subcorpora) df = df[mp] else: df = df.drop(skip_subcorpora, axis=0) return Concordance(df) if print_info: print('\n***Processing results***\n========================\n') df1_istotals = False if isinstance(df, Series): df1_istotals = True df = DataFrame(df) # if just a single result else: df = DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False if denominator.__class__ == Interrogation: try: denominator = denominator.results except AttributeError: denominator = denominator.totals if denominator is not False and not isinstance(denominator, STRINGTYPE): df2 = denominator.copy() using_totals = True if isinstance(df2, DataFrame): if len(df2.columns) > 1: single_totals = False else: df2 = Series(df2) elif isinstance(df2, Series): single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?') else: if operation in ['k', 'a', '%', '/', '*', '-', '+']: denominator = 'self' if denominator == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to=spelling) df = merge_duplicates(df, print_info=False) if not single_totals: df2 = convert_spell(df2, convert_to=spelling, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, replace_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not sort_by: sort_by = 'total' if replace_subcorpus_names: df = name_replacer(df.T, replace_subcorpus_names) df = merge_duplicates(df).T df = df.sort_index() if not single_totals: if isinstance(df2, DataFrame): df2 = df2.T df2 = name_replacer(df2, replace_subcorpus_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if isinstance(df2, DataFrame): df2 = df2.T df2 = df2.sort_index() if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis=0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis=0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and df1_istotals: continue try: df = df.drop(name, axis=ax, errors='ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and single_totals: continue try: df2 = df2.drop(name, axis=ax, errors='ignore') except: pass # merging: make dicts if they aren't already, so we can iterate if merge_entries: if not isinstance(merge_entries, list): if isinstance(merge_entries, STRINGTYPE): merge_entries = {'combine': merge_entries} # for newname, criteria for name, the_input in sorted(merge_entries.items()): pin = parse_input(df, the_input) the_newname = newname_getter(df, pin, newname=name, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, the_input) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) else: for i in merge_entries: pin = parse_input(df, merge_entries) the_newname = newname_getter(df, pin, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, merge_entries) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) if merge_subcorpora: if not isinstance(merge_subcorpora, dict): if isinstance(merge_subcorpora, list): if isinstance(merge_subcorpora[0], tuple): merge_subcorpora = {x: y for x, y in merge_subcorpora} elif isinstance(merge_subcorpora[0], STRINGTYPE): merge_subcorpora = {'combine': [x for x in merge_subcorpora]} elif isinstance(merge_subcorpora[0], int): merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]} else: merge_subcorpora = {'combine': merge_subcorpora} for name, the_input in sorted(merge_subcorpora.items()): pin = parse_input(df.T, the_input) the_newname = newname_getter(df.T, pin, newname=name, \ merging_subcorpora=True, prinf=print_info) df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', prinf=print_info).T if using_totals: pin2 = parse_input(df2.T, the_input) df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', prinf=False).T if just_subcorpora: df = just_these_subcorpora(df, just_subcorpora, prinf=print_info) if using_totals: df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False) if skip_subcorpora: df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info) if using_totals: df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False) if span_subcorpora: df = span_these_subcorpora(df, span_subcorpora, prinf=print_info) if using_totals: df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False) if just_entries: df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info) if not single_totals: df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False) if skip_entries: df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info) if not single_totals: df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) if just_totals: df = DataFrame(df.sum(), columns=['Combined total']) if using_totals: if not single_totals: df2 = DataFrame(df2.sum(), columns=['Combined total']) else: df2 = df2.sum() tots = df.sum(axis=1) if using_totals or outputmode: if not operation.startswith('k'): tshld = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: tshld = set_threshold(df2, threshold, prinf=print_info) df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info) # if doing keywording... if operation.startswith('k'): if isinstance(denominator, STRINGTYPE): if denominator == 'self': df2 = df.copy() else: df2 = denominator from corpkit.keys import keywords df = keywords(df, df2, selfdrop=selfdrop, threshold=threshold, print_info=print_info, editing=True, calc_all=calc_all, sort_by=sort_by, measure=keyword_measure, **kwargs) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by or keep_stats: df = resort(df, keep_stats=keep_stats, sort_by=sort_by) if isinstance(df, bool): if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = Series(df['Combined total'], name='Combined total') if df1_istotals: if operation.startswith('k'): try: df = Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0, :] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = Series(df['Total'], name='Total') except: total = 'none' pass #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis=1) except: total = 'none' if not isinstance(tots, DataFrame) and not isinstance(tots, Series): total = df.sum(axis=1) else: total = tots if isinstance(df, DataFrame): datatype = df.iloc[0].dtype else: datatype = df.dtype locs['datatype'] = datatype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): """add an order for tkintertable if using gui""" if isinstance(df, Series): df = df.T df = df.drop('tkintertable-order', errors='ignore', axis=0) df = df.drop('tkintertable-order', errors='ignore', axis=1) dat = [i for i in range(len(df.index))] df['tkintertable-order'] = Series(dat, index=list(df.index)) df = df.T return df # while tkintertable can't sort rows if checkstack('tkinter'): df = add_tkt_index(df) if kwargs.get('df1_always_df'): if isinstance(df, Series): df = DataFrame(df) # delete non-appearing conc lines if not hasattr(interrogation, 'concordance'): lns = None elif hasattr(interrogation, 'concordance') and interrogation.concordance is None: lns = None else: col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns)) ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index)) lns = interrogation.concordance[col_crit] lns = lns.loc[ind_crit] lns = Concordance(lns) output = Interrogation(results=df, totals=total, query=locs, concordance=lns) if print_info: print('***Done!***\n========================\n') return output
summ = s1.describe() summ["mean"] ######################################################## unique and nunique ## unique() returns the unique elements of a series ## nunique() returns the number of unique values in a Series. drop and dropna drop(labels) drop elements with the selected labels from a Series. ## drop(labels) drop elements with the selected labels from a Series. s1 = Series(arange(1.0,6),index=["a","a","b","c","d"]) s1 s1.drop("a") ################ dropna() is similar to drop() except that it only drops null values – NaN or similar. s1 = Series(arange(1.0,4.0),index=["a","b","c"]) s2 = Series(arange(1.0,4.0),index=["c","d","e"]) s3 = s1 + s2 s3 s3.dropna() ############################################################################## #### fillna
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, quicksave = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, do_concordancing = False, maxconc = 9999, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" only_conc = False no_conc = False if do_concordancing is False: no_conc = True if type(do_concordancing) == str and do_concordancing.lower() == 'only': only_conc = True no_conc = False # iteratively count conc lines numconc = 0 # store kwargs locs = locals() if kwargs: for k, v in kwargs.items(): locs[k] = v locs.pop('kwargs', None) import corpkit from interrogation import Interrogation from process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from process import get_deps from time import localtime, strftime from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs import signal original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: original_sigint = signal.getsignal(signal.SIGINT) def signal_handler(signal, frame): """pause on ctrl+c, rather than just stop loop""" import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) try: sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) except NameError: sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler) # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if type(corpus) == str: from corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(list(search.keys())) == 1: query = list(search.values())[0] if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, '__iter__'): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" speakr = dummy_args.get('speaker', False) import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: encd = to_write.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) q = list(search.values())[0] ops = ['-o', '-%s' % translated_option] concs = [] res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) if not no_conc: ops += ['-w', '-f'] whole_res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) res = format_tregex(res) whole_res = format_tregex(whole_res, whole = True) concs = make_conc_lines_from_whole_mid(whole_res, res, speakr) if root: root.update() try: os.remove(to_open) except OSError: pass if countmode: return(len(res)) else: return res, concs def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, [] def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): import re, os if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results, whole = False): """format tregex by show list""" if countmode: return results import re done = [] if whole: fnames = [x for x, y in results] results = [y for x, y in results] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) if whole: done = zip(fnames, done) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess if im: signal.signal(signal.SIGINT, original_sigint) from multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} count_results = {} conc_results = {} # check if just counting countmode = 'c' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if corpus.datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif corpus.datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l'] if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif corpus.datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 no_conc = True only_conc = False do_concordancing = False else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): translated_option = 't' query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for subcorpus in corpus.subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): # to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n %s corpus ... \n' % \ (thetime, message, corpus.name, optiontext, sformat, message) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): conc_results[subcorpus_name] = [] count_results[subcorpus_name] = [] results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not countmode: result = format_tregex(result) if not no_conc: op += ['-w', '-f'] whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result, whole = True) conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if countmode: count_results[subcorpus_name] += [result] else: result = Counter(result) results[subcorpus_name] += result if not no_conc: for lin in conc_result: if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: slow_treg_speaker_guess = kwargs.get('outname', False) if corpus.datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if len(just_speakers) == 1: slow_treg_speaker_guess = just_speakers[0] if not sents: continue else: sents = corenlp_xml.sentences res, conc_res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, do_concordancing = do_concordancing, only_format_match = only_format_match, speaker = slow_treg_speaker_guess) if res == 'Bad query': return 'Bad query' elif corpus.datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) if not only_conc: res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') elif corpus.datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() if not only_conc: res = searcher(list(search.values())[0], data, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for index, line in enumerate(conc_res): if searcher != slow_tregex: line.insert(0, f.name) else: line[0] = f.name if not preserve_case: line[3:] = [x.lower() for x in line[3:]] if spelling: line = [correct_spelling(b) for b in line] if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: if not preserve_case: if not statsmode: res = [i.lower() for i in res] if spelling: if not statsmode: res = [correct_spelling(r) for r in res] #if not statsmode: results[subcorpus_name] += Counter(res) #else: #results[subcorpus_name] += res if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if not no_conc: all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) conc_df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(conc_df['s'].values)): conc_df.drop('s', axis = 1, inplace = True) #if kwargs.get('note'): # kwargs['note'].progvar.set(100) #if kwargs.get('printstatus', True): # thetime = strftime("%H:%M:%S", localtime()) # finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index)) # print(finalstring) from interrogation import Concordance output = Concordance(conc_df) if only_conc: output.query = locs if quicksave: output.save() if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df)) print(finalstring) return output #output.query = locs #return output ############################################ # Get interrogation into DataFrame # ############################################ if not only_conc: if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort_values(ascending = False, inplace = True) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) if not no_conc: interro = Interrogation(results = df, totals = tot, query = locs, concordance = output) else: interro = Interrogation(results = df, totals = tot, query = locs) if quicksave: interro.save() return interro
obj2 = obj.reindex(['a','b','c','d','e']) obj2 obj.reindex(['a','b','c','d','e'],fill_value=0) obj3 = Series(['blue','purple','yellow'],index=[0,2,4]) obj3.reindex(range(6), method='ffill') obj.reindex(['a','b','c','d','e'],fill_value=0) obj3 = Series(['blue','purple','yellow'],index=[0,2,4]) obj3.reindex(range(6), method='ffill') frame = DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns=['Ohio','Texas', 'California']) frame2= frame.reindex(['a','b','c','d']) states=['Texas','Utah','California'] frame.reindex(columns=states) frame2= frame.reindex(['a','b','c','d'],method='ffill',columns=states) frame.ix[['a','b','c','d'],states] obj = Series(np.arange(5.),index=['a','b','c','d','e']) new_obj = obj.drop('c') new_obj obj.drop(['d','c']) data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four']) data.drop(['Colorado','Ohio']) data.drop('two',axis=1) data.drop(['two','four'],axis=1) obj = Series(np.arange(4.),index=['a','b','c','d']) obj['b'] obj[1] obj obj[2:4] obj[['b','a','d']] obj[[1,3]] obj[obj<2]
print '#==========逻辑问答===========#' print 'Ohio' in frame3.columns print '#------reindex-----#' obj = Series([4.5,7.2,-5.3,3.6],index=['a','d','b','c']) print obj obj2 = obj.reindex(['a','b','c','d','e','f'],fill_value=0) print obj2 print '#-------丢弃指定轴上的值------------#' obj = Series(np.arange(5.),index=['a','b','c','d','e']) print obj new_obj = obj.drop('a') print new_obj data =DataFrame(np.arange(16.).reshape((4,4)), index=['Ohio','Colorado','Utha','NewYork'], columns=['one','two','three','four']) print data print data.drop('Ohio') #删除行 print data.drop('one',axis=1) #删除列 axis=0表示行 1表示列 print '#--------索引选取过滤---------#' obj = Series(np.arange(4.),index=['a','b','c','d']) print obj
obj2 = obj.reindex(['a','b','c','e']) # 如果某个索引值不存在,那么默认是空缺值 # 设置填充值 obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0) # 设置前向值填充,默认填充前一个 obj3 = Series(['blue','purple','yellow'],index=[0,2,4]) obj3.reindex(range(6), method='ffill') # reindex默认重新索引行,但是也可以重新索引列 frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'], columns=['Ohio','Texas','California']) states = ['Texas','Utah','California'] frame2 = frame.reindex(columns=states) ## 丢掉指定轴上的项 obj = Series(np.arange(5.), index=['a','b','c','d','e']) obj.drop('c') obj.drop(['d','c']) # 丢弃列 data = DataFrame(np.arange(16).reshape(4,4), index = ['Ohio','Colorado','Utah','New York'], columns = ['one','two','three','four']) data.drop(['one','four'], axis=1) ## 索引、选取和过滤 data.ix['Colorado',['two','three']] data.ix['Colorado',[3,0,1]] data.ix['Colorado'] data['two'] data.two ## 算数运算对齐
s1 + s2 s1 = Series([1.0,2,3]) s1.values s1.index s1.index = ['cat','dog','elephant'] s1.index s1 = Series(arange(10.0,20.0)) s1.describe() summ = s1.describe() summ['mean'] s1 = Series(arange(1.0,6),index=['a','a','b','c','d']) s1 s1.drop('a') s1 = Series(arange(1.0,4.0),index=['a','b','c']) s2 = Series(arange(1.0,4.0),index=['c','d','e']) s3 = s1 + s2 s3 s3.dropna() s1 = Series(arange(1.0,4.0),index=['a','b','c']) s2 = Series(arange(1.0,4.0),index=['c','d','e']) s3 = s1 + s2 s3.fillna(-1.0) df = DataFrame(array([[1,2],[3,4]])) df
# 创建index index = pd.Index(np.arange(0,200)) s = Series(data,index=index) # print s # 重新索引 s2 = s.reindex(np.arange(100,300)) # print s2 # 选择不存在值的填充方式 s3 = s.reindex(np.arange(180,220),method='ffill') # print s3 # 删除指定的index项 s4 = s.drop(np.arange(0,20)) # print s4 dic = { 'Nevada':{2001:2.4,2002:4.2,2003:1.2}, 'Ohio':{2000:1.1,2002:4.4,2001:5.2} } f = DataFrame(dic) # print f # 列名称 cols = f.columns # 行名称 index = f.index # print cols # print index
def test_cat_accessor_updates_on_inplace(self): s = Series(list('abc')).astype('category') s.drop(0, inplace=True) s.cat.remove_unused_categories(inplace=True) assert len(s.cat.categories) == 2
# Lecture 18: Drop Entry import numpy as np from pandas import Series,DataFrame import pandas as pd ser1 = Series(np.arange(3),index=['a','b','c']) ser1 ser1.drop('b') # data frame dframe1 = DataFrame(np.arange(9).reshape(3,3), index = ['SF','LA','NY'], columns =['pop','size','year']) dframe1 # drop a row dframe1.drop('LA') # drop a column dframe1.drop('year', axis = 1) # axis = 0 is rows (default)
# encoding: UTF-8 import numpy as np from pandas import Series r=Series([5,10,20,25,30,25,21,23,45,62]) s=[30,40,50,60] h=r.drop([len(r)-1]) print(h) 0 -0.215840 1 0.087310 2 0.169105 3 -0.120198 4 0.130334 5 -0.049459 Name: Close, dtype: float64 0 -0.055864 1 -0.024838 2 -0.029977 3 0.049289 4 -0.045947 Name: Close, dtype: float64 相关系数:-0.21477281551 0 0.087310 1 0.169105 2 -0.120198 3 0.130334 4 -0.049459 Name: Close, dtype: float64