def test_sum_overflow(self, use_bottleneck): with pd.option_context("use_bottleneck", use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes for dtype in ["int32", "int64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert int(result) == v.sum(dtype="int64") result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] for dtype in ["float32", "float64"]: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def combine_subjects_lr(df): weights = Series(index=df.columns).fillna(0.0) df = df.loc[:, ~df.isnull().all()] df = df.fillna(df.mean()) lr = LinearRegression(fit_intercept=False) for col in df.columns: use_df = df.drop([col], axis=1) _ = lr.fit(use_df, df[[col]]) corr = Series(lr.predict(use_df).ravel(), index=df.index).corr(df[col]) corr = 0.0 if isnan(corr) else corr coefs = Series(lr.coef_.ravel(), index=use_df.columns).reindex(weights.index).fillna(0.0) weights += (corr * coefs) weights = weights.where(weights.ge(0.0), 0.0).div(float(df.shape[1] - 1)) # weights = weights.where(weights.gt(0.1) | weights.eq(0.0), 0.1) print '---------------' print df.index.get_level_values('grade').values[0] print weights / weights.sum() # other_weights = df.corr().where(df.corr().ne(1.0)).mean() # print other_weights / other_weights.sum() output = df.reindex(columns=weights.index).multiply(weights).sum( axis=1).div(weights.sum()) return output
def _get_binning_threshold(self, df: DataFrame, y: Series) -> Dict: """ 获取分箱阈值 :param df: 所有变量数据 :param y: 标签数据 :return: 变量分箱区间字典 """ params = { "criterion": self.criterion, "max_depth": self.max_depth, "min_samples_split": self.min_samples_split, "min_samples_leaf": max(int(np.ceil(y.size * self.min_samples_leaf)), 50), "max_leaf_nodes": self.max_leaf_nodes, "random_state": self.random_state } self.B_G_rate = y.sum() / (y.size - y.sum()) for col in df.columns: feat_type = self.features_info.get(col) nan_value = self.features_nan_value.get(col) bins, flag = self._bin_threshold(df[col], y, is_num=feat_type, nan_value=nan_value, **params) self.features_bins[col] = {'bins': bins, 'flag': flag}
def calc_square_model_params(x: pd.Series, y: pd.Series): print('[quadratic regression]') # calculate the matrix of linear equations sx = x.sum() sx2 = (x**2).sum() sx3 = (x**3).sum() sx4 = (x**4).sum() sy = y.sum() sxy = (x * y).sum() sx2y = ((x**2) * y).sum() A = np.array([[len(x), sx, sx2], [sx, sx2, sx3], [sx2, sx3, sx4]]) b = np.array([sy, sxy, sx2y]) print('solve following linear equations to get w0, w1 and w2...') print('\tn\t* w0 + S(x)\t* w1 + S(x^2)\t* w2 = S(y)') print('\tS(x)\t* w0 + S(x^2)\t* w1 + S(x^3)\t* w2 = S(xy)') print('\tS(x^2)\t* w0 + S(x^3)\t* w1 + S(x^4)\t* w2 = S(x^2*y)') print('substitute values...') print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' % (A[0][0], A[0][1], A[0][2], b[0])) print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' % (A[1][0], A[1][1], A[1][2], b[1])) print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' % (A[2][0], A[2][1], A[2][2], b[2])) # solve equations w = np.linalg.solve(A, b) print('solution: w0 = %f, w1 = %f, w2 = %f\n' % (w[0], w[1], w[2])) return w[0], w[1], w[2]
def test_sum_overflow(self, use_bottleneck): with pd.option_context('use_bottleneck', use_bottleneck): # GH#6915 # overflowing on the smaller int dtypes for dtype in ['int32', 'int64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert int(result) == v.sum(dtype='int64') result = s.min(skipna=False) assert int(result) == 0 result = s.max(skipna=False) assert int(result) == v[-1] for dtype in ['float32', 'float64']: v = np.arange(5000000, dtype=dtype) s = Series(v) result = s.sum(skipna=False) assert result == v.sum(dtype=dtype) result = s.min(skipna=False) assert np.allclose(float(result), 0.0) result = s.max(skipna=False) assert np.allclose(float(result), v[-1])
def _linear_regression_np(x: Series, y: Series) -> dict: """Simple Linear Regression in Numpy for two 1d arrays for environments without the sklearn package.""" result = {"a": npNaN, "b": npNaN, "r": npNaN, "t": npNaN, "line": npNaN} x_sum = x.sum() y_sum = y.sum() if int(x_sum) != 0: # 1st row, 2nd col value corr(x, y) r = npCorrcoef(x, y)[0, 1] m = x.size r_mix = m * (x * y).sum() - x_sum * y_sum b = r_mix // (m * (x * x).sum() - x_sum * x_sum) a = y.mean() - b * x.mean() line = a + b * x _np_err = seterr() seterr(divide="ignore", invalid="ignore") result = { "a": a, "b": b, "r": r, "t": r / npSqrt((1 - r * r) / (m - 2)), "line": line, } seterr(divide=_np_err["divide"], invalid=_np_err["invalid"]) return result
def update(self, feature_x: pd.Series, feature_y: pd.Series) -> None: """Updates partial cross feature statistics.""" self.sum_x += feature_x.sum() self.sum_y += feature_y.sum() self.sum_square_x += (feature_x**2).sum() self.sum_square_y += (feature_y**2).sum() self.sum_xy += (feature_x * feature_y).sum() self.count += len(feature_x)
def chi(q: pd.Series, p: pd.Series): # Eqn (4) in # https://www.cse.huji.ac.il/~werman/Papers/ECCV2010.pdf p = p / p.sum() q = q / q.sum() x = np.sqrt(0.5 * (((p - q)**2) / (p + q)).sum()) assert (0 <= x <= 1) return x
def _exit_transaction( df: pd.DataFrame, trade: pd.Series, exit_condition: Callable[[pd.DataFrame, pd.Series, pd.Timestamp], bool], ) -> Transaction: for index in df.index: if exit_condition(df, trade, index): return Transaction(timestamp=index, amount=-trade.sum()) return Transaction(timestamp=df.index[-1], amount=-trade.sum())
def bootstrap_series(ser: pd.Series) -> pd.Series: """ Creates a bootstrapped time series of same length and number of observations as original time series :param ser: (pd.Series): time series (not necessarily stationary, but observations are assumed to be weakly correlated.) """ resampled = np.random.multinomial(ser.sum(), ser / ser.sum()) return pd.Series(resampled, index=ser.index)
def get_precision_recall( data: pd.Series, ties: Optional[np.ndarray] = None) -> Tuple[pd.Series, pd.Series]: r = np.arange(1, data.shape[0] + 1) c = data.cumsum() if ties is not None: return fix_tied(ties, c / r), fix_tied(ties, c / data.sum()) return (c / r), (c / data.sum())
def get_fi_gain(model_name, reg, X_train): if model_name == 'ours': fi_gain = Series(reg.compute_feature_importance(method='gain')) elif model_name == 'sklearn': fi_gain = Series(reg.feature_importances_, index=X_train.columns) elif model_name == 'xgboost': fi_gain = Series(reg.get_score(importance_type='gain')) else: # model_name == 'catboost' fi_gain = Series(reg.feature_importances_, index=reg.feature_names_) if fi_gain.sum() != 0: fi_gain /= fi_gain.sum() return fi_gain
def compute_weighted_avg(series): """ method for computing weighted average by duration :return: feature value """ if len(series) == 1: return series.mean() values = series[:-1].values weights = Series(map(lambda x: float(x) / 10e8, series.index.values[1:] - series.index.values[:-1])) weights = weights.values avg = values * weights return avg.sum() / weights.sum() if weights.sum() > 0 else np.nan
def main(args): generators, loads, _, times, _, data = parsedir() generators = filter(lambda gen: gen.is_controllable, generators) gen_data = data['generators'] if args['min'] == 0: args['min'] = 1.1 * gen_data.pmin.sum() if args['max'] == 0: args['max'] = 0.99 * gen_data.pmax.sum() load_values = np.arange(args['min'], args['max'], args['interval']) results = DataFrame(columns=['prices', 'committed', 'last_committed'], index=load_values) committed_gen_names = Index([]) for load_val in load_values: print load_val loads_times = make_loads_times(Pd=load_val) power_system, times = solve_problem(generators, do_reset_config=False, **loads_times) t = times[0] results.ix[load_val, 'prices'] = power_system.buses[0].price(t) statuses = Series( dict([(gen.name, gen.status(t).value) for gen in power_system.generators()])) results.ix[load_val, 'committed'] = statuses.sum() results.ix[load_val, 'last_committed'] = \ statuses[statuses == 1].index.diff(committed_gen_names) committed_gen_names = statuses[statuses == 1].index if (load_values[-1] == 0.99 * gen_data.pmax.sum()) and \ (statuses.sum() != len(generators)): print('warning: uncommitted generation:') print(gen_data.set_index('name').ix[statuses[statuses == 0].index]) results.to_csv(joindir(user_config.directory, 'ed_sweep.csv')) if args['hide_units_committed']: ax = results.prices.plot(drawstyle='steps') else: ax = results[['prices', 'committed']].plot(drawstyle='steps', secondary_y=['committed']) ax.right_ax.set_ylabel('Units committed') ax.set_xlabel('System Load (MW)') ax.set_ylabel('Estimated System Price ($/MWh)') plt.savefig(joindir(user_config.directory, 'ed_sweep.png'))
def main(args): generators, loads, _, times, _, data = parsedir() generators = [gen for gen in generators if gen.is_controllable] gen_data = data["generators"] if args["min"] == 0: args["min"] = 1.1 * gen_data.pmin.sum() if args["max"] == 0: args["max"] = 0.99 * gen_data.pmax.sum() load_values = np.arange(args["min"], args["max"], args["interval"]) results = DataFrame(columns=["prices", "committed", "last_committed"], index=load_values) committed_gen_names = Index([]) for load_val in load_values: print(load_val) loads_times = make_loads_times(Pd=load_val) power_system, times = solve_problem(generators, do_reset_config=False, **loads_times) t = times[0] results.loc[load_val, "prices"] = power_system.buses[0].price(t) statuses = Series( dict([(gen.name, gen.status(t).value) for gen in power_system.generators()])) results.loc[load_val, "committed"] = statuses.sum() results.loc[load_val, "last_committed"] = statuses[ statuses == 1].index.difference(committed_gen_names) committed_gen_names = statuses[statuses == 1].index if (load_values[-1] == 0.99 * gen_data.pmax.sum()) and (statuses.sum() != len(generators)): print("warning: uncommitted generation:") print((gen_data.set_index("name").loc[statuses[statuses == 0].index])) results.to_csv(joindir(user_config.directory, "ed_sweep.csv")) if args["hide_units_committed"]: ax = results.prices.plot(drawstyle="steps") else: ax = results[["prices", "committed"]].plot(drawstyle="steps", secondary_y=["committed"]) ax.right_ax.set_ylabel("Units committed") ax.set_xlabel("System Load (MW)") ax.set_ylabel("Estimated System Price ($/MWh)") plt.savefig(joindir(user_config.directory, "ed_sweep.png"))
def main(args): generators, loads, _, times, _, data = parsedir() generators = filter(lambda gen: gen.is_controllable, generators) gen_data = data['generators'] if args['min'] == 0: args['min'] = 1.1 * gen_data.pmin.sum() if args['max'] == 0: args['max'] = 0.99 * gen_data.pmax.sum() load_values = np.arange(args['min'], args['max'], args['interval']) results = DataFrame(columns=['prices', 'committed', 'last_committed'], index=load_values) committed_gen_names = Index([]) for load_val in load_values: print load_val loads_times = make_loads_times(Pd=load_val) power_system, times = solve_problem(generators, do_reset_config=False, **loads_times) t = times[0] results.ix[load_val, 'prices'] = power_system.buses[0].price(t) statuses = Series(dict([(gen.name, gen.status(t).value) for gen in power_system.generators()])) results.ix[load_val, 'committed'] = statuses.sum() results.ix[load_val, 'last_committed'] = \ statuses[statuses == 1].index.difference(committed_gen_names) committed_gen_names = statuses[statuses == 1].index if (load_values[-1] == 0.99 * gen_data.pmax.sum()) and \ (statuses.sum() != len(generators)): print('warning: uncommitted generation:') print(gen_data.set_index('name').ix[statuses[statuses == 0].index]) results.to_csv(joindir(user_config.directory, 'ed_sweep.csv')) if args['hide_units_committed']: ax = results.prices.plot(drawstyle='steps') else: ax = results[['prices', 'committed']].plot(drawstyle='steps', secondary_y=['committed']) ax.right_ax.set_ylabel('Units committed') ax.set_xlabel('System Load (MW)') ax.set_ylabel('Estimated System Price ($/MWh)') plt.savefig(joindir(user_config.directory, 'ed_sweep.png'))
def batch_buy(self, codedf: pd.Series, datetime: str, totalamount: float = 1000000, model: enumerate = 'avg_money'): """ 批量调仓接口 codedf: pd.Series Series.index -> code Series.value -> price totalamount: 总买入金额 model Enum 'avg_money': 等市值买入 'avg_amount': 等股数买入(买入总金额==totalamount) """ if model == 'avg_money': moneyper = totalamount / len(codedf) amount = (moneyper/codedf).apply(lambda x: (int(100/x)*100) if int(100/x) > 0 else 100) elif model == 'avg_amount': amountx = int(totalamount/(100*codedf.sum())) if amountx == 0: return False else: amount = codedf.apply(lambda x: amountx*100) orderres = pd.concat([codedf, amount], axis=1) orderres.columns = ['price', 'amount'] res = orderres.assign(datetime=datetime).apply(lambda x: self.send_order( code=x.index, amount=x.amount, price=x.price, towards=1, datetime=x.datetime)) return res
def test_sum_with_level(): obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)])) with tm.assert_produces_warning(FutureWarning): result = obj.sum(level=0) expected = Series([10.0], index=[2]) tm.assert_series_equal(result, expected)
def tags_distance(distribution: pd.Series, other: pd.Series, tags: pd.Index, p=1): """Compute the Optimal Transport Distance between histograms (see https://arxiv.org/pdf/1803.00567.pdf p.30-33) """ if p < 1: raise ValueError('p must be greater or equal that 1') if p != 1: raise NotImplementedError('Only wasserstein 1 is currently supported') # Make the tag distributions have the same support distrib = distribution.reindex(index=tags, fill_value=0) other = other.reindex(index=tags, fill_value=0) # Sort by tag (in the lexicographic order) and normalize the distributions # This is important because in the distance we implicitly associate a tag to # a point in N. distrib = distrib.sort_index() distrib = distrib / distrib.sum() other = other.sort_index() other = other / other.sum() # print(distrib, other, sep='\n') return wasserstein_1d(distrib.to_numpy(), other.to_numpy())
def chi(self, customattribute): """ 计算其卡方值. """ attributeDict = dict() classAttributeDict = dict() for piece in self.chunks: for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems(): attributeDict.setdefault((attribute, classAttribute), np.array([])) attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays) for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems(): classAttributeDict.setdefault(classAttribute, np.array([])) classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays) #各个类别的毕业去向群体中所占的比例. classSeries = Series(classAttributeDict).apply(lambda x:len(x)) classSeries /= classSeries.sum() #在各个attribute上的实际观测值. attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0) attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns) #设置初始值. for index in attributeExp.index: attributeExp.ix[index] = attributeObs.ix[index].sum() #根据各个目标类别中的比例来获得其期望值. attributeExp = attributeExp.mul(classSeries).fillna(0) #根据实际观测值与期望值来计算其卡方值,并返回p-value值. return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
def create_report_new(type_cleaned: str, stats: pd.Series, errors: str) -> None: """ Describe what was done in the cleaning process The stats series contains the following codes in its index 0 := the number of null values 1 := the number of values that could not be parsed 2 := the number of values that were transformed during cleaning 3 := the number of values that were already in the correct format """ print(f"{type_cleaned} Cleaning Report:") nrows = stats.sum() nclnd = stats.loc[2] if 2 in stats.index else 0 if nclnd > 0: pclnd = round(nclnd / nrows * 100, 2) print(f"\t{nclnd} values cleaned ({pclnd}%)") nunknown = stats.loc[1] if 1 in stats.index else 0 if nunknown > 0: punknown = round(nunknown / nrows * 100, 2) expl = "set to NaN" if errors == "coerce" else "left unchanged" print(f"\t{nunknown} values unable to be parsed ({punknown}%), {expl}") nnull = stats.loc[0] if 0 in stats.index else 0 if errors == "coerce": nnull += stats.loc[1] if 1 in stats.index else 0 pnull = round(nnull / nrows * 100, 2) ncorrect = nclnd + (stats.loc[3] if 3 in stats.index else 0) pcorrect = round(ncorrect / nrows * 100, 2) print( f"Result contains {ncorrect} ({pcorrect}%) values in the correct format " f"and {nnull} null values ({pnull}%)")
class Player: def __init__(self, first_name, last_name, id): self.first_name = first_name self.last_name = last_name self.id = id self.hrs = [0, 0, 0, 0, 0, 0] #One for each month of the game self.hr_total = 0 self.hr_series = Series() self.hr_total_series = Series() def __str__(self): return str.format('{0} : {1}', self.id, self.last_name) def __repr__(self): return self.__str__() def add_hrs(self, count, date): self.hr_total += count self.hr_total_series[date] = self.hr_series.sum() + count if (self.hr_series.last_valid_index() == date): self.hr_series[date] = count + self.hr_series[date] else: self.hr_series[date] = count def name(self): return self.first_name + " " + self.last_name def get_player_hr_dataframe(self): return self.hr_series.to_frame(self.name()) def get_player_hr_total_dataframe(self): return self.hr_total_series.to_frame(self.name())
def calculate_pfe(trades: List[Trade], ca: CollateralAgreement): ac_bucketing = {} addOns = Series() for t in trades: if t.assetClass in ac_bucketing: ac_bucketing[t.assetClass].append(t) else: ac_bucketing[t.assetClass] = [t] for ac, ac_trades in ac_bucketing.items(): if ac == AssetClass.EQ: addOns[ac] = SA_CCR.equity_addOn(ac_trades, ca) elif ac == AssetClass.IR: addOns[ac] = SA_CCR.interest_rate_addOn(ac_trades, ca) elif ac == AssetClass.FX: addOns[ac] = SA_CCR.fx_addOn(ac_trades, ca) V = ca.get_V() C = ca.get_C() aggregate_addOn = addOns.sum() multiplier_var = SA_CCR.multiplier(V, C, aggregate_addOn) PFE = multiplier_var * aggregate_addOn return { 'PFE': PFE, 'multiplier': multiplier_var, 'AddOn_agg': aggregate_addOn }
def infer_posterior(data: Series, mu_0: float, sigma_0_sq: Optional[float] = None, tau_0: Optional[float] = None) -> Normal: """ Return a new Normal distribution of the posterior most likely to generate the given data. :param data: Series of float observations. :param mu_0: Value for the μ₀ (mean) hyper-parameter of the prior Normal distribution describing the mean. :param sigma_0_sq: Value for the σ₀² (variance) hyper-parameter of the prior Normal distribution describing the mean. :param tau_0: Value for the τ₀ (precision) hyper-parameter of the prior Normal distribution describing the mean. """ if not one_is_none(sigma_0_sq, tau_0): raise ValueError('Give either σ₀² or τ₀') n = len(data) x_sum = data.sum() if sigma_0_sq is None: tau = 1 / data.var() return NormalNormalConjugate(n=n, x_sum=x_sum, mu_0=mu_0, tau=tau, tau_0=tau_0).posterior() else: sigma_sq = data.var() return NormalNormalConjugate(n=n, x_sum=x_sum, mu_0=mu_0, sigma_sq=sigma_sq, sigma_0_sq=sigma_0_sq).posterior()
def count_fraction_of_true(series: pd.Series): # We are assuming this is called by a Boolean series if series.dtype != np.bool: raise ValueError num_true = series.sum() total = float(series.count()) return num_true / total if total > 0.0 else 0.0, num_true
def sinwma(close, length=None, asc=None, offset=None, **kwargs): """Indicator: Sine Weighted Moving Average (SINWMA) by Everget of TradingView""" # Validate Arguments close = verify_series(close) length = int(length) if length and length > 0 else 14 min_periods = (int(kwargs["min_periods"]) if "min_periods" in kwargs and kwargs["min_periods"] is not None else length) offset = get_offset(offset) # Calculate Result sines = Series( [sin((i + 1) * pi / (length + 1)) for i in range(0, length)]) w = sines / sines.sum() sinwma = close.rolling(length, min_periods=length).apply(weights(w), raw=True) # Offset if offset != 0: sinwma = sinwma.shift(offset) # Name & Category sinwma.name = f"SINWMA_{length}" sinwma.category = "overlap" return sinwma
class Player: def __init__(self, first_name, last_name, id): self.first_name = first_name self.last_name = last_name self.id = id self.hrs = [0,0,0,0,0,0] #One for each month of the game self.hr_total = 0 self.hr_series = Series() self.hr_total_series = Series() def __str__(self): return str.format('{0} : {1}', self.id, self.last_name) def __repr__(self): return self.__str__() def add_hrs(self, count, date): self.hr_total += count self.hr_total_series[date] = self.hr_series.sum() + count if(self.hr_series.last_valid_index() == date ): self.hr_series[date] = count + self.hr_series[date] else: self.hr_series[date] = count def name(self): return self.first_name + " " + self.last_name def get_player_hr_dataframe(self): return self.hr_series.to_frame(self.name()) def get_player_hr_total_dataframe(self): return self.hr_total_series.to_frame(self.name())
def single_proportion_test(sample: pd.Series, category: str, p_0: float, alternative: str) -> Dict[str, float]: """Performs a single proportion test Args: sample: Series with the count of two categorical variables. Check the example below for details. category: The name of the category we want to use for the test. p_0: The proportion of the Null Hypothesis alternative: Defines the alternative hypothesis. Possible values: 'less', 'greater', or 'two-sided'. Returns: Dict with the calculated "z" parameter and the p-value Example: The following is an example of the format required for the `sample` parameter. The index values (yes, no) are the categories, and the values are the count of elements in each category:: >>> sample Out[1]: Relapse no 4 yes 20 Name: Drug, dtype: int64 """ n = sample.sum() p_hat = sample[category] / n _SE = np.sqrt(p_0 * (1 - p_0) / n) z = p_hat - p_0 / _SE validate_conditions_for_theoretical_distns( inference_type='single-proportion', n=n, p=p_hat) return {'z': z, 'p-value': get_p_value(z, alternative=alternative)}
def sinwma(close, length=None, offset=None, **kwargs): """Indicator: Sine Weighted Moving Average (SINWMA) by Everget of TradingView""" # Validate Arguments length = int(length) if length and length > 0 else 14 close = verify_series(close, length) offset = get_offset(offset) if close is None: return # Calculate Result sines = Series([npSin((i + 1) * npPi / (length + 1)) for i in range(0, length)]) w = sines / sines.sum() sinwma = close.rolling(length, min_periods=length).apply(weights(w), raw=True) # Offset if offset != 0: sinwma = sinwma.shift(offset) # Handle fills if "fillna" in kwargs: sinwma.fillna(kwargs["fillna"], inplace=True) if "fill_method" in kwargs: sinwma.fillna(method=kwargs["fill_method"], inplace=True) # Name & Category sinwma.name = f"SINWMA_{length}" sinwma.category = "overlap" return sinwma
def _getSeriesScoreMultipliedByCount(self, targetSeries: pd.Series) -> float: totalCount = targetSeries.count() trueCount = targetSeries.sum() falseCount = totalCount - trueCount return totalCount - (trueCount * trueCount + falseCount * falseCount) / totalCount
def permutation_feature_importance(model_name, model, X, y, categorical_features=None): results = {} mse = compute_mse(model_name, model, X, y, categorical_features=categorical_features) for col in X.columns: permutated_x = X.copy() random_feature_mse = [] for i in range(N_PERMUTATIONS): permutated_x[col] = permutation(permutated_x[col]) if model_name == 'xgboost': temp_x = xgb.DMatrix(permutated_x) elif model_name == 'catboost': temp_x = Pool(permutated_x, cat_features=categorical_features ) if categorical_features else Pool(permutated_x) else: temp_x = permutated_x random_feature_mse.append( compute_mse(model_name, model, temp_x, y, transform_x=False, categorical_features=categorical_features)) results[col] = mean(array(random_feature_mse)) - mse results = Series(results) return results / results.sum()
def make_equal(series: pd.Series, matched: float) -> pd.Series: """ Equally distrubute a series considering a matched value. Lower values than matched value are filtered. :param series: A series which has one or more rows. :param matched: A positive float number. :return: A series that is equally distrubuted. """ check_negative = series.sum() < 0 if check_negative: sorted_ = series.abs().sort_values() else: sorted_ = series.sort_values() per_ = matched / series.size for i, v in enumerate(sorted_): if not v > per_: per_ = (matched - sorted_.iloc[:i + 1].sum()) / (series.size - (i + 1)) else: break sorted_.iloc[i:] = per_ if check_negative: return series.mul(0).add(sorted_).mul(-1) else: return series.mul(0).add(sorted_)
def plotCosts(series: pd.Series, folder, suffix, xLabel=''): f = plt.figure(figsize=(10, 4)) ax = f.add_subplot(121) fig = plt.bar(series.columns, series.iloc[0, :].values, color=colors) plt.xticks(rotation=45) plt.ylabel('Social Cost') plt.title('Base cost contribution') ax2 = f.add_subplot(122) lines = [ plt.plot(series.index, series.iloc[:, i] - series.iloc[0, i], color=colors[i], label=series.columns[i]) for i in range(series.shape[1]) ] total = plt.plot(series.index, series.sum(axis=1).values - series.iloc[0, :].sum(), color='k', linewidth=2.0, label="Total") plt.legend(list(series.columns) + ['Total']) plt.xlabel(xLabel) plt.ylabel('Change in social cost') plt.title('Variation') f.tight_layout() plt.savefig(folder + '/costs' + suffix + '.png') return f
def _trapezium_integration_variable(d_ti: pd.Series) -> Optional[float]: """Gapfill version of trap int - will fill out""" # Clear no numbers d_ti = d_ti.dropna() if d_ti.count() == 0: return None # One entry if d_ti.count() == 1: return d_ti[0] * 0.5 # Fall back on average but warn to check data if d_ti.count() <= 3: d_sum = d_ti.sum() if d_sum == 0: return 0 if d_ti.count() == 0: return 0 return 0.5 * d_sum / d_ti.count() bucket_middle = d_ti.count() - 2 bucket_middle_weights = [1] + [2] * bucket_middle + [1] weights = d_ti.values * bucket_middle_weights weights_sum = weights.sum() bucket_energy = 0.5 * weights_sum / ((d_ti.count() - 1) * 2) return bucket_energy
def test_td64_sum_empty(skipna): # GH#37151 ser = Series([], dtype="timedelta64[ns]") result = ser.sum(skipna=skipna) assert isinstance(result, pd.Timedelta) assert result == pd.Timedelta(0)
def test_sum_inf(self): s = Series(np.random.randn(10)) s2 = s.copy() s[5:8] = np.inf s2[5:8] = np.nan assert np.isinf(s.sum()) arr = np.random.randn(100, 100).astype('f4') arr[:, 2] = np.inf with pd.option_context("mode.use_inf_as_na", True): tm.assert_almost_equal(s.sum(), s2.sum()) res = nanops.nansum(arr, axis=1) assert np.isinf(res).all()
def compute_weighted_std(series): """ method for computing weighted standard deviation by duration :return: feature value """ if len(series) <= 1: return 0.0 values = series[:-1].values weights = Series(map(lambda x: float(x) / 10e8, series.index.values[1:] - series.index.values[:-1])) weights = weights.values w_avg = values * weights w_avg = w_avg.sum() / weights.sum() if weights.sum() > 0 else np.nan nonzero_w_num = float(len(weights[weights != 0])) std = (weights * (values - w_avg) ** 2).sum() / (((nonzero_w_num - 1) / float( nonzero_w_num) if nonzero_w_num > 0 else np.nan) * weights.sum()) if weights.sum() > 0 else np.nan return np.sqrt(std)
def Calls(self): rows = [] for name,callTimes in self.times['call'].iteritems(): s = Series(callTimes) func,loc = formatName(name) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append((func,loc,callCount,meanTime,totalTime)) columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
def Phases(self): rows = [] for prefix in ('parse', 'compile', 'run'): for name,callTimes in self.times[prefix].iteritems(): s = Series(callTimes) callCount = s.count() meanTime = s.mean() totalTime = s.sum() rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime)) columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL') return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. series = Series(np.random.randint(1, 5), index=['a', 'b', 'c', 'd', 'e'], name='YLABEL') ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, series.index) assert ax.get_ylabel() == 'YLABEL' # without wedge labels ax = _check_plot_works(series.plot.pie, labels=None) self._check_text_labels(ax.texts, [''] * 5) # with less colors than elements color_args = ['r', 'g', 'b'] ax = _check_plot_works(series.plot.pie, colors=color_args) color_expected = ['r', 'g', 'b', 'r', 'g'] self._check_colors(ax.patches, facecolors=color_expected) # with labels and colors labels = ['A', 'B', 'C', 'D', 'E'] color_args = ['r', 'g', 'b', 'c', 'm'] ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args) self._check_text_labels(ax.texts, labels) self._check_colors(ax.patches, facecolors=color_args) # with autopct and fontsize ax = _check_plot_works(series.plot.pie, colors=color_args, autopct='%.2f', fontsize=7) pcts = ['{0:.2f}'.format(s * 100) for s in series.values / float(series.sum())] iters = [iter(series.index), iter(pcts)] expected_texts = list(next(it) for it in itertools.cycle(iters)) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: assert t.get_fontsize() == 7 # includes negative value with pytest.raises(ValueError): series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e']) series.plot.pie() # includes nan series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'], name='YLABEL') ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, ['a', 'b', '', 'd'])
def test_pie_series(self): # if sum of values is less than 1.0, pie handle them as rate and draw # semicircle. series = Series(np.random.randint(1, 5), index=["a", "b", "c", "d", "e"], name="YLABEL") ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, series.index) self.assertEqual(ax.get_ylabel(), "YLABEL") # without wedge labels ax = _check_plot_works(series.plot.pie, labels=None) self._check_text_labels(ax.texts, [""] * 5) # with less colors than elements color_args = ["r", "g", "b"] ax = _check_plot_works(series.plot.pie, colors=color_args) color_expected = ["r", "g", "b", "r", "g"] self._check_colors(ax.patches, facecolors=color_expected) # with labels and colors labels = ["A", "B", "C", "D", "E"] color_args = ["r", "g", "b", "c", "m"] ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args) self._check_text_labels(ax.texts, labels) self._check_colors(ax.patches, facecolors=color_args) # with autopct and fontsize ax = _check_plot_works(series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7) pcts = ["{0:.2f}".format(s * 100) for s in series.values / float(series.sum())] iters = [iter(series.index), iter(pcts)] expected_texts = list(next(it) for it in itertools.cycle(iters)) self._check_text_labels(ax.texts, expected_texts) for t in ax.texts: self.assertEqual(t.get_fontsize(), 7) # includes negative value with tm.assertRaises(ValueError): series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"]) series.plot.pie() # includes nan series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL") ax = _check_plot_works(series.plot.pie) self._check_text_labels(ax.texts, ["a", "b", "", "d"])
def interrogator( corpus, search, query="any", show="w", exclude=False, excludemode="any", searchmode="all", dep_type="collapsed-ccprocessed-dependencies", case_sensitive=False, quicksave=False, just_speakers=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, conc=False, only_unique=False, random=False, only_format_match=False, multiprocess=False, spelling=False, regex_nonword_filter=r"[A-Za-z0-9:_]", gramsize=2, split_contractions=False, **kwargs ): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" # store kwargs locs = locals() from corpkit.interrogation import Interrogation from corpkit.process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from corpkit.other import as_regex from corpkit.process import get_deps from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator from corpkit.dictionaries.word_transforms import wordlist, taglemma # find out if using gui root = kwargs.get("root") note = kwargs.get("note") # convert path to corpus object if type(corpus) == str: from corpkit.corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from corpkit.process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(search.keys()) == 1: query = search.values()[0] if "l" in show and search.get("t"): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, "__iter__"): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != search.values()[0] or len(search.keys()) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == "each": im = True just_speakers = ["each"] if just_speakers == ["each"]: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in search.values()): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" import os from corpkit.process import tregex_engine # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None]) to_write.encode("utf-8", errors="ignore") with open(to_open, "w") as fo: fo.write(to_write) q = search.values()[0] res = tregex_engine( query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True ) if root: root.update() os.remove(to_open) if countmode: return len(res) else: return res def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" with open(to_open, "w") as fo: for sent in sents: statsmode_results["Sentences"] += 1 sts = sent.parse_string.rstrip() encd = sts.encode("utf-8", errors="ignore") + "\n" fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith("pass")]) statsmode_results["Passives"] += numpass statsmode_results["Tokens"] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results["Words"] += len(words) statsmode_results["Characters"] += len("".join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from corpkit.other import as_regex tregex_qs = { "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/", "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)", "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))", "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))", "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))", "Open class words": r"/^(NN|JJ|VB|RB)/ < __", "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/", "Clauses": r"/^S/ < __", "Interrogative": r"ROOT << (/\?/ !< __)", "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"), "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"), "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.relational, boundaries="w"), } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + "/" + str(total_files) if kwargs.get("outname"): tot_string = "%s: %s" % (kwargs["outname"], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get("note", False): kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False): if speakr is False: speakr = "" conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if "-join-".join([f, whole, mid]) not in duplicates: duplicates.append("-join-".join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith("u"): if word.lower() in taglemma.keys(): word = taglemma[word.lower()] else: if word == "x": word = "Other" # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag=False): """ Find tag for WordNet lemmatisation """ import re tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False} if lemmatag is False: tag = "n" # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)") tagchecker = re.compile(r"^[A-Z]{1,4}$") qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "") treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], "n") elif lemmatag: tag = lemmatag return tag def format_tregex(results): """format tregex by show list""" if countmode: return results import re done = [] if "l" in show or "pl" in show: lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get("w"): if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("w"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("l"), lemma): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("p"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("pl"), lemma): continue if exclude and excludemode == "all": num_to_cause_exclude = len(exclude.keys()) current_num = 0 if exclude.get("w"): if re.search(exclude.get("w"), word): current_num += 1 if exclude.get("l"): if re.search(exclude.get("l"), lemma): current_num += 1 if exclude.get("p"): if re.search(exclude.get("p"), word): current_num += 1 if exclude.get("pl"): if re.search(exclude.get("pl"), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == "t": bits.append(word) if i == "l": bits.append(lemma) elif i == "w": bits.append(word) elif i == "p": bits.append(word) elif i == "pl": bits.append(lemma) joined = "/".join(bits) done.append(joined) return done def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = "".join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == "any": pattern = r".*" if not split_contractions: list_of_toks = unsplitter(list_of_toks) # list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index + x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[" ".join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in ngrams.items(): if v > 1: for i in range(v): result.append(k) if countmode: return len(result) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print "%s: Query %s" % (thetime, error_message) if root: return "Bad query" else: raise ValueError("%s: Query %s" % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == "Bad query": return "Bad query" if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})" compiled_pattern = compiler(pattern) if compiled_pattern == "Bad query": return "Bad query" matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return len(matches) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == "uk": usa_convert = {v: k for k, v in usa_convert.items()} spell_out = [] bits = a_string.split("/") for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = "/".join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})" pat = compiler(pat) if pat == "Bad query": return "Bad query" matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs["search"] = search locs["query"] = query locs["just_speakers"] = just_speakers locs["corpus"] = corpus locs["multiprocess"] = multiprocess if im: from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} # check if just counting countmode = "c" in show # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get("denominator", 1) startnum = kwargs.get("startnum", 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and "t" in search.keys(): simple_tregex_mode = True else: if corpus.datatype == "plaintext": if search.get("n"): raise NotImplementedError("Use a tokenised corpus for n-gramming.") # searcher = plaintext_ngram optiontext = "n-grams via plaintext" if search.get("w"): if kwargs.get("regex", True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = "Searching plaintext" elif corpus.datatype == "tokens": if search.get("n"): searcher = tok_ngrams optiontext = "n-grams via tokens" elif search.get("w"): if kwargs.get("regex", True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get("w")) == list: searcher = tok_by_list optiontext = "Searching tokens" only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"] if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()): raise ValueError( 'Need parsed corpus to search with "%s" option(s).' % ", ".join([i for i in search.keys() if i in only_parse]) ) elif corpus.datatype == "parse": if search.get("t"): searcher = slow_tregex elif search.get("s"): searcher = get_stats statsmode = True optiontext = "General statistics" global numdone numdone = 0 else: from corpkit.depsearch import dep_searcher searcher = dep_searcher optiontext = "Dependency querying" ############################################ # Set some Tregex-related values # ############################################ if search.get("t"): query = search.get("t") # check the query q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root) if query is False: if root: return "Bad query" else: return optiontext = "Searching parse trees" if "p" in show or "pl" in show: translated_option = "u" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "t" in show: translated_option = "o" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "w" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "c" in show: count_results = {} only_count = True translated_option = "C" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "l" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" query = search["t"] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for k, v in sorted(corpus.structure.items()): to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if conc: message = "Concordancing" else: message = "Interrogating" if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) sformat = "\n ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()]) if search == {"s": r".*"}: sformat = "features" welcome = "\n%s: %s %s ...\n %s\n Query: %s\n" % ( thetime, message, corpus.name, optiontext, sformat, ) print welcome ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(to_iterate_over.keys()) else: if search.get("s"): total_files = sum([len(x) for x in to_iterate_over.values()]) * 12 else: total_files = sum([len(x) for x in to_iterate_over.values()]) par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files} term = None if kwargs.get("paralleling", None) is not None: from blessings import Terminal term = Terminal() par_args["terminal"] = term par_args["linenum"] = kwargs.get("paralleling") outn = kwargs.get("outname", "") if outn: outn = outn + ": " tstr = "%s%d/%d" % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if countmode or conc: results[subcorpus_name] = [] else: results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ["-o", "-" + translated_option] result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if countmode: results[subcorpus_name].append(result) continue result = Counter(format_tregex(result)) if conc: op.append("-w") whole_result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if not only_format_match: whole_result = format_tregex(whole_result) result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False) if spelling: for index, line in enumerate(result): result[index] = [correct_spelling(b) for b in line] results[subcorpus_name] += result current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: if corpus.datatype == "parse": with open(f.path, "r") as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print "Could not read file: %s" % f.path continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if not sents: continue else: sents = corenlp_xml.sentences res = searcher( sents, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, lemmatise=False, case_sensitive=case_sensitive, concordancing=conc, only_format_match=only_format_match, ) if res == "Bad query": return "Bad query" if searcher == slow_tregex and not countmode: res = format_tregex(res) elif corpus.datatype == "tokens": import pickle with open(f.path, "rb") as fo: data = pickle.load(fo) res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") elif corpus.datatype == "plaintext": with open(f.path, "rb") as data: data = data.read() data = unicode(data, errors="ignore") res = searcher(search.values()[0], data, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") if countmode: results[subcorpus_name] += res continue # add filename and do lowercasing for conc if conc: for index, line in enumerate(res): line.insert(0, f.name) if not preserve_case: line = [b.lower() for b in line] if spelling: line = [correct_spelling(b) for b in line] results[subcorpus_name] += [line] # do lowercasing and spelling else: if not preserve_case: res = [r.lower() for r in res] if spelling: res = [correct_spelling(r) for r in res] results[subcorpus_name] += Counter(res) if not statsmode: current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) # delete temp file if there import os if os.path.isfile("tmp.txt"): os.remove("tmp.txt") ############################################ # Get concordances into DataFrame # ############################################ if conc: all_conc_lines = [] for sc_name, resu in sorted(results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu # make into series pindex = "c f s l m r".encode("utf-8").split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors="ignore") fname = os.path.basename(fname) # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function all_conc_lines.append( Series( [ sc_name.encode("ascii", errors="ignore"), fname.encode("ascii", errors="ignore"), spkr.encode("ascii", errors="ignore"), start.encode("ascii", errors="ignore"), word.encode("ascii", errors="ignore"), end.encode("ascii", errors="ignore"), ], index=pindex, ) ) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) df = pd.concat(all_conc_lines, axis=1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: df.columns = ["c", "f", "s", "l", "m", "r"] else: df.columns = ["c", "f", "s", "l", "m", "r", "link"] if all(x == "" for x in list(df["s"].values)): df.drop("s", axis=1, inplace=True) if kwargs.get("note"): kwargs["note"].progvar.set(100) if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index)) print finalstring from corpkit.interrogation import Concordance output = Concordance(df) output.query = locs if quicksave: interro.save() return output ############################################ # Get interrogation into DataFrame # ############################################ else: if countmode: df = Series({k: sum(v) for k, v in sorted(results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in results.values() for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get("df1_always_df"): df = Series(df.ix[0]) df.sort(ascending=False) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix["Total-tmp"] = df.sum() the_tot = df.ix["Total-tmp"] df = df[the_tot.argsort()[::-1]] df = df.drop("Total-tmp", axis=0) # format final string if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Interrogation finished!" % thetime if countmode: finalstring += " %d matches." % tot else: finalstring += " %d unique results, %d total occurrences." % (numentries, total_total) print finalstring interro = Interrogation(results=df, totals=tot, query=locs) if quicksave: interro.save() return interro
def normalize_confusion_matrix(cm: pd.Series) -> pd.Series: return cm / cm.sum()
def editor(interrogation, operation=None, denominator=False, sort_by=False, keep_stats=False, keep_top=False, just_totals=False, threshold='medium', just_entries=False, skip_entries=False, merge_entries=False, just_subcorpora=False, skip_subcorpora=False, span_subcorpora=False, merge_subcorpora=False, replace_names=False, replace_subcorpus_names=False, projection=False, remove_above_p=False, p=0.05, print_info=False, spelling=False, selfdrop=True, calc_all=True, keyword_measure='ll', **kwargs ): """ See corpkit.interrogation.Interrogation.edit() for docstring """ # grab arguments, in case we get dict input and have to iterate locs = locals() import corpkit import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # to use if we also need to worry about concordance lines return_conc = False from corpkit.interrogation import Interrodict, Interrogation, Concordance if interrogation.__class__ == Interrodict: locs.pop('interrogation', None) from collections import OrderedDict outdict = OrderedDict() for i, (k, v) in enumerate(interrogation.items()): # only print the first time around if i != 0: locs['print_info'] = False if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self': denominator = interrogation # if df2 is also a dict, get the relevant entry if isinstance(denominator, (dict, Interrodict)): #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \ # sorted(set([i.lower() for i in list(denominator.keys())])): # locs['denominator'] = denominator[k] # fix: this repeats itself for every key, when it doesn't need to # denominator_sum: if kwargs.get('denominator_sum'): locs['denominator'] = denominator.collapse(axis='key') if kwargs.get('denominator_totals'): locs['denominator'] = denominator[k].totals else: locs['denominator'] = denominator[k].results outdict[k] = v.results.edit(**locs) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys())))) return Interrodict(outdict) elif isinstance(interrogation, (DataFrame, Series)): dataframe1 = interrogation elif isinstance(interrogation, Interrogation): #if interrogation.__dict__.get('concordance', None) is not None: # concordances = interrogation.concordance branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r') : dataframe1 = interrogation.results elif branch.lower().startswith('t'): dataframe1 = interrogation.totals elif branch.lower().startswith('c'): dataframe1 = interrogation.concordance return_conc = True else: dataframe1 = interrogation.results elif isinstance(interrogation, Concordance) or \ all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']): return_conc = True print('heree') dataframe1 = interrogation # hope for the best else: dataframe1 = interrogation the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None try: from process import checkstack except ImportError: from corpkit.process import checkstack if checkstack('pythontex'): print_info=False def combiney(df, df2, operation='%', threshold='medium', prinf=True): """mash df and df2 together in appropriate way""" totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list(df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print('Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show))) if len(to_drop) > 10: print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)) else: print('') else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '+': try: df = df.add(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '-': try: df = df.sub(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis=1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]) # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis=1).T def editf(datum): meth = {'%': datum.div, '*': datum.mul, '/': datum.div, '+': datum.add, '-': datum.sub} if datum.name in list(df2.columns): method = meth[operation] mathed = method(df2[datum.name], fill_value=0.0) if operation == '%': return mathed * 100.0 else: return mathed else: return datum * 0.0 df = df.apply(editf) else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2.T.sum() return df, totals def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" parsed_input = False import re if the_input == 'all': the_input = r'.*' if isinstance(the_input, int): try: the_input = str(the_input) except: pass the_input = [the_input] elif isinstance(the_input, STRINGTYPE): regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input from corpkit.dictionaries.process_types import Wordlist if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist: the_input = list(the_input) if isinstance(the_input, list): if isinstance(the_input[0], int): parsed_input = [word for index, word in enumerate(list(df)) if index in the_input] elif isinstance(the_input[0], STRINGTYPE): try: parsed_input = [word for word in the_input if word in df.columns] except AttributeError: # if series parsed_input = [word for word in the_input if word in df.index] return parsed_input def synonymise(df, pos='n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos=pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info=print_info): if print_info: print('Merging duplicate entries ... \n') # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis=1) #df = df.drop([dup for d in range(num_dupes)], axis=1) df = df.drop(dup, axis=1) df[dup] = temp return df def name_replacer(df, replace_names, print_info=print_info): """replace entry names and merge""" import re # get input into list of tuples # if it's a string, we want to delete it if isinstance(replace_names, STRINGTYPE): replace_names = [(replace_names, '')] # this is for some malformed list if not isinstance(replace_names, dict): if isinstance(replace_names[0], STRINGTYPE): replace_names = [replace_names] # if dict, make into list of tupes if isinstance(replace_names, dict): replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: if replacement: print('Replacing "%s" with "%s" ...\n' % (to_find, replacement)) else: print('Deleting "%s" from entry names ...\n' % to_find) to_find = re.compile(to_find) if not replacement: replacement = '' df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)] df = merge_duplicates(df, print_info=False) return df def just_these_entries(df, parsed_input, prinf=True): entries = [word for word in list(df) if word not in parsed_input] if prinf: print('Keeping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(entries, axis=1) return df def skip_these_entries(df, parsed_input, prinf=True): if prinf: print('Skipping %d entries:\n %s' % \ (len(parsed_input), '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') df = df.drop(parsed_input, axis=1) return df def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if isinstance(newname, int): the_newname = list(df.columns)[newname] elif isinstance(newname, STRINGTYPE): if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if not newname: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0] if not isinstance(the_newname, STRINGTYPE): the_newname = str(the_newname, errors='ignore') return the_newname def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'): # make new entry with sum of parsed input if len(parsed_input) == 0: import warnings warnings.warn('No %s could be automatically merged.\n' % merging) else: if prinf: print('Merging %d %s as "%s":\n %s' % \ (len(parsed_input), merging, the_newname, '\n '.join(parsed_input[:10]))) if len(parsed_input) > 10: print('... and %d more ... \n' % (len(parsed_input) - 10)) else: print('') # remove old entries temp = sum([df[i] for i in parsed_input]) if isinstance(df, Series): df = df.drop(parsed_input, errors='ignore') nms = list(df.index) else: df = df.drop(parsed_input, axis=1, errors='ignore') nms = list(df.columns) if the_newname in nms: df[the_newname] = df[the_newname] + temp else: df[the_newname] = temp return df def just_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if prinf: print('Keeping %d subcorpora:\n %s' % (len(good_years), '\n '.join(good_years[:10]))) if len(good_years) > 10: print('... and %d more ... \n' % (len(good_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0) return df def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True): if isinstance(lst_of_subcorpora, int): lst_of_subcorpora = [lst_of_subcorpora] if isinstance(lst_of_subcorpora[0], int): lst_of_subcorpora = [str(l) for l in lst_of_subcorpora] bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora] if len(bad_years) == 0: import warnings warnings.warn('No subcorpora skipped.\n') else: if prinf: print('Skipping %d subcorpora:\n %s' % (len(bad_years), '\n '.join([str(i) for i in bad_years[:10]]))) if len(bad_years) > 10: print('... and %d more ... \n' % (len(bad_years) - 10)) else: print('') df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0) return df def span_these_subcorpora(df, lst_of_subcorpora, prinf=True): """select only a span of suborpora (first, last)""" fir, sec = lst_of_subcorpora if len(lst_of_subcorpora) == 0: import warnings warnings.warn('Span not identified.\n') else: if prinf: print('Keeping subcorpora:\n %d--%d\n' % (int(fir), int(sec))) sbs = list(df.index) df = df.ix[sbs.index(fir):sbs.index(sec) + 1] return df def projector(df, list_of_tuples, prinf=True): """project abs values""" if isinstance(list_of_tuples, list): tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list(list_of_tuples.items()): if isinstance(subcorpus, int): subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if isinstance(projection_value, float): print('Projection: %s * %s' % (subcorpus, projection_value)) if isinstance(projection_value, int): print('Projection: %s * %d' % (subcorpus, projection_value)) if prinf: print('') return df def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: thetime = strftime("%H:%M:%S", localtime()) print('%s: sort type not available in this verion of corpkit.' % thetime) return False indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = list(range(len(indices))) statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] stats = [] if isinstance(df, Series): y = list(df.values) sl = Series(list(linregress(x, y)), index=statfields) else: for entry in list(df.columns): y = list(df[entry]) stats.append(list(linregress(x, y))) sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns)) df = df.append(sl) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) return df def resort(df, sort_by = False, keep_stats = False): """sort results, potentially using scipy's linregress""" # translate options and make sure they are parseable stat_field = ['slope', 'intercept', 'r', 'p', 'stderr'] easy_sorts = ['total', 'infreq', 'name', 'most', 'least'] stat_sorts = ['increase', 'decrease', 'static', 'turbulent'] options = stat_field + easy_sorts + stat_sorts sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'} sort_by = sort_by_convert.get(sort_by, sort_by) # probably broken :( if just_totals: if sort_by == 'name': return df.sort_index() else: return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1) stats_done = False if keep_stats or sort_by in stat_field + stat_sorts: df = do_stats(df) stats_done = True if isinstance(df, bool): if df is False: return False if isinstance(df, Series): if stats_done: stats = df.ix[range(-5, 0)] df = df.drop(list(stats.index)) if sort_by == 'name': df = df.sort_index() else: df = df.sort_values(ascending=sort_by != 'total') if stats_done: df = df.append(stats) return df if sort_by == 'name': # currently case sensitive df = df.reindex_axis(sorted(df.columns), axis=1) elif sort_by in ['total', 'infreq']: if df1_istotals: df = df.T df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)] # sort by slope etc., or search by subcorpus name if sort_by in stat_field or sort_by not in options: asc = kwargs.get('reverse', False) df = df.T.sort_values(by=sort_by, ascending=asc).T if sort_by in ['increase', 'decrease', 'static', 'turbulent']: slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(stat_field, axis=0, errors='ignore') return df def set_threshold(big_list, threshold, prinf=True): if isinstance(threshold, STRINGTYPE): if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if isinstance(big_list, DataFrame): tot = big_list.sum().sum() if isinstance(big_list, Series): tot = big_list.sum() tshld = float(tot) / float(denominator) else: tshld = threshold if prinf: print('Threshold: %d\n' % tshld) return tshld # copy dataframe to be very safe df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' if isinstance(interrogation, Concordance): return_conc = True # do concordance work if return_conc: if just_entries: if isinstance(just_entries, int): just_entries = [just_entries] if isinstance(just_entries, STRINGTYPE): df = df[df['m'].str.contains(just_entries)] if isinstance(just_entries, list): if all(isinstance(e, STRINGTYPE) for e in just_entries): mp = df['m'].map(lambda x: x in just_entries) df = df[mp] else: df = df.ix[just_entries] if skip_entries: if isinstance(skip_entries, int): skip_entries = [skip_entries] if isinstance(skip_entries, STRINGTYPE): df = df[~df['m'].str.contains(skip_entries)] if isinstance(skip_entries, list): if all(isinstance(e, STRINGTYPE) for e in skip_entries): mp = df['m'].map(lambda x: x not in skip_entries) df = df[mp] else: df = df.drop(skip_entries, axis=0) if just_subcorpora: if isinstance(just_subcorpora, int): just_subcorpora = [just_subcorpora] if isinstance(just_subcorpora, STRINGTYPE): df = df[df['c'].str.contains(just_subcorpora)] if isinstance(just_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in just_subcorpora): mp = df['c'].map(lambda x: x in just_subcorpora) df = df[mp] else: df = df.ix[just_subcorpora] if skip_subcorpora: if isinstance(skip_subcorpora, int): skip_subcorpora = [skip_subcorpora] if isinstance(skip_subcorpora, STRINGTYPE): df = df[~df['c'].str.contains(skip_subcorpora)] if isinstance(skip_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora): mp = df['c'].map(lambda x: x not in skip_subcorpora) df = df[mp] else: df = df.drop(skip_subcorpora, axis=0) return Concordance(df) if print_info: print('\n***Processing results***\n========================\n') df1_istotals = False if isinstance(df, Series): df1_istotals = True df = DataFrame(df) # if just a single result else: df = DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False if denominator.__class__ == Interrogation: try: denominator = denominator.results except AttributeError: denominator = denominator.totals if denominator is not False and not isinstance(denominator, STRINGTYPE): df2 = denominator.copy() using_totals = True if isinstance(df2, DataFrame): if len(df2.columns) > 1: single_totals = False else: df2 = Series(df2) elif isinstance(df2, Series): single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?') else: if operation in ['k', 'a', '%', '/', '*', '-', '+']: denominator = 'self' if denominator == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to=spelling) df = merge_duplicates(df, print_info=False) if not single_totals: df2 = convert_spell(df2, convert_to=spelling, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, replace_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not sort_by: sort_by = 'total' if replace_subcorpus_names: df = name_replacer(df.T, replace_subcorpus_names) df = merge_duplicates(df).T df = df.sort_index() if not single_totals: if isinstance(df2, DataFrame): df2 = df2.T df2 = name_replacer(df2, replace_subcorpus_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if isinstance(df2, DataFrame): df2 = df2.T df2 = df2.sort_index() if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis=0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis=0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and df1_istotals: continue try: df = df.drop(name, axis=ax, errors='ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and single_totals: continue try: df2 = df2.drop(name, axis=ax, errors='ignore') except: pass # merging: make dicts if they aren't already, so we can iterate if merge_entries: if not isinstance(merge_entries, list): if isinstance(merge_entries, STRINGTYPE): merge_entries = {'combine': merge_entries} # for newname, criteria for name, the_input in sorted(merge_entries.items()): pin = parse_input(df, the_input) the_newname = newname_getter(df, pin, newname=name, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, the_input) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) else: for i in merge_entries: pin = parse_input(df, merge_entries) the_newname = newname_getter(df, pin, prinf=print_info) df = merge_these_entries(df, pin, the_newname, prinf=print_info) if not single_totals: pin2 = parse_input(df2, merge_entries) df2 = merge_these_entries(df2, pin2, the_newname, prinf=False) if merge_subcorpora: if not isinstance(merge_subcorpora, dict): if isinstance(merge_subcorpora, list): if isinstance(merge_subcorpora[0], tuple): merge_subcorpora = {x: y for x, y in merge_subcorpora} elif isinstance(merge_subcorpora[0], STRINGTYPE): merge_subcorpora = {'combine': [x for x in merge_subcorpora]} elif isinstance(merge_subcorpora[0], int): merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]} else: merge_subcorpora = {'combine': merge_subcorpora} for name, the_input in sorted(merge_subcorpora.items()): pin = parse_input(df.T, the_input) the_newname = newname_getter(df.T, pin, newname=name, \ merging_subcorpora=True, prinf=print_info) df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', prinf=print_info).T if using_totals: pin2 = parse_input(df2.T, the_input) df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', prinf=False).T if just_subcorpora: df = just_these_subcorpora(df, just_subcorpora, prinf=print_info) if using_totals: df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False) if skip_subcorpora: df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info) if using_totals: df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False) if span_subcorpora: df = span_these_subcorpora(df, span_subcorpora, prinf=print_info) if using_totals: df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False) if just_entries: df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info) if not single_totals: df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False) if skip_entries: df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info) if not single_totals: df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) if just_totals: df = DataFrame(df.sum(), columns=['Combined total']) if using_totals: if not single_totals: df2 = DataFrame(df2.sum(), columns=['Combined total']) else: df2 = df2.sum() tots = df.sum(axis=1) if using_totals or outputmode: if not operation.startswith('k'): tshld = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: tshld = set_threshold(df2, threshold, prinf=print_info) df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info) # if doing keywording... if operation.startswith('k'): if isinstance(denominator, STRINGTYPE): if denominator == 'self': df2 = df.copy() else: df2 = denominator from corpkit.keys import keywords df = keywords(df, df2, selfdrop=selfdrop, threshold=threshold, print_info=print_info, editing=True, calc_all=calc_all, sort_by=sort_by, measure=keyword_measure, **kwargs) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by or keep_stats: df = resort(df, keep_stats=keep_stats, sort_by=sort_by) if isinstance(df, bool): if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = Series(df['Combined total'], name='Combined total') if df1_istotals: if operation.startswith('k'): try: df = Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0, :] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = Series(df['Total'], name='Total') except: total = 'none' pass #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis=1) except: total = 'none' if not isinstance(tots, DataFrame) and not isinstance(tots, Series): total = df.sum(axis=1) else: total = tots if isinstance(df, DataFrame): datatype = df.iloc[0].dtype else: datatype = df.dtype locs['datatype'] = datatype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): """add an order for tkintertable if using gui""" if isinstance(df, Series): df = df.T df = df.drop('tkintertable-order', errors='ignore', axis=0) df = df.drop('tkintertable-order', errors='ignore', axis=1) dat = [i for i in range(len(df.index))] df['tkintertable-order'] = Series(dat, index=list(df.index)) df = df.T return df # while tkintertable can't sort rows if checkstack('tkinter'): df = add_tkt_index(df) if kwargs.get('df1_always_df'): if isinstance(df, Series): df = DataFrame(df) # delete non-appearing conc lines if not hasattr(interrogation, 'concordance'): lns = None elif hasattr(interrogation, 'concordance') and interrogation.concordance is None: lns = None else: col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns)) ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index)) lns = interrogation.concordance[col_crit] lns = lns.loc[ind_crit] lns = Concordance(lns) output = Interrogation(results=df, totals=total, query=locs, concordance=lns) if print_info: print('***Done!***\n========================\n') return output
def _z_test_word_list(word_count_series_one: pd.Series, word_count_series_two: pd.Series) -> pd.Series: """Run z-test on all the words of two input word lists. :param word_count_series_one: a pandas series where: - the data is the word counts. - the index is the corresponding words. - the name depends on the what the input is. If a file is given, the name will be string "File" add the actual file name, or if a class is given, the name will be string "class" add the actual class name. :param word_count_series_two: a pandas series where: - the data is the word counts. - the index is the corresponding words. - the name depends on the what the input is. If a file is given, the name will be string "File" add the actual file name, or if a class is given, the name will be string "class" add the actual class name. :return: a panda series where: - the data is the z-scores. - the index is the corresponding words. - the name is a readable header for analysis result. """ # Find sample population of the two input data set. total_word_count_one = word_count_series_one.sum() total_word_count_two = word_count_series_two.sum() # Join two input pandas series together to avoid making the assumption # that they are parallel array in future analysis. joined_data_frame = word_count_series_one.to_frame().join( word_count_series_two.to_frame()) # Perform the z-test to detect word anomalies. # We are using dict instead of pandas series here, because this method # requires 'full_word_score_dict' to be sorted via the absolute value # of the z-scores (the 'value' of the dictionary). # For code clarity we use this as a temp solution, but in future we # can implement the 'sort_by' function for series in our general # functions if we need it for better performance. full_word_score_dict = \ {word: TopwordModel._z_test(p1=count1 / total_word_count_one, p2=count2 / total_word_count_two, n1=total_word_count_one, n2=total_word_count_two) for word, [count1, count2] in joined_data_frame.iterrows()} # Filter out the insignificant result. sig_word_score_dict = \ {word: z_score for word, z_score in full_word_score_dict.items() if abs(z_score) >= 1.96} # Sort 'sig_word_score_dict' by absolute value of z-scores in # descending order. sorted_dict = OrderedDict(sorted(sig_word_score_dict.items(), key=lambda item: abs(item[1]), reverse=True)) # Convert the sorted result to a panda series. result_series = pd.Series(sorted_dict) # Set the result series name. result_series.name = f"{word_count_series_one.name} compares to " \ f"{word_count_series_two.name}" return result_series
def source_data(self): st_date = self.stTrain # st_date = '2014-10-1' stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2])) if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date(): raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!') if self.view: db_red = psycopg2.connect(host="***", database="***", port="***", user="******", password="******") db_red.autocommit = True df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view from appstoredata_itunes_metrics where game='***' and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, con=db_red) df_red['date'] = pd.to_datetime(df_red['date']) ts_view_target1 = Series(df_red.view.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1 = Series(df_red.install.tolist(), index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_view_target1) < (self.endP-stD).days : ts_view_target1[pd.to_datetime(st_date)] = 0 ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_install_target1[pd.to_datetime(st_date)] = 0 ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_view_target = (ts_view_target1)/(ts_view_target1.sum()) ts_install_target = (ts_install_target1)/(ts_install_target1.sum()) else: ts_view_target = [] ts_view_target1 = [] ts_install_target = [] ts_install_target1 = [] db = MySQLdb.connect( host = '***', user = '******', passwd = '***', db = '***', port = '***') df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country, sum(metrics_daily.value) as value, dim_channel.channel_type as type from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db) df_mysql['date'] = pd.to_datetime(df_mysql['date']) all_data_target = df_mysql[df_mysql.country==self.target] org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)] ts_org_target1 = Series(org_data_target.value.tolist(), index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_all_target1 = Series(all_data_target.value.tolist(), index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_target = (ts_org_target1)/(ts_org_target1.sum()) ts_all_target = (ts_all_target1)/(ts_all_target1.sum()) if self.baseorg: org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)] ts_org_base1 = Series(org_data_base.value.tolist(), index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min()) else: ts_org_base = [] ts_org_base1 = [] if self.paid: paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)] ts_paid_target1 = Series(paid_data_target.value.tolist(), index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_paid_target1) < (self.endP-stD).days : ts_paid_target1[pd.to_datetime(st_date)] = 0 ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum()) else: ts_paid_target = [] ts_paid_target1 = [] if self.rank: df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where country='%s' and device!='android'and game='***' and category='Overall' group by date;''' % self.target, con=db) df_rank['date'] = pd.to_datetime(df_rank['date']) ts_rank_target1 = Series(df_rank.bestRank.tolist(), index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0) if len(ts_rank_target1) < (self.endP-stD).days : ts_rank_target1[pd.to_datetime(st_date)] = 0 ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0) ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum()) else: ts_rank_target = [] ts_rank_target1 = [] # endog = ts_org_target1 # endog = ts_install_target endog = ts_all_target1 Tlist = [self.paid, self.baseorg, self.view, self.rank] dff = DataFrame() tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target] tlist = ['paid', 'base', 'view', 'rank'] for i in xrange(0,len(Tlist)): if Tlist[i]: dff[tlist[i]] = tList[i] if dff.empty: raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!') exog = dff return (endog, exog)
k_list.append(float(new_list[i-1][1])/float(new_list[i][0])) k_avg = k_sum/size #consu_ser = Series(data=k_list,index=range(len(k_list))) consu_ser = Series(data=k_list,index=range(len(k_list))) #print consu_ser.describe() mean = consu_ser.mean() std_dev = consu_ser.std() modified_list = list() for i in range(len(k_list)): if (k_list[i] < mean + std_dev) and (k_list[i] > mean - std_dev): modified_list.append(k_list[i]) plt.hist(modified_list) plt.show() consu_ser_mod = Series(data=modified_list,index=range(len(modified_list))) k_avg = consu_ser_mod.sum()/len(modified_list) thresh_sum = 0 for i in range(size): if i == 0: continue thresh_sum = thresh_sum + float(new_list[i-1][1]) - float(new_list[i][0])*k_avg thresh_avg = thresh_sum/size print "k_avg: ", k_avg print "thresh_avg: ", thresh_avg
def interrogator(corpus, search, query = 'any', show = 'w', exclude = False, excludemode = 'any', searchmode = 'all', dep_type = 'collapsed-ccprocessed-dependencies', case_sensitive = False, quicksave = False, just_speakers = False, preserve_case = False, lemmatag = False, files_as_subcorpora = False, only_unique = False, random = False, only_format_match = False, multiprocess = False, spelling = False, regex_nonword_filter = r'[A-Za-z0-9:_]', gramsize = 2, split_contractions = False, do_concordancing = False, maxconc = 9999, **kwargs): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" only_conc = False no_conc = False if do_concordancing is False: no_conc = True if type(do_concordancing) == str and do_concordancing.lower() == 'only': only_conc = True no_conc = False # iteratively count conc lines numconc = 0 # store kwargs locs = locals() if kwargs: for k, v in kwargs.items(): locs[k] = v locs.pop('kwargs', None) import corpkit from interrogation import Interrogation from process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from other import as_regex from process import get_deps from time import localtime, strftime from textprogressbar import TextProgressBar from process import animator from dictionaries.word_transforms import wordlist, taglemma import corenlp_xml import codecs import signal original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: original_sigint = signal.getsignal(signal.SIGINT) def signal_handler(signal, frame): """pause on ctrl+c, rather than just stop loop""" import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) try: sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) except NameError: sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGINT, signal_handler) # find out if using gui root = kwargs.get('root') note = kwargs.get('note') # convert path to corpus object if type(corpus) == str: from corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(list(search.keys())) == 1: query = list(search.values())[0] if 'l' in show and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lmtzr=WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, '__iter__'): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != list(search.values())[0] or len(list(search.keys())) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == 'each': im = True just_speakers = ['each'] if just_speakers == ['each']: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in list(search.values())): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" speakr = dummy_args.get('speaker', False) import os from process import tregex_engine # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' to_write = '\n'.join([sent._parse_string.strip() for sent in sents \ if sent.parse_string is not None]) to_write.encode('utf-8', errors = 'ignore') with open(to_open, "w") as fo: encd = to_write.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) q = list(search.values())[0] ops = ['-o', '-%s' % translated_option] concs = [] res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) if not no_conc: ops += ['-w', '-f'] whole_res = tregex_engine(query = q, options = ops, corpus = to_open, root = root, preserve_case = True) res = format_tregex(res) whole_res = format_tregex(whole_res, whole = True) concs = make_conc_lines_from_whole_mid(whole_res, res, speakr) if root: root.update() try: os.remove(to_open) except OSError: pass if countmode: return(len(res)) else: return res, concs def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get('outname'): to_open = 'tmp-%s.txt' % kwargs['outname'] else: to_open = 'tmp.txt' with open(to_open, "w") as fo: for sent in sents: statsmode_results['Sentences'] += 1 sts = sent.parse_string.rstrip() encd = sts.encode('utf-8', errors = 'ignore') + '\n' fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith('pass')]) statsmode_results['Passives'] += numpass statsmode_results['Tokens'] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results['Words'] += len(words) statsmode_results['Characters'] += len(''.join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from other import as_regex tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/', 'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))', 'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))', 'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))', 'Open class words': r'/^(NN|JJ|VB|RB)/ < __', 'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/', 'Clauses': r'/^S/ < __', 'Interrogative': r'ROOT << (/\?/ !< __)', 'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'), 'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'), 'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w') } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query = q, options = ['-o', '-C'], corpus = to_open, root = root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + '/' + str(total_files) if kwargs.get('outname'): tot_string = '%s: %s' % (kwargs['outname'], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get('note', False): kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results, [] def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr = False): import re, os if speakr is False: speakr = '' conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if '-join-'.join([f, whole, mid]) not in duplicates: duplicates.append('-join-'.join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith('u'): if word.lower() in list(taglemma.keys()): word = taglemma[word.lower()] else: if word == 'x': word = 'Other' # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag = False): """ Find tag for WordNet lemmatisation """ import re tagdict = {'N': 'n', 'A': 'a', 'V': 'v', 'A': 'r', 'None': False, '': False, 'Off': False} if lemmatag is False: tag = 'n' # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)') tagchecker = re.compile(r'^[A-Z]{1,4}$') qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '') treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], 'n') elif lemmatag: tag = lemmatag return tag def format_tregex(results, whole = False): """format tregex by show list""" if countmode: return results import re done = [] if whole: fnames = [x for x, y in results] results = [y for x, y in results] if 'l' in show or 'pl' in show: lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get('w'): if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('w'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('l'), lemma): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('p'), word): continue if len(list(exclude.keys())) == 1 or excludemode == 'any': if re.search(exclude.get('pl'), lemma): continue if exclude and excludemode == 'all': num_to_cause_exclude = len(list(exclude.keys())) current_num = 0 if exclude.get('w'): if re.search(exclude.get('w'), word): current_num += 1 if exclude.get('l'): if re.search(exclude.get('l'), lemma): current_num += 1 if exclude.get('p'): if re.search(exclude.get('p'), word): current_num += 1 if exclude.get('pl'): if re.search(exclude.get('pl'), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == 't': bits.append(word) if i == 'l': bits.append(lemma) elif i == 'w': bits.append(word) elif i == 'p': bits.append(word) elif i == 'pl': bits.append(lemma) joined = '/'.join(bits) done.append(joined) if whole: done = zip(fnames, done) return done def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = ''.join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == 'any': pattern = r'.*' if not split_contractions: list_of_toks = unsplitter(list_of_toks) #list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index+x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[' '.join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in list(ngrams.items()): if v > 1: for i in range(v): result.append(k) if countmode: return(len(result)) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == 'Bad query': return 'Bad query' if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140]) matches.append(match) if countmode: return(len(matches)) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})' compiled_pattern = compiler(pattern) if compiled_pattern == 'Bad query': return 'Bad query' matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return(len(matches)) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} spell_out = [] bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})' pat = compiler(pat) if pat == 'Bad query': return 'Bad query' matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs['search'] = search locs['query'] = query locs['just_speakers'] = just_speakers locs['corpus'] = corpus locs['multiprocess'] = multiprocess if im: signal.signal(signal.SIGINT, original_sigint) from multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} count_results = {} conc_results = {} # check if just counting countmode = 'c' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and 't' in list(search.keys()): simple_tregex_mode = True else: if corpus.datatype == 'plaintext': if search.get('n'): raise NotImplementedError('Use a tokenised corpus for n-gramming.') #searcher = plaintext_ngram optiontext = 'n-grams via plaintext' if search.get('w'): if kwargs.get('regex', True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = 'Searching plaintext' elif corpus.datatype == 'tokens': if search.get('n'): searcher = tok_ngrams optiontext = 'n-grams via tokens' elif search.get('w'): if kwargs.get('regex', True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get('w')) == list: searcher = tok_by_list optiontext = 'Searching tokens' only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l'] if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())): raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse])) elif corpus.datatype == 'parse': if search.get('t'): searcher = slow_tregex elif search.get('s'): searcher = get_stats statsmode = True optiontext = 'General statistics' global numdone numdone = 0 no_conc = True only_conc = False do_concordancing = False else: from depsearch import dep_searcher searcher = dep_searcher optiontext = 'Dependency querying' ############################################ # Set some Tregex-related values # ############################################ if search.get('t'): translated_option = 't' query = search.get('t') # check the query q = tregex_engine(corpus = False, query = search.get('t'), options = ['-t'], check_query = True, root = root) if query is False: if root: return 'Bad query' else: return optiontext = 'Searching parse trees' if 'p' in show or 'pl' in show: translated_option = 'u' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 't' in show: translated_option = 'o' if type(search['t']) == list: search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)' elif 'w' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'c' in show: only_count = True translated_option = 'C' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' elif 'l' in show: translated_option = 't' if type(search['t']) == list: search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', case_sensitive = case_sensitive) if search['t'] == 'any': search['t'] = r'/.?[A-Za-z0-9].?/ !< __' query = search['t'] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for subcorpus in corpus.subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name): # to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) sformat = '\n '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())]) if search == {'s': r'.*'}: sformat = 'features' welcome = '\n%s: %s %s ...\n %s\n Query: %s\n %s corpus ... \n' % \ (thetime, message, corpus.name, optiontext, sformat, message) print(welcome) ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: if search.get('s'): total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12 else: total_files = sum([len(x) for x in list(to_iterate_over.values())]) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') outn = kwargs.get('outname', '') if outn: outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init = True, tot_string = tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): conc_results[subcorpus_name] = [] count_results[subcorpus_name] = [] results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ['-o', '-' + translated_option] result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not countmode: result = format_tregex(result) if not no_conc: op += ['-w', '-f'] whole_result = tregex_engine(query = search['t'], options = op, corpus = subcorpus_path, root = root, preserve_case = preserve_case) if not only_format_match: whole_result = format_tregex(whole_result, whole = True) conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False) if countmode: count_results[subcorpus_name] += [result] else: result = Counter(result) results[subcorpus_name] += result if not no_conc: for lin in conc_result: if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: slow_treg_speaker_guess = kwargs.get('outname', False) if corpus.datatype == 'parse': with open(f.path, 'r') as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print('Could not read file: %s' % f.path) continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if len(just_speakers) == 1: slow_treg_speaker_guess = just_speakers[0] if not sents: continue else: sents = corenlp_xml.sentences res, conc_res = searcher(sents, search = search, show = show, dep_type = dep_type, exclude = exclude, excludemode = excludemode, searchmode = searchmode, lemmatise = False, case_sensitive = case_sensitive, do_concordancing = do_concordancing, only_format_match = only_format_match, speaker = slow_treg_speaker_guess) if res == 'Bad query': return 'Bad query' elif corpus.datatype == 'tokens': import pickle with codecs.open(f.path, "rb") as fo: data = pickle.load(fo) if not only_conc: res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') elif corpus.datatype == 'plaintext': with codecs.open(f.path, 'rb', encoding = 'utf-8') as data: data = data.read() if not only_conc: res = searcher(list(search.values())[0], data, concordancing = False) if not no_conc: conc_res = searcher(list(search.values())[0], data, concordancing = True) if not no_conc: for index, line in enumerate(conc_res): line.insert(0, '') if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for index, line in enumerate(conc_res): if searcher != slow_tregex: line.insert(0, f.name) else: line[0] = f.name if not preserve_case: line[3:] = [x.lower() for x in line[3:]] if spelling: line = [correct_spelling(b) for b in line] if numconc < maxconc or not maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: if not preserve_case: if not statsmode: res = [i.lower() for i in res] if spelling: if not statsmode: res = [correct_spelling(r) for r in res] #if not statsmode: results[subcorpus_name] += Counter(res) #else: #results[subcorpus_name] += res if not statsmode: current_iter += 1 if kwargs.get('paralleling', None) is not None: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) else: tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # delete temp file if there import os if os.path.isfile('tmp.txt'): os.remove('tmp.txt') ############################################ # Get concordances into DataFrame # ############################################ if not no_conc: all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series pindex = 'c f s l m r'.encode('utf-8').split() for fname, spkr, start, word, end in unique_results: #spkr = str(spkr, errors = 'ignore') fname = os.path.basename(fname) all_conc_lines.append(Series([sc_name, fname, \ spkr, \ start, \ word, \ end], \ index = pindex)) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) conc_df = pd.concat(all_conc_lines, axis = 1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r'] else: conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link'] if all(x == '' for x in list(conc_df['s'].values)): conc_df.drop('s', axis = 1, inplace = True) #if kwargs.get('note'): # kwargs['note'].progvar.set(100) #if kwargs.get('printstatus', True): # thetime = strftime("%H:%M:%S", localtime()) # finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index)) # print(finalstring) from interrogation import Concordance output = Concordance(conc_df) if only_conc: output.query = locs if quicksave: output.save() if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df)) print(finalstring) return output #output.query = locs #return output ############################################ # Get interrogation into DataFrame # ############################################ if not only_conc: if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis = 1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get('df1_always_df'): df = Series(df.ix[0]) df.sort_values(ascending = False, inplace = True) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix['Total-tmp'] = df.sum() the_tot = df.ix['Total-tmp'] df = df[the_tot.argsort()[::-1]] df = df.drop('Total-tmp', axis = 0) # format final string if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %d matches.' % tot else: finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total) print(finalstring) if not no_conc: interro = Interrogation(results = df, totals = tot, query = locs, concordance = output) else: interro = Interrogation(results = df, totals = tot, query = locs) if quicksave: interro.save() return interro
return True # Get a list of all the words in Brown corpus. words = brown.words() # Get frequency distribution on the given condition. sent_fd = nltk.FreqDist( word.lower() for word in words if len(word) == length and check_condition(word, userinput) ) # Display the top 3 frequent words if applicable. series = Series(sent_fd) series.sort_values(ascending=False, inplace=True) sumValues = series.sum() top_words = series.keys() count = len(top_words) if count > 0: i = 0 while i < count and i < 3: print(str(i + 1) + ': ' + top_words[i] + ' (' + str(round(100 * series.get(i) / sumValues, 1)) + ' %)') i += 1 else: print("It doesn't seem like there is any word like that.")