Example #1
0
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context("use_bottleneck", use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ["int32", "int64"]:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype="int64")
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ["float32", "float64"]:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
Example #2
0
def combine_subjects_lr(df):

    weights = Series(index=df.columns).fillna(0.0)
    df = df.loc[:, ~df.isnull().all()]
    df = df.fillna(df.mean())

    lr = LinearRegression(fit_intercept=False)
    for col in df.columns:
        use_df = df.drop([col], axis=1)
        _ = lr.fit(use_df, df[[col]])
        corr = Series(lr.predict(use_df).ravel(), index=df.index).corr(df[col])
        corr = 0.0 if isnan(corr) else corr
        coefs = Series(lr.coef_.ravel(),
                       index=use_df.columns).reindex(weights.index).fillna(0.0)
        weights += (corr * coefs)

    weights = weights.where(weights.ge(0.0), 0.0).div(float(df.shape[1] - 1))
    # weights = weights.where(weights.gt(0.1) | weights.eq(0.0), 0.1)
    print '---------------'
    print df.index.get_level_values('grade').values[0]
    print weights / weights.sum()
    # other_weights = df.corr().where(df.corr().ne(1.0)).mean()
    # print other_weights / other_weights.sum()
    output = df.reindex(columns=weights.index).multiply(weights).sum(
        axis=1).div(weights.sum())
    return output
Example #3
0
 def _get_binning_threshold(self, df: DataFrame, y: Series) -> Dict:
     """
     获取分箱阈值
     :param df: 所有变量数据
     :param y: 标签数据
     :return: 变量分箱区间字典
     """
     params = {
         "criterion":
         self.criterion,
         "max_depth":
         self.max_depth,
         "min_samples_split":
         self.min_samples_split,
         "min_samples_leaf":
         max(int(np.ceil(y.size * self.min_samples_leaf)), 50),
         "max_leaf_nodes":
         self.max_leaf_nodes,
         "random_state":
         self.random_state
     }
     self.B_G_rate = y.sum() / (y.size - y.sum())
     for col in df.columns:
         feat_type = self.features_info.get(col)
         nan_value = self.features_nan_value.get(col)
         bins, flag = self._bin_threshold(df[col],
                                          y,
                                          is_num=feat_type,
                                          nan_value=nan_value,
                                          **params)
         self.features_bins[col] = {'bins': bins, 'flag': flag}
def calc_square_model_params(x: pd.Series, y: pd.Series):
    print('[quadratic regression]')
    # calculate the matrix of linear equations
    sx = x.sum()
    sx2 = (x**2).sum()
    sx3 = (x**3).sum()
    sx4 = (x**4).sum()
    sy = y.sum()
    sxy = (x * y).sum()
    sx2y = ((x**2) * y).sum()
    A = np.array([[len(x), sx, sx2], [sx, sx2, sx3], [sx2, sx3, sx4]])
    b = np.array([sy, sxy, sx2y])
    print('solve following linear equations to get w0, w1 and w2...')
    print('\tn\t* w0 + S(x)\t* w1 + S(x^2)\t* w2 = S(y)')
    print('\tS(x)\t* w0 + S(x^2)\t* w1 + S(x^3)\t* w2 = S(xy)')
    print('\tS(x^2)\t* w0 + S(x^3)\t* w1 + S(x^4)\t* w2 = S(x^2*y)')
    print('substitute values...')
    print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' %
          (A[0][0], A[0][1], A[0][2], b[0]))
    print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' %
          (A[1][0], A[1][1], A[1][2], b[1]))
    print('\t%f\t* w0 + %f\t* w1 + %f\t* w2 = %f' %
          (A[2][0], A[2][1], A[2][2], b[2]))
    # solve equations
    w = np.linalg.solve(A, b)
    print('solution: w0 = %f, w1 = %f, w2 = %f\n' % (w[0], w[1], w[2]))
    return w[0], w[1], w[2]
Example #5
0
    def test_sum_overflow(self, use_bottleneck):

        with pd.option_context('use_bottleneck', use_bottleneck):
            # GH#6915
            # overflowing on the smaller int dtypes
            for dtype in ['int32', 'int64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert int(result) == v.sum(dtype='int64')
                result = s.min(skipna=False)
                assert int(result) == 0
                result = s.max(skipna=False)
                assert int(result) == v[-1]

            for dtype in ['float32', 'float64']:
                v = np.arange(5000000, dtype=dtype)
                s = Series(v)

                result = s.sum(skipna=False)
                assert result == v.sum(dtype=dtype)
                result = s.min(skipna=False)
                assert np.allclose(float(result), 0.0)
                result = s.max(skipna=False)
                assert np.allclose(float(result), v[-1])
def _linear_regression_np(x: Series, y: Series) -> dict:
    """Simple Linear Regression in Numpy for two 1d arrays for environments without the sklearn package."""
    result = {"a": npNaN, "b": npNaN, "r": npNaN, "t": npNaN, "line": npNaN}
    x_sum = x.sum()
    y_sum = y.sum()

    if int(x_sum) != 0:
        # 1st row, 2nd col value corr(x, y)
        r = npCorrcoef(x, y)[0, 1]

        m = x.size
        r_mix = m * (x * y).sum() - x_sum * y_sum
        b = r_mix // (m * (x * x).sum() - x_sum * x_sum)
        a = y.mean() - b * x.mean()
        line = a + b * x

        _np_err = seterr()
        seterr(divide="ignore", invalid="ignore")
        result = {
            "a": a,
            "b": b,
            "r": r,
            "t": r / npSqrt((1 - r * r) / (m - 2)),
            "line": line,
        }
        seterr(divide=_np_err["divide"], invalid=_np_err["invalid"])

    return result
 def update(self, feature_x: pd.Series, feature_y: pd.Series) -> None:
     """Updates partial cross feature statistics."""
     self.sum_x += feature_x.sum()
     self.sum_y += feature_y.sum()
     self.sum_square_x += (feature_x**2).sum()
     self.sum_square_y += (feature_y**2).sum()
     self.sum_xy += (feature_x * feature_y).sum()
     self.count += len(feature_x)
Example #8
0
File: mcmc.py Project: numpde/cbb
 def chi(q: pd.Series, p: pd.Series):
     # Eqn (4) in
     # https://www.cse.huji.ac.il/~werman/Papers/ECCV2010.pdf
     p = p / p.sum()
     q = q / q.sum()
     x = np.sqrt(0.5 * (((p - q)**2) / (p + q)).sum())
     assert (0 <= x <= 1)
     return x
Example #9
0
def _exit_transaction(
    df: pd.DataFrame,
    trade: pd.Series,
    exit_condition: Callable[[pd.DataFrame, pd.Series, pd.Timestamp], bool],
) -> Transaction:
    for index in df.index:
        if exit_condition(df, trade, index):
            return Transaction(timestamp=index, amount=-trade.sum())
    return Transaction(timestamp=df.index[-1], amount=-trade.sum())
Example #10
0
def bootstrap_series(ser: pd.Series) -> pd.Series:
    """
    Creates a bootstrapped time series of same length and number of observations as original time series

    :param ser: (pd.Series): time series (not necessarily stationary, but observations
                are assumed to be weakly correlated.)
    """
    resampled = np.random.multinomial(ser.sum(), ser / ser.sum())
    return pd.Series(resampled, index=ser.index)
Example #11
0
def get_precision_recall(
        data: pd.Series,
        ties: Optional[np.ndarray] = None) -> Tuple[pd.Series, pd.Series]:
    r = np.arange(1, data.shape[0] + 1)
    c = data.cumsum()

    if ties is not None:
        return fix_tied(ties, c / r), fix_tied(ties, c / data.sum())

    return (c / r), (c / data.sum())
def get_fi_gain(model_name, reg, X_train):
    if model_name == 'ours':
        fi_gain = Series(reg.compute_feature_importance(method='gain'))
    elif model_name == 'sklearn':
        fi_gain = Series(reg.feature_importances_, index=X_train.columns)
    elif model_name == 'xgboost':
        fi_gain = Series(reg.get_score(importance_type='gain'))
    else:  # model_name == 'catboost'
        fi_gain = Series(reg.feature_importances_, index=reg.feature_names_)
    if fi_gain.sum() != 0:
        fi_gain /= fi_gain.sum()
    return fi_gain
Example #13
0
def compute_weighted_avg(series):
    """
    method for computing weighted average by duration
    :return: feature value
    """
    if len(series) == 1:
        return series.mean()
    values = series[:-1].values
    weights = Series(map(lambda x: float(x) / 10e8, series.index.values[1:] - series.index.values[:-1]))
    weights = weights.values
    avg = values * weights
    return avg.sum() / weights.sum() if weights.sum() > 0 else np.nan
Example #14
0
def main(args):
    generators, loads, _, times, _, data = parsedir()
    generators = filter(lambda gen: gen.is_controllable, generators)

    gen_data = data['generators']
    if args['min'] == 0:
        args['min'] = 1.1 * gen_data.pmin.sum()

    if args['max'] == 0:
        args['max'] = 0.99 * gen_data.pmax.sum()

    load_values = np.arange(args['min'], args['max'], args['interval'])
    results = DataFrame(columns=['prices', 'committed', 'last_committed'],
                        index=load_values)

    committed_gen_names = Index([])

    for load_val in load_values:
        print load_val
        loads_times = make_loads_times(Pd=load_val)
        power_system, times = solve_problem(generators,
                                            do_reset_config=False,
                                            **loads_times)
        t = times[0]
        results.ix[load_val, 'prices'] = power_system.buses[0].price(t)
        statuses = Series(
            dict([(gen.name, gen.status(t).value)
                  for gen in power_system.generators()]))

        results.ix[load_val, 'committed'] = statuses.sum()
        results.ix[load_val, 'last_committed'] = \
            statuses[statuses == 1].index.diff(committed_gen_names)
        committed_gen_names = statuses[statuses == 1].index

    if (load_values[-1] == 0.99 * gen_data.pmax.sum()) and \
        (statuses.sum() != len(generators)):
        print('warning: uncommitted generation:')
        print(gen_data.set_index('name').ix[statuses[statuses == 0].index])

    results.to_csv(joindir(user_config.directory, 'ed_sweep.csv'))

    if args['hide_units_committed']:
        ax = results.prices.plot(drawstyle='steps')
    else:
        ax = results[['prices', 'committed']].plot(drawstyle='steps',
                                                   secondary_y=['committed'])
        ax.right_ax.set_ylabel('Units committed')

    ax.set_xlabel('System Load (MW)')
    ax.set_ylabel('Estimated System Price ($/MWh)')

    plt.savefig(joindir(user_config.directory, 'ed_sweep.png'))
Example #15
0
def main(args):
    generators, loads, _, times, _, data = parsedir()
    generators = [gen for gen in generators if gen.is_controllable]

    gen_data = data["generators"]
    if args["min"] == 0:
        args["min"] = 1.1 * gen_data.pmin.sum()

    if args["max"] == 0:
        args["max"] = 0.99 * gen_data.pmax.sum()

    load_values = np.arange(args["min"], args["max"], args["interval"])
    results = DataFrame(columns=["prices", "committed", "last_committed"],
                        index=load_values)

    committed_gen_names = Index([])

    for load_val in load_values:
        print(load_val)
        loads_times = make_loads_times(Pd=load_val)
        power_system, times = solve_problem(generators,
                                            do_reset_config=False,
                                            **loads_times)
        t = times[0]
        results.loc[load_val, "prices"] = power_system.buses[0].price(t)
        statuses = Series(
            dict([(gen.name, gen.status(t).value)
                  for gen in power_system.generators()]))

        results.loc[load_val, "committed"] = statuses.sum()
        results.loc[load_val, "last_committed"] = statuses[
            statuses == 1].index.difference(committed_gen_names)
        committed_gen_names = statuses[statuses == 1].index

    if (load_values[-1] == 0.99 *
            gen_data.pmax.sum()) and (statuses.sum() != len(generators)):
        print("warning: uncommitted generation:")
        print((gen_data.set_index("name").loc[statuses[statuses == 0].index]))

    results.to_csv(joindir(user_config.directory, "ed_sweep.csv"))

    if args["hide_units_committed"]:
        ax = results.prices.plot(drawstyle="steps")
    else:
        ax = results[["prices", "committed"]].plot(drawstyle="steps",
                                                   secondary_y=["committed"])
        ax.right_ax.set_ylabel("Units committed")

    ax.set_xlabel("System Load (MW)")
    ax.set_ylabel("Estimated System Price ($/MWh)")

    plt.savefig(joindir(user_config.directory, "ed_sweep.png"))
Example #16
0
def main(args):
    generators, loads, _, times, _, data = parsedir()
    generators = filter(lambda gen: gen.is_controllable, generators)

    gen_data = data['generators']
    if args['min'] == 0:
        args['min'] = 1.1 * gen_data.pmin.sum()

    if args['max'] == 0:
        args['max'] = 0.99 * gen_data.pmax.sum()

    load_values = np.arange(args['min'], args['max'], args['interval'])
    results = DataFrame(columns=['prices', 'committed', 'last_committed'], index=load_values)

    committed_gen_names = Index([])

    for load_val in load_values:
        print load_val
        loads_times = make_loads_times(Pd=load_val)
        power_system, times = solve_problem(generators,
                                            do_reset_config=False, **loads_times)
        t = times[0]
        results.ix[load_val, 'prices'] = power_system.buses[0].price(t)
        statuses = Series(dict([(gen.name, gen.status(t).value)
                                for gen in power_system.generators()]))

        results.ix[load_val, 'committed'] = statuses.sum()
        results.ix[load_val, 'last_committed'] = \
            statuses[statuses == 1].index.difference(committed_gen_names)
        committed_gen_names = statuses[statuses == 1].index

    if (load_values[-1] == 0.99 * gen_data.pmax.sum()) and \
            (statuses.sum() != len(generators)):
        print('warning: uncommitted generation:')
        print(gen_data.set_index('name').ix[statuses[statuses == 0].index])

    results.to_csv(joindir(user_config.directory, 'ed_sweep.csv'))

    if args['hide_units_committed']:
        ax = results.prices.plot(drawstyle='steps')
    else:
        ax = results[['prices', 'committed']].plot(drawstyle='steps', secondary_y=['committed'])
        ax.right_ax.set_ylabel('Units committed')

    ax.set_xlabel('System Load (MW)')
    ax.set_ylabel('Estimated System Price ($/MWh)')

    plt.savefig(joindir(user_config.directory, 'ed_sweep.png'))
Example #17
0
    def batch_buy(self, codedf: pd.Series, datetime: str, totalamount: float = 1000000, model: enumerate = 'avg_money'):
        """
        批量调仓接口

        codedf: pd.Series

            Series.index -> code
            Series.value -> price


        totalamount: 总买入金额

        model Enum
            'avg_money': 等市值买入
            'avg_amount': 等股数买入(买入总金额==totalamount)
        """
        if model == 'avg_money':
            moneyper = totalamount / len(codedf)
            amount = (moneyper/codedf).apply(lambda x: (int(100/x)*100)
                                             if int(100/x) > 0 else 100)
        elif model == 'avg_amount':
            amountx = int(totalamount/(100*codedf.sum()))
            if amountx == 0:
                return False
            else:
                amount = codedf.apply(lambda x: amountx*100)
        orderres = pd.concat([codedf, amount], axis=1)
        orderres.columns = ['price', 'amount']
        res = orderres.assign(datetime=datetime).apply(lambda x: self.send_order(
            code=x.index, amount=x.amount, price=x.price, towards=1, datetime=x.datetime))
        return res
def test_sum_with_level():
    obj = Series([10.0], index=MultiIndex.from_tuples([(2, 3)]))

    with tm.assert_produces_warning(FutureWarning):
        result = obj.sum(level=0)
    expected = Series([10.0], index=[2])
    tm.assert_series_equal(result, expected)
Example #19
0
def tags_distance(distribution: pd.Series,
                  other: pd.Series,
                  tags: pd.Index,
                  p=1):
    """Compute the Optimal Transport Distance between histograms (see
    https://arxiv.org/pdf/1803.00567.pdf p.30-33)
    """

    if p < 1:
        raise ValueError('p must be greater or equal that 1')
    if p != 1:
        raise NotImplementedError('Only wasserstein 1 is currently supported')

    # Make the tag distributions have the same support
    distrib = distribution.reindex(index=tags, fill_value=0)
    other = other.reindex(index=tags, fill_value=0)

    # Sort by tag (in the lexicographic order) and normalize the distributions
    # This is important because in the distance we implicitly associate a tag to
    # a point in N.
    distrib = distrib.sort_index()
    distrib = distrib / distrib.sum()

    other = other.sort_index()
    other = other / other.sum()

    # print(distrib, other, sep='\n')

    return wasserstein_1d(distrib.to_numpy(), other.to_numpy())
    def chi(self, customattribute):
        """
        计算其卡方值.
        """
        attributeDict = dict()
        classAttributeDict = dict()
        for piece in self.chunks:
            for (attribute, classAttribute), arrays in piece.groupby([customattribute, self.classAttribute]).studentID.unique().iteritems():
                attributeDict.setdefault((attribute, classAttribute), np.array([]))
                attributeDict[(attribute, classAttribute)] = np.union1d(attributeDict[(attribute, classAttribute)], arrays)

            for classAttribute, arrays in piece.groupby(self.classAttribute).studentID.unique().iteritems():
                classAttributeDict.setdefault(classAttribute, np.array([]))
                classAttributeDict[classAttribute] = np.union1d(classAttributeDict[classAttribute], arrays)

        #各个类别的毕业去向群体中所占的比例.
        classSeries = Series(classAttributeDict).apply(lambda x:len(x))
        classSeries /= classSeries.sum()

        #在各个attribute上的实际观测值.
        attributeObs = Series(attributeDict).apply(lambda x:len(x)).unstack(fill_value=0)

        attributeExp = DataFrame(index=attributeObs.index, columns=attributeObs.columns)

        #设置初始值.
        for index in attributeExp.index:
            attributeExp.ix[index] = attributeObs.ix[index].sum()
        #根据各个目标类别中的比例来获得其期望值.
        attributeExp = attributeExp.mul(classSeries).fillna(0)
        #根据实际观测值与期望值来计算其卡方值,并返回p-value值.
        return chisquare(attributeObs.stack(), attributeExp.stack()), attributeObs
Example #21
0
def create_report_new(type_cleaned: str, stats: pd.Series,
                      errors: str) -> None:
    """
    Describe what was done in the cleaning process

    The stats series contains the following codes in its index
        0 := the number of null values
        1 := the number of values that could not be parsed
        2 := the number of values that were transformed during cleaning
        3 := the number of values that were already in the correct format
    """
    print(f"{type_cleaned} Cleaning Report:")
    nrows = stats.sum()

    nclnd = stats.loc[2] if 2 in stats.index else 0
    if nclnd > 0:
        pclnd = round(nclnd / nrows * 100, 2)
        print(f"\t{nclnd} values cleaned ({pclnd}%)")

    nunknown = stats.loc[1] if 1 in stats.index else 0
    if nunknown > 0:
        punknown = round(nunknown / nrows * 100, 2)
        expl = "set to NaN" if errors == "coerce" else "left unchanged"
        print(f"\t{nunknown} values unable to be parsed ({punknown}%), {expl}")

    nnull = stats.loc[0] if 0 in stats.index else 0
    if errors == "coerce":
        nnull += stats.loc[1] if 1 in stats.index else 0
    pnull = round(nnull / nrows * 100, 2)

    ncorrect = nclnd + (stats.loc[3] if 3 in stats.index else 0)
    pcorrect = round(ncorrect / nrows * 100, 2)
    print(
        f"Result contains {ncorrect} ({pcorrect}%) values in the correct format "
        f"and {nnull} null values ({pnull}%)")
Example #22
0
class Player:
    def __init__(self, first_name, last_name, id):
        self.first_name = first_name
        self.last_name = last_name
        self.id = id
        self.hrs = [0, 0, 0, 0, 0, 0]  #One for each month of the game
        self.hr_total = 0
        self.hr_series = Series()
        self.hr_total_series = Series()

    def __str__(self):
        return str.format('{0} : {1}', self.id, self.last_name)

    def __repr__(self):
        return self.__str__()

    def add_hrs(self, count, date):
        self.hr_total += count
        self.hr_total_series[date] = self.hr_series.sum() + count
        if (self.hr_series.last_valid_index() == date):
            self.hr_series[date] = count + self.hr_series[date]
        else:
            self.hr_series[date] = count

    def name(self):
        return self.first_name + " " + self.last_name

    def get_player_hr_dataframe(self):
        return self.hr_series.to_frame(self.name())

    def get_player_hr_total_dataframe(self):
        return self.hr_total_series.to_frame(self.name())
Example #23
0
    def calculate_pfe(trades: List[Trade], ca: CollateralAgreement):
        ac_bucketing = {}
        addOns = Series()
        for t in trades:
            if t.assetClass in ac_bucketing:
                ac_bucketing[t.assetClass].append(t)
            else:
                ac_bucketing[t.assetClass] = [t]
        for ac, ac_trades in ac_bucketing.items():
            if ac == AssetClass.EQ:
                addOns[ac] = SA_CCR.equity_addOn(ac_trades, ca)
            elif ac == AssetClass.IR:
                addOns[ac] = SA_CCR.interest_rate_addOn(ac_trades, ca)
            elif ac == AssetClass.FX:
                addOns[ac] = SA_CCR.fx_addOn(ac_trades, ca)

        V = ca.get_V()

        C = ca.get_C()
        aggregate_addOn = addOns.sum()
        multiplier_var = SA_CCR.multiplier(V, C, aggregate_addOn)

        PFE = multiplier_var * aggregate_addOn
        return {
            'PFE': PFE,
            'multiplier': multiplier_var,
            'AddOn_agg': aggregate_addOn
        }
Example #24
0
    def infer_posterior(data: Series,
                        mu_0: float,
                        sigma_0_sq: Optional[float] = None,
                        tau_0: Optional[float] = None) -> Normal:
        """
        Return a new Normal distribution of the posterior most likely to
        generate the given data.

        :param data: Series of float observations.
        :param mu_0: Value for the μ₀ (mean) hyper-parameter of the prior Normal
                     distribution describing the mean.
        :param sigma_0_sq: Value for the σ₀² (variance) hyper-parameter of the
                           prior Normal distribution describing the mean.
        :param tau_0: Value for the τ₀ (precision) hyper-parameter of the
                      prior Normal distribution describing the mean.
        """
        if not one_is_none(sigma_0_sq, tau_0):
            raise ValueError('Give either σ₀² or τ₀')

        n = len(data)
        x_sum = data.sum()
        if sigma_0_sq is None:
            tau = 1 / data.var()
            return NormalNormalConjugate(n=n,
                                         x_sum=x_sum,
                                         mu_0=mu_0,
                                         tau=tau,
                                         tau_0=tau_0).posterior()
        else:
            sigma_sq = data.var()
            return NormalNormalConjugate(n=n,
                                         x_sum=x_sum,
                                         mu_0=mu_0,
                                         sigma_sq=sigma_sq,
                                         sigma_0_sq=sigma_0_sq).posterior()
Example #25
0
def count_fraction_of_true(series: pd.Series):
    # We are assuming this is called by a Boolean series
    if series.dtype != np.bool:
        raise ValueError
    num_true = series.sum()
    total = float(series.count())
    return num_true / total if total > 0.0 else 0.0, num_true
Example #26
0
def sinwma(close, length=None, asc=None, offset=None, **kwargs):
    """Indicator: Sine Weighted Moving Average (SINWMA) by Everget of TradingView"""
    # Validate Arguments
    close = verify_series(close)
    length = int(length) if length and length > 0 else 14
    min_periods = (int(kwargs["min_periods"]) if "min_periods" in kwargs
                   and kwargs["min_periods"] is not None else length)
    offset = get_offset(offset)

    # Calculate Result
    sines = Series(
        [sin((i + 1) * pi / (length + 1)) for i in range(0, length)])
    w = sines / sines.sum()

    sinwma = close.rolling(length, min_periods=length).apply(weights(w),
                                                             raw=True)

    # Offset
    if offset != 0:
        sinwma = sinwma.shift(offset)

    # Name & Category
    sinwma.name = f"SINWMA_{length}"
    sinwma.category = "overlap"

    return sinwma
Example #27
0
class Player:
    def __init__(self, first_name, last_name, id):
        self.first_name = first_name
        self.last_name = last_name
        self.id = id
        self.hrs = [0,0,0,0,0,0] #One for each month of the game
        self.hr_total = 0
        self.hr_series = Series()
        self.hr_total_series = Series()

    def __str__(self):
        return str.format('{0} : {1}', self.id, self.last_name)

    def __repr__(self):
        return self.__str__()

    def add_hrs(self, count, date):
        self.hr_total += count
        self.hr_total_series[date] = self.hr_series.sum() + count
        if(self.hr_series.last_valid_index() == date ):
            self.hr_series[date] = count + self.hr_series[date]
        else:
            self.hr_series[date] = count


    def name(self):
        return self.first_name + " " + self.last_name

    def get_player_hr_dataframe(self):
        return self.hr_series.to_frame(self.name())

    def get_player_hr_total_dataframe(self):
        return self.hr_total_series.to_frame(self.name())
Example #28
0
def single_proportion_test(sample: pd.Series, category: str, p_0: float,
                           alternative: str) -> Dict[str, float]:
    """Performs a single proportion test

    Args:
        sample: Series with the count of two categorical variables. Check the example below for details.
        category: The name of the category we want to use for the test.
        p_0: The proportion of the Null Hypothesis
        alternative: Defines the alternative hypothesis. Possible values: 'less', 'greater', or 'two-sided'.

    Returns:
        Dict with the calculated "z" parameter and the p-value

    Example:
        The following is an example of the format required for the `sample` parameter. The index values (yes, no) are
        the categories, and the values are the count of elements in each category::

            >>> sample
            Out[1]:
            Relapse
            no      4
            yes    20
            Name: Drug, dtype: int64
    """
    n = sample.sum()
    p_hat = sample[category] / n
    _SE = np.sqrt(p_0 * (1 - p_0) / n)
    z = p_hat - p_0 / _SE
    validate_conditions_for_theoretical_distns(
        inference_type='single-proportion', n=n, p=p_hat)
    return {'z': z, 'p-value': get_p_value(z, alternative=alternative)}
Example #29
0
def sinwma(close, length=None, offset=None, **kwargs):
    """Indicator: Sine Weighted Moving Average (SINWMA) by Everget of TradingView"""
    # Validate Arguments
    length = int(length) if length and length > 0 else 14
    close = verify_series(close, length)
    offset = get_offset(offset)

    if close is None: return

    # Calculate Result
    sines = Series([npSin((i + 1) * npPi / (length + 1)) for i in range(0, length)])
    w = sines / sines.sum()

    sinwma = close.rolling(length, min_periods=length).apply(weights(w), raw=True)

    # Offset
    if offset != 0:
        sinwma = sinwma.shift(offset)

    # Handle fills
    if "fillna" in kwargs:
        sinwma.fillna(kwargs["fillna"], inplace=True)
    if "fill_method" in kwargs:
        sinwma.fillna(method=kwargs["fill_method"], inplace=True)

    # Name & Category
    sinwma.name = f"SINWMA_{length}"
    sinwma.category = "overlap"

    return sinwma
Example #30
0
 def _getSeriesScoreMultipliedByCount(self,
                                      targetSeries: pd.Series) -> float:
     totalCount = targetSeries.count()
     trueCount = targetSeries.sum()
     falseCount = totalCount - trueCount
     return totalCount - (trueCount * trueCount +
                          falseCount * falseCount) / totalCount
def permutation_feature_importance(model_name,
                                   model,
                                   X,
                                   y,
                                   categorical_features=None):
    results = {}
    mse = compute_mse(model_name,
                      model,
                      X,
                      y,
                      categorical_features=categorical_features)
    for col in X.columns:
        permutated_x = X.copy()
        random_feature_mse = []
        for i in range(N_PERMUTATIONS):
            permutated_x[col] = permutation(permutated_x[col])
            if model_name == 'xgboost':
                temp_x = xgb.DMatrix(permutated_x)
            elif model_name == 'catboost':
                temp_x = Pool(permutated_x, cat_features=categorical_features
                              ) if categorical_features else Pool(permutated_x)
            else:
                temp_x = permutated_x
            random_feature_mse.append(
                compute_mse(model_name,
                            model,
                            temp_x,
                            y,
                            transform_x=False,
                            categorical_features=categorical_features))
        results[col] = mean(array(random_feature_mse)) - mse
    results = Series(results)
    return results / results.sum()
Example #32
0
def make_equal(series: pd.Series, matched: float) -> pd.Series:
    """
    Equally distrubute a series considering a matched value.
    Lower values than matched value are filtered.
    :param series: A series which has one or more rows.
    :param matched: A positive float number.
    :return: A series that is equally distrubuted.
    """
    check_negative = series.sum() < 0
    if check_negative:
        sorted_ = series.abs().sort_values()
    else:
        sorted_ = series.sort_values()
    per_ = matched / series.size
    for i, v in enumerate(sorted_):
        if not v > per_:
            per_ = (matched - sorted_.iloc[:i + 1].sum()) / (series.size -
                                                             (i + 1))
        else:
            break
    sorted_.iloc[i:] = per_
    if check_negative:
        return series.mul(0).add(sorted_).mul(-1)
    else:
        return series.mul(0).add(sorted_)
def plotCosts(series: pd.Series, folder, suffix, xLabel=''):
    f = plt.figure(figsize=(10, 4))
    ax = f.add_subplot(121)
    fig = plt.bar(series.columns, series.iloc[0, :].values, color=colors)
    plt.xticks(rotation=45)
    plt.ylabel('Social Cost')
    plt.title('Base cost contribution')
    ax2 = f.add_subplot(122)
    lines = [
        plt.plot(series.index,
                 series.iloc[:, i] - series.iloc[0, i],
                 color=colors[i],
                 label=series.columns[i]) for i in range(series.shape[1])
    ]
    total = plt.plot(series.index,
                     series.sum(axis=1).values - series.iloc[0, :].sum(),
                     color='k',
                     linewidth=2.0,
                     label="Total")
    plt.legend(list(series.columns) + ['Total'])
    plt.xlabel(xLabel)
    plt.ylabel('Change in social cost')
    plt.title('Variation')
    f.tight_layout()
    plt.savefig(folder + '/costs' + suffix + '.png')
    return f
Example #34
0
def _trapezium_integration_variable(d_ti: pd.Series) -> Optional[float]:
    """Gapfill version of trap int - will fill out"""
    # Clear no numbers
    d_ti = d_ti.dropna()

    if d_ti.count() == 0:
        return None

    # One entry
    if d_ti.count() == 1:
        return d_ti[0] * 0.5

    # Fall back on average but warn to check data
    if d_ti.count() <= 3:
        d_sum = d_ti.sum()

        if d_sum == 0:
            return 0

        if d_ti.count() == 0:
            return 0

        return 0.5 * d_sum / d_ti.count()

    bucket_middle = d_ti.count() - 2

    bucket_middle_weights = [1] + [2] * bucket_middle + [1]

    weights = d_ti.values * bucket_middle_weights

    weights_sum = weights.sum()

    bucket_energy = 0.5 * weights_sum / ((d_ti.count() - 1) * 2)

    return bucket_energy
Example #35
0
def test_td64_sum_empty(skipna):
    # GH#37151
    ser = Series([], dtype="timedelta64[ns]")

    result = ser.sum(skipna=skipna)
    assert isinstance(result, pd.Timedelta)
    assert result == pd.Timedelta(0)
Example #36
0
    def test_sum_inf(self):
        s = Series(np.random.randn(10))
        s2 = s.copy()

        s[5:8] = np.inf
        s2[5:8] = np.nan

        assert np.isinf(s.sum())

        arr = np.random.randn(100, 100).astype('f4')
        arr[:, 2] = np.inf

        with pd.option_context("mode.use_inf_as_na", True):
            tm.assert_almost_equal(s.sum(), s2.sum())

        res = nanops.nansum(arr, axis=1)
        assert np.isinf(res).all()
Example #37
0
def compute_weighted_std(series):
    """
    method for computing weighted standard deviation by duration
    :return: feature value
    """
    if len(series) <= 1:
        return 0.0
    values = series[:-1].values
    weights = Series(map(lambda x: float(x) / 10e8, series.index.values[1:] - series.index.values[:-1]))
    weights = weights.values
    w_avg = values * weights
    w_avg = w_avg.sum() / weights.sum() if weights.sum() > 0 else np.nan

    nonzero_w_num = float(len(weights[weights != 0]))
    std = (weights * (values - w_avg) ** 2).sum() / (((nonzero_w_num - 1) / float(
        nonzero_w_num) if nonzero_w_num > 0 else np.nan) * weights.sum()) if weights.sum() > 0 else np.nan
    return np.sqrt(std)
Example #38
0
    def Calls(self):
        rows = []
        for name,callTimes in self.times['call'].iteritems():
            s = Series(callTimes)
            func,loc = formatName(name)
            callCount = s.count()
            meanTime = s.mean()
            totalTime = s.sum()
            rows.append((func,loc,callCount,meanTime,totalTime))

        columns = ('FUNCTION', 'SOURCE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('FUNCTION', 'SOURCE'))
Example #39
0
    def Phases(self):
        rows = []
        for prefix in ('parse', 'compile', 'run'):
            for name,callTimes in self.times[prefix].iteritems():
                s = Series(callTimes)
                callCount = s.count()
                meanTime = s.mean()
                totalTime = s.sum()
                rows.append(("%s:%s" % (prefix,name),callCount,meanTime,totalTime))

        columns = ('PHASE', 'COUNT', 'MEAN', 'TOTAL')
        return DataFrame.from_records(rows, columns=columns, index=('PHASE'))
Example #40
0
    def test_pie_series(self):
        # if sum of values is less than 1.0, pie handle them as rate and draw
        # semicircle.
        series = Series(np.random.randint(1, 5),
                        index=['a', 'b', 'c', 'd', 'e'], name='YLABEL')
        ax = _check_plot_works(series.plot.pie)
        self._check_text_labels(ax.texts, series.index)
        assert ax.get_ylabel() == 'YLABEL'

        # without wedge labels
        ax = _check_plot_works(series.plot.pie, labels=None)
        self._check_text_labels(ax.texts, [''] * 5)

        # with less colors than elements
        color_args = ['r', 'g', 'b']
        ax = _check_plot_works(series.plot.pie, colors=color_args)

        color_expected = ['r', 'g', 'b', 'r', 'g']
        self._check_colors(ax.patches, facecolors=color_expected)

        # with labels and colors
        labels = ['A', 'B', 'C', 'D', 'E']
        color_args = ['r', 'g', 'b', 'c', 'm']
        ax = _check_plot_works(series.plot.pie, labels=labels,
                               colors=color_args)
        self._check_text_labels(ax.texts, labels)
        self._check_colors(ax.patches, facecolors=color_args)

        # with autopct and fontsize
        ax = _check_plot_works(series.plot.pie, colors=color_args,
                               autopct='%.2f', fontsize=7)
        pcts = ['{0:.2f}'.format(s * 100)
                for s in series.values / float(series.sum())]
        iters = [iter(series.index), iter(pcts)]
        expected_texts = list(next(it) for it in itertools.cycle(iters))
        self._check_text_labels(ax.texts, expected_texts)
        for t in ax.texts:
            assert t.get_fontsize() == 7

        # includes negative value
        with pytest.raises(ValueError):
            series = Series([1, 2, 0, 4, -1], index=['a', 'b', 'c', 'd', 'e'])
            series.plot.pie()

        # includes nan
        series = Series([1, 2, np.nan, 4], index=['a', 'b', 'c', 'd'],
                        name='YLABEL')
        ax = _check_plot_works(series.plot.pie)
        self._check_text_labels(ax.texts, ['a', 'b', '', 'd'])
Example #41
0
    def test_pie_series(self):
        # if sum of values is less than 1.0, pie handle them as rate and draw
        # semicircle.
        series = Series(np.random.randint(1, 5), index=["a", "b", "c", "d", "e"], name="YLABEL")
        ax = _check_plot_works(series.plot.pie)
        self._check_text_labels(ax.texts, series.index)
        self.assertEqual(ax.get_ylabel(), "YLABEL")

        # without wedge labels
        ax = _check_plot_works(series.plot.pie, labels=None)
        self._check_text_labels(ax.texts, [""] * 5)

        # with less colors than elements
        color_args = ["r", "g", "b"]
        ax = _check_plot_works(series.plot.pie, colors=color_args)

        color_expected = ["r", "g", "b", "r", "g"]
        self._check_colors(ax.patches, facecolors=color_expected)

        # with labels and colors
        labels = ["A", "B", "C", "D", "E"]
        color_args = ["r", "g", "b", "c", "m"]
        ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args)
        self._check_text_labels(ax.texts, labels)
        self._check_colors(ax.patches, facecolors=color_args)

        # with autopct and fontsize
        ax = _check_plot_works(series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7)
        pcts = ["{0:.2f}".format(s * 100) for s in series.values / float(series.sum())]
        iters = [iter(series.index), iter(pcts)]
        expected_texts = list(next(it) for it in itertools.cycle(iters))
        self._check_text_labels(ax.texts, expected_texts)
        for t in ax.texts:
            self.assertEqual(t.get_fontsize(), 7)

        # includes negative value
        with tm.assertRaises(ValueError):
            series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"])
            series.plot.pie()

        # includes nan
        series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL")
        ax = _check_plot_works(series.plot.pie)
        self._check_text_labels(ax.texts, ["a", "b", "", "d"])
Example #42
0
def interrogator(
    corpus,
    search,
    query="any",
    show="w",
    exclude=False,
    excludemode="any",
    searchmode="all",
    dep_type="collapsed-ccprocessed-dependencies",
    case_sensitive=False,
    quicksave=False,
    just_speakers=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    conc=False,
    only_unique=False,
    random=False,
    only_format_match=False,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r"[A-Za-z0-9:_]",
    gramsize=2,
    split_contractions=False,
    **kwargs
):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""
    # store kwargs
    locs = locals()

    from corpkit.interrogation import Interrogation
    from corpkit.process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from corpkit.other import as_regex
    from corpkit.process import get_deps
    from time import localtime, strftime

    thetime = strftime("%H:%M:%S", localtime())
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator
    from corpkit.dictionaries.word_transforms import wordlist, taglemma

    # find out if using gui
    root = kwargs.get("root")
    note = kwargs.get("note")

    # convert path to corpus object
    if type(corpus) == str:
        from corpkit.corpus import Corpus

        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from corpkit.process import searchfixer

    search, search_iterable = searchfixer(search, query)

    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(search.keys()) == 1:
        query = search.values()[0]

    if "l" in show and search.get("t"):
        from nltk.stem.wordnet import WordNetLemmatizer

        lmtzr = WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict

        if hasattr(corpus, "__iter__"):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != search.values()[0] or len(search.keys()) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == "each":
                im = True
                just_speakers = ["each"]
            if just_speakers == ["each"]:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in search.values()):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        import os
        from corpkit.process import tregex_engine

        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None])
        to_write.encode("utf-8", errors="ignore")
        with open(to_open, "w") as fo:
            fo.write(to_write)
        q = search.values()[0]
        res = tregex_engine(
            query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True
        )
        if root:
            root.update()
        os.remove(to_open)
        if countmode:
            return len(res)
        else:
            return res

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter

        statsmode_results = Counter()
        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results["Sentences"] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode("utf-8", errors="ignore") + "\n"
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith("pass")])
                statsmode_results["Passives"] += numpass
                statsmode_results["Tokens"] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results["Words"] += len(words)
                statsmode_results["Characters"] += len("".join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from corpkit.other import as_regex

        tregex_qs = {
            "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/",
            "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)",
            "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))",
            "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))",
            "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))",
            "Open class words": r"/^(NN|JJ|VB|RB)/ < __",
            "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/",
            "Clauses": r"/^S/ < __",
            "Interrogative": r"ROOT << (/\?/ !< __)",
            "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"),
            "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"),
            "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)"
            % as_regex(processes.relational, boundaries="w"),
        }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + "/" + str(total_files)
                if kwargs.get("outname"):
                    tot_string = "%s: %s" % (kwargs["outname"], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get("note", False):
                kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False):
        if speakr is False:
            speakr = ""
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if "-join-".join([f, whole, mid]) not in duplicates:
                duplicates.append("-join-".join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)]
            for offstart, offend in offsets:
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict

        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith("u"):
                if word.lower() in taglemma.keys():
                    word = taglemma[word.lower()]
                else:
                    if word == "x":
                        word = "Other"
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag=False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False}

        if lemmatag is False:
            tag = "n"  # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)")
            tagchecker = re.compile(r"^[A-Z]{1,4}$")
            qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "")
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], "n")
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results):
        """format tregex by show list"""
        if countmode:
            return results
        import re

        done = []
        if "l" in show or "pl" in show:
            lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get("w"):
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("w"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("l"), lemma):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("p"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("pl"), lemma):
                        continue
            if exclude and excludemode == "all":
                num_to_cause_exclude = len(exclude.keys())
                current_num = 0
                if exclude.get("w"):
                    if re.search(exclude.get("w"), word):
                        current_num += 1
                if exclude.get("l"):
                    if re.search(exclude.get("l"), lemma):
                        current_num += 1
                if exclude.get("p"):
                    if re.search(exclude.get("p"), word):
                        current_num += 1
                if exclude.get("pl"):
                    if re.search(exclude.get("pl"), lemma):
                        current_num += 1
                if current_num == num_to_cause_exclude:
                    continue

            for i in show:
                if i == "t":
                    bits.append(word)
                if i == "l":
                    bits.append(lemma)
                elif i == "w":
                    bits.append(word)
                elif i == "p":
                    bits.append(word)
                elif i == "pl":
                    bits.append(lemma)
            joined = "/".join(bits)
            done.append(joined)
        return done

    def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True):
        from collections import Counter
        import re

        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == "any":
            pattern = r".*"

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)

            # list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index + x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[" ".join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in ngrams.items():
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return len(result)
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re

        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime

            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print "%s: Query %s" % (thetime, error_message)
            if root:
                return "Bad query"
            else:
                raise ValueError("%s: Query %s" % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        comped = compiler(pattern)
        if comped == "Bad query":
            return "Bad query"
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re

        if concordancing:
            pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})"
        compiled_pattern = compiler(pattern)
        if compiled_pattern == "Bad query":
            return "Bad query"
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return len(matches)
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert

        if spelling.lower() == "uk":
            usa_convert = {v: k for k, v in usa_convert.items()}
        spell_out = []
        bits = a_string.split("/")
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = "/".join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re

        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})"
            pat = compiler(pat)
            if pat == "Bad query":
                return "Bad query"
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)

    locs["search"] = search
    locs["query"] = query
    locs["just_speakers"] = just_speakers
    locs["corpus"] = corpus
    locs["multiprocess"] = multiprocess

    if im:
        from corpkit.multiprocess import pmultiquery

        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    # check if just counting
    countmode = "c" in show
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get("denominator", 1)
    startnum = kwargs.get("startnum", 0)

    ############################################
    # Determine the search function to be used #
    ############################################

    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and "t" in search.keys():
        simple_tregex_mode = True
    else:
        if corpus.datatype == "plaintext":
            if search.get("n"):
                raise NotImplementedError("Use a tokenised corpus for n-gramming.")
                # searcher = plaintext_ngram
                optiontext = "n-grams via plaintext"
            if search.get("w"):
                if kwargs.get("regex", True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = "Searching plaintext"

        elif corpus.datatype == "tokens":
            if search.get("n"):
                searcher = tok_ngrams
                optiontext = "n-grams via tokens"
            elif search.get("w"):
                if kwargs.get("regex", True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get("w")) == list:
                    searcher = tok_by_list
                optiontext = "Searching tokens"
        only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"]
        if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()):
            raise ValueError(
                'Need parsed corpus to search with "%s" option(s).'
                % ", ".join([i for i in search.keys() if i in only_parse])
            )

        elif corpus.datatype == "parse":
            if search.get("t"):
                searcher = slow_tregex
            elif search.get("s"):
                searcher = get_stats
                statsmode = True
                optiontext = "General statistics"
                global numdone
                numdone = 0
            else:
                from corpkit.depsearch import dep_searcher

                searcher = dep_searcher
                optiontext = "Dependency querying"

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get("t"):
        query = search.get("t")

        # check the query
        q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root)
        if query is False:
            if root:
                return "Bad query"
            else:
                return

        optiontext = "Searching parse trees"
        if "p" in show or "pl" in show:
            translated_option = "u"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "t" in show:
            translated_option = "o"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "w" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "c" in show:
            count_results = {}
            only_count = True
            translated_option = "C"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "l" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"

        query = search["t"]

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for k, v in sorted(corpus.structure.items()):
            to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if conc:
        message = "Concordancing"
    else:
        message = "Interrogating"
    if kwargs.get("printstatus", True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = "\n                 ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()])
        if search == {"s": r".*"}:
            sformat = "features"
        welcome = "\n%s: %s %s ...\n          %s\n          Query: %s\n" % (
            thetime,
            message,
            corpus.name,
            optiontext,
            sformat,
        )
        print welcome

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(to_iterate_over.keys())
    else:
        if search.get("s"):
            total_files = sum([len(x) for x in to_iterate_over.values()]) * 12
        else:
            total_files = sum([len(x) for x in to_iterate_over.values()])

    par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files}

    term = None
    if kwargs.get("paralleling", None) is not None:
        from blessings import Terminal

        term = Terminal()
        par_args["terminal"] = term
        par_args["linenum"] = kwargs.get("paralleling")

    outn = kwargs.get("outname", "")
    if outn:
        outn = outn + ": "
    tstr = "%s%d/%d" % (outn, current_iter, total_files)
    p = animator(None, None, init=True, tot_string=tstr, **par_args)
    tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        if countmode or conc:
            results[subcorpus_name] = []
        else:
            results[subcorpus_name] = Counter()

        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ["-o", "-" + translated_option]
            result = tregex_engine(
                query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
            )

            if countmode:
                results[subcorpus_name].append(result)
                continue

            result = Counter(format_tregex(result))

            if conc:
                op.append("-w")
                whole_result = tregex_engine(
                    query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
                )

                if not only_format_match:
                    whole_result = format_tregex(whole_result)

                result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False)

                if spelling:
                    for index, line in enumerate(result):
                        result[index] = [correct_spelling(b) for b in line]

            results[subcorpus_name] += result

            current_iter += 1
            if kwargs.get("paralleling", None) is not None:
                tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
            else:
                tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:

                if corpus.datatype == "parse":
                    with open(f.path, "r") as data:
                        data = data.read()
                        from corenlp_xml.document import Document

                        try:
                            corenlp_xml = Document(data)
                        except:
                            print "Could not read file: %s" % f.path
                            continue
                        if just_speakers:
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res = searcher(
                            sents,
                            search=search,
                            show=show,
                            dep_type=dep_type,
                            exclude=exclude,
                            excludemode=excludemode,
                            searchmode=searchmode,
                            lemmatise=False,
                            case_sensitive=case_sensitive,
                            concordancing=conc,
                            only_format_match=only_format_match,
                        )

                        if res == "Bad query":
                            return "Bad query"

                        if searcher == slow_tregex and not countmode:
                            res = format_tregex(res)

                elif corpus.datatype == "tokens":
                    import pickle

                    with open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc)
                    if conc:
                        for index, line in enumerate(res):
                            line.insert(0, "")

                elif corpus.datatype == "plaintext":
                    with open(f.path, "rb") as data:
                        data = data.read()
                        data = unicode(data, errors="ignore")
                        res = searcher(search.values()[0], data, concordancing=conc)
                        if conc:
                            for index, line in enumerate(res):
                                line.insert(0, "")

                if countmode:
                    results[subcorpus_name] += res
                    continue

                # add filename and do lowercasing for conc
                if conc:
                    for index, line in enumerate(res):
                        line.insert(0, f.name)
                        if not preserve_case:
                            line = [b.lower() for b in line]
                        if spelling:
                            line = [correct_spelling(b) for b in line]
                        results[subcorpus_name] += [line]

                # do lowercasing and spelling
                else:
                    if not preserve_case:
                        res = [r.lower() for r in res]
                    if spelling:
                        res = [correct_spelling(r) for r in res]
                    results[subcorpus_name] += Counter(res)

                if not statsmode:
                    current_iter += 1
                    if kwargs.get("paralleling", None) is not None:
                        tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
                    else:
                        tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)

    # delete temp file if there
    import os

    if os.path.isfile("tmp.txt"):
        os.remove("tmp.txt")

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if conc:
        all_conc_lines = []
        for sc_name, resu in sorted(results.items()):

            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            # make into series
            pindex = "c f s l m r".encode("utf-8").split()
            for fname, spkr, start, word, end in unique_results:
                spkr = unicode(spkr, errors="ignore")
                fname = os.path.basename(fname)

                # the use of ascii here makes sure the string formats ok, but will also screw over
                # anyone doing non-english work. so, change to utf-8, then fix errors as they come
                # in the corpkit-gui "add_conc_lines_to_window" function
                all_conc_lines.append(
                    Series(
                        [
                            sc_name.encode("ascii", errors="ignore"),
                            fname.encode("ascii", errors="ignore"),
                            spkr.encode("ascii", errors="ignore"),
                            start.encode("ascii", errors="ignore"),
                            word.encode("ascii", errors="ignore"),
                            end.encode("ascii", errors="ignore"),
                        ],
                        index=pindex,
                    )
                )

        # randomise results...
        if random:
            from random import shuffle

            shuffle(all_conc_lines)

        df = pd.concat(all_conc_lines, axis=1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            df.columns = ["c", "f", "s", "l", "m", "r"]
        else:
            df.columns = ["c", "f", "s", "l", "m", "r", "link"]

        if all(x == "" for x in list(df["s"].values)):
            df.drop("s", axis=1, inplace=True)

        if kwargs.get("note"):
            kwargs["note"].progvar.set(100)

        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index))
            print finalstring

        from corpkit.interrogation import Concordance

        output = Concordance(df)
        output.query = locs
        if quicksave:
            interro.save()
        return output

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    else:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in results.values() for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index=sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis=1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get("df1_always_df"):
                        df = Series(df.ix[0])
                        df.sort(ascending=False)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:
                df.ix["Total-tmp"] = df.sum()
                the_tot = df.ix["Total-tmp"]
                df = df[the_tot.argsort()[::-1]]
                df = df.drop("Total-tmp", axis=0)

        # format final string
        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Interrogation finished!" % thetime
            if countmode:
                finalstring += " %d matches." % tot
            else:
                finalstring += " %d unique results, %d total occurrences." % (numentries, total_total)
            print finalstring

        interro = Interrogation(results=df, totals=tot, query=locs)

        if quicksave:
            interro.save()

        return interro
def normalize_confusion_matrix(cm: pd.Series) -> pd.Series:
    return cm / cm.sum()
Example #44
0
def editor(interrogation, 
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05, 
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs
          ):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime
    
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum: 
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results


            outdict[k] = v.results.edit(**locs)
        if print_info:
            
            thetime = strftime("%H:%M:%S", localtime())
            print("\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r') :
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results
    
    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack
        
    if checkstack('pythontex'):
        print_info=False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2
            
            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work 
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {'%': datum.div,
                            '*': datum.mul,
                            '/': datum.div,
                            '+': datum.add,
                            '-': datum.sub}

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in the_input if word in df.index]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' % (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info=False)
        return df

    def just_these_entries(df, parsed_input, prinf=True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print('Keeping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(entries, axis=1)
        return df

    def skip_these_entries(df, parsed_input, prinf=True):
        if prinf:     
            print('Skipping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(parsed_input, axis=1)
        return df

    def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print('Merging %d %s as "%s":\n    %s' % \
                    (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10])))
                if len(parsed_input) > 10:
                    print('... and %d more ... \n' % (len(parsed_input) - 10))
                else:
                    print('')
        # remove old entries
        temp = sum([df[i] for i in parsed_input])

        if isinstance(df, Series):
            df = df.drop(parsed_input, errors='ignore')
            nms = list(df.index)
        else:
            df = df.drop(parsed_input, axis=1, errors='ignore')
            nms = list(df.columns)
        if the_newname in nms:
            df[the_newname] = df[the_newname] + temp
        else:
            df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf=True):        
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print('Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10])))
            if len(good_years) > 10:
                print('... and %d more ... \n' % (len(good_years) - 10))
            else:
                print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        if isinstance(lst_of_subcorpora, int):
            lst_of_subcorpora = [lst_of_subcorpora]
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print('Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]])))
                if len(bad_years) > 10:
                    print('... and %d more ... \n' % (len(bad_years) - 10))
                else:
                    print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        """select only a span of suborpora (first, last)"""

        fir, sec = lst_of_subcorpora
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print('Keeping subcorpora:\n    %d--%d\n' % (int(fir), int(sec)))
        sbs = list(df.index)
        df = df.ix[sbs.index(fir):sbs.index(sec) + 1]

        return df

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' % (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' % (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try: 
            from scipy.stats import linregress
        except ImportError:
            
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this verion of corpkit.' % thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))
        
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']

        stats = []
        if isinstance(df, Series):
            y = list(df.values)
            sl = Series(list(linregress(x, y)), index=statfields)

        else:    
            for entry in list(df.columns):
                y = list(df[entry])
                stats.append(list(linregress(x, y)))
            sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns))
        df = df.append(sl)
        
        # drop infinites and nans
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results, potentially using scipy's linregress"""
        
        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False
        
        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)]
        
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T
        
        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2)
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
                #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'         
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:

            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if not isinstance(merge_entries, list):
            if isinstance(merge_entries, STRINGTYPE):
                merge_entries = {'combine': merge_entries}
            # for newname, criteria    
            for name, the_input in sorted(merge_entries.items()):
                pin = parse_input(df, the_input)
                the_newname = newname_getter(df, pin, newname=name, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, the_input)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
        else:
            for i in merge_entries:
                pin = parse_input(df, merge_entries)
                the_newname = newname_getter(df, pin, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, merge_entries)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
    
    if merge_subcorpora:
        if not isinstance(merge_subcorpora, dict):
            if isinstance(merge_subcorpora, list):
                if isinstance(merge_subcorpora[0], tuple):
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif isinstance(merge_subcorpora[0], STRINGTYPE):
                    merge_subcorpora = {'combine': [x for x in merge_subcorpora]}
                elif isinstance(merge_subcorpora[0], int):
                    merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {'combine': merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            pin = parse_input(df.T, the_input)
            the_newname = newname_getter(df.T, pin, newname=name, \
                merging_subcorpora=True, prinf=print_info)
            df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', 
                                     prinf=print_info).T
            if using_totals:
                pin2 = parse_input(df2.T, the_input)
                df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', 
                                          prinf=False).T

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf=print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf=print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False)
    
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info)
    
    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df, df2, 
                      selfdrop=selfdrop, 
                      threshold=threshold, 
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)
    
    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'
    
    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    if not hasattr(interrogation, 'concordance'):
        lns = None
    elif hasattr(interrogation, 'concordance') and interrogation.concordance is None:
        lns = None
    else:
        col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns))
        ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index))
        lns = interrogation.concordance[col_crit]
        lns = lns.loc[ind_crit]
        lns = Concordance(lns)
    
    output = Interrogation(results=df, totals=total, query=locs, concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
Example #45
0
    def _z_test_word_list(word_count_series_one: pd.Series,
                          word_count_series_two: pd.Series) -> pd.Series:
        """Run z-test on all the words of two input word lists.

        :param word_count_series_one: a pandas series where:
            - the data is the word counts.
            - the index is the corresponding words.
            - the name depends on the what the input is. If a file is given,
              the name will be string "File" add the actual file name, or if a
              class is given, the name will be string "class" add the actual
              class name.
        :param word_count_series_two: a pandas series where:
            - the data is the word counts.
            - the index is the corresponding words.
            - the name depends on the what the input is. If a file is given,
              the name will be string "File" add the actual file name, or if a
              class is given, the name will be string "class" add the actual
              class name.
        :return: a panda series where:
            - the data is the z-scores.
            - the index is the corresponding words.
            - the name is a readable header for analysis result.
        """
        # Find sample population of the two input data set.
        total_word_count_one = word_count_series_one.sum()
        total_word_count_two = word_count_series_two.sum()

        # Join two input pandas series together to avoid making the assumption
        # that they are parallel array in future analysis.
        joined_data_frame = word_count_series_one.to_frame().join(
            word_count_series_two.to_frame())

        # Perform the z-test to detect word anomalies.
        # We are using dict instead of pandas series here, because this method
        # requires 'full_word_score_dict' to be sorted via the absolute value
        # of the z-scores (the 'value' of the dictionary).
        # For code clarity we use this as a temp solution, but in future we
        # can implement the 'sort_by' function for series in our general
        # functions if we need it for better performance.
        full_word_score_dict = \
            {word: TopwordModel._z_test(p1=count1 / total_word_count_one,
                                        p2=count2 / total_word_count_two,
                                        n1=total_word_count_one,
                                        n2=total_word_count_two)
             for word, [count1, count2] in joined_data_frame.iterrows()}

        # Filter out the insignificant result.
        sig_word_score_dict = \
            {word: z_score for word, z_score in full_word_score_dict.items()
             if abs(z_score) >= 1.96}

        # Sort 'sig_word_score_dict' by absolute value of z-scores in
        # descending order.
        sorted_dict = OrderedDict(sorted(sig_word_score_dict.items(),
                                         key=lambda item: abs(item[1]),
                                         reverse=True))

        # Convert the sorted result to a panda series.
        result_series = pd.Series(sorted_dict)
        # Set the result series name.
        result_series.name = f"{word_count_series_one.name} compares to " \
                             f"{word_count_series_two.name}"

        return result_series
    def source_data(self):
        
        st_date = self.stTrain
#        st_date = '2014-10-1'
        stD = date(int(st_date.split('-')[0]), int(st_date.split('-')[1]), int(st_date.split('-')[2]))
        if self.view and stD < datetime.datetime.strptime('2015-4-1',"%Y-%m-%d").date():
            raise RuntimeError('I know it sucks but we dont have view-count data for anytime before 2015-4-1!')
        if self.view:
            db_red = psycopg2.connect(host="***", database="***", port="***",
                                  user="******", password="******")
            db_red.autocommit = True
            df_red = pd.read_sql('''select date,sum(installs) as install, sum(pageviewcount) as view
                                from appstoredata_itunes_metrics where game='***' 
                                and country='%s' group by date;''' % pycountry.countries.get(alpha2=self.target).name, 
                                con=db_red)  
                            
            df_red['date'] = pd.to_datetime(df_red['date'])
            ts_view_target1 = Series(df_red.view.tolist(), 
                                     index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_install_target1 = Series(df_red.install.tolist(), 
                                        index=df_red.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_view_target1) < (self.endP-stD).days :
                ts_view_target1[pd.to_datetime(st_date)] = 0
                ts_view_target1 = ts_view_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
                ts_install_target1[pd.to_datetime(st_date)] = 0
                ts_install_target1 = ts_install_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_view_target = (ts_view_target1)/(ts_view_target1.sum())
            ts_install_target = (ts_install_target1)/(ts_install_target1.sum())
        else:
            ts_view_target = []
            ts_view_target1 = []
            ts_install_target = []  
            ts_install_target1 = []
        
        db = MySQLdb.connect(
        host = '***', 
        user = '******', 
        passwd = '***', 
        db = '***', 
        port = '***')
        
        df_mysql = pd.read_sql('''select metrics_daily.date as date, dim_country.name as country,
                               sum(metrics_daily.value) as value, dim_channel.channel_type as type
                               from metrics_daily left join dim_channel on dim_channel.id = metrics_daily.channel_id 
                               left join dim_country on dim_country.id = metrics_daily.country_id where project_id=195 
                               and metrics_daily.platform_id=2 and metric_id in (5) group by date, type, country;''', con=db)  
                       
        
        df_mysql['date'] = pd.to_datetime(df_mysql['date'])
        all_data_target = df_mysql[df_mysql.country==self.target]
        org_data_target = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.target)]
        ts_org_target1 = Series(org_data_target.value.tolist(), 
                               index=org_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_all_target1 = Series(all_data_target.value.tolist(), 
                                index=all_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
        ts_org_target = (ts_org_target1)/(ts_org_target1.sum())
        ts_all_target = (ts_all_target1)/(ts_all_target1.sum())
        
        if self.baseorg:
            org_data_base = df_mysql[(df_mysql.type=='ORGANIC') & (df_mysql.country==self.baseline)]
            ts_org_base1 = Series(org_data_base.value.tolist(), 
                                 index=org_data_base.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)   
            ts_org_base = (ts_org_base1-ts_org_base1.min())/(ts_org_base1.max()-ts_org_base1.min())
        else:
            ts_org_base = []
            ts_org_base1 = []
        
        if self.paid:
            paid_data_target = df_mysql[(df_mysql.type=='PAID') & (df_mysql.country==self.target)]
            ts_paid_target1 = Series(paid_data_target.value.tolist(),
                                    index=paid_data_target.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_paid_target1) < (self.endP-stD).days :
                ts_paid_target1[pd.to_datetime(st_date)] = 0
                ts_paid_target1 = ts_paid_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_paid_target = (ts_paid_target1)/(ts_paid_target1.sum())
        else:
            ts_paid_target = []
            ts_paid_target1 = []
            
        if self.rank:
            df_rank = pd.read_sql('''select date, max(1/sqrt(rank)) as bestRank from kabam_ranks_data_free where 
                                    country='%s' and device!='android'and game='***' 
                                    and category='Overall' group by date;''' % self.target, con=db)  
            
            df_rank['date'] = pd.to_datetime(df_rank['date'])
            ts_rank_target1 = Series(df_rank.bestRank.tolist(), 
                                     index=df_rank.date.tolist()).resample('D', how='sum')[st_date:self.endPred].fillna(0)
            if len(ts_rank_target1) < (self.endP-stD).days :
                ts_rank_target1[pd.to_datetime(st_date)] = 0
                ts_rank_target1 = ts_rank_target1.resample('D', how='sum')[st_date:self.endPred].fillna(0)
            ts_rank_target = (ts_rank_target1)/(ts_rank_target1.sum())
        else:
            ts_rank_target = []
            ts_rank_target1 = []
        
#        endog = ts_org_target1
#        endog = ts_install_target
        endog = ts_all_target1
        
        Tlist = [self.paid, self.baseorg, self.view, self.rank]
        dff = DataFrame()
        tList = [ts_paid_target, ts_org_base, ts_view_target, ts_rank_target]
        tlist = ['paid', 'base', 'view', 'rank']
        for i in xrange(0,len(Tlist)):
            if Tlist[i]:
                dff[tlist[i]] = tList[i]
        if dff.empty:
            raise RuntimeError('Where is your exog variable? Do you need a coffee or something?!')
                
        exog = dff
        
        return (endog, exog)
Example #47
0
	k_list.append(float(new_list[i-1][1])/float(new_list[i][0]))
k_avg = k_sum/size
#consu_ser = Series(data=k_list,index=range(len(k_list)))
consu_ser = Series(data=k_list,index=range(len(k_list)))
#print consu_ser.describe()
mean = consu_ser.mean()
std_dev = consu_ser.std()
modified_list = list()
for i in range(len(k_list)):
	if (k_list[i] < mean + std_dev) and (k_list[i] > mean - std_dev):
		modified_list.append(k_list[i])

plt.hist(modified_list)
plt.show()
consu_ser_mod = Series(data=modified_list,index=range(len(modified_list)))
k_avg = consu_ser_mod.sum()/len(modified_list)

thresh_sum = 0
for i in range(size):
	if i == 0:
		continue
	thresh_sum = thresh_sum + float(new_list[i-1][1]) - float(new_list[i][0])*k_avg
thresh_avg = thresh_sum/size

print "k_avg: ", k_avg
print "thresh_avg: ", thresh_avg




Example #48
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            quicksave = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            do_concordancing = False,
            maxconc = 9999,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""

    only_conc = False
    no_conc = False
    if do_concordancing is False:
        no_conc = True
    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        only_conc = True
        no_conc = False

    # iteratively count conc lines
    numconc = 0

    # store kwargs
    locs = locals()
    
    if kwargs:
        for k, v in kwargs.items():
            locs[k] = v
        locs.pop('kwargs', None)

    import corpkit
    from interrogation import Interrogation
    from process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from process import get_deps
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs
    import signal

    original_sigint = signal.getsignal(signal.SIGINT)

    if kwargs.get('paralleling', None) is None:
        original_sigint = signal.getsignal(signal.SIGINT)
        
        def signal_handler(signal, frame):
            """pause on ctrl+c, rather than just stop loop"""   
            import signal
            import sys
            from time import localtime, strftime
            signal.signal(signal.SIGINT, original_sigint)
            thetime = strftime("%H:%M:%S", localtime())
            try:
                sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            except NameError:
                sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            time = strftime("%H:%M:%S", localtime())
            print('%s: Interrogation resumed.\n' % time)
            signal.signal(signal.SIGINT, signal_handler)

        signal.signal(signal.SIGINT, signal_handler)

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if type(corpus) == str:
        from corpus import Corpus
        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search, search_iterable = searchfixer(search, query)
    
    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(list(search.keys())) == 1:
        query = list(search.values())[0]

    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        if hasattr(corpus, '__iter__'):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        speakr = dummy_args.get('speaker', False)
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            encd = to_write.encode('utf-8', errors = 'ignore') + '\n'
            fo.write(encd)
        q = list(search.values())[0]
        ops = ['-o', '-%s' % translated_option]
        concs = []
        res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if not no_conc:
            ops += ['-w', '-f']
            whole_res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True) 

            res = format_tregex(res)
            whole_res = format_tregex(whole_res, whole = True)
            concs = make_conc_lines_from_whole_mid(whole_res, res, speakr)

        if root:
            root.update()
        try:
            os.remove(to_open)
        except OSError:
            pass
        if countmode:
            return(len(res))
        else:
            return res, concs

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        import re, os
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results, whole = False):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        
        if whole:
            fnames = [x for x, y in results]
            results = [y for x, y in results]

        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)

        if whole:
            done = zip(fnames, done)

        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess

    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    count_results = {}
    conc_results = {}
    # check if just counting
    countmode = 'c' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if corpus.datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif corpus.datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l']
        if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif corpus.datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
                no_conc = True
                only_conc = False
                do_concordancing = False
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        translated_option = 't'
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for subcorpus in corpus.subcorpora:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
        #    to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if no_conc:
        message = 'Interrogating'
    else:
        message = 'Interrogating and concordancing'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n          %s corpus ... \n' % \
                  (thetime, message, corpus.name, optiontext, sformat, message)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files,
                'startnum': kwargs.get('startnum'),
                'denom': kwargs.get('denominator', 1)}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        conc_results[subcorpus_name] = []
        count_results[subcorpus_name] = []
        results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)

            if not countmode:
                result = format_tregex(result)

            if not no_conc:
                op += ['-w', '-f']
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result, whole = True)

                conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                result = Counter(result)
                results[subcorpus_name] += result
                if not no_conc:
                    for lin in conc_result:
                        if numconc < maxconc or not maxconc:
                            conc_results[subcorpus_name].append(lin)
                        numconc += 1

            current_iter += 1
            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)

            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:
                slow_treg_speaker_guess = kwargs.get('outname', False)
                if corpus.datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if len(just_speakers) == 1:
                                slow_treg_speaker_guess = just_speakers[0]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res, conc_res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            do_concordancing = do_concordancing,
                            only_format_match = only_format_match,
                            speaker = slow_treg_speaker_guess)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                elif corpus.datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    if not only_conc:
                        res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = False)
                    if not no_conc:
                        conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = True)
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            line.insert(0, '')

                elif corpus.datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        if not only_conc:
                            res = searcher(list(search.values())[0], data, 
                            concordancing = False)
                        if not no_conc:
                            conc_res = searcher(list(search.values())[0], data, 
                            concordancing = True)
                        if not no_conc:
                            for index, line in enumerate(conc_res):
                                line.insert(0, '')

                if countmode:
                    count_results[subcorpus_name] += [res]
                else:
                    # add filename and do lowercasing for conc
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            if searcher != slow_tregex:
                                line.insert(0, f.name)
                            else:
                                line[0] = f.name
                            if not preserve_case:
                                line[3:] = [x.lower() for x in line[3:]]
                            if spelling:
                                line = [correct_spelling(b) for b in line]
                            if numconc < maxconc or not maxconc:
                                conc_results[subcorpus_name].append(line)
                                numconc += 1

                    # do lowercasing and spelling
                    if not only_conc:
                        if not preserve_case:
                            if not statsmode:
                                res = [i.lower() for i in res]
                        if spelling:
                            if not statsmode:
                                res = [correct_spelling(r) for r in res]
                        #if not statsmode:
                        results[subcorpus_name] += Counter(res)
                        #else:
                        #results[subcorpus_name] += res

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if not no_conc:
        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        conc_df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis = 1, inplace = True)

        #if kwargs.get('note'):
        #    kwargs['note'].progvar.set(100)

        #if kwargs.get('printstatus', True):
        #    thetime = strftime("%H:%M:%S", localtime())
        #    finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index))
        #    print(finalstring)

        from interrogation import Concordance
        output = Concordance(conc_df)
        if only_conc:
            output.query = locs
            if quicksave:
                output.save()

            if kwargs.get('printstatus', True):
                thetime = strftime("%H:%M:%S", localtime())
                finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df))
                print(finalstring)
            return output

        #output.query = locs

        #return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    if not only_conc:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(count_results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort_values(ascending = False, inplace = True)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        if not no_conc:
            interro = Interrogation(results = df, totals = tot, query = locs, concordance = output)
        else:
            interro = Interrogation(results = df, totals = tot, query = locs)

        if quicksave:
            interro.save()
        
        return interro
Example #49
0
    return True

# Get a list of all the words in Brown corpus.
words = brown.words()

# Get frequency distribution on the given condition.
sent_fd = nltk.FreqDist(
            word.lower() for word in words
            if len(word) == length and
               check_condition(word, userinput)
        )               

# Display the top 3 frequent words if applicable.                
series = Series(sent_fd)
series.sort_values(ascending=False, inplace=True)
sumValues = series.sum()
top_words = series.keys()
count = len(top_words)
if count > 0:
    i = 0
    while i < count and i < 3:
        print(str(i + 1) + ': ' + top_words[i] + '  (' + 
              str(round(100 * series.get(i) / sumValues, 1)) + ' %)')
        i += 1
else:
    print("It doesn't seem like there is any word like that.")