Example #1
0
def _prepare_one_phenotype(C: NDArray[(Any, Any),
                                      Float], row: pd.Series, correction: str,
                           includes_intercept: bool) -> pd.Series:
    '''
    Creates the broadcasted information for one (phenotype, offset) pair. The returned series
    contains the information eventually stored in a LogRegState.

    This function accepts and returns a pandas series for integration with Pandas UDFs and
    pd.DataFrame.apply.
    '''
    y = row['values']
    mask = ~np.isnan(y)
    offset = row.get('offset')
    y_pred = _logistic_null_model_predictions(y, C, mask, offset)
    y_res = np.nan_to_num(y - y_pred)
    gamma = y_pred * (1 - y_pred)
    CtGammaC = C.T @ (gamma[:, None] * C)
    inv_CtGammaC = np.linalg.inv(CtGammaC)
    row.label = str(row.label)  # Ensure that the phenotype name is a string
    row.drop(['values', 'offset'], inplace=True, errors='ignore')
    row['y_res'], row['gamma'], row['inv_CtGammaC'] = np.ravel(
        y_res), np.ravel(gamma), np.ravel(inv_CtGammaC)
    if correction == correction_approx_firth:
        row['firth_offset'] = np.ravel(
            af.perform_null_firth_fit(y, C, mask, offset, includes_intercept))
    return row
Example #2
0
def __clean_artifacts(data: pd.Series, threshold=0.2) -> pd.Series:
    """
    Cleans obviously illegal IBI values (artefacts) from a list

    Parameters
    ----------
    data : pd.Series
        the IBI list
    threshold : float, optional
        the maximum relative deviation between subsequent intervals, by default 0.2

    Returns
    -------
    pd.Series
        the cleaned IBIs
    """

    # Artifact detection - Statistical
    # for index in trange(data.shape[0]):
    #    # Remove RR intervals that differ more than 20% from the previous one
    #    if np.abs(data.iloc[index] - data.iloc[index - 1]) > 0.2 * data.iloc[index]:
    #        data.iloc[index] = np.nan

    # efficiency instead of loop ;-)
    diff = data.diff().abs()
    drop_indices = diff > threshold * data
    if drop_indices.any():
        data.drop(data[drop_indices].index, inplace=True)
    drop_indices = (data < 250) | (data > 2000)
    if drop_indices.any():
        data.drop(data[drop_indices].index,
                  inplace=True)  # drop by bpm > 240 or bpm < 30
    data.dropna(inplace=True)  # just to be sure

    return data
Example #3
0
def __clean_artifacts(data: pd.Series, threshold=0.2) -> pd.Series:
    """
    Cleans obviously illegal IBI values (artefacts) from a list

    Parameters
    ----------
    data : pd.Series
        the IBI list
    threshold : float, optional
        the maximum relative deviation between subsequent intervals, by default 0.2

    Returns
    -------
    pd.Series
        the cleaned IBIs
    """

    diff = data.diff().abs()
    drop_indices = diff > threshold * data
    if drop_indices.any():
        data.drop(data[drop_indices].index, inplace=True)
    drop_indices = (data < 250) | (data > 2000)
    if drop_indices.any():
        data.drop(data[drop_indices].index, inplace=True)  # drop by bpm > 240 or bpm < 30
    data.dropna(inplace=True)  # just to be sure

    return data
Example #4
0
    def test_plot_accessor_updates_on_inplace(self):
        s = Series([1, 2, 3, 4])
        _, ax = self.plt.subplots()
        ax = s.plot(ax=ax)
        before = ax.xaxis.get_ticklocs()

        s.drop([0, 1], inplace=True)
        _, ax = self.plt.subplots()
        after = ax.xaxis.get_ticklocs()
        tm.assert_numpy_array_equal(before, after)
Example #5
0
    def test_plot_accessor_updates_on_inplace(self):
        s = Series([1, 2, 3, 4])
        _, ax = self.plt.subplots()
        ax = s.plot(ax=ax)
        before = ax.xaxis.get_ticklocs()

        s.drop([0, 1], inplace=True)
        _, ax = self.plt.subplots()
        after = ax.xaxis.get_ticklocs()
        tm.assert_numpy_array_equal(before, after)
Example #6
0
def test_drop_raise_exception_if_labels_not_in_level(msg, labels, level):
    # GH 8594
    mi = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]], names=["a", "b"])
    s = Series([10, 20, 30], index=mi)
    df = DataFrame([10, 20, 30], index=mi)

    with pytest.raises(KeyError, match=msg):
        s.drop(labels, level=level)
    with pytest.raises(KeyError, match=msg):
        df.drop(labels, level=level)
Example #7
0
 def extractJSONData(dict1):
     ser1 = Series(dict1['MonitoredVehicleJourney'])
     ser1 = ser1.append(Series({'RecordedAtTime':dict1['RecordedAtTime']}))
     nextStops = ser1['OnwardCalls']['OnwardCall'][0]
     ser1.drop('OnwardCalls', inplace = True)
     ser1 = unnest(ser1)
     nextStops = unnest(Series(nextStops))
     nextStops.index = 'NextStop' + nextStops.index.values
     ser1 = pd.concat((ser1, nextStops))
     df_row = DataFrame(ser1).transpose()
     return(df_row)
Example #8
0
def get_ica_components(X, contribution=0.85):
    X_ica = FastICA(n_components=len(X.columns)).fit(X)

    L2 = Series(np.sum(X_ica.mixing_**2, axis=0))
    L2.sort_values(ascending=False, inplace=True)

    X_S = DataFrame(X_ica.transform(X))
    X_ica_mixing_ = DataFrame(X_ica.mixing_)

    L2.drop(L2.index[L2.cumsum() / L2.sum() >= contribution][1:], inplace=True)

    return X_S.reindex(columns=L2.index).values, X_ica_mixing_.reindex(
        columns=L2.index).values, X_ica.mean_, len(L2)
Example #9
0
def test_drop_with_ignore_errors():
    # errors='ignore'
    s = Series(range(3), index=list("abc"))
    result = s.drop("bc", errors="ignore")
    tm.assert_series_equal(result, s)
    result = s.drop(["a", "d"], errors="ignore")
    expected = s.iloc[1:]
    tm.assert_series_equal(result, expected)

    # GH 8522
    s = Series([2, 3], index=[True, False])
    assert s.index.is_object()
    result = s.drop(True)
    expected = Series([3], index=[False])
    tm.assert_series_equal(result, expected)
Example #10
0
def test_drop_with_ignore_errors():
    # errors='ignore'
    s = Series(range(3), index=list('abc'))
    result = s.drop('bc', errors='ignore')
    tm.assert_series_equal(result, s)
    result = s.drop(['a', 'd'], errors='ignore')
    expected = s.iloc[1:]
    tm.assert_series_equal(result, expected)

    # GH 8522
    s = Series([2, 3], index=[True, False])
    assert s.index.is_object()
    result = s.drop(True)
    expected = Series([3], index=[False])
    tm.assert_series_equal(result, expected)
Example #11
0
def test_drop_with_ignore_errors():
    # errors='ignore'
    s = Series(range(3), index=list('abc'))
    result = s.drop('bc', errors='ignore')
    tm.assert_series_equal(result, s)
    result = s.drop(['a', 'd'], errors='ignore')
    expected = s.iloc[1:]
    tm.assert_series_equal(result, expected)

    # GH 8522
    s = Series([2, 3], index=[True, False])
    assert s.index.is_object()
    result = s.drop(True)
    expected = Series([3], index=[False])
    tm.assert_series_equal(result, expected)
Example #12
0
    def _fit_strcols_to_terminal_width(self, strcols: list[list[str]]) -> str:
        from pandas import Series

        lines = self.adj.adjoin(1, *strcols).split("\n")
        max_len = Series(lines).str.len().max()
        # plus truncate dot col
        width, _ = get_terminal_size()
        dif = max_len - width
        # '+ 1' to avoid too wide repr (GH PR #17023)
        adj_dif = dif + 1
        col_lens = Series([Series(ele).apply(len).max() for ele in strcols])
        n_cols = len(col_lens)
        counter = 0
        while adj_dif > 0 and n_cols > 1:
            counter += 1
            mid = round(n_cols / 2)
            mid_ix = col_lens.index[mid]
            col_len = col_lens[mid_ix]
            # adjoin adds one
            adj_dif -= col_len + 1
            col_lens = col_lens.drop(mid_ix)
            n_cols = len(col_lens)

        # subtract index column
        max_cols_fitted = n_cols - self.fmt.index
        # GH-21180. Ensure that we print at least two.
        max_cols_fitted = max(max_cols_fitted, 2)
        self.fmt.max_cols_fitted = max_cols_fitted

        # Call again _truncate to cut frame appropriately
        # and then generate string representation
        self.fmt.truncate()
        strcols = self._get_strcols()
        return self.adj.adjoin(1, *strcols)
Example #13
0
    def generate_text_features(self, X: Series, feature: str) -> DataFrame:
        X: DataFrame = X.to_frame(name=feature)
        X[feature +
          '.char_count'] = [self.char_count(value) for value in X[feature]]
        X[feature +
          '.word_count'] = [self.word_count(value) for value in X[feature]]
        X[feature + '.capital_ratio'] = [
            self.capital_ratio(value) for value in X[feature]
        ]
        X[feature +
          '.lower_ratio'] = [self.lower_ratio(value) for value in X[feature]]
        X[feature +
          '.digit_ratio'] = [self.digit_ratio(value) for value in X[feature]]
        X[feature + '.special_ratio'] = [
            self.special_ratio(value) for value in X[feature]
        ]

        symbols = [
            '!', '?', '@', '%', '$', '*', '&', '#', '^', '.', ':', ' ', '/',
            ';', '-', '='
        ]
        for symbol in symbols:
            X[feature + '.symbol_count.' + symbol] = [
                self.symbol_in_string_count(value, symbol)
                for value in X[feature]
            ]
            X[feature + '.symbol_ratio.' +
              symbol] = X[feature + '.symbol_count.' +
                          symbol] / X[feature + '.char_count']
            X[feature + '.symbol_ratio.' + symbol].fillna(0, inplace=True)

        X = X.drop(feature, axis=1)

        return X
Example #14
0
def apply_dcg_to_series(movie_id: int, data: pd.Series):
    # create lookup table (look up index in list given the item)
    lookup = index_reverse_lookup_dict(data.loc[movie_id])
    # remove base movie from results
    data = data.drop(index=movie_id)
    # apply dcg similarity over the data
    return data.apply(dcg_similarity, args=(lookup, ))
    def make_line_chart_popup(data_row:pd.Series, title:str) -> folium.Popup:
        '''Create a line chart popup from temporal Series for departements
        Index of the Series have to be in {year}_median, {year}_decile1, {year}_decile9, {year+1}_median, {year+1}_decile1... format
        this popup can be added in map layers'''
        # filter index names and build 3 columns from one(series)
        data = {
                'decile_1': data_row.filter(regex=".*decile_1$").values,
                'decile_9': data_row.filter(regex=".*decile_9$").values,
                'median': data_row.filter(like="median").values,
                }
        df_to_display = pd.DataFrame.from_dict(data)
        data_row = data_row.drop("color")

        # create index of the dataframe from the inital data_row Series.index
        df_to_display.index = pd.to_datetime(list(dict.fromkeys([int(annee_c[:4]) for annee_c in data_row.index.tolist()])), format="%Y")

        line_chart = vincent.Line(df_to_display,
                                width=300,
                                height=200)
        line_chart.axis_titles(x='Année', y='prix m2')
        line_chart.legend(title=title)

        popup = folium.Popup()
        folium.Vega(line_chart, width = 400, height=250).add_to(popup)
        return popup
Example #16
0
def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels,
                                          expected_data, expected_index):

    s = Series(data=data, index=index)
    result = s.drop(drop_labels, axis=axis)
    expected = Series(data=expected_data, index=expected_index)
    tm.assert_series_equal(result, expected)
Example #17
0
def test_drop_index_ea_dtype(any_numeric_ea_dtype):
    # GH#45860
    df = Series(100, index=Index([1, 2, 2], dtype=any_numeric_ea_dtype))
    idx = Index([df.index[1]])
    result = df.drop(idx)
    expected = Series(100, index=Index([1], dtype=any_numeric_ea_dtype))
    tm.assert_series_equal(result, expected)
Example #18
0
def create_item_mod(possible_mod_ids: list, item_mod: pd.Series, rare_mods: pd.DataFrame, mod_value: int) -> dict:
    logger.info('Creating item mods dict')
    for mod_id in possible_mod_ids:
        # Comparing mod with rare mods multiple times is expensive
        possible_mods = rare_mods[rare_mods.mod_id == mod_id]
        if len(possible_mods) > 0:
            for idx, row in possible_mods.iterrows():
                min_value = row['min']
                max_value = row['max']
                if min_value < 0:
                    mod_value = -mod_value
                if min_value <= mod_value <= max_value + 1:
                    item_mod = possible_mods.loc[idx, :]
                    break
                else:
                    continue
            if isinstance(item_mod, dict):
                if not item_mod:
                    item_mod = possible_mods.loc[idx, :]
            else:
                if item_mod.empty:
                    item_mod = possible_mods.loc[idx, :]
            item_mod['value'] = mod_value
            item_mod = item_mod.drop(['min', 'max', 'required_level'])
            item_mod = item_mod.to_dict()
            break
    return item_mod
Example #19
0
def remover_resultado_concursos(possibilidades, resultado_concursos):
	"""
	Remove da lista de possibilidades os resultados já sorteados.
	
	:param possibilidades: Combinações possíveis da Lotofácil
	:param resultado_concursos: Resultado de todos os concursos
	
	return:	A lista de possibilidades sem os resultados já sorteados.
	"""
	from pandas import Series

	elem_ini = 0
	elem_fin = len(possibilidades) - 1
	
	indices = [buscar(
                      possibilidades,
                      elem_ini,
                      elem_fin,
                      valor_busca
	                 ) for valor_busca in resultado_concursos]
	
	s_possibilidades = Series(possibilidades)
	removidos = s_possibilidades.drop(indices)

	lista_possibilidades_atualizada = removidos.values 
	
	return lista_possibilidades_atualizada.tolist()
Example #20
0
 def test_cat_accessor_updates_on_inplace(self):
     s = Series(list("abc")).astype("category")
     return_value = s.drop(0, inplace=True)
     assert return_value is None
     return_value = s.cat.remove_unused_categories(inplace=True)
     assert return_value is None
     assert len(s.cat.categories) == 2
Example #21
0
def test_drop_unique_and_non_unique_index(data, index, axis, drop_labels,
                                          expected_data, expected_index):

    s = Series(data=data, index=index)
    result = s.drop(drop_labels, axis=axis)
    expected = Series(data=expected_data, index=expected_index)
    tm.assert_series_equal(result, expected)
Example #22
0
def update_series():
    ser = Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
    print('根据列表+索引创建:\n', ser)
    s = ser.drop('c')  #
    print('删除后的结果:\n', s)
    s = ser.drop(['a', 'c'])  #
    print('删除多个后的结果:\n', s)
    ser.pop('d')
    # ser.pop(0) # 索引删除 invalid key
    # ser.pop([0,1]) # 删除多个 invalid key
    print('pop删除,修改源数据:\n', ser)
    ser[0] = 1000
    ser['f'] = 2000
    ser2 = Series([100, 200], index=['x', 'y'])
    ser.append(ser2)
    print('修改Series:\n', ser)
    print(' Series append:\n', ser.append(ser2))
Example #23
0
def perf_per_month(df: pd.Series) -> json:
    """Returns monthly performance of strategy

    Arguments:
        df -- Series of NAV with datetime as the index

    Returns:
        Monthly returns and datetime as the index
    """
    df = df.to_frame()
    df.index = pd.to_datetime(df.index, format="%Y-%m-%d %H:%M:%S").date
    df['eom'] = df.index + MonthEnd(0)
    df.drop_duplicates('eom', keep='last', inplace=True)
    df = df.loc[df.index == df['eom']]
    df['m_rets'] = df['nav'] / df['nav'].shift(1) - 1
    df.drop(columns=['eom', 'nav'], inplace=True)
    return df.to_json(orient='index')
Example #24
0
 def __calc_line_parameters(
         self, lines: pd.Series) -> Tuple[np.ndarray, np.uint64]:
     """ Generate general parameters of the given acquisition """
     rel_idx = np.where(
         np.abs(lines.diff().pct_change(periods=1)) > self.change_thresh)[0]
     delta = np.uint64(
         lines.drop(rel_idx).reindex(np.arange(
             len(lines))).interpolate().diff().mean())
     return rel_idx[::2], delta
Example #25
0
def test_drop_pos_args_deprecation():
    # https://github.com/pandas-dev/pandas/issues/41485
    ser = Series([1, 2, 3])
    msg = (r"In a future version of pandas all arguments of Series\.drop "
           r"except for the argument 'labels' will be keyword-only")
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = ser.drop(1, 0)
    expected = Series([1, 3], index=[0, 2])
    tm.assert_series_equal(result, expected)
Example #26
0
def test_closed_uneven():
    # see gh-21704
    ser = Series(data=np.arange(10), index=pd.date_range("2000", periods=10))

    # uneven
    ser = ser.drop(index=ser.index[[1, 5]])
    result = ser.rolling("3D", closed="left").min()
    expected = Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index)
    tm.assert_series_equal(result, expected)
Example #27
0
def series_test():
    print("==== 使用 list 构造 series ====");
    data_list = list(i for i in range(1, 10));
    index_list = list("index_%d" % i for i in range(1, 10));
    # 默认索引列,从 0 开始
    series = Series(data_list);
    print(series);
    series = Series(data_list, index_list);
    print(series);

    print("==== 使用 dict 构造 series ====");
    data_dict_1 = {"name": "hujiang", "age": 18, "sex": "man", "index_1":"balabala"};
    data_dict_2 = {"name": "hymanHu", "age": 19, "sex": "man", "index_1":"balabala"};
    # dict 中的 key 自动转为 index_list
    series_1= Series(data_dict_1);
    print(series_1);
    # 传入的 index_list 元素与 dict key 相同时候,则显示 dict 对应的值,否者显示 NaN
    series_1 = Series(data_dict_1, index_list);
    print(series_1);
    series_2 = Series(data_dict_2, index_list);
    print(series_1 + series_2); # 多个 series 合并

    print("==== 新增 ====");
    series["index_10"] = 100;  # 新增一个值
    print(series);

    print("==== 删除 ====");
    print(series.drop("index_10"));
    print(series.drop(["index_10", "index_9"]));

    print("==== 修改 ====");
    series["index_9"] = 99;
    print(series);

    print("==== 查询 ====");
    print(series.index);  # 打印索引 Index 对象
    print(series.index.values);  # 打印索引列表
    print(series.values);  # 打印值列表
    print(series.get("index_9"), series["index_9"]);  # 根据索引或对应的值
    print(series[["index_8", "index_9"]]);  # 取多索引
    print(series["index_1":"index_3"]); # 索引切片
    print(series[[1,3]]); # 索引下标取值
    print(series[0:4]); # 索引下标切片
    print(np.asarray(series)); # Series 转 list
Example #28
0
 def load_sinks(self):
     """
     Load particle by particle sink flag values.
     """
     sinks = self._sink_value.value
     if self._drop_ids is not None:
         sinks = Series(sinks, index=self._particleIDs.value)
         self['sink_value'] = sinks.drop(self._drop_ids)
     else:
         self['sink_value'] = sinks
def turn_row_into_price_action(row: pd.Series, fiat_currency: str) -> typing.Dict:
    price_action = {
        "timestamp": row.timestamp,
        "fiat_currency": fiat_currency,
        "action": {
            "type": "external_price_update",
            "tokens": row.drop("timestamp").to_dict()
        }
    }
    return price_action
Example #30
0
 def load_PIDs(self):
     """
     Load Particle ID numbers
     """
     particleIDs = self._particleIDs.value
     if self._drop_ids is not None:
         particleIDs = Series(particleIDs, index=self._particleIDs.value)
         self['particleIDs'] = particleIDs.drop(self._drop_ids)
     else:
         self['particleIDs'] = particleIDs
Example #31
0
def _loocv_loess(x: pd.Series,
                 y: pd.Series,
                 interpolator: Callable,
                 frac: Optional[float] = None) -> tuple:
    """
    Helper function for batch_correction. Computes loess correction with LOOCV.

    Parameters
    ----------
    x: pd.Series
    y: pd.Series
    frac: float, optional
        fraction of sample to use in LOESS correction. If None, determines the
        best value using LOOCV.
    interpolator = callable
        interpolator function used to predict new values.
    Returns
    -------
    corrected: pd.Series
        LOESS corrected data
    """
    if frac is None:
        # valid frac values, from 4/N to 1/N, where N is the number of corrector
        # samples.
        frac_list = [k / x.size for k in range(4, x.size + 1)]
        rms = np.inf  # initial value for root mean square error
        best_frac = 1
        for frac in frac_list:
            curr_rms = 0
            for loocv_index in x.index[1:-1]:
                y_temp = y.drop(loocv_index)
                x_temp = x.drop(loocv_index)
                y_loess = lowess(y_temp,
                                 x_temp,
                                 return_sorted=False,
                                 frac=frac)
                interp = interpolator(x_temp, y_loess)
                curr_rms += (y[loocv_index] - interp(x[loocv_index]))**2
            if rms > curr_rms:
                best_frac = frac
                rms = curr_rms
        frac = best_frac
    return lowess(y, x, return_sorted=False, frac=frac)
Example #32
0
 def load_PIDs(self):
     """
     Load Particle ID numbers
     """
     particleIDs = self._particleIDs.value
     if self._drop_ids is not None:
         particleIDs = Series(particleIDs, index=self._particleIDs.value)
         self['particleIDs'] = particleIDs.drop(self._drop_ids)
     else:
         self['particleIDs'] = particleIDs
Example #33
0
 def load_sinks(self):
     """
     Load particle by particle sink flag values.
     """
     sinks = self._sink_value.value
     if self._drop_ids is not None:
         sinks = Series(sinks, index=self._particleIDs.value)
         self['sink_value'] = sinks.drop(self._drop_ids)
     else:
         self['sink_value'] = sinks
Example #34
0
 def predict(self, x: Series):
     condition = x[self.sub_attribute]
     if condition in self.sub_trees.keys():
         if isinstance(self.sub_trees[condition], str):
             return self.sub_trees[condition]
         else:
             return self.sub_trees[condition].predict(
                 x.drop(self.sub_attribute))
     else:
         return self.data[self.label].value_counts().keys()[0]
Example #35
0
    def test_cat_accessor_updates_on_inplace(self):
        ser = Series(list("abc")).astype("category")
        return_value = ser.drop(0, inplace=True)
        assert return_value is None

        with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
            return_value = ser.cat.remove_unused_categories(inplace=True)

        assert return_value is None
        assert len(ser.cat.categories) == 2
Example #36
0
 def load_adiabatic_index(self):
     """
     Load particle adiabatic index.
     """
     gamma = self._Adiabatic_index.value
     if self._drop_ids is not None:
         gamma = Series(gamma, index=self._particleIDs.value)
         self['adiabatic_index'] = gamma.drop(self._drop_ids)
     else:
         self['adiabatic_index'] = gamma
Example #37
0
 def load_adiabatic_index(self):
     """
     Load particle adiabatic index.
     """
     gamma = self._Adiabatic_index.value
     if self._drop_ids is not None:
         gamma = Series(gamma, index=self._particleIDs.value)
         self['adiabatic_index'] = gamma.drop(self._drop_ids)
     else:
         self['adiabatic_index'] = gamma
Example #38
0
 def load_internal_energy(self, unit=None):
     """
     Load internal particle energies per unit mass in cgs units 
     (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units.set_energy(unit)
     energy = self._internal_energy.value * self.units.energy_conv
     if self._drop_ids is not None:
         energy = Series(energy, index=self._particleIDs.value)
         self['internal_energy'] = energy.drop(self._drop_ids)
     else:
         self['internal_energy'] = energy
Example #39
0
 def test_drop_and_dropna_caching(self):
     # tst that cacher updates
     original = Series([1, 2, np.nan], name='A')
     expected = Series([1, 2], dtype=original.dtype, name='A')
     df = pd.DataFrame({'A': original.values.copy()})
     df2 = df.copy()
     df['A'].dropna()
     assert_series_equal(df['A'], original)
     df['A'].dropna(inplace=True)
     assert_series_equal(df['A'], expected)
     df2['A'].drop([1])
     assert_series_equal(df2['A'], original)
     df2['A'].drop([1], inplace=True)
     assert_series_equal(df2['A'], original.drop([1]))
Example #40
0
 def load_masses(self, unit=None):
     """
     Load Particle Masses in units of M_sun (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units.set_mass(unit)
     masses = self._masses.value * self.units.mass_conv
     if self.units.remove_h:
         h = self._header.HubbleParam
         masses /= h
     if self._drop_ids is not None:
         masses = Series(masses, index=self._particleIDs.value)
         self['masses'] = masses.drop(self._drop_ids)
     else:
         self['masses'] = masses
Example #41
0
 def load_density(self, unit=None):
     """
     Load Particle Densities in cgs units (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units.set_density(unit)
     density = self._density.value * self.units.density_conv
     if self.units.remove_h:
         h = self._header.HubbleParam
         density *=  h**2
     if self.units.coordinate_system == 'physical':
         ainv = self._header.Redshift + 1 # 1/(scale factor)
         density *= ainv**3
     if self._drop_ids is not None:
         density = Series(density, index=self._particleIDs.value)
         self['density'] = density.drop(self._drop_ids)
     else:
         self['density'] = density
Example #42
0
 def load_smoothing_length(self, unit=None):
     """
     Load Particle Smoothing Lengths in units of kpc.
     (default set in units class)
     unit: unit conversion from code units
     """
     if unit:
         self.units._set_smoothing_length(unit)
     hsml = self._smoothing_length.value * self.units.length_conv
     if self.units.remove_h:
         h = self._header.HubbleParam
         hsml /= h
     if self.units.coordinate_system == 'physical':
         a = self._header.ScaleFactor
         hsml *= a
     if self._drop_ids is not None:
         hsml = Series(hsml, index=self._particleIDs.value)
         self['smoothing_length'] = hsml.drop(self._drop_ids)
     else:
         self['smoothing_length'] = hsml
Example #43
0
def _esd(x, max_outlier, alpha, direction):
    """
    The ESD test using median and MAD in the calculation of the test statistic.
    """
    x = Series(x)
    n = len(x)
    outlier_index = []
    for i in range(1, max_outlier + 1):
        median = x.median()
        mad = np.median([abs(value - median) for value in x]) * _MAD_CONSTANT
        if mad == 0:
            break
        if direction == 'both':
            ares = x.map(lambda value: abs(value - median) / mad)
        elif direction == 'pos':
            ares = x.map(lambda value: (value - median) / mad)
        elif direction == 'neg':
            ares = x.map(lambda value: (median - value) / mad)
        r_idx = ares.idxmax()
        r = ares[r_idx]
        if direction == 'both':
            p = 1.0 - alpha / (2 * (n - i + 1))
        else:
            p = 1.0 - alpha / (n - i + 1)
        crit = t.ppf(p, n-i-1)
        lam = (n-i)*crit / np.sqrt((n-i-1+crit**2) * (n-i+1))
        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("%s/%s outlier. median=%s, mad=%s, r_idx=%s, r=%s, crit=%s, lam=%s" %
                         (i, max_outlier, median, mad, r_idx, r, crit, lam))
        if r > lam:
            outlier_index.append(r_idx)
            x = x.drop(r_idx)
        else:
            # The r keeps decreasing while lam keeps increasing. Therefore, when r is less than lam for the first time,
            # we can stop.
            break
    return outlier_index
'''
print
x.ix[['a', 'b', 'd', 'c'], states]
'''
   A  B  C  D
a  0  1  2  0
b  3  4  5  0
d  6  7  8  0
c  3  4  5  0
'''

print
'Series根据行索引删除行'
x = Series(numpy.arange(4), index=['a', 'b', 'c', 'd'])
print
x.drop('c')
'''
a    0
b    1
d    3
dtype: int32
'''
print
x.drop(['a', 'b'])  # 花式删除
'''
c    2
d    3
dtype: int32
'''

print
obj3=Series(['blue','purple','yellow'],index=[0,2,4])

obj3.reindex(range(6),method='ffill')

#*********************************************************
frame=DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])

frame2=frame.reindex(['a','b','c','d'])



#丢弃 drop  删除 索引值

obj=Series(np.arange(5),index=['a','b','c','d','e'])

new_obj=obj.drop('c')

data=DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])

#删除 列类型

data.drop('three',axis=1)

#直接 定位到元素 点 用ix
data.ix['Colorado',['one','four']]


#***************************
#对Series 相加  是对相同索引 的数据相加,没有的值 其和 最终会以NaN 来表示
list('abcd') 等价于 ['a','b','c','d']
[4 rows x 3 columns]
'''


###################################################################
# Dropping entries from an axis
###################################################################

'''
Dropping one or more entries from an axis is easy if you have an index array or list
without those entries. As that can require a bit of munging and set logic, the drop
method will return a new object with the indicated value or values deleted from an axis:
'''
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')
print(new_obj)
'''
a    0
b    1
d    3
e    4
dtype: float64
'''
print(obj.drop(['d', 'c']))
'''
a    0
b    1
e    4
dtype: float64
'''
def main():
    # reindex
    obj = Series(range(4), index="a b c d".split(" ")[::-1])
    print obj

    obj2 = obj.reindex("a b c d e".split(" "))
    print obj2

    # Change NaN
    print obj.reindex("a b c d e".split(" "), fill_value=0)
    colors = ["blue", "purple", "yellow"]
    index = [0, 2, 4]
    obj3 = Series(colors, index=index)
    print obj3.reindex(range(6))
    print obj3.reindex(range(6), method="ffill")  # not found forward fill
    print obj3.reindex(range(6), method="backfill")  # bfill

    # DataFrame
    states = ["Ohio", "Texas", "California"]
    frame = DataFrame(np.arange(9).reshape((3, 3)), index="a b c".split(" "), columns=["Ohio", "Texas", "California"])
    print frame
    frame2 = frame.reindex("a b c d".split(" "))
    print frame2
    states[0] = "Utah"
    states[1], states[0] = states[:2]
    print frame.reindex(columns=states)
    # fill
    print frame.reindex("a b c d".split(" "), method="ffill", columns=states)
    print frame.ix["a b c d".split(" ")]
    print frame.ix["a b c d".split(" "), states]

    # Delete column
    print "", ""
    obj = Series(range(5), index="a b c d e".split(" "))
    new_obj = obj.drop("c")
    print new_obj
    print obj

    # Index reference
    print "", ""
    obj = Series(np.arange(4.0), index="a b c d".split(" "))
    print obj["b"]
    print obj[1]  # same
    print obj[2:4]
    print obj[["b", "a", "c"]]
    print obj[[1, 3]]
    print obj[obj < 2]
    # Slice with label
    print obj["b":"c"]  # include 'c'
    obj["b":"c"] = 5
    print obj

    data = DataFrame(
        np.arange(16).reshape((4, 4)),
        index=["Ohio", "Colorado", "Utah", "New York"],
        columns=["one", "two", "three", "four"],
    )
    print data
    # column
    print data["two"]
    print data[["three", "one"]]
    # row
    print data[:2]
    print data[data["three"] > 5]
    # all values
    print data < 5
    data[data < 5] = 0
    print data
    # row and column
    print data.ix[["Colorado"], ["two", "three"]]
    print data.ix[["Colorado", "Utah"], [3, 0, 1]]
    # row
    print data.ix[2]
    # label row and column, return column
    print data.ix[:"Utah", "two"]
    # xs
    # row
    print data.xs("Utah")
    print data.xs("Utah", axis=0)
    # rows
    print data.xs("two", axis=1)
    # icol/irow i is index
    print data.icol(1)
    print data.irow(1)

    # Union
    print "", ""
    s1 = Series([7.3, -2.5, 3.4, 1.5], index=["a", "c", "d", "e"])
    s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=["a", "c", "e", "f", "g"])
    print s1
    print s2
    # index is union, but d, f, g are NaN
    print s1 + s2
    df1 = DataFrame(np.arange(9.0).reshape((3, 3)), columns=list("bcd"), index=["Ohio", "Texas", "Colorado"])
    df2 = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print df1
    print df2
    print df1 + df2

    # arithmetic method
    print "", ""
    df1 = DataFrame(np.arange(12.0).reshape((3, 4)), columns=list("abcd"))
    df2 = DataFrame(np.arange(20.0).reshape((4, 5)), columns=list("abcde"))
    print df1
    print df2
    print df1.add(df2, fill_value=0)
    # reindex has fill_value argument
    # other arithmetic method are sub/div/mul(ti)

    # Calculation in a DataFrame and Series
    print "", ""
    # subtract from each row. broadcat
    arr = np.arange(12.0).reshape((3, 4))
    print arr
    print arr[0]
    print arr - arr[0]
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    series = frame.ix[0]
    print frame
    print series
    print frame - series

    series2 = Series(range(3), index=list("bef"))
    print frame + series2

    series3 = frame["d"]
    series4 = frame.ix[0]
    print frame
    print series3
    print series4
    print frame.sub(series3, axis=0)
    print frame.sub(series4, axis=1)

    # apply function and mapping
    print "", ""
    frame = DataFrame(np.arange(12.0).reshape((4, 3)), columns=list("bde"), index=["Utah", "Ohio", "Texas", "Oregon"])
    print frame
    f = lambda x: x.max() - x.min()
    print frame.apply(f)
    print frame.apply(f, axis=1)

    f = lambda x: Series([x.min(), x.max()], index=["min", "max"])
    print frame.apply(f)

    format = lambda x: "{0:.2f}".format(x)
    print frame.applymap(format)  # frame
    print frame["e"].map(format)  # series

    # sort and rank
    print "", ""
    obj = Series(range(4), index=list("dabc"))
    print obj
    print obj.sort_index()

    frame = DataFrame(np.arange(8).reshape((2, 4)), index=["three", "one"], columns=list("dabc"))
    print frame
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(axis=1, ascending=False)

    # Sorting series
    print "", ""
    obj = Series([4, 7, -3, 2])
    print obj.order()
    obj = Series([4, np.nan, 7, np.nan, -3, 2])
    print obj.order()
    print obj.order(ascending=False)

    # order by multi columns
    print "", ""
    frame = DataFrame({"b": [4, 7, -3, 2], "a": [0, 1, 0, 1]})
    print frame.sort_index(by=["a", "b"])

    # rank
    print "", ""
    obj = Series([7, -5, 7, 4, 2, 0, 4])
    print obj.rank()  # method is average
    print obj.rank(method="first")  # No Duplicates
    print obj.rank(ascending=False, method="min")
    print obj.rank(ascending=False, method="max")
    f1 = DataFrame(obj, columns=["data"])
    f2 = DataFrame(obj.rank(), columns=["rank"])
    # merge by each index
    print pd.merge(f1, f2, left_index=True, right_index=True)

    # Index of the axis with duplicate values
    print "", ""
    obj = Series(range(5), index=list("aaabc"))
    print obj
    print obj.index.is_unique
    print obj["a"]
    print obj["c"]

    df = DataFrame(np.arange(12.0).reshape((4, 3)), index=list("aabb"), columns=list("ccd"))
    print df
    print df.ix["b"]
    print df["c"]
Example #48
0
#Assign indexes
mySeries=Series([4,5,2],
	index=['Apples','Oranges','Grapes'])
mySeries

#Data filtering 
mySeries['Oranges']
'Apples' in mySeries 
mySeries

#Add a value
mySeries['Pears']=6
mySeries

#Delete a value 
mySeries.drop(['Oranges'])

#Create series from a dictionary
myDict={'USA':75,'Canada':20}
dictSeries=Series(myDict)
dictSeries

#Creating a data frame from dictionary
empDict={'id':[1,2,3,4],'name':   ['Mark','Ian','Sam','Rich'],'isManager':[False,True,False,True]}

## Data Structure : Data Frame from a dictionary
empDict={'id':[1,2,3,4]}
empDf=DataFrame(empDict)

#Access rows and columns 
empDf.name
Example #49
0
def test_drop():
    # unique
    s = Series([1, 2], index=['one', 'two'])
    expected = Series([1], index=['one'])
    result = s.drop(['two'])
    assert_series_equal(result, expected)
    result = s.drop('two', axis='rows')
    assert_series_equal(result, expected)

    # non-unique
    # GH 5248
    s = Series([1, 1, 2], index=['one', 'two', 'one'])
    expected = Series([1, 2], index=['one', 'one'])
    result = s.drop(['two'], axis=0)
    assert_series_equal(result, expected)
    result = s.drop('two')
    assert_series_equal(result, expected)

    expected = Series([1], index=['two'])
    result = s.drop(['one'])
    assert_series_equal(result, expected)
    result = s.drop('one')
    assert_series_equal(result, expected)

    # single string/tuple-like
    s = Series(range(3), index=list('abc'))
    pytest.raises(KeyError, s.drop, 'bc')
    pytest.raises(KeyError, s.drop, ('a',))

    # errors='ignore'
    s = Series(range(3), index=list('abc'))
    result = s.drop('bc', errors='ignore')
    assert_series_equal(result, s)
    result = s.drop(['a', 'd'], errors='ignore')
    expected = s.iloc[1:]
    assert_series_equal(result, expected)

    # bad axis
    pytest.raises(ValueError, s.drop, 'one', axis='columns')

    # GH 8522
    s = Series([2, 3], index=[True, False])
    assert s.index.is_object()
    result = s.drop(True)
    expected = Series([3], index=[False])
    assert_series_equal(result, expected)

    # GH 16877
    s = Series([2, 3], index=[0, 1])
    with tm.assert_raises_regex(KeyError, 'not contained in axis'):
        s.drop([False, True])
Example #50
0
def editor(interrogation, 
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05, 
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs
          ):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime
    
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum: 
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results


            outdict[k] = v.results.edit(**locs)
        if print_info:
            
            thetime = strftime("%H:%M:%S", localtime())
            print("\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r') :
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results
    
    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack
        
    if checkstack('pythontex'):
        print_info=False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:
                    
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2
            
            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work 
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {'%': datum.div,
                            '*': datum.mul,
                            '/': datum.div,
                            '+': datum.add,
                            '-': datum.sub}

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in the_input if word in df.index]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' % (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info=False)
        return df

    def just_these_entries(df, parsed_input, prinf=True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print('Keeping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(entries, axis=1)
        return df

    def skip_these_entries(df, parsed_input, prinf=True):
        if prinf:     
            print('Skipping %d entries:\n    %s' % \
                (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(parsed_input, axis=1)
        return df

    def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging='entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print('Merging %d %s as "%s":\n    %s' % \
                    (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10])))
                if len(parsed_input) > 10:
                    print('... and %d more ... \n' % (len(parsed_input) - 10))
                else:
                    print('')
        # remove old entries
        temp = sum([df[i] for i in parsed_input])

        if isinstance(df, Series):
            df = df.drop(parsed_input, errors='ignore')
            nms = list(df.index)
        else:
            df = df.drop(parsed_input, axis=1, errors='ignore')
            nms = list(df.columns)
        if the_newname in nms:
            df[the_newname] = df[the_newname] + temp
        else:
            df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf=True):        
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print('Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10])))
            if len(good_years) > 10:
                print('... and %d more ... \n' % (len(good_years) - 10))
            else:
                print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        if isinstance(lst_of_subcorpora, int):
            lst_of_subcorpora = [lst_of_subcorpora]
        if isinstance(lst_of_subcorpora[0], int):
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print('Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]])))
                if len(bad_years) > 10:
                    print('... and %d more ... \n' % (len(bad_years) - 10))
                else:
                    print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        """select only a span of suborpora (first, last)"""

        fir, sec = lst_of_subcorpora
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print('Keeping subcorpora:\n    %d--%d\n' % (int(fir), int(sec)))
        sbs = list(df.index)
        df = df.ix[sbs.index(fir):sbs.index(sec) + 1]

        return df

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' % (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' % (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try: 
            from scipy.stats import linregress
        except ImportError:
            
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this verion of corpkit.' % thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))
        
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']

        stats = []
        if isinstance(df, Series):
            y = list(df.values)
            sl = Series(list(linregress(x, y)), index=statfields)

        else:    
            for entry in list(df.columns):
                y = list(df[entry])
                stats.append(list(linregress(x, y)))
            sl = DataFrame(zip(*stats), index=statfields, columns=list(df.columns))
        df = df.append(sl)
        
        # drop infinites and nans
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results, potentially using scipy's linregress"""
        
        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False
        
        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(df.sum().sort_values(ascending=sort_by != 'total').index)]
        
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T
        
        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2)
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
                #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'         
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:

            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if not isinstance(merge_entries, list):
            if isinstance(merge_entries, STRINGTYPE):
                merge_entries = {'combine': merge_entries}
            # for newname, criteria    
            for name, the_input in sorted(merge_entries.items()):
                pin = parse_input(df, the_input)
                the_newname = newname_getter(df, pin, newname=name, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, the_input)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
        else:
            for i in merge_entries:
                pin = parse_input(df, merge_entries)
                the_newname = newname_getter(df, pin, prinf=print_info)
                df = merge_these_entries(df, pin, the_newname, prinf=print_info)
                if not single_totals:
                    pin2 = parse_input(df2, merge_entries)
                    df2 = merge_these_entries(df2, pin2, the_newname, prinf=False)
    
    if merge_subcorpora:
        if not isinstance(merge_subcorpora, dict):
            if isinstance(merge_subcorpora, list):
                if isinstance(merge_subcorpora[0], tuple):
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif isinstance(merge_subcorpora[0], STRINGTYPE):
                    merge_subcorpora = {'combine': [x for x in merge_subcorpora]}
                elif isinstance(merge_subcorpora[0], int):
                    merge_subcorpora = {'combine': [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {'combine': merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            pin = parse_input(df.T, the_input)
            the_newname = newname_getter(df.T, pin, newname=name, \
                merging_subcorpora=True, prinf=print_info)
            df = merge_these_entries(df.T, pin, the_newname, merging='subcorpora', 
                                     prinf=print_info).T
            if using_totals:
                pin2 = parse_input(df2.T, the_input)
                df2 = merge_these_entries(df2.T, pin2, the_newname, merging='subcorpora', 
                                          prinf=False).T

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf=print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf=print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False)
    
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info)
    
    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df, df2, 
                      selfdrop=selfdrop, 
                      threshold=threshold, 
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)
    
    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'
    
    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    if not hasattr(interrogation, 'concordance'):
        lns = None
    elif hasattr(interrogation, 'concordance') and interrogation.concordance is None:
        lns = None
    else:
        col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns))
        ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index))
        lns = interrogation.concordance[col_crit]
        lns = lns.loc[ind_crit]
        lns = Concordance(lns)
    
    output = Interrogation(results=df, totals=total, query=locs, concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
summ = s1.describe()
summ["mean"]
########################################################

	unique and nunique
## unique() returns the unique elements of a series 
## nunique() returns the number of unique values in a Series.

drop and dropna
drop(labels) drop elements with the selected labels from a Series.

	
## drop(labels) drop elements with the selected labels from a Series.
s1 = Series(arange(1.0,6),index=["a","a","b","c","d"])
s1
s1.drop("a")
	################

	dropna() is similar to drop() except that it only drops null values – NaN or similar.

	
s1 = Series(arange(1.0,4.0),index=["a","b","c"])
s2 = Series(arange(1.0,4.0),index=["c","d","e"])
s3 = s1 + s2
s3
s3.dropna()

##############################################################################

#### fillna
Example #52
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            quicksave = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            do_concordancing = False,
            maxconc = 9999,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""

    only_conc = False
    no_conc = False
    if do_concordancing is False:
        no_conc = True
    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        only_conc = True
        no_conc = False

    # iteratively count conc lines
    numconc = 0

    # store kwargs
    locs = locals()
    
    if kwargs:
        for k, v in kwargs.items():
            locs[k] = v
        locs.pop('kwargs', None)

    import corpkit
    from interrogation import Interrogation
    from process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from process import get_deps
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs
    import signal

    original_sigint = signal.getsignal(signal.SIGINT)

    if kwargs.get('paralleling', None) is None:
        original_sigint = signal.getsignal(signal.SIGINT)
        
        def signal_handler(signal, frame):
            """pause on ctrl+c, rather than just stop loop"""   
            import signal
            import sys
            from time import localtime, strftime
            signal.signal(signal.SIGINT, original_sigint)
            thetime = strftime("%H:%M:%S", localtime())
            try:
                sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            except NameError:
                sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            time = strftime("%H:%M:%S", localtime())
            print('%s: Interrogation resumed.\n' % time)
            signal.signal(signal.SIGINT, signal_handler)

        signal.signal(signal.SIGINT, signal_handler)

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if type(corpus) == str:
        from corpus import Corpus
        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search, search_iterable = searchfixer(search, query)
    
    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(list(search.keys())) == 1:
        query = list(search.values())[0]

    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        if hasattr(corpus, '__iter__'):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        speakr = dummy_args.get('speaker', False)
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            encd = to_write.encode('utf-8', errors = 'ignore') + '\n'
            fo.write(encd)
        q = list(search.values())[0]
        ops = ['-o', '-%s' % translated_option]
        concs = []
        res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if not no_conc:
            ops += ['-w', '-f']
            whole_res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True) 

            res = format_tregex(res)
            whole_res = format_tregex(whole_res, whole = True)
            concs = make_conc_lines_from_whole_mid(whole_res, res, speakr)

        if root:
            root.update()
        try:
            os.remove(to_open)
        except OSError:
            pass
        if countmode:
            return(len(res))
        else:
            return res, concs

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        import re, os
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results, whole = False):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        
        if whole:
            fnames = [x for x, y in results]
            results = [y for x, y in results]

        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)

        if whole:
            done = zip(fnames, done)

        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess

    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    count_results = {}
    conc_results = {}
    # check if just counting
    countmode = 'c' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if corpus.datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif corpus.datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l']
        if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif corpus.datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
                no_conc = True
                only_conc = False
                do_concordancing = False
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        translated_option = 't'
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for subcorpus in corpus.subcorpora:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
        #    to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if no_conc:
        message = 'Interrogating'
    else:
        message = 'Interrogating and concordancing'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n          %s corpus ... \n' % \
                  (thetime, message, corpus.name, optiontext, sformat, message)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files,
                'startnum': kwargs.get('startnum'),
                'denom': kwargs.get('denominator', 1)}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        conc_results[subcorpus_name] = []
        count_results[subcorpus_name] = []
        results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)

            if not countmode:
                result = format_tregex(result)

            if not no_conc:
                op += ['-w', '-f']
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result, whole = True)

                conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                result = Counter(result)
                results[subcorpus_name] += result
                if not no_conc:
                    for lin in conc_result:
                        if numconc < maxconc or not maxconc:
                            conc_results[subcorpus_name].append(lin)
                        numconc += 1

            current_iter += 1
            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)

            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:
                slow_treg_speaker_guess = kwargs.get('outname', False)
                if corpus.datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if len(just_speakers) == 1:
                                slow_treg_speaker_guess = just_speakers[0]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res, conc_res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            do_concordancing = do_concordancing,
                            only_format_match = only_format_match,
                            speaker = slow_treg_speaker_guess)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                elif corpus.datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    if not only_conc:
                        res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = False)
                    if not no_conc:
                        conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = True)
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            line.insert(0, '')

                elif corpus.datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        if not only_conc:
                            res = searcher(list(search.values())[0], data, 
                            concordancing = False)
                        if not no_conc:
                            conc_res = searcher(list(search.values())[0], data, 
                            concordancing = True)
                        if not no_conc:
                            for index, line in enumerate(conc_res):
                                line.insert(0, '')

                if countmode:
                    count_results[subcorpus_name] += [res]
                else:
                    # add filename and do lowercasing for conc
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            if searcher != slow_tregex:
                                line.insert(0, f.name)
                            else:
                                line[0] = f.name
                            if not preserve_case:
                                line[3:] = [x.lower() for x in line[3:]]
                            if spelling:
                                line = [correct_spelling(b) for b in line]
                            if numconc < maxconc or not maxconc:
                                conc_results[subcorpus_name].append(line)
                                numconc += 1

                    # do lowercasing and spelling
                    if not only_conc:
                        if not preserve_case:
                            if not statsmode:
                                res = [i.lower() for i in res]
                        if spelling:
                            if not statsmode:
                                res = [correct_spelling(r) for r in res]
                        #if not statsmode:
                        results[subcorpus_name] += Counter(res)
                        #else:
                        #results[subcorpus_name] += res

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if not no_conc:
        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        conc_df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis = 1, inplace = True)

        #if kwargs.get('note'):
        #    kwargs['note'].progvar.set(100)

        #if kwargs.get('printstatus', True):
        #    thetime = strftime("%H:%M:%S", localtime())
        #    finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index))
        #    print(finalstring)

        from interrogation import Concordance
        output = Concordance(conc_df)
        if only_conc:
            output.query = locs
            if quicksave:
                output.save()

            if kwargs.get('printstatus', True):
                thetime = strftime("%H:%M:%S", localtime())
                finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df))
                print(finalstring)
            return output

        #output.query = locs

        #return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    if not only_conc:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(count_results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort_values(ascending = False, inplace = True)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        if not no_conc:
            interro = Interrogation(results = df, totals = tot, query = locs, concordance = output)
        else:
            interro = Interrogation(results = df, totals = tot, query = locs)

        if quicksave:
            interro.save()
        
        return interro
Example #53
0
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
obj.reindex(['a','b','c','d','e'],fill_value=0)
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6), method='ffill')
obj.reindex(['a','b','c','d','e'],fill_value=0)
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6), method='ffill')
frame = DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns=['Ohio','Texas', 'California'])
frame2=  frame.reindex(['a','b','c','d'])
states=['Texas','Utah','California']
frame.reindex(columns=states)
frame2=  frame.reindex(['a','b','c','d'],method='ffill',columns=states)
frame.ix[['a','b','c','d'],states]
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj
obj.drop(['d','c'])
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
data.drop('two',axis=1)
data.drop(['two','four'],axis=1)

obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b']
obj[1]
obj
obj[2:4]
obj[['b','a','d']]
obj[[1,3]]
obj[obj<2]
Example #54
0
print '#==========逻辑问答===========#'
print 'Ohio' in frame3.columns



print '#------reindex-----#'
obj = Series([4.5,7.2,-5.3,3.6],index=['a','d','b','c'])
print obj
obj2 = obj.reindex(['a','b','c','d','e','f'],fill_value=0)
print obj2

print '#-------丢弃指定轴上的值------------#'
obj = Series(np.arange(5.),index=['a','b','c','d','e'])
print obj
new_obj = obj.drop('a')
print new_obj

data =DataFrame(np.arange(16.).reshape((4,4)),
				index=['Ohio','Colorado','Utha','NewYork'],
				columns=['one','two','three','four'])
				
print data
print data.drop('Ohio')  #删除行
print data.drop('one',axis=1)   #删除列  axis=0表示行 1表示列



print '#--------索引选取过滤---------#'
obj = Series(np.arange(4.),index=['a','b','c','d'])
print obj
Example #55
0
obj2 = obj.reindex(['a','b','c','e'])
# 如果某个索引值不存在,那么默认是空缺值
# 设置填充值
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
# 设置前向值填充,默认填充前一个
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
obj3.reindex(range(6), method='ffill')
# reindex默认重新索引行,但是也可以重新索引列
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a','c','d'],
	columns=['Ohio','Texas','California'])
states = ['Texas','Utah','California']
frame2 = frame.reindex(columns=states)

## 丢掉指定轴上的项
obj = Series(np.arange(5.), index=['a','b','c','d','e'])
obj.drop('c')
obj.drop(['d','c'])
# 丢弃列
data = DataFrame(np.arange(16).reshape(4,4),
	index = ['Ohio','Colorado','Utah','New York'],
	columns = ['one','two','three','four'])
data.drop(['one','four'], axis=1)

## 索引、选取和过滤
data.ix['Colorado',['two','three']]
data.ix['Colorado',[3,0,1]]
data.ix['Colorado']
data['two']
data.two

## 算数运算对齐
Example #56
0
s1 + s2

s1 = Series([1.0,2,3])
s1.values
s1.index
s1.index = ['cat','dog','elephant']
s1.index

s1 = Series(arange(10.0,20.0))
s1.describe()
summ = s1.describe()
summ['mean']

s1 = Series(arange(1.0,6),index=['a','a','b','c','d'])
s1
s1.drop('a')

s1 = Series(arange(1.0,4.0),index=['a','b','c'])
s2 = Series(arange(1.0,4.0),index=['c','d','e'])
s3 = s1 + s2
s3
s3.dropna()

s1 = Series(arange(1.0,4.0),index=['a','b','c'])
s2 = Series(arange(1.0,4.0),index=['c','d','e'])
s3 = s1 + s2
s3.fillna(-1.0)

df = DataFrame(array([[1,2],[3,4]]))
df
Example #57
0
# 创建index
index = pd.Index(np.arange(0,200))
s = Series(data,index=index)
# print s

# 重新索引
s2 = s.reindex(np.arange(100,300))
# print s2

# 选择不存在值的填充方式
s3 = s.reindex(np.arange(180,220),method='ffill')
# print s3

# 删除指定的index项
s4 = s.drop(np.arange(0,20))
# print s4

dic = {
    'Nevada':{2001:2.4,2002:4.2,2003:1.2},
    'Ohio':{2000:1.1,2002:4.4,2001:5.2}
}
f = DataFrame(dic)
# print f
# 列名称
cols = f.columns
# 行名称
index = f.index
# print cols
# print index
Example #58
0
 def test_cat_accessor_updates_on_inplace(self):
     s = Series(list('abc')).astype('category')
     s.drop(0, inplace=True)
     s.cat.remove_unused_categories(inplace=True)
     assert len(s.cat.categories) == 2
Example #59
0
# Lecture 18: Drop Entry

import numpy as np
from pandas import Series,DataFrame
import pandas as pd

ser1 = Series(np.arange(3),index=['a','b','c'])
ser1

ser1.drop('b')

# data frame
dframe1 = DataFrame(np.arange(9).reshape(3,3),
                    index = ['SF','LA','NY'],
                    columns =['pop','size','year'])
dframe1

# drop a row
dframe1.drop('LA')

# drop a column
dframe1.drop('year', axis = 1) # axis = 0 is rows (default)




Example #60
0
# encoding: UTF-8
import numpy as np
from pandas import Series
r=Series([5,10,20,25,30,25,21,23,45,62])
s=[30,40,50,60]
h=r.drop([len(r)-1])
print(h)

0   -0.215840
1    0.087310
2    0.169105
3   -0.120198
4    0.130334
5   -0.049459
Name: Close, dtype: float64
0   -0.055864
1   -0.024838
2   -0.029977
3    0.049289
4   -0.045947
Name: Close, dtype: float64
相关系数:-0.21477281551



0    0.087310
1    0.169105
2   -0.120198
3    0.130334
4   -0.049459
Name: Close, dtype: float64