def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.apply(
        lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: [item for item in x if item not in removing_words])
    pd = pd.apply(lambda x: [stemmer.stem(y) for y in x])
    pd = pd.str.join(' ')
    pd = pd.str.replace('[{}]'.format('$<>?@`\'"'), ' ')
    return pd
Ejemplo n.º 2
0
def preprocess_lite(pd):
    pd = pd.str.lower()
    pd = pd.str.replace(
        '[{}]'.format('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n\t'), ' ')
    pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: convert_numbers(x))
    pd = pd.str.join(' ')

    pd = pd.apply(
        lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: [item for item in x if len(item) > 1])
    return pd
Ejemplo n.º 3
0
    def callLambda(self, item, reload=False):

        if (not item is self.mainPanda.columns.values) or reload:

            if "@" in item:

                parts = item.split("@")
                field = parts[1]

                prereq = prereqMaster[field]

                for prq in prereq:
                    self.callitem(prq)

                for an in self.anhos:
                    pandas = self.dictpandas[an]
                    pandas = pd.apply(lambdaMaster[field], axis=1)
                    self.dictpandas[an] = pandas
Ejemplo n.º 4
0
def preprocess(pd):
    pd = pd.str.lower()
    pd = pd.str.replace('[^a-zA-Z]', ' ')
    pd = pd.apply(lambda x: [w for w in w_tokenizer.tokenize(x)])
    pd = pd.str.join(' ')

    pd = pd.apply(
        lambda x: [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(x)])
    pd = pd.apply(lambda x: [lemmatizer.lemmatize(w, 'v') for w in x])
    pd = pd.apply(lambda x: [item for item in x if item not in stop_words])
    pd = pd.apply(lambda x: [item for item in x if len(item) > 3])
    pd = pd.apply(lambda x: [
        i[0] for i in nltk.pos_tag(x)
        if i[1] in ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNP', 'NNPS']
    ])
    pd = pd.apply(lambda x: " ".join(x))
    return pd
def calcular_proporcion_facturacion_inferior_25(pd):

    pd['menor_percentil25'] = pd.apply(
        lambda x: is_menor_25porciento(x['percentil25'], x['percentil25']),
        axis=1)

    porcentaje_menor_percentil25 = pd[['grupo', 'menor_percentil25']] \
        .groupby('grupo').agg(['sum', 'count']).reset_index()

    porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'] = \
        (porcentaje_menor_percentil25['menor_percentil25']['sum'] / (porcentaje_menor_percentil25['menor_percentil25']['count'] - 1))

    porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'] = \
        porcentaje_menor_percentil25['porcentaje_fact_menor_percentil_25'].fillna(0)

    porcentaje_menor_percentil25 = porcentaje_menor_percentil25[[
        'grupo', 'porcentaje_fact_menor_percentil_25'
    ]]

    pd_variable_fact_inferior_25 = pd.\
        merge(porcentaje_menor_percentil25, how='inner', on='grupo')

    return pd_variable_fact_inferior_25
Ejemplo n.º 6
0
def quantize(x, step):
    sign = pd.apply(np.sign, x)
    quantized = pd.apply(np.floor, (value.abs() / step) + 0.5)
    return (sign * quantized)
Ejemplo n.º 7
0
        df.dtypes


dates = pd.date_range("20161201", periods=7)
pd = pd.DataFrame([
    {
        "a": "1",
        "b": "2",
        "c": "3"
    },
    {
        "a": "2",
        "b": "5",
        "c": "11"
    },
    {
        "a": "3",
        "b": "2",
        "c": "30"
    },
    {
        "a": "4",
        "b": "5",
        "c": "3"
    },
])

ss = {"1": "11111", "2": "2222", "3": "3333", "4": "4444"}
pd["d"] = pd.apply(lambda x: int(x["a"]) * int(x["b"]), axis=1)

print pd.columns
Ejemplo n.º 8
0
def pd_to_int(h2o, pd):
    return (h2o, pd.apply(lambda x: 1 if x else 0))
Ejemplo n.º 9
0
def funtionICP(X, Y, ExpInd, alpha=0.1, mode="asymptotic", intercept=False):
    if isinstance(X, list) and X.isnumeric():
        X = np.asmatrix(X, ncol=1)
    if not isinstance(X, np.ndarray) and not isinstance(X, pd.DataFrame):
        raise ValueError("'X' must be a matrix or data frame")
    if not isinstance(Y, np.ndarray):
        raise ValueError("'Y' must be a vector")
    if X.shape[0] <= X.shape[1]:
        raise ValueError(
            "hiddenICP not suitable for high-dimensional data (at the moment) \n -- need row > column but have nrow(X)= {} and ncol(X)={}"
            .format(X.shape[0], X.shape[1]))
    if not isinstance(ExpInd, list):  # If ExpInd is not a list
        if len(ExpInd) != len(Y):
            raise Exception(
                "if `ExpInd' is a vector, it needs to have the same length as `Y'"
            )
        uni = np.unique(ExpInd)
        if len(uni) == 1:
            raise Exception(
                "There is just one environment ('ExpInd'= {} for all observations) and the method needs at least two distinct environments sep = "
                .format(uni[1]))
        if min(Counter(ExpInd)) <= 2:
            print("\nOut put of 'table(ExpInd)':\n ")
            print(Counter(ExpInd))
            raise Exception(
                "one environment has just one or two observations (as supllied by 'ExpInd'); there need to be at least 3 (and ideally dozens) of observations in each environment; the out put of 'table(ExpInd)' is given below to show the number of observations in each unique environment as supplied by 'ExpInd'"
            )
        K = len(uni)
        ExpIndNEW = list()
        for uc in range(0, K):
            ExpIndNEW[uc] = np.where(ExpInd == uni[uc])
            setattr(ExpIndNEW[uc], "value", uni[uc])
        ExpInd = ExpIndNEW  # Now ExpInd is a list
        del ExpIndNEW
    else:  #if ExpInd is a list
        if min(ExpInd) < 1:
            raise Exception(
                "if `ExpInd' is a list with indicies of observations, \n minimal entry has to be at least 1 but is {}"
                .format(min(ExpInd)))
        if max(ExpInd) > len(Y):
            raise Exception(
                "if `ExpInd' is a list with indicies of observations, \n maximal entry has to be at most equal \n to the length {} of the observations but is {}"
                .format(len(Y), max(ExpInd)))
    X = pd.DataFrame(X)
    '''if len(ucol = set(X.shape[1])) < min(3, X.shape[1]) :
       colnames(X) = paste("Variable",1:X.shape[1],sep="_")'''
    colX = X.columns
    if intercept:
        X = np.column_stack((np.repeat(1, X.shape[0]), X))
    K = len(ExpInd)
    p = X.shape[1]
    n = X.shape[0]

    kc = 1
    if K > 2:
        KC = K
    else:
        KC = 1

    ConfInt = np.zeros(0, shape=[2, p])
    pvalues = np.repeat(1, p)
    for kc in range(0, KC):
        ins = ExpInd[kc]
        out = (1, n)[-ins]
        DS = (np.transpose(X[ins:]).dot(X[ins:])) / len(ins) - (np.transpose(
            X[out:]).dot(X[out:])) / len(out)
        Drho = (np.transpose(X[ins:]).dot(Y[ins])) / len(ins) - (np.transpose(
            X[out:]).dot(Y[out])) / len(out)
        DSI = np.linalg.solve(DS, Drho)

        betahat = pd.to_numeric(np.linalg.solve(DS, Drho))
        if kc == 1:
            betahatall = betahat
        else:
            betahatall = betahatall + betahat
        Zin = np.zeros(shape=[len(ins), p])
        Zout = np.zeros(shape=[len(out), p])
        for i in range(0, len(ins)):
            tmp = DSI * X[ins[i], ]
            Zin[i, ] = pd.to_numeric(-tmp * sum(tmp * Drho) + Y[ins[i]] * tmp)

        for i in range(0, len(out)):
            tmp = DSI * X[out[i], ]
            Zout[i, ] = pd.to_numeric(-tmp * sum(tmp * Drho) + Y[out[i]] * tmp)

        sigmavec = math.sqrt(
            np.diag((np.cov(Zin) / len(ins) + np.cov(Zout) / len(out))))

        pvalues = min(
            pvalues, 2 * K *
            (1 -
             norm.cdf(abs(betahat) / max(pow(10, -10), sigmavec), df=n - 1)))

        addvar = norm.ppf(max(0.5, 1 - alpha / (2 * K))) * sigmavec
        maximineffectsN = np.sign(betahat) * max(0, abs(betahat) - addvar)
        ConfInt[1:] = max(ConfInt[1:], betahat - addvar, True)
        ConfInt[2:] = min(ConfInt[2:], betahat + addvar, True)
        if kc == 1:
            maximineffects = maximineffectsN
        else:
            for varc in range(1, p + 1):
                if abs(maximineffectsN[varc]) > abs(maximineffects[varc]):
                    maximineffects[varc] = maximineffectsN[varc]

    betahat = betahatall / KC
    maximinCoefficients = maximineffects
    if intercept:
        betahat = betahat[-1]
        maximinCoefficients = maximinCoefficients[-1]
        ConfInt = ConfInt[:-1]
        pvalues = pvalues[-1]

    ConfInt = pd.apply(ConfInt, 2, result_type='sort', decreasing=True)
    retobj = list(betahat=betahat,
                  maximinCoefficients=maximinCoefficients,
                  ConfInt=ConfInt,
                  pvalues=pvalues,
                  colnames=colX,
                  alpha=alpha)
    #class(retobj) <- "hiddenInvariantCausalPrediction"
    return retobj
Ejemplo n.º 10
0
    labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
  File "pandas/_libs/hashtable_class_helper.pxi", line 1367, in pandas._libs.hashtable.PyObjectHashTable.get_labels
TypeError: unhashable type: 'list'
>>> pd.get_dummies(Y[1])
   She  by  seashells  seashore.  sells  the
0    1   0          0          0      0    0
1    0   0          0          0      1    0
2    0   0          1          0      0    0
3    0   1          0          0      0    0
4    0   0          0          0      0    1
5    0   0          0          1      0    0
>>> Y[1]
['She', 'sells', 'seashells', 'by', 'the', 'seashore.']
>>> pd.get_dummies
<function get_dummies at 0x10dfe1ea0>
>>> pd.apply(get_dummies(Y))
Traceback (most recent call last):
  File "<pyshell#19>", line 1, in <module>
    pd.apply(get_dummies(Y))
AttributeError: module 'pandas' has no attribute 'apply'
>>> Y.apply(get_dummies)
Traceback (most recent call last):
  File "<pyshell#20>", line 1, in <module>
    Y.apply(get_dummies)
NameError: name 'get_dummies' is not defined
>>> Y.apply(pd.get_dummies)
0       I  by  seashells  seashore.  sell  the
0  1...
1       She  by  seashells  seashore.  sells  the
0...
dtype: object
Ejemplo n.º 11
0
def quantize(x, step):
  sign = pd.apply(np.sign, x) 
  quantized = pd.apply(np.floor, (value.abs() / step) + 0.5)
  return (sign * quantized)
Ejemplo n.º 12
0
def pd_to_int(h2o, pd):
    return (h2o, pd.apply(lambda x: 1 if x else 0))