Ejemplo n.º 1
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for col in columns[0:1]:
        if oper == 'add':
            df['add'] = df[col]
        elif oper == 'sub':
            df['sub'] = df[col]
        elif oper == 'mul':
            df['mul'] = df[col]
        elif oper == 'div':
            df['div'] = df[col]
        else:
            return None

        for col in columns[1:]:
            if oper == 'add':
                df['add'] += df[col]

            elif oper == 'sub':
                df['sub'] -= df[col]

            elif oper == 'mul':
                df['mul'] *= df[col]

            elif oper == 'div':
                df['div'] /= df[col]

            else:
                return None

    return _helper.publish(df)
Ejemplo n.º 2
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for c in col:
        if c in df:
            df[c] =df[c].astype('int64')
            features = df[[c]]
            pt = PowerTransformer(method='yeo-johnson', standardize=True,) 
            #Fit the data to the powertransformer
            pt_yeojohnson = pt.fit(features)
            #Transform the data 
            pt_yeojohnson = pt.transform(features)
            #Pass the transformed data into a new dataframe 
            df_xt = pd.DataFrame(data=pt_yeojohnson, columns=[c + '_yeojohn'])
            df=df.join(df_xt)
            

        else:
            Pass
    
    
    return _helper.publish(df)
Ejemplo n.º 3
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if t_test == 'student t_test':
        # null hypothesis: expected value =
        a = df.loc[df[col_1] == df[col_1].value_counts().index[0],
                   col_2].to_numpy()
        b = df.loc[df[col_1] == df[col_1].value_counts().index[1],
                   col_2].to_numpy()
        t_statistic, p_value = ttest_ind(a, b)
    elif t_test == 'Kolmogrov-Smirnov':
        # two sample Kolmogrov-Smirnov test t
        a = df.loc[df[col_1] == df[col_1].value_counts().index[0],
                   col_2].to_numpy()
        b = df.loc[df[col_1] == df[col_1].value_counts().index[1],
                   col_2].to_numpy()
        z_statistic, p_value = ks_2samp(a, b)
    else:
        #two sample median mood test
        a = df.loc[df[col_1] == df[col_1].value_counts().index[0],
                   col_2].to_numpy()
        b = df.loc[df[col_1] == df[col_1].value_counts().index[1],
                   col_2].to_numpy()
        stat, p_value, m, tb = median_test(a, b)
    p_value = "{:.30f}".format(p_value)
    df[col_1 + '_' + col_2 + "_p_value"] = p_value

    return _helper.publish(df)
Ejemplo n.º 4
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    fig = px.histogram(df, x=col1, y=col2, color=color_col, marginal="box",hover_data=df.columns)
    _helper.chart(fig)
Ejemplo n.º 5
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    #clean html
    def cleanhtml(sent):
        clean = re.sub(r'<*?>', r' ', str(sent))
        return clean

    #clean punctuation
    def cleanpunc(word):
        cleanr = re.sub(r'[?|!|\'|"|#]', r' ', str(word))
        cleaned = re.sub(r'[)|(|\|/]', r' ', str(cleanr))
        return cleaned

    lst_of_sent = []
    str1 = ''
    final_str = []
    #Enter the column which has string data type
    if col in df.columns:
        for sent in df[col].values:
            filtered_sent = []
            sent = cleanhtml(sent)
            for word in sent.split():
                for clean_words in cleanpunc(word).split():
                    if (clean_words.isalpha()):
                        filtered_sent.append(clean_words.lower())
            lst_of_sent.append(filtered_sent)
    else:
        return None

    for lst in lst_of_sent:
        str1 = ' '.join(lst)
        final_str.append(str1)

    df[col] = final_str

    #Extract sentiment from text

    analyzer = SentimentIntensityAnalyzer()
    lst_of_sent = []
    for sentence in final_str:
        vs = analyzer.polarity_scores(sentence)
        for key, item in vs.items():
            if key == 'compound':
                lst_of_sent.append(item)
            elif key == 'neg' or key == 'pos' or key == 'neu':
                pass
            else:
                return None

    df['compound'] = lst_of_sent
    return _helper.publish(df)
Ejemplo n.º 6
0
def main():
    data = _helper.data() 
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if column_1 in data.columns and column_2 in data.columns :
        data['exp'] = pow(data[column_1],m) - pow(data[column_2],n)
        
    else:
        return None
    
    return _helper.publish(data)
Ejemplo n.º 7
0
def main():
    data = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if column in data:
        data['logarithm_base10'] = np.log10(data[column])
    
    else:
        return None
    
    return  _helper.publish(data)
        
Ejemplo n.º 8
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for col in columns:
        if col in df.columns:
            df[col+"_n_power"]=df[col]**n
            
        else:
            return None
        
    return _helper.publish(df)
Ejemplo n.º 9
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for c in col:
        if c in df:
            df[c + 'sqrt_col'] = sqrt(df[c])

        else:
            return None

    return _helper.publish(df)
Ejemplo n.º 10
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for c in col:
        if c in df:
            #df[col] =df[col].astype('int64')
            df[c + 'rank_col'] = df[c].rank()

        else:
            return None

    return _helper.publish(df)
Ejemplo n.º 11
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if col in df:
        transform = df[col].values
        # transform values and store as "dft"
        dft = stats.boxcox(transform)
        df['box_cox'] = dft[0]

    else:
        return None

    return _helper.publish(df)
Ejemplo n.º 12
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    tfidf = TfidfVectorizer()
    if column in df:
        tf_idf_data = tfidf.fit_transform(df[column])
        tfidf_list = tf_idf_data.toarray()
        df_tfidf = pd.DataFrame(tfidf_list)

    else:
        return None

    return _helper.publish(df_tfidf)
Ejemplo n.º 13
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for c in col:
        if c in df:
            scaler = GaussRankScaler()
            X = scaler.fit_transform(df[[c]])
            X_ = pd.DataFrame(X, columns=[c + '_gauss_rank'])
            df = df.join(X_)

        else:
            return None

    return _helper.publish(df)
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if t_test == 'student t_test':
        # null hypothesis: expected value =
        t_statistic, p_value = ttest_1samp(df[col], 0)
    elif t_test == 'sign_test':
        # one sample wilcoxon-test
        z_statistic, p_value = wilcoxon(df[col])
    else:
        #on esample shapiro
        stat, p_value = shapiro(df[col])
        p_value = "{:.5f}".format(p_value)
    df[col + "_p_value"] = p_value
    return _helper.publish(df)
Ejemplo n.º 15
0
def main():
    data = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    if var == 'positive':
        data['positive_count'] = data.select_dtypes(
            include='number').ge(1).sum(axis=1)

    elif var == 'negative':
        data['negative_count'] = data.select_dtypes(
            include='number').lt(0).sum(axis=1)

    elif var == 'both':
        data['positive_count'] = data.select_dtypes(
            include='number').ge(1).sum(axis=1)
        data['positive_count'] = data.select_dtypes(
            include='number').lt(0).sum(axis=1)

    else:
        return None

    return _helper.publish(data)
Ejemplo n.º 16
0
def main():
    df = _helper.data()
    if df.empty:
        raise ValueError('Data Loading failed !')
    else:
        pass
    for col in columns :
        if method_type=='rolling':
            if win_size_type=='window_by_row':
                if operation =="sum":
                    df[col+"_sum"] = df[col].rolling(win_size).sum()

                elif operation =="mean":
                    df[col+"_mean"] = df[col].rolling(win_size).mean()

                elif operation =="median":
                    df[col + "_median"] = df[col].rolling(win_size).median()

                elif operation =="count":
                    df[col + "_count"] = df[col].rolling(win_size).count()

                elif operation =="quantile":
                    df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile)

                else:
                    df[col + "_variance"] = df[col].rolling(win_size).var()


            elif win_size_type=='window_by_time':

                if type(df.index) == pandas.core.indexes.datetimes.DatetimeIndex :

                    if operation =="sum":
                        df[col + "_sum"] = df[col].rolling(win_size).sum()

                    elif operation =="mean":
                        df[col + "_mean"] = df[col].rolling(win_size).mean()

                    elif operation =="median":
                        df[col + "_median"] = df[col].rolling(win_size).median()

                    elif operation =="count":
                        df[col + "_count"] = df[col].rolling(win_size).count()

                    elif operation =="quantile":
                        df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile)

                    else:
                        df[col + "_variance"] = df[col].rolling(win_size).var()

                else:
                    min_year=1900
                    max_year=datetime.now().year

                    start = datetime(min_year, 1, 1, 00, 00, 00)
                    years = max_year - min_year+1
                    end = start + timedelta(days=365 * years)

                    for i in range(len(df)):
                        random_date = start + (end - start) * random.random()
                        df["date"] = random_date

                    df = df.set_index(df['date'])

                    if operation =="sum":
                        df[col + "_sum"] = df[col].rolling(win_size).sum()

                    elif operation =="mean":
                        df[col + "_mean"] = df[col].rolling(win_size).mean()

                    elif operation =="median":
                        df[col + "_median"] = df[col].rolling(win_size).median()

                    elif operation =="count":
                        df[col + "_count"] = df[col].rolling(win_size).count()

                    elif operation =="quantile":
                        df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile)

                    else:
                        df[col + "_variance"] = df[col].rolling(win_size).var()

            else:
                pass
        else:
            if operation =="sum":
                df[col+"_sum"] = df[col].expanding(win_size).sum()

            elif operation =="mean":
                df[col+"_mean"] = df[col].expanding(win_size).mean()

            elif operation =="median":
                df[col + "_median"] = df[col].expanding(win_size).median()

            elif operation =="count":
                df[col + "_count"] = df[col].expanding(win_size).count()

            elif operation =="quantile":
                df[col + "_quantile"] = df[col].expanding(win_size).quantile(percentile)

            else:
                df[col + "_variance"] = df[col].expanding(win_size).var()

            

    return _helper.publish(df)