def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for col in columns[0:1]: if oper == 'add': df['add'] = df[col] elif oper == 'sub': df['sub'] = df[col] elif oper == 'mul': df['mul'] = df[col] elif oper == 'div': df['div'] = df[col] else: return None for col in columns[1:]: if oper == 'add': df['add'] += df[col] elif oper == 'sub': df['sub'] -= df[col] elif oper == 'mul': df['mul'] *= df[col] elif oper == 'div': df['div'] /= df[col] else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for c in col: if c in df: df[c] =df[c].astype('int64') features = df[[c]] pt = PowerTransformer(method='yeo-johnson', standardize=True,) #Fit the data to the powertransformer pt_yeojohnson = pt.fit(features) #Transform the data pt_yeojohnson = pt.transform(features) #Pass the transformed data into a new dataframe df_xt = pd.DataFrame(data=pt_yeojohnson, columns=[c + '_yeojohn']) df=df.join(df_xt) else: Pass return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if t_test == 'student t_test': # null hypothesis: expected value = a = df.loc[df[col_1] == df[col_1].value_counts().index[0], col_2].to_numpy() b = df.loc[df[col_1] == df[col_1].value_counts().index[1], col_2].to_numpy() t_statistic, p_value = ttest_ind(a, b) elif t_test == 'Kolmogrov-Smirnov': # two sample Kolmogrov-Smirnov test t a = df.loc[df[col_1] == df[col_1].value_counts().index[0], col_2].to_numpy() b = df.loc[df[col_1] == df[col_1].value_counts().index[1], col_2].to_numpy() z_statistic, p_value = ks_2samp(a, b) else: #two sample median mood test a = df.loc[df[col_1] == df[col_1].value_counts().index[0], col_2].to_numpy() b = df.loc[df[col_1] == df[col_1].value_counts().index[1], col_2].to_numpy() stat, p_value, m, tb = median_test(a, b) p_value = "{:.30f}".format(p_value) df[col_1 + '_' + col_2 + "_p_value"] = p_value return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass fig = px.histogram(df, x=col1, y=col2, color=color_col, marginal="box",hover_data=df.columns) _helper.chart(fig)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass #clean html def cleanhtml(sent): clean = re.sub(r'<*?>', r' ', str(sent)) return clean #clean punctuation def cleanpunc(word): cleanr = re.sub(r'[?|!|\'|"|#]', r' ', str(word)) cleaned = re.sub(r'[)|(|\|/]', r' ', str(cleanr)) return cleaned lst_of_sent = [] str1 = '' final_str = [] #Enter the column which has string data type if col in df.columns: for sent in df[col].values: filtered_sent = [] sent = cleanhtml(sent) for word in sent.split(): for clean_words in cleanpunc(word).split(): if (clean_words.isalpha()): filtered_sent.append(clean_words.lower()) lst_of_sent.append(filtered_sent) else: return None for lst in lst_of_sent: str1 = ' '.join(lst) final_str.append(str1) df[col] = final_str #Extract sentiment from text analyzer = SentimentIntensityAnalyzer() lst_of_sent = [] for sentence in final_str: vs = analyzer.polarity_scores(sentence) for key, item in vs.items(): if key == 'compound': lst_of_sent.append(item) elif key == 'neg' or key == 'pos' or key == 'neu': pass else: return None df['compound'] = lst_of_sent return _helper.publish(df)
def main(): data = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if column_1 in data.columns and column_2 in data.columns : data['exp'] = pow(data[column_1],m) - pow(data[column_2],n) else: return None return _helper.publish(data)
def main(): data = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if column in data: data['logarithm_base10'] = np.log10(data[column]) else: return None return _helper.publish(data)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for col in columns: if col in df.columns: df[col+"_n_power"]=df[col]**n else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for c in col: if c in df: df[c + 'sqrt_col'] = sqrt(df[c]) else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for c in col: if c in df: #df[col] =df[col].astype('int64') df[c + 'rank_col'] = df[c].rank() else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if col in df: transform = df[col].values # transform values and store as "dft" dft = stats.boxcox(transform) df['box_cox'] = dft[0] else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass tfidf = TfidfVectorizer() if column in df: tf_idf_data = tfidf.fit_transform(df[column]) tfidf_list = tf_idf_data.toarray() df_tfidf = pd.DataFrame(tfidf_list) else: return None return _helper.publish(df_tfidf)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for c in col: if c in df: scaler = GaussRankScaler() X = scaler.fit_transform(df[[c]]) X_ = pd.DataFrame(X, columns=[c + '_gauss_rank']) df = df.join(X_) else: return None return _helper.publish(df)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if t_test == 'student t_test': # null hypothesis: expected value = t_statistic, p_value = ttest_1samp(df[col], 0) elif t_test == 'sign_test': # one sample wilcoxon-test z_statistic, p_value = wilcoxon(df[col]) else: #on esample shapiro stat, p_value = shapiro(df[col]) p_value = "{:.5f}".format(p_value) df[col + "_p_value"] = p_value return _helper.publish(df)
def main(): data = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass if var == 'positive': data['positive_count'] = data.select_dtypes( include='number').ge(1).sum(axis=1) elif var == 'negative': data['negative_count'] = data.select_dtypes( include='number').lt(0).sum(axis=1) elif var == 'both': data['positive_count'] = data.select_dtypes( include='number').ge(1).sum(axis=1) data['positive_count'] = data.select_dtypes( include='number').lt(0).sum(axis=1) else: return None return _helper.publish(data)
def main(): df = _helper.data() if df.empty: raise ValueError('Data Loading failed !') else: pass for col in columns : if method_type=='rolling': if win_size_type=='window_by_row': if operation =="sum": df[col+"_sum"] = df[col].rolling(win_size).sum() elif operation =="mean": df[col+"_mean"] = df[col].rolling(win_size).mean() elif operation =="median": df[col + "_median"] = df[col].rolling(win_size).median() elif operation =="count": df[col + "_count"] = df[col].rolling(win_size).count() elif operation =="quantile": df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile) else: df[col + "_variance"] = df[col].rolling(win_size).var() elif win_size_type=='window_by_time': if type(df.index) == pandas.core.indexes.datetimes.DatetimeIndex : if operation =="sum": df[col + "_sum"] = df[col].rolling(win_size).sum() elif operation =="mean": df[col + "_mean"] = df[col].rolling(win_size).mean() elif operation =="median": df[col + "_median"] = df[col].rolling(win_size).median() elif operation =="count": df[col + "_count"] = df[col].rolling(win_size).count() elif operation =="quantile": df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile) else: df[col + "_variance"] = df[col].rolling(win_size).var() else: min_year=1900 max_year=datetime.now().year start = datetime(min_year, 1, 1, 00, 00, 00) years = max_year - min_year+1 end = start + timedelta(days=365 * years) for i in range(len(df)): random_date = start + (end - start) * random.random() df["date"] = random_date df = df.set_index(df['date']) if operation =="sum": df[col + "_sum"] = df[col].rolling(win_size).sum() elif operation =="mean": df[col + "_mean"] = df[col].rolling(win_size).mean() elif operation =="median": df[col + "_median"] = df[col].rolling(win_size).median() elif operation =="count": df[col + "_count"] = df[col].rolling(win_size).count() elif operation =="quantile": df[col + "_quantile"] = df[col].rolling(win_size).quantile(percentile) else: df[col + "_variance"] = df[col].rolling(win_size).var() else: pass else: if operation =="sum": df[col+"_sum"] = df[col].expanding(win_size).sum() elif operation =="mean": df[col+"_mean"] = df[col].expanding(win_size).mean() elif operation =="median": df[col + "_median"] = df[col].expanding(win_size).median() elif operation =="count": df[col + "_count"] = df[col].expanding(win_size).count() elif operation =="quantile": df[col + "_quantile"] = df[col].expanding(win_size).quantile(percentile) else: df[col + "_variance"] = df[col].expanding(win_size).var() return _helper.publish(df)