Esempio n. 1
0
 def disp_df(self, *args):
     self.out.clear_output()
     with self.out:
         try:
             show(self.df)
         except:
             pass
# In[3]:


import seaborn as sns
sns.heatmap(np.log1p(M))


# Here we'll also present the dataframe containing the remapped numberings (in the index column) to reference with the above figure.  Remember, each row corresponds (and column) corresponds to a specific label in the parcellation.  Due to the renumbering we performed, the index column of the below table, indicates which label is associated with which row of the above matrix figure.

# In[4]:


import itables
resetTable=remappingFrame.reset_index()
itables.show(resetTable,paging=True)


# In order to make the information in the matrix a bit more digestable, we can look at the information contained in each row/column as a bar graph.  Below we'll do this in an interactive fashion.  Be warned:  some rows have only a few connections and are fairly straightforward to view in this way, while others may have a large number of connections and may result in a particulalrly large bar plot.

# In[5]:



dropDownList=list(zip(currentParcellationEntries['LabelName:'].to_list(), currentParcellationEntries['#No.'].to_list()))

def plotCurrentLabelCounts(currLabel):
    import seaborn as sns
    from itertools import compress
    import matplotlib.pyplot as plt
    #convert the input FS Label number to the renumbered index
Esempio n. 3
0
from itables.javascript import load_datatables
import itables.options as opts
import pandas as pd
from tensorflow.python.lib.io import file_io

# Forcefully load required JavaScript and CSS for datatables.
load_datatables()

# Remove maxByte limit to prevent issues where entire table cannot be rendered
# due to size of data.
opts.maxBytes = 0

dfs = []
files = file_io.get_matching_files(source)

# Read data from file and write it to a DataFrame object.
if not variables.get("headers", False):
    # If no headers are provided, use the first row as headers
    for f in files:
        dfs.append(pd.read_csv(f))
else:
    # If headers are provided, do not set headers for DataFrames
    for f in files:
        dfs.append(pd.read_csv(f, header=None))

# Display DataFrame as output.
df = pd.concat(dfs)
if variables.get("headers", False):
    df.columns = variables.get("headers")
show(df)
Esempio n. 4
0
def test_show_test_series(series_name, series):
    show(series)
Esempio n. 5
0
def test_show_test_dfs(df_name, df):
    show(df)
Esempio n. 6
0
def test_get_indicators():
    df = get_indicators()
    assert len(df.index) == 500
    assert len(df.columns)
    show(df)
Esempio n. 7
0
def test_get_population():
    x = get_population()
    assert len(x) > 30
    assert x.max() > 7e9
    show(x)
Esempio n. 8
0
def test_get_countries():
    df = get_countries()
    assert len(df.columns) > 5
    assert len(df.index) > 100
    show(df)
Esempio n. 9
0
def test_no_warning_when_eval_functions_is_true(df, coloredColumnDefs):
    warnings.simplefilter("error")
    show(df, columnDefs=coloredColumnDefs, eval_functions=True)
Esempio n. 10
0
def test_warning_when_eval_functions_is_missing(df, coloredColumnDefs):
    with pytest.warns(UserWarning, match="starts with 'function'"):
        show(df, columnDefs=coloredColumnDefs)
data["repay_date"] = data["repay_date"].astype("datetime64")
# data["register_time"]=data["register_time"].astype("datetime64")
# data["register_time_hour"]=data["register_time"].hour

# data["os_new"]=data["os"].map(lambda x:x[0:10])

# data["app_type_num"] = data.apply(lambda x : sum([1 if x[i] > 0 else 0 for i in var_list]) , axis=1)
# data["device_manufacturer_new"]=data["device_manufacturer"]
# data["device_manufacturer_new"][data["device_manufacturer_new"].isin(device_manufacturer_hebing)]='others'

print("\033[1;31m样本量\n \033[0m", data.shape)
print(data.columns.values)
# print(type(data))
# print(data.dtypes)
# print(data.describe())
show(data.describe())

# ,dtype={"order_id":"str","account_id":"str"}

# In[6]:

#数据读取

# print(data.children_number.dtypes)

# print(data.info())
# show(data.head())


def sample_tag(x):
    if x > 0:
def data_explore(df,
                 df_name,
                 col_width="200px",
                 na_threshold=0.7,
                 is_show_all=0):
    """
    对数据进行简单eda
    df -- 需要进行数据探索的数据框
    df_name -- 需要进行数据探索的数据框的命名
    col_width -- 展示的表单的字段宽度,默认为200px
    na_threshold -- 缺失变量缺失率阈值,默认为0.7,即删除缺失比例大于阈值的变量,不进行分布(分位数这些)查看
    show_type -- 哪些变量在缺失率展示时需要展示,0表示值展示有缺失的变量,-1表示所有变量均展示,默认为0
    """
    #数据读取
    #replace_name -- 需要替换的表名,因为hue导出时,字段名里会带上表名

    #     data=pd.read_csv(df_name)

    #列名将表名使用空格替换掉
    #     col_name=pd.DataFrame(pd.Series(data.columns.values),columns=["var_name"])
    #     excel_name=df_name.replace('.csv','')+'.xlsx'
    #     col_name.to_excel(excel_name)
    #     col_name=col_name["var_name"].map(lambda x:x.replace(replace_name,''))
    #     data.columns=list(col_name)

    data = df.copy()
    print("\033[1;31m数据量级和特征数据类型\n \033[0m")
    print(data.info())
    #     print(data.columns)

    print("\033[1;31m少量数据查看\n \033[0m")
    show(data.head(),
         columnDefs=[{
             "width": col_width,
             "high": "80px",
             "targets": "_all"
         }])

    #离散型,连续型变量个数
    character_feature, count = character_VarFindFunc(data)
    print("\033[1;31m离散型变量个数\n \033[0m", count)
    print("\033[1;31m连续型变量个数\n \033[0m", len(data.columns) - count)

    #1.2.1缺失值统计
    feature_na, na_feature_num = findNaFunc(data, is_show_all=is_show_all)

    # feature_na=feature_na.rename(columns={"queshi_num":"缺失值数量","na_rate":"缺失率"})
    print("\033[1;31m有缺失值的变量个数:\n \033[0m", na_feature_num)
    print("\033[1;31m各变量缺失率展示\n \033[0m")
    feature_na_show = feature_na.sort_values(by='queshi_num',
                                             axis=0,
                                             ascending=False)
    feature_na_show = feature_na_show.reset_index(drop=False)

    #表单字段缺失率查看
    excel_name = df_name + '.xlsx'
    feature_na_show.to_excel(excel_name)

    show(feature_na_show,
         columnDefs=[{
             "width": col_width,
             "high": "80px",
             "targets": "_all"
         }])

    #缺失率少于某个阈值的特征,才进行分布展示
    delete_feature = feature_na[
        feature_na["na_rate"] >
        na_threshold].var_name  #当不想删除缺失特征时,可提高na_rate阈值
    data_delete = data.drop(delete_feature, axis=1)
    na_count = len(delete_feature)

    print("\033[1;31m去掉缺失率大于阈值的变量\n \033[0m", na_count)
    print("\033[1;31m去掉缺失率较高连续型变量分布\n \033[0m")
    show(data_delete.describe(),
         columnDefs=[{
             "width": col_width,
             "high": "80px",
             "targets": "_all"
         }])
    print("\033[1;31m去掉缺失率较高离散型变量分布\n \033[0m")
    show(data_delete.describe(include=['object']),
         columnDefs=[{
             "width": col_width,
             "high": "80px",
             "targets": "_all"
         }])