Ejemplo n.º 1
0
def biao1(df):
    df['结果2'] = df['结果'].apply(lambda x: x.split(','))

    list1 = []
    for x in range(df.shape[0]):
        if df.loc[x, '结果'] != '阴性':
            list1 = list1 + df.loc[x, '结果2']

    s1 = pd.DataFrame(list1)
    s1.replace(' ', inplace=True)

    s1 = pd.DataFrame(s1.groupby(0).size())
    s1.columns = ['阳性例数']
    s1.reset_index(inplace=True)

    s1.columns = ['HPV类型', '阳性例数']

    s1['构成比'] = round(s1['阳性例数'] / s1['阳性例数'].sum() * 100, 2)
    s1['阳性率'] = round(s1['阳性例数'] / df.shape[0] * 100, 2)

    df2 = pd.read_excel(or_path('MAP'))

    df = pd.merge(s1, df2, on='HPV类型', how='left')

    df['类型'] = df['类型'].fillna('高危型')

    df.to_excel(or_path('HPV各型的阳性例数'))
Ejemplo n.º 2
0
def biao2():
    def age(x):
        if x <= 10:
            return '0-10'
        elif x <= 30:
            return '10-30'
        elif x <= 50:
            return '30-50'
        elif x <= 70:
            return '50-70'
        elif x <= 90:
            return '70-90'

    df['年龄2'] = df['年龄'].apply(lambda x: age(x))

    s1 = pd.DataFrame(df.groupby('年龄2').size())
    s1.columns = ['个数']

    df2 = df[(df['TCT'] == 'L') | (df['TCT'] == 'H') | (df['TCT'] == 'A')]
    s1['TCT阳性'] = df2.groupby('年龄2').size()

    df3 = df[(df['HPV阳性'].notnull())]
    s1['HPV阳性'] = df3.groupby('年龄2').size()

    df4 = df[(df['感染层'] == '单一感染')]
    s1['单一感染'] = df4.groupby('年龄2').size()

    df5 = df[(df['感染层'] == '多重感染')]
    s1['多重感染'] = df5.groupby('年龄2').size()

    s1.fillna(0, inplace=True)

    s1.loc['汇总'] = s1.apply(lambda x: sum(x))

    def more(col):
        s1[col] = s1[col].apply(lambda x: str(int(x))) + '(' + (
            s1[col] /
            s1['个数']).apply(lambda x: str('%.2f%%' % (x * 100))) + ')'

    more('TCT阳性')
    more('HPV阳性')
    more('单一感染')
    more('多重感染')

    s1['个数'] = s1['个数'].apply(lambda x: str(int(x))) + '(' + (
        s1['个数'] / 714).apply(lambda x: str('%.2f%%' % (x * 100))) + ')'

    s1.reset_index(inplace=True)
    s1.rename(columns={'年龄2': '年龄(岁)'}, inplace=True)

    s1.to_excel(or_path('各个年龄阶段的总数以及总计'), index=False)
    print(s1)
Ejemplo n.º 3
0
print(itemsets)
print(rules)

exit()

#
# # # 官网案例
# transactions = [['eggs', 'bacon', 'soup'],
#                 ['eggs', 'bacon', 'apple'],
#                 ['soup', 'bacon', 'banana']]
#
# # print(type(transactions))
# # exit()
# itemsets, rules = apriori(transactions, min_support=0.5, min_confidence=1)
# print(itemsets)
# print(rules)
# exit()

df = pd.read_excel(or_path('\wx\张艺谋合作影视明星 - 副本'))

df['name2'] = df['name'].apply(lambda x: x.split(' / ')[1:])

data = df['name2'].tolist()
# print(type(data))
# print(data)
# exit()

itemsets, rules = apriori(data, min_support=0.5, min_confidence=1)
print(itemsets)
print(rules)
Ejemplo n.º 4
0
today = datetime.date.today()
day_cut = int(
    str(pd.to_datetime(today) - pd.to_datetime('2018/11/14')).split(' ')[0])

df_all = pd.DataFrame()
for x in range(day_cut, 0, -1):
    day = today - datetime.timedelta(days=x)
    df = date(lc_url(day))
    df['time'] = day
    df_all = df_all.append(df)
    print('{}号抓取完毕!'.format(day))

df_all.reset_index(inplace=True)
df_all = df_all[['time', 'counts', 'day1', 'day3', 'day7', 'day14', 'day30']]

# 测试专用
# df_all.to_excel(or_path('TTT'))
# df_all = pd.read_excel(or_path('TTT'))


def more(form, col):
    form[col] = form[col].apply(lambda x: str(int(x))) + '(' + (
        form[col] /
        form['counts']).apply(lambda x: str('%.0f%%' % (x * 100))) + ')'


for col in df_all.columns[2:]:
    more(df_all, col)

df_all.to_excel(or_path('趣头条每次用户留存'))
Ejemplo n.º 5
0
    predict_y = gridsearch.predict(test_x)
    print("#准确率: %0.4lf" % accuracy_score(test_y, predict_y))
    response['predict_y'] = predict_y
    return response


from build.Func import or_path

df_all = pd.DataFrame()
for model, model_name, model_param_grid in zip(classifiers, classifier_names,
                                               classifier_param_grid):
    # print(model_name, '\n', '-' * 50)
    # print(model_param_grid, '\n', '-' * 50)
    # print(model)

    # 管道流水机制
    pipeline = Pipeline([('scaler', StandardScaler()), (model_name, model)])

    print('\n{}模型输出结果:'.format(model_name))

    # 参数调优
    result = GridSearchCV_work(pipeline, model_param_grid, score='accuracy')

    result.columns = [model_name]

    df_all = pd.concat([df_all, result], axis=1)

    print('-' * 50)

df_all.to_excel(or_path('各算法预测结果'))
# -*- coding: utf-8 -*-
# author:Super.Shen

import pandas as pd
from build.Func import or_path, gb

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 1000)
import warnings
from Func import append_excel
warnings.filterwarnings('ignore')

df = pd.read_excel(or_path('晶\\1107'))

zhuce = pd.read_excel(or_path('晶\\注册'))

zhuce['flag'] = 'new'
zhuce = zhuce[['用户ID', 'flag']]
zhuce.rename(columns={'用户ID': 'player_id'}, inplace=True)

df = pd.merge(left=df, right=zhuce, on='player_id', how='left')

df['flag'].fillna('old', inplace=True)

df = df[['player_id', 'flag']]

print(df.groupby(['flag']).size())

# exit()

# print(df.groupby('flag').size())
Ejemplo n.º 7
0
def old_app(pingtai='qi'):
    if pingtai=='qi':
        file_path = 'C:\\Users\Administrator\Desktop\图表数据\奇奇乐'
        writer = pd.ExcelWriter(or_path('奇奇乐周报-报表'))
        list1 = ['宝石分类', '宝石明细', '税收']
    else:
        file_path = 'C:\\Users\Administrator\Desktop\图表数据\浪仔'
        writer = pd.ExcelWriter(or_path('浪仔周报-报表'))
        list1 = ['宝石分类', '宝石明细', '税收']

    os.chdir(file_path)

    # 总的
    # list1 = ['渠道', '充值支付类型占比', '新注册其次占比', '金币产出', '金币消耗', '金币系统赠送', '宝石分类', '宝石明细', '奖品发放', '税收', '我要赚钱', '回收比']

    for name in list1:
        df_all = pd.DataFrame()
        for x, y, z in os.walk(file_path):
            for file in z:
                df = pd.read_excel(file, sheet_name=name, index=False, encoding='utf8')
                if name == '宝石明细':
                    df = df.drop([2, 3])
                elif name == '新注册其次占比':
                    df = df[['日期', '新用户量', '次日再消费用户量']]
                elif name == '金币产出':
                    df['其他'] = df['领取邮件'] + df['系统赠送']

                    if pingtai == 'qi':
                        df = df[['时间', '用户充值', '兑换红宝石', '兑换鱼雷', '其他']]
                    else:
                        # print(file)
                        df = df[['时间', '用户充值', '兑换红包券', '兑换鱼雷', '其他']]

                elif name == '宝石分类':
                    try:
                        df['其他'] = df['充值礼包'] + df['分享抽奖'] + df['成就任务'] + df['新手礼包']
                        df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品']
                        df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换话费', '玩家兑换金币', '其他2']]

                    except KeyError:
                        if pingtai == 'qi':
                            df['其他'] = df['充值礼包'] + df['成就任务'] + df['新手礼包']
                            df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品']
                            df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换话费', '玩家兑换金币', '其他2']]
                        else:
                            df['其他'] = df['充值礼包'] + df['分享抽奖'] + df['新手礼包']
                            df['其他2'] = df['幸运抽奖'] + df['欢乐夺宝'] + df['购买物品']
                            df = df[['时间', '游戏产出', '其他', '玩家兑换红包', '玩家兑换金币', '其他2']]

                elif name == '宝石明细':
                    del df['总和']
                elif name == '税收':
                    if pingtai == 'qi':
                        df['鱼雷场'] = df['鱼雷初级场'] + df['鱼雷中级场'] + df['鱼雷高级场']
                        df = df[['日期', '红包场', '鱼雷场', '猜猜乐']]
                    else:
                        pass

                df_all = df_all.append(df)

        df_all.sort_values(df_all.columns[0], inplace=True)
        df_all.drop_duplicates(keep='last', inplace=True)
        df_all.to_excel(writer, sheet_name=name, index=False)

    writer.save()
Ejemplo n.º 8
0
                   u'最大每5秒回蓝', u'初始每5秒回蓝', u'最大攻速', u'攻击范围']

data = df[features_remain]
data[u'最大攻速'] = data[u'最大攻速'].apply(lambda x: float(x.strip('%')) / 100)
data[u'攻击范围'] = data[u'攻击范围'].map({'远程': 1, '近战': 0})



# 采用 Z-Score 规范化数据,保证每个特征维度的数据均值为 0,方差为 1
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
data = ss.fit_transform(data)

# 构造 GMM 聚类
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=30, covariance_type='full')
gmm.fit(data)

# 训练数据
prediction = gmm.predict(data)
# print(prediction)
# exit()

# 将分组结果输出到 CSV 文件中

df.insert(0, '分组', prediction)
df.sort_values('分组', ascending=1, inplace=True)
df.to_excel(or_path('英雄分类结果'), index=False)

print(df.groupby('分组').apply(lambda x: list(x['英雄'])))
Ejemplo n.º 9
0
# 构造高斯模型
gmm = GaussianMixture(n_components=no, covariance_type='full')
gmm.fit(train_x)

# 训练数据
predict_x = gmm.predict(train_x)

# 训练结果插入原始数据
result = pd.concat((pd.DataFrame(predict_x), data), axis=1)
result.rename(columns={0: u'EM聚类'}, inplace=True)

# -------------------------------------------------------------------

# k-Means 算法
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=no)
kmeans.fit(train_x)
predict_y = kmeans.predict(train_x)

# 合并聚类结果,插入到原数据中
result = pd.concat((pd.DataFrame(predict_y), result), axis=1)
result.rename(columns={0: u'K-Means聚类'}, inplace=True)

# 结果查看
print(result.head())
print('-' * 50)

# 输出到桌面
result.to_excel(or_path('聚类结果'))
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-
# author:Super

import pandas as pd
from build.Func import or_path
import numpy as np

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 1000)

df = pd.read_excel(or_path('data'))

df.replace('阴性(-)', '', inplace=True)
df.replace(np.NaN, '', inplace=True)

result = pd.DataFrame()
for x in range(df.shape[0]):
    list1 = []
    for y in df.columns[1:]:

        if df.loc[x, y] != '':
            list1.append(y[:5].strip() + '+')
    if len(list1) == 0:
        result.loc[x, '结果'] = np.NaN
    else:
        result.loc[x, '结果'] = ", ".join(list1)

result.fillna('阴性', inplace=True)
print(result.groupby('结果').size())

result.to_excel(or_path('ttt'))
Ejemplo n.º 11
0
# 存放到df
df['price_KNN'] = pred_y

# 使用SVM 回归模型
from sklearn import svm

model = svm.SVR()
model.fit(train_x, train_y)
pred_y = model.predict(test_x)

mse = mean_squared_error(test_y, pred_y)
print("SVM 均方误差 = ", round(mse, 2))

# 存放到df
df['price_SVM'] = pred_y

# 将数据放到桌面
df.to_excel(or_path('各模型回归预测'))

# 画图
df = pd.read_excel(or_path('各模型回归预测'))
print(df.head())

for col in df.columns[-4:]:
    fig = plt.figure(figsize=(13, 7))
    df['price'].plot(color='black')
    df[col].plot(color='lime', linestyle='-.')
    plt.legend(loc='upper right')
    plt.savefig('C:\\Users\Administrator\Desktop\\{}'.format(col))
def init():
    df_all = pd.DataFrame()
    for count in range(1, 10):
        url = 'https://www.zhipin.com/c101210100/?query=%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B8%88&page={}'.format(
            count)

        if count == 1:
            div = 3
        else:
            div = 2

        ip = random.choice(list1)
        ua = random.choice(list2)
        psoxy = {'http': 'http://' + ip}
        headers = {
            'accept': "application/json, text/javascript, */*; q=0.01",
            'accept-encoding': "gzip, deflate, br",
            'accept-language': "zh-CN,zh;q=0.9,en;q=0.8",
            'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
            'cookie': "JSESSIONID="
            "; __c=1530137184; sid=sem_pz_bdpc_dasou_title; __g=sem_pz_bdpc_dasou_title; __l=r=https%3A%2F%2Fwww.zhipin.com%2Fgongsi%2F5189f3fadb73e42f1HN40t8~.html&l=%2Fwww.zhipin.com%2Fgongsir%2F5189f3fadb73e42f1HN40t8~.html%3Fka%3Dcompany-jobs&g=%2Fwww.zhipin.com%2F%3Fsid%3Dsem_pz_bdpc_dasou_title; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1531150234,1531231870,1531573701,1531741316; lastCity=101010100; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fjob_detail%2F%3Fquery%3Dpython%26scity%3D101010100; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1531743361; __a=26651524.1530136298.1530136298.1530137184.286.2.285.199",
            'origin': "https://www.zhipin.com",
            'referer':
            "https://www.zhipin.com/job_detail/?query=python&scity=101010100",
            'user-agent': ua
        }

        html = requests.get(url, headers=headers, proxies=psoxy).text

        tree = etree.HTML(html)

        res = PyQuery(html)

        df = pd.DataFrame()
        for i in range(30):
            df.loc[i, '公司'] = res(
                'li:nth-child({}) > div > div.info-company > div > h3 > a'.
                format(i + 1)).text()

            df.loc[i, '职位'] = res(
                'li:nth-child({}) > div > div.info-primary > h3 > a > div.job-title'
                .format(i + 1)).text()

            df.loc[i, '薪水'] = res(
                'li:nth-child({}) > div > div.info-primary > h3 > a > span'.
                format(i + 1)).text()

            df.loc[i, '地区'] = \
                tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[1]'.format(div, i + 1))[0]

            df.loc[i, '经验要求'] = \
                tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[2]'.format(div, i + 1))[0]

            df.loc[i, '学历要求'] = \
                tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[1]/p/text()[3]'.format(div, i + 1))[0]

            df.loc[i, '行业'] = \
                tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[1]'.format(div, i + 1))[0]

            df.loc[i, '融资'] = \
                tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[2]'.format(div, i + 1))[0]
            try:
                df.loc[i, '人数'] = \
                    tree.xpath('//*[@id="main"]/div/div[{}]/ul/li[{}]/div/div[2]/div/p/text()[3]'.format(div, i + 1))[0]
            except IndexError:
                print('\n{} - 该公司有数据缺失!\n'.format(df.loc[i, '公司']))

            df.loc[i, 'url'] = url_b + res(
                'li:nth-child({}) > div > div.info-primary > h3 > a'.format(
                    i + 1)).attr.href

        time.sleep(5)

        df_all = df_all.append(df, ignore_index=True)
        print('第{}页抓取完毕!……'.format(count))

    df_all.to_excel(or_path('boss直聘数据分析岗位'))
        df_all = df_all.append(df, ignore_index=True)
        print('第{}页抓取完毕!……'.format(count))

    df_all.to_excel(or_path('boss直聘数据分析岗位'))


import os

file = 'C:\\Users\Administrator\Desktop\\boss直聘数据分析岗位.xlsx'
if os.path.exists(file):
    print('\n{} - 已存在!\n'.format('岗位数据已存在'))
else:
    init()

# 读取数据
df = pd.read_excel(or_path('boss直聘数据分析岗位'))

df.drop_duplicates('url', inplace=True)


def content():
    df2 = pd.DataFrame()
    for i in range(df.shape[0]):

        try:
            ip = random.choice(list1)
            print(ip)
            ua = random.choice(list2)
            psoxy = {'http': 'http://' + ip}
            headers = {
                'accept': "application/json, text/javascript, */*; q=0.01",
Ejemplo n.º 14
0
# -*- coding: utf-8 -*-
# author:Super

import pandas as pd
import numpy as np

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 1000)
from build.Func import or_path, hash_data

df = pd.read_excel(or_path('HPV07原始数据'))


def biao1(df):
    df['结果2'] = df['结果'].apply(lambda x: x.split(','))

    list1 = []
    for x in range(df.shape[0]):
        if df.loc[x, '结果'] != '阴性':
            list1 = list1 + df.loc[x, '结果2']

    s1 = pd.DataFrame(list1)
    s1.replace(' ', inplace=True)

    s1 = pd.DataFrame(s1.groupby(0).size())
    s1.columns = ['阳性例数']
    s1.reset_index(inplace=True)

    s1.columns = ['HPV类型', '阳性例数']

    s1['构成比'] = round(s1['阳性例数'] / s1['阳性例数'].sum() * 100, 2)
Ejemplo n.º 15
0
def run3():

    df =pd.read_excel(or_path('奇奇乐'))
    df_map = pd.read_excel('C:\\Users\Administrator\Desktop\map.xlsx')

    # 修改不规则的列
    for x in range(df.shape[0]):
        if '鱼雷' in str(df.loc[x, '数值']):
            df.loc[x, '原因'] = '玩家兑换鱼雷'

        elif '红宝石' in str(df.loc[x, '数值']):
            df.loc[x, '原因'] = '玩家兑换红宝石'


    def change_col(x):
        if '鱼雷' in x:
            return int(x.split('(')[0])
        elif '红宝石' in x:
            return int(x.split('(')[0])
        else:
            return int(x)

    df['数值'] = df['数值'].apply(lambda x: change_col(str(x)))

    '-----------------系统赠送金币分类-----------------'

    df3 = df[df.columns[:3]]
    df3.dropna(axis=0, how='any', inplace=True)
    df3 = pd.pivot_table(df3, values='数值', index='时间', columns='原因')
    df3 = df3[['每日登录抽奖', 'VIP奖励', '新手礼包', '成就任务', '分享抽奖']]
    df3.reset_index(inplace=True)
    df3 = df3[df3['时间'] >= pd.to_datetime('{}'.format(bef_yesterday))]

    df3['时间'] = df3['时间'].apply(lambda x: str(x)[:10])

    '-----------------金币消耗汇总表--------------'

    # 金币消耗-透视
    df2 = df[df.columns[-3:]]
    df2.dropna(axis=0, how='any', inplace=True)
    df2 = pd.pivot_table(df2, values='数值2', index='时间2', columns='原因2')
    del df2['单局结算']
    df2.reset_index(inplace=True)
    df2['时间2'] = df2['时间2'].apply(lambda x: str(x)[:10])

    '----------------------------金币产出汇总表--------------------------'
    # 金币产出-透视
    df = df[df.columns[:3]]
    df_map = df_map[['原因', 'jinbi']]
    df_map.dropna(inplace=True)

    # 合并匹配表
    df = pd.merge(left=df, right=df_map, on='原因', how='left')

    df = df.groupby(['时间', 'jinbi'])['数值'].sum()

    df = pd.DataFrame(df)
    df.reset_index(inplace=True)

    df = pd.pivot_table(df, values='数值', index='时间', columns='jinbi')

    df.reset_index(inplace=True)

    df = df[['时间', '用户充值', '系统赠送', '兑换红宝石','兑换鱼雷', '领取邮件']]
    df['时间'] = df['时间'].apply(lambda x: str(x)[:10])

    df.to_excel(or_path('奇奇乐-红宝石兑换'))
    # print(df)
    exit()

    print('\n第三个表运行完毕……')

    return df, df2, df3
Ejemplo n.º 16
0
df['游戏种类']=df['游戏种类'].apply(lambda x:zl(x))
df['变动属性']=df['变动属性'].apply(lambda x:sx(x))


out=pd.DataFrame()
for x,y in df.groupby('用户ID'):

    y = pd.pivot_table(y, values='差值', index='游戏种类', columns='变动属性')
    y.fillna(0,inplace=True)



    y['求和']=y['红包券']*2000+y['金币']+y['鱼雷']*10000
    y['ID']=x
    y.reset_index(inplace=True)
    y= pd.pivot_table(y, values='求和', index='ID', columns='游戏种类')

    out=out.append(y)

del out['大厅']
out.fillna(0,inplace=True)
out['求和']=out.apply(lambda x:x.sum(),axis=1)
out2=out/20000


out.to_excel(or_path('四个用户变动详情'))
out2.to_excel(or_path('四个用户变动详情2'))


# -*- coding: utf-8 -*-
# author:Super.Shen

import pandas as pd

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 1000)
import warnings

warnings.filterwarnings('ignore')

from build.database import url11, date, url77
from build.Func import or_path, gb

# # # 导出数据
date(url77).to_excel(or_path('注册充值用户2'))
date(url11).to_excel(or_path('变动日志2'))
# exit()

# # 读取充值新用户
df_reg = pd.read_excel(or_path('注册充值用户2'))
df_reg = gb(df_reg, '用户id', '充值金额')
df_reg.rename(columns={'用户id': '用户ID'}, inplace=True)

# 读取变动日志

df = pd.read_excel(or_path('变动日志2'))

# 数据分析
df['变动时间'] = df['变动时间'].apply(lambda x: pd.to_datetime(x))
Ejemplo n.º 18
0
#     print('{}号抓取完毕!'.format(day))
#
# df_all.

# # 筛选出属于CPA的变动日志
# df = pd.read_hdf('C:\\Users\Administrator\Desktop\\bdrz.h5', key='data')
# df_reg = pd.read_excel(or_path('新闻资讯注册'))
# df_reg['flag'] = 'new'
# df_reg = df_reg[['用户ID', 'flag']]
# df = pd.merge(left=df, right=df_reg, on='用户ID', how='left')
# df = df[df['flag'].notnull()]
# df.to_hdf('C:\\Users\Administrator\Desktop\\test.h5', key='data')
# print(df.shape[0])
# exit()

df_pay = pd.read_excel(or_path('新闻资讯充值'))
df_pay = gb(df_pay, 'player_id', 'amount')

df = pd.read_hdf('C:\\Users\Administrator\Desktop\\test.h5', key='data')

df = pd.DataFrame(df.groupby(['游戏种类', '用户ID']).size())

df.reset_index(inplace=True)

df2 = pd.pivot_table(df, values=0, index='用户ID', columns='游戏种类')

df2 = df2[['大厅', '红包场', '鱼雷场', '水果狂欢', '鱼乐场']]
df2.reset_index(inplace=True)
df2.rename(columns={'用户ID': 'player_id'}, inplace=True)

df2 = pd.merge(left=df2, right=df_pay, on='player_id', how='left')
# -*- coding: utf-8 -*-
# author:Super.Shen

import pandas as pd

pd.set_option('expand_frame_repr', False)
pd.set_option('display.max_rows', 1000)
import warnings

warnings.filterwarnings('ignore')

from build.database import url1, date, url7
from build.Func import or_path, gb

# # 导出数据
date(url7).to_excel(or_path('注册充值用户'))
date(url1).to_excel(or_path('变动日志'))

# # 读取充值新用户
df_reg = pd.read_excel(or_path('注册充值用户'))
df_reg = gb(df_reg, '用户id', '充值金额')
df_reg.rename(columns={'用户id': '用户ID'}, inplace=True)

# 读取变动日志

df = pd.read_excel(or_path('变动日志'))

# 数据分析
df['变动时间'] = df['变动时间'].apply(lambda x: pd.to_datetime(x))

df.sort_values('变动时间', inplace=True)
# 转化数值类型
from sklearn import preprocessing
dvec = preprocessing.LabelEncoder()
for col in features[1:]:
    train_features[col] = dvec.fit_transform(train_features[col])

# 规范化到 [0,1] 空间
min_max_scaler = preprocessing.MinMaxScaler()
train_x = min_max_scaler.fit_transform(train_features)

print(train_features.head())
print('-'*50)

# k-Means 算法
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
kmeans.fit(train_x)
predict_y = kmeans.predict(train_x)

# 合并聚类结果,插入到原数据中
result = pd.concat((pd.DataFrame(predict_y), data), axis=1)
result.rename(columns={0: u'聚类'}, inplace=True)

result.to_excel(or_path('聚类结果'))
print(result.head())
print('-'*50)

# 输出聚类效果
df = pd.DataFrame(result.groupby('聚类').apply(lambda x: list(x['公司'] + '-' + x['职位'])))
df.to_excel(or_path('职位分类'))
print(df)