Ejemplo n.º 1
0
def read_data():
    read_rows = int(sys.argv[1])
    data_pre_name = 'disk_sample_smart_log_'
    df = read_data_csv(data_pre_name + '201707' + '_sample_pn_v2', read_rows)
    print '201707', df.shape
    for day in range(201708, 201713) + range(201801, 201808):
        df_temp = read_data_csv(data_pre_name + str(day) + '_sample_pn_v2',
                                read_rows)
        df = pd.concat([df, df_temp])
        print day, df.shape
    df_test = read_data_csv('disk_sample_smart_log_test_b', -1)
    return df, df_test
Ejemplo n.º 2
0
def take_sample():
    read_rows = int(sys.argv[1])
    data_pre_name = 'disk_sample_smart_log_'
    df_label = pd.read_csv(
        data_path + 'disk_sample_fault_tag.csv',
        names=['manufacturer', 'model', 'serial_number', 'fault_time', 'tag'])
    df_label = df_label.groupby(
        ['manufacturer', 'model',
         'serial_number'])['fault_time'].min().to_frame().reset_index()
    print df_label.head(3)
    for col in ['manufacturer', 'model', 'serial_number']:
        df_label[col] = df_label[col].astype(str)
    for day in range(201707, 201713) + range(201801, 201808):
        # if day <= 201806: continue
        df = read_data_csv(data_pre_name + str(day), read_rows)
        print 'raw : ', len(df)
        cnt = df.shape
        for col in ['manufacturer', 'model', 'serial_number']:
            df[col] = df[col].astype(str)
        df = df.merge(df_label,
                      on=['manufacturer', 'model', 'serial_number'],
                      how='left')
        df_positive = df[~df['fault_time'].isna()]
        df = df[df['fault_time'].isna()]
        df = df.sample(n=len(df_positive) * 10, random_state=2020)
        df = pd.concat([df_positive, df])
        df.to_csv(data_path + data_pre_name + str(day) + '_sample_pn_v3.csv',
                  index=False)
        print 'sample : ', day, cnt, len(df_positive), len(df)
        del df, df_positive
        gc.collect()
Ejemplo n.º 3
0
def take_sample():
    read_rows = int(sys.argv[1])
    data_pre_name = 'disk_sample_smart_log_'
    df_label = pd.read_csv(
        data_path + 'disk_sample_fault_tag.csv',
        names=['manufacturer', 'model', 'serial_number', 'fault_time', 'tag'])
    df_label = df_label.drop_duplicates()
    for col in ['manufacturer', 'model', 'serial_number']:
        df_label[col] = df_label[col].astype(str)
    print df_label.head(3)
    for day in range(201707, 201713) + range(201801, 201808):
        if day < 201807: continue
        df = read_data_csv(data_pre_name + str(day), read_rows)
        cnt = df.shape
        for col in ['manufacturer', 'model', 'serial_number']:
            df[col] = df[col].astype(str)
        df = df.merge(df_label,
                      on=['manufacturer', 'model', 'serial_number'],
                      how='left')
        print 'debug1'
        df_positive = df[~df['tag'].isna()]
        print 'debug2', len(df_positive)
        df = df.sample(frac=0.1, random_state=2020)
        df_negative = df[df['tag'].isna()]
        print 'debug3'
        df_negative = df_negative.sample(n=len(df_positive) * 5,
                                         random_state=2020)
        df = pd.concat([df_positive, df_negative])
        df.to_csv(data_path + data_pre_name + str(day) + '_sample_pn_copy.csv',
                  index=False)
        print day, cnt, len(df_positive), len(df)
    return df
Ejemplo n.º 4
0
def write(pred, time_name):
    print 'write', '-' * 100
    print 'pred_mean = ', np.mean(pred)
    print 'pred_max = ', np.max(pred)
    print 'pred_min = ', np.min(pred)
    print 'pred_median = ', np.median(pred)
    print 'pred_st = ', np.std(pred)
    print 'one = ', np.sum(pred == 1)
    print 'zero = ', np.sum(pred == 0)
    df_sub = read_data_csv(r'\sample_submission')
    df_sub['isFraud'] = list(pred)
    df_sub.to_csv(data_path + '\sub_ratio_' + time_name + '.csv', index=False)
Ejemplo n.º 5
0
def get_weight_label_data(read_rows=int(sys.argv[1])):
    df_train = read_data_csv('df_train_get_weight', read_rows)
    df_test = read_data_csv('df_test_get_weight', read_rows)
    weight = df_train['weight']
    label = df_train['label']
    return df_train, df_test, weight, label
Ejemplo n.º 6
0
def BUILD96_output(preds):
    sample_submission = read_data_csv(r'\sample_submission', int(sys.argv[1]))
    sample_submission.isFraud = preds
    sample_submission.to_csv(data_path + 'sub_xgb_96_transactionID.csv',
                             index=False)