def generate_fold(data, NUM_FOLDS=3, TEST_SAMPLE_SIZE=50): all_folds = [] for fold in range(0, NUM_FOLDS): class_folds = {"train": [], "test": []} for i, group in data.groupby("Class_ID"): num_samples = group.shape[0] test_mask = np.zeros(num_samples, dtype=np.bool) if TEST_SAMPLE_SIZE * NUM_FOLDS > num_samples: start = fold * TEST_SAMPLE_SIZE end = start + TEST_SAMPLE_SIZE ix = [i % num_samples for i in range(start, end)] else: class_fold_size = num_samples // NUM_FOLDS start = fold * class_fold_size end = start + class_fold_size ix = range(start, end) test_mask[ix] = True try: class_folds["test"].append(group[test_mask].sample( n=TEST_SAMPLE_SIZE, random_state=0)) except: logging.warning('fold error') class_folds["train"].append(group[~test_mask]) class_folds["test"] = pd.concat(class_folds["test"]) class_folds["train"] = pd.concat(class_folds["train"]) all_folds.append(class_folds) return all_folds
def get_sequences(data): grouped = data.groupby('userId') res = [] users = [] for user_id, group in grouped: seq = group.sort_values('timestamp')['movieId'].values res.append(list(seq.astype(str))) users.append(user_id) return np.array(users), res
def calc_mean_ndcg_wrmf(model, data, k): grouped = data.groupby('userId') res = [] for user_id, group in grouped: ranked = WRMFEmbedded.rank_items(model, user_id, group['movieId'].values) y_pred = np.array([x[1] for x in ranked]) y_true = group['rating'].values ndcg = calc_ndcg(y_true, y_pred, k) res.append(ndcg) return np.mean(res)
def calc_mean_ndcg_als(model, csr_train, data, k): grouped = data.groupby('userId') res = [] for user_id, group in grouped: ranked = model.rank_items(user_id, csr_train, group['movieId'].values) ranked.sort(key=lambda x: x[0]) y_pred = np.array([x[1] for x in ranked]) y_true = group.sort_values('movieId')['rating'].values ndcg = calc_ndcg(y_true, y_pred, k) res.append(ndcg) return np.mean(res)
def transform_data_to_file_folder_structure(path_to_csv, path_to_data_dir): data = pd.read_csv(path_to_csv) data.Date = pd.to_datetime(data.Date) data['day'] = (data.Date - pd.datetime(year=2017, month=1, day=1)).dt.days data = data.groupby(['DummyUserId', 'day']).agg('sum').reset_index() user_ids = data.DummyUserId.unique() size_data = len(user_ids) # print(int(np.floor(0.6*size_data))) np.random.seed(11) train = np.random.choice(user_ids, size=int(np.floor(0.6 * size_data)), replace=False) user_ids = user_ids[~np.in1d(user_ids, train)] validate = np.random.choice(user_ids, size=int(np.floor(0.2 * size_data)), replace=False) user_ids = user_ids[~np.in1d(user_ids, validate)] test = np.random.choice(user_ids, size=int(np.floor(0.2 * size_data)), replace=False) # print(len(user_ids)) badge_achievements = {} for dset in [train, validate, test]: for user, trajectory in data[data.DummyUserId.isin(dset)].groupby( 'DummyUserId'): trajectory = trajectory.sort_values('day') badge = {} for b in BADGES: idxs = np.where(trajectory[b] == 1)[0] if len(idxs) > 0: badge[b] = [int(i) for i in idxs] badge_achievements[user] = badge action_trajectory = torch.tensor(trajectory[ACTIONS].values, dtype=torch.long) torch.save(action_trajectory, '{}/user_{}.pt'.format(path_to_data_dir, user)) with open('{}/badge_achievements.json'.format(path_to_data_dir), 'w') as f: json.dump(badge_achievements, f) with open('{}/data_indexes.json'.format(path_to_data_dir), 'w') as f: obj = {} obj['train'] = [int(u) for u in train] obj['test'] = [int(u) for u in test] obj['validate'] = [int(u) for u in validate] json.dump(obj, f)
def split_data(data, test_part=0.2, min_test=10): grouped = data.groupby('userId') train = [] test = [] for name, group in grouped: entries = group.sort_values('timestamp') test_cnt = max(min_test, int(len(entries) * test_part)) train.append(entries[:-test_cnt]) test.append(entries[-test_cnt:]) data_train = pd.concat(train) data_test = pd.concat(test) return data_train, data_test
def __init__(self, data, samples_in_a_row, shuffle=False): super().__init__(data) data = data[['sirna']].copy() data.index = np.arange(len(data)) buckets = [bucket.index.values for _, bucket in data.groupby('sirna')] max_len = min(len(bucket) for bucket in buckets) print('max_len: {}, {}'.format(max_len, max_len // samples_in_a_row * samples_in_a_row)) max_len = max_len // samples_in_a_row * samples_in_a_row self.max_len = max_len self.buckets = buckets self.samples_in_a_row = samples_in_a_row self.shuffle = shuffle
def artists_pca(self, data): data['artists'] = data['artists'].astype('category') data['artists'] = data['artists'].apply(lambda x: get_first_artist(x)) grouped_by_artist = data.groupby('artists').mean() pca = PCA(n_components=self.pca_components) pca_components = pd.DataFrame( pca.fit_transform(grouped_by_artist, y='artists'), columns=['PCA%i' % i for i in range(self.pca_components)], index=grouped_by_artist.index) merged_data = pd.merge(data, pca_components, left_on='artists', right_index=True, how='inner') return merged_data.drop('artists', axis=1)
def __init__(self, data, batch_size, shuffle=False, drop_last=False): super().__init__(data) data = pd.DataFrame({ "i": range(len(data)), "size": data, }).sort_values("size") batches = [ group["i"] for _, group in data.groupby(np.arange(len(data)) // batch_size) ] batches = [b for b in batches if len(b) > 0] if drop_last: batches = [b for b in batches if len(b) == batch_size] self.batches = batches self.shuffle = shuffle
def __init__(self, path, csv_file, train, transform=[]): data = pd.read_csv(os.path.join(path, csv_file)) image_path = '/home/we/zsw/train/' data['img_file'] = image_path + data['_id'].astype(str) + '_' \ + data['cum_sum'].astype(str) + '.jpg' gb = data.groupby('level1') keys = gb.groups.keys() self.transform = transform self.num_file = gb.size().max() # subclass image num max self.train = train self.img_file = [] self.level1 = [] self.subcategory = [] self.sublength = [] for item in keys: group = gb.get_group(item) self.img_file.append(list(group['img_file'])) self.level1.append(item) self.subcategory.append(list(group['level1_sub_class'])) self.sublength.append(len(group))
def gen_batch_data(data): cols = list(data.columns) del_cols_index = [cols.index(col) for col in ['sid', 'pid', 'click_mode']] sel_cols_index = list(range(len(cols))) for item in del_cols_index: sel_cols_index.pop(item) grouped = data.groupby('sid') batch_feas_list = [] batch_click_mode_list = [] for i, (group_id, group) in tqdm(enumerate(grouped)): grouped_values = group.values batch_click_mode = grouped_values[:, del_cols_index[-1]][0] batch_feas = torch.tensor(grouped_values[:, sel_cols_index]).type( torch.FloatTensor) batch_feas_list.append(batch_feas) batch_click_mode_list.append(batch_click_mode) batch_click_mode_list = torch.tensor(batch_click_mode_list).type( torch.LongTensor) # first paddle # check why size does not add up to 10000??? print('list_len:', len(batch_click_mode_list)) return batch_feas_list, batch_click_mode_list
def main(dataset_path, workers): transform = T.Compose([ ApplyTo( ['image'], T.Compose([ SplitInSites(), T.Lambda( lambda xs: torch.stack([ToTensor()(x) for x in xs], 0)), ])), Extract(['image']), ]) train_data = pd.read_csv(os.path.join(dataset_path, 'train.csv')) train_data['root'] = os.path.join(dataset_path, 'train') test_data = pd.read_csv(os.path.join(dataset_path, 'test.csv')) test_data['root'] = os.path.join(dataset_path, 'test') data = pd.concat([train_data, test_data]) stats = {} for (exp, plate), group in tqdm(data.groupby(['experiment', 'plate'])): dataset = TestDataset(group, transform=transform) data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, num_workers=workers) with torch.no_grad(): images = [images for images, in data_loader] images = torch.cat(images, 0) mean = images.mean((0, 1, 3, 4)) std = images.std((0, 1, 3, 4)) stats[(exp, plate)] = mean, std del images, mean, std gc.collect() torch.save(stats, 'plate_stats.pth')
for col in data.columns: if (data[col].dtype == 'object') and (col != 'UID'): data = encode_count(data, col) train = data.drop(['merchant', 'UID'], axis=1).fillna(-1) label = data['merchant'].values if (os.path.exists('./feature/merchant_np.npy')): merchant_weight = np.load('./feature/merchant_np.npy') for item in ['merchant']: result = data.groupby([ 'UID' ])[item].apply(max_list).reset_index().rename(columns={ item: 'arr_%s' % item }).fillna(0) y = y.merge(result[['UID', 'arr_%s' % item]], on=['UID'], how='left').fillna(0) sub = sub.merge(result[['UID', 'arr_%s' % item]], on=['UID'], how='left').fillna(0) ##pay attention :: astype(int)!!!!!!!! for dat in [y, sub]: dat['new_merchant'] = dat['arr_merchant'].astype(int).apply( lambda x: merchant_weight[x]) for i in range(100): y['new_merchant_%d' % i] = y['new_merchant'].apply(lambda x: x[i]) sub['new_merchant_%d' % i] = sub['new_merchant'].apply(lambda x: x[i])
def process_data(base_path): import pandas as pd # processed_dataset = {} # validation == 1000 samples # train === 5000 samples # test === 1000 samples # convert to number of actions per week # edit out the badge outcome variables print("Processing raw data") output_fname = os.path.join(base_path, 'so_data.pkl') labels = ['train', 'valid', 'test'] input_fname = os.path.join(csv_path, 'so_badges.csv') data = pd.read_csv(input_fname) data.Date = pd.to_datetime(data.Date) data['week'] = (data.Date - pd.datetime(year=2017, month=1, day=1)).dt.days data = data.groupby(['DummyUserId', 'week']).agg('sum').reset_index() badge_ixs = data[data.Electorate > 0] max_week = data.week.max() badge_ixs = badge_ixs[badge_ixs.week > 45] badge_ixs = badge_ixs[badge_ixs.week < max_week - 46] badge_ixs = badge_ixs.DummyUserId print(len(badge_ixs.unique())) indexes = badge_ixs.unique() train = np.random.choice(indexes, size=4000, replace=False) indexes = indexes[~np.in1d(indexes, train)] validate = np.random.choice(indexes, size=1000, replace=False) indexes = indexes[~np.in1d(indexes, validate)] test = np.random.choice(indexes, size=1000, replace=False) # data.set_index('DummyUserId', inplace=True) processed_dataset = {} for s, dset in enumerate([train, validate, test]): split = labels[s] processed_dataset[split] = {} sub_data = data[data.DummyUserId.isin(dset)] n_seqs = len(dset) processed_dataset[split]['sequence_lengths'] = torch.zeros( n_seqs, dtype=torch.long) processed_dataset[split]['sequences'] = [] processed_dataset[split]['outcomes'] = [] idx = 0 for u_id, seqs in sub_data.groupby('DummyUserId'): seqs = seqs.sort_values('week') out = {} for b in BADGES: idxs = np.where(seqs[b] == 1)[0] if len(idxs) > 0: out[b] = torch.tensor(idxs, dtype=torch.long) civic_duty = out['Electorate'] days = 90 action_vec = seqs[ACTIONS].values[civic_duty - days // 2:civic_duty + days // 2, :] out['Electorate'] = torch.tensor([days // 2], dtype=torch.long) processed_dataset[split]['sequence_lengths'][idx] = days processed_sequence = torch.tensor(action_vec, dtype=torch.long) processed_dataset[split]['sequences'].append(processed_sequence) processed_dataset[split]['outcomes'].append(out) idx += 1 pickle.dump(processed_dataset, open(output_fname, "wb"), pickle.HIGHEST_PROTOCOL) print("dumped processed data to %s" % output_fname)
def get_scaleing(data, scaler, scaler_range, train): sensors = [ 's1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21' ] operating_condition = ["oc_0", "oc_1", "oc_2", "oc_3", "oc_4", "oc_5"] global scalingparams if "operating_condition" in data.columns: print("\n\t Scaling by Clusters") groupby_oc = data.groupby("operating_condition", sort=False) scaled_sensors = [] if train: scalingparams = {} scalingparams['scaler'] = scaler scalingparams['scaler_range'] = scaler_range for operating_condition, data in groupby_oc: if scalingparams['scaler'] == 'mm': min_ = np.min(data[sensors]) max_ = np.max(data[sensors]) scaled_data = (((data[sensors] - min_) / (max_ - min_)) * (scalingparams['scaler_range'][1] - scalingparams['scaler_range'][0]) ) + scalingparams['scaler_range'][0] scalingparams['min_oc' + str(operating_condition)] = min_ scalingparams['max_oc' + str(operating_condition)] = max_ elif scalingparams['scaler'] == 'ss': mean_ = np.mean(data[sensors]) std_ = np.std(data[sensors]) scaled_data = (data[sensors] - mean_) / std_ scalingparams['mean_oc' + str(operating_condition)] = mean_ scalingparams['std_oc' + str(operating_condition)] = std_ scaled_sensors.append(scaled_data) else: for operating_condition, data in groupby_oc: if scalingparams['scaler'] == 'mm': min_ = scalingparams['min_oc' + str(operating_condition)] max_ = scalingparams['max_oc' + str(operating_condition)] scalingparams['scaler_range'] = scaler_range scaled_data = (((data[sensors] - min_) / (max_ - min_)) * (scalingparams['scaler_range'][1] - scalingparams['scaler_range'][0]) ) + scalingparams['scaler_range'][0] elif scalingparams['scaler'] == 'ss': mean_ = scalingparams['mean_oc' + str(operating_condition)] std_ = scalingparams['std_oc' + str(operating_condition)] scaled_data = (data[sensors] - mean_) / std_ scaled_sensors.append(scaled_data) scaled_df = pd.concat(scaled_sensors, sort=False) scaled_df = scaled_df.sort_index(axis=0, ascending=True) else: print("\n\t Scaling Without Clusters") if train: scalingparams = {} scalingparams['scaler'] = scaler scalingparams['scaler_range'] = scaler_range if scalingparams['scaler'] == 'mm': min_ = np.min(data[sensors]) max_ = np.max(data[sensors]) scaled_data = (((data[sensors] - min_) / (max_ - min_)) * (scalingparams['scaler_range'][1] - scalingparams['scaler_range'][0]) ) + scalingparams['scaler_range'][0] scalingparams['min_'] = min_ scalingparams['max_'] = max_ elif scalingparams['scaler'] == 'ss': mean_ = np.mean(data[sensors]) std_ = np.std(data[sensors]) scaled_data = (data[sensors] - mean_) / std_ scalingparams['mean_'] = mean_ scalingparams['std_'] = std_ else: if scalingparams['scaler'] == 'mm': min_ = scalingparams['min_'] max_ = scalingparams['max_'] scalingparams['scaler_range'] = scaler_range scaled_data = (((data[sensors] - min_) / (max_ - min_)) * (scalingparams['scaler_range'][1] - scalingparams['scaler_range'][0]) ) + scalingparams['scaler_range'][0] elif scalingparams['scaler'] == 'ss': mean_ = scalingparams['mean_'] std_ = scalingparams['std_'] scaled_data = (data[sensors] - mean_) / std_ scaled_df = scaled_data scaled_df = scaled_df.dropna(axis=1) cols_wo_na = scaled_df.columns print("\n\t scaled_df after dropNA {} \n column names {}".format( scaled_df.shape, cols_wo_na)) return scaled_df, cols_wo_na