""" Test different data operations. """ import dev.util as util from preprocess.DataIO import load_sequences_csv, load_seq_features_csv, load_event_features_csv from preprocess.DataOperation import stitching, superposing, aggregating, data_info # test sequence loading functions # load event sequences domain_names = {'seq_id': 'id', 'time': 'time', 'event': 'event'} database = load_sequences_csv('{}/{}/Linkedin.csv'.format( util.POPPY_PATH, util.DATA_DIR), domain_names=domain_names) data_info(database) # load sequences' features domain_dict = {'time': 'numerical', 'option1': 'categorical'} database = load_seq_features_csv('{}/{}/Linkedin.csv'.format( util.POPPY_PATH, util.DATA_DIR), seq_domain='id', domain_dict=domain_dict, database=database) data_info(database) # load event types' features database = load_event_features_csv('{}/{}/Linkedin.csv'.format( util.POPPY_PATH, util.DATA_DIR), event_domain='event', domain_dict=domain_dict, database=database)
if __name__ == '__main__': # hyper-parameters memory_size = 400 batch_size = 128 use_cuda = True use_cuda = use_cuda and torch.cuda.is_available() seed = 2 torch.manual_seed(seed) if use_cuda: torch.cuda.manual_seed(seed) kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # load event sequences from csv file domain_names = {'seq_id': 'seq_id', 'time': 'time', 'event': 'event'} database = load_sequences_csv('{}/{}/IPTV_DATA.csv'.format( util.POPPY_PATH, util.DATA_DIR), domain_names=domain_names, upperlimit=50000) data_info(database) # sample batches from database trainloader_thinning = DataLoader(ThinningSampler(database=database, length=memory_size), batch_size=batch_size, shuffle=True, **kwargs) trainloader_interval = DataLoader(EventSampler(database=database, memorysize=memory_size), batch_size=batch_size, shuffle=True, **kwargs) validationloader = DataLoader(FullData(database=database))