def __get_id_dict(type, number=0, test_flag:bool=False, label_flag:bool=False, all = False): if all == True: id_pair = (list(range(0, train_num)), []) elif type == 'basic_model': id_pair = basic_model_id[number] elif type == 'cv_4_fold': id_pair = cv_4_fold[number] elif type == 'cv_5_fold': id_pair = cv_5_fold[number] else: raise NotImplementedError('type can not be {0}'.format(type)) id_dict = { 'train': id_pair[0], 'validation': id_pair[1] } if test_flag: id_dict['test'] = list(range(train_num, train_num + test_num)) print_log('finished creating id_dict --- train:{0}, validation:{1}'.format(len(id_dict['train']), len(id_dict['validation']))) if not label_flag: return id_dict # need label label_dict = {} for split, id_list in id_dict.items(): label_dict[split] = [] feature = Feature.load('is_trade') for id in id_list: label_dict[split].append(feature.data[id]) label_dict[split] = np.asarray(label_dict[split]) print_log('finished creating label_dict') return id_dict, label_dict
def manipulate(self, input, threshold): id2cat, cat2id, count_info = {}, {}, {} for line in input: for x in line: if not x in count_info: count_info[x] = 0 count_info[x] += 1 cat2id['__other__'] = 0 id2cat[0] = '__other__' other_num = 0 for key, val in count_info.items(): if val < threshold or key == '-1': other_num += val else: cat2id[key] = len(cat2id) id2cat[cat2id[key]] = key feature = [] for line in input: feature.append([cat2id.get(x, cat2id['__other__']) for x in line]) info = { 'category_num': len(cat2id), 'biggest_size': max(count_info.values()), 'threshold': threshold, 'tail_size': other_num } self.length = len(cat2id) self.cat2id = cat2id self.id2cat = id2cat print_log('manipulation of {0}(type: {1}) finished'.format(self.name, self.type)) return feature, info
def manipulate(self, input): feature = np.asarray([int(x) for x in input]) if isinstance(input, list) else input sum, num = 0, 0 for x in feature: if x != -1: sum += x num += 1 info = { 'max': np.max(feature), 'min': np.min(feature), 'mean': 1.0 * sum / num, 'var': np.var(feature), 'missing_rate': 1. * np.sum(feature == -1) / len(input) } print_log('manipulation of {0}(type: {1}) finished'.format(self.name, self.type)) return feature, info
def __pretreatment(): # mkdir if not os.path.exists(os.path.join(default_data_path, 'pretreatment')): os.mkdir(os.path.join(default_data_path, 'pretreatment')) if not os.path.exists(os.path.join(default_data_path, 'feature')): os.mkdir(os.path.join(default_data_path, 'feature')) dict = {} # get raw category with open(os.path.join(default_data_path, 'raw data/round1_train.txt')) as f: category_list = f.readline().split() # read data & split data_dict = {} for split in ['train', 'test']: data_dict[split] = [] with open( os.path.join(default_data_path, 'raw data/round1_{0}.txt'.format(split))) as f: for i, line in enumerate(f): if i == 0: continue data_list = line.split() if split == 'test': data_list.append(-1) data_dict[split].append((int(data_list[16]), data_list)) # train reorder data_dict['train'] = sorted(data_dict['train'], key=lambda data: data[0]) # merge train and test for category in category_list: dict[category] = [] for split in ['train', 'test']: for x in data_dict[split]: for category, data in zip(category_list, x[1]): dict[category].append(data) # write for key, val in dict.items(): joblib.dump( val, os.path.join(default_data_path, 'pretreatment/raw-{0}.pkl'.format(key))) print_log('pretreatment of {0} finished!'.format(key))
def __get_id_dict(type, number=0, test_flag: bool = False, label_flag: bool = False, all=False, slice_num=5): global true, false true_day_length = int(1. * np.ceil(len(true) / slice_num)) false_day_length = int(1. * np.ceil(len(false) / slice_num)) true = np.array(true) false = np.array(false) day = {} for i in range(slice_num): day[i] = np.concatenate([ true[i * true_day_length:min(len(true), (i + 1) * true_day_length)], false[i * false_day_length:min(len(false), (i + 1) * false_day_length)] ]) random.seed(10) random.shuffle(day[i]) day[i] = day[i].tolist() basic_model_id = {} for i in range(slice_num - 1): tmp = [] for j in range(slice_num - 1): if i == j: continue tmp = tmp + day[j] basic_model_id[i] = (tmp, day[i]) all_train = [] except_last_train = [] for i in range(slice_num): if i != slice_num - 1: except_last_train = except_last_train + day[i] all_train = all_train + day[i] basic_model_id[slice_num - 1] = (except_last_train, day[slice_num - 1]) basic_model_id[slice_num] = (all_train, []) if all == True: id_pair = (list(range(0, train_num)), []) elif type == 'basic_model': id_pair = basic_model_id[number] else: raise NotImplementedError('type can not be {0}'.format(type)) id_dict = {'train': id_pair[0], 'validation': id_pair[1]} if test_flag: id_dict['test'] = list(range(train_num, train_num + test_num)) print_log('finished creating id_dict --- train:{0}, validation:{1}'.format( len(id_dict['train']), len(id_dict['validation']))) if not label_flag: return id_dict # need label label_dict = {} for split, id_list in id_dict.items(): label_dict[split] = [] feature = Feature.load('is_trade') for id in id_list: label_dict[split].append(feature.data[id]) label_dict[split] = np.asarray(label_dict[split]) print_log('finished creating label_dict') return id_dict, label_dict