Example #1
0
def __get_id_dict(type, number=0, test_flag:bool=False, label_flag:bool=False, all = False):
    if all == True:
        id_pair = (list(range(0, train_num)), [])
    elif type == 'basic_model':
        id_pair = basic_model_id[number]
    elif type == 'cv_4_fold':
        id_pair = cv_4_fold[number]
    elif type == 'cv_5_fold':
        id_pair = cv_5_fold[number]
    else:
        raise NotImplementedError('type can not be {0}'.format(type))
    id_dict = {
        'train': id_pair[0],
        'validation': id_pair[1]
    }
    if test_flag:
        id_dict['test'] = list(range(train_num, train_num + test_num))

    print_log('finished creating id_dict --- train:{0}, validation:{1}'.format(len(id_dict['train']), len(id_dict['validation'])))
    if not label_flag:
        return id_dict

    # need label
    label_dict = {}
    for split, id_list in id_dict.items():
        label_dict[split] = []
        feature = Feature.load('is_trade')
        for id in id_list:
            label_dict[split].append(feature.data[id])
        label_dict[split] = np.asarray(label_dict[split])
    print_log('finished creating label_dict')
    return id_dict, label_dict
Example #2
0
    def manipulate(self, input, threshold):
        id2cat, cat2id, count_info = {}, {}, {}
        for line in input:
            for x in line:
                if not x in count_info:
                    count_info[x] = 0
                count_info[x] += 1

        cat2id['__other__'] = 0
        id2cat[0] = '__other__'
        other_num = 0
        for key, val in count_info.items():
            if val < threshold or key == '-1':
                other_num += val
            else:
                cat2id[key] = len(cat2id)
                id2cat[cat2id[key]] = key
        feature = []
        for line in input:
            feature.append([cat2id.get(x, cat2id['__other__']) for x in line])
        info = {
            'category_num': len(cat2id),
            'biggest_size': max(count_info.values()),
            'threshold': threshold,
            'tail_size': other_num
        }
        self.length = len(cat2id)
        self.cat2id = cat2id
        self.id2cat = id2cat
        print_log('manipulation of {0}(type: {1}) finished'.format(self.name, self.type))
        return feature, info
Example #3
0
 def manipulate(self, input):
     feature = np.asarray([int(x) for x in input]) if isinstance(input, list) else input
     sum, num = 0, 0
     for x in feature:
         if x != -1:
             sum += x
             num += 1
     info = {
         'max': np.max(feature),
         'min': np.min(feature),
         'mean': 1.0 * sum / num,
         'var': np.var(feature),
         'missing_rate': 1. * np.sum(feature == -1) / len(input)
     }
     print_log('manipulation of {0}(type: {1}) finished'.format(self.name, self.type))
     return feature, info
Example #4
0
def __pretreatment():
    # mkdir
    if not os.path.exists(os.path.join(default_data_path, 'pretreatment')):
        os.mkdir(os.path.join(default_data_path, 'pretreatment'))
    if not os.path.exists(os.path.join(default_data_path, 'feature')):
        os.mkdir(os.path.join(default_data_path, 'feature'))

    dict = {}
    # get raw category
    with open(os.path.join(default_data_path,
                           'raw data/round1_train.txt')) as f:
        category_list = f.readline().split()

    # read data & split
    data_dict = {}
    for split in ['train', 'test']:
        data_dict[split] = []
        with open(
                os.path.join(default_data_path,
                             'raw data/round1_{0}.txt'.format(split))) as f:
            for i, line in enumerate(f):
                if i == 0:
                    continue
                data_list = line.split()
                if split == 'test':
                    data_list.append(-1)
                data_dict[split].append((int(data_list[16]), data_list))

    # train reorder
    data_dict['train'] = sorted(data_dict['train'], key=lambda data: data[0])

    # merge train and test
    for category in category_list:
        dict[category] = []
    for split in ['train', 'test']:
        for x in data_dict[split]:
            for category, data in zip(category_list, x[1]):
                dict[category].append(data)

    # write
    for key, val in dict.items():
        joblib.dump(
            val,
            os.path.join(default_data_path,
                         'pretreatment/raw-{0}.pkl'.format(key)))
        print_log('pretreatment of {0} finished!'.format(key))
Example #5
0
def __get_id_dict(type,
                  number=0,
                  test_flag: bool = False,
                  label_flag: bool = False,
                  all=False,
                  slice_num=5):
    global true, false
    true_day_length = int(1. * np.ceil(len(true) / slice_num))
    false_day_length = int(1. * np.ceil(len(false) / slice_num))

    true = np.array(true)
    false = np.array(false)

    day = {}

    for i in range(slice_num):
        day[i] = np.concatenate([
            true[i * true_day_length:min(len(true), (i + 1) *
                                         true_day_length)],
            false[i * false_day_length:min(len(false), (i + 1) *
                                           false_day_length)]
        ])
        random.seed(10)
        random.shuffle(day[i])
        day[i] = day[i].tolist()

    basic_model_id = {}
    for i in range(slice_num - 1):
        tmp = []
        for j in range(slice_num - 1):
            if i == j:
                continue
            tmp = tmp + day[j]
        basic_model_id[i] = (tmp, day[i])

    all_train = []
    except_last_train = []
    for i in range(slice_num):
        if i != slice_num - 1:
            except_last_train = except_last_train + day[i]
        all_train = all_train + day[i]
    basic_model_id[slice_num - 1] = (except_last_train, day[slice_num - 1])
    basic_model_id[slice_num] = (all_train, [])

    if all == True:
        id_pair = (list(range(0, train_num)), [])
    elif type == 'basic_model':
        id_pair = basic_model_id[number]
    else:
        raise NotImplementedError('type can not be {0}'.format(type))
    id_dict = {'train': id_pair[0], 'validation': id_pair[1]}
    if test_flag:
        id_dict['test'] = list(range(train_num, train_num + test_num))

    print_log('finished creating id_dict --- train:{0}, validation:{1}'.format(
        len(id_dict['train']), len(id_dict['validation'])))
    if not label_flag:
        return id_dict

    # need label
    label_dict = {}
    for split, id_list in id_dict.items():
        label_dict[split] = []
        feature = Feature.load('is_trade')
        for id in id_list:
            label_dict[split].append(feature.data[id])
        label_dict[split] = np.asarray(label_dict[split])
    print_log('finished creating label_dict')
    return id_dict, label_dict