Ejemplo n.º 1
0
    def data_split(self):
        """
        训练集数据切分为训练集及验证集。
        validationPids:需要被选做为验证集的司机id
        """
        self.drivers_pd = pd.read_csv(self.root_path +
                                      "drivers_img_nop081_list.csv")
        self.imgs_pd = self.drivers_pd["img"]
        self.class_pd = self.drivers_pd["classname"]
        self.subject_pd = self.drivers_pd["subject"]
        self.choices = ["p035", "p047"]

        DataSplit.split(
            choice_ids=self.choices,
            train_pd_path=self.root_path + "drivers_img_nop081_list.csv",
            train_aug_pd_path=self.root_path + "drivers_img_aug_list.csv",
            train_dir=self.train_dir,
            val_dir=self.val_dir,
            test_dir=self.test_dir,
            origin_test_dir=self.root_path + "imgs/test",
            saved_weights_dir=self.saved_weights)
        return self
            user_item_map[words[0]][words[1]] += 1
    count_map = {}
    count_user_item_map = {}
    for user in user_item_map:
        for item, cnt in user_item_map[user].items():
            count_map.setdefault(cnt, 0)
            count_map[cnt] += 1
            count_user_item_map.setdefault(cnt, {})
            count_user_item_map[cnt].setdefault(user, set())
            count_user_item_map[cnt][user].add(item)
    # for cnt in count_map:
    #     print 'the number of behavior which occurs %d is %d' %(cnt, count_map[cnt])
    return count_map, count_user_item_map

if __name__ == '__main__':
    id_position_map = DataSplit.itemIDMap('../File/tianchi_mobile_recommend_train_item.csv')
    test_user_item_map, test_user_item_map_size = DataSplit.userOperate('../File/user_test_data.csv', 4, id_position_map)
    fp_behavior_occur_count = open('../File/AnalysisResult/behavior_occur_count.csv', 'w')
    total1 = 0
    for i in xrange(1, 5):
        fp_behavior_occur_count.write('behavior type is %d: \n' %i)
        count_map, count_user_item_map = userBuyAppetite('../File/user_train_data.csv', i, id_position_map)
        total = 0
        for cnt in count_map:
            # print count_map[cnt], len(count_user_item_map[cnt])
            bingos, test_set_num = DataSplit.computeRatio1(count_user_item_map[cnt], test_user_item_map)
            fp_behavior_occur_count.write('behavior occurs = %d, in train data(%d,%d,%.2lf), in test data(%d,%d,%.2lf)\n' \
                                          %(cnt, bingos, count_map[cnt], 100.0 * bingos / count_map[cnt], bingos, test_set_num, 100.0 * bingos / test_set_num))
            total += bingos
            # total1 += len(count_user_item_map[cnt])
        fp_behavior_occur_count.write('%d\n' %total)
    count_map = {}
    count_user_item_map = {}
    for user in user_item_map:
        for item, cnt in user_item_map[user].items():
            count_map.setdefault(cnt, 0)
            count_map[cnt] += 1
            count_user_item_map.setdefault(cnt, {})
            count_user_item_map[cnt].setdefault(user, set())
            count_user_item_map[cnt][user].add(item)
    # for cnt in count_map:
    #     print 'the number of behavior which occurs %d is %d' %(cnt, count_map[cnt])
    return count_map, count_user_item_map


if __name__ == '__main__':
    id_position_map = DataSplit.itemIDMap(
        '../File/tianchi_mobile_recommend_train_item.csv')
    test_user_item_map, test_user_item_map_size = DataSplit.userOperate(
        '../File/user_test_data.csv', 4, id_position_map)
    fp_behavior_occur_count = open(
        '../File/AnalysisResult/behavior_occur_count.csv', 'w')
    total1 = 0
    for i in xrange(1, 5):
        fp_behavior_occur_count.write('behavior type is %d: \n' % i)
        count_map, count_user_item_map = userBuyAppetite(
            '../File/user_train_data.csv', i, id_position_map)
        total = 0
        for cnt in count_map:
            # print count_map[cnt], len(count_user_item_map[cnt])
            bingos, test_set_num = DataSplit.computeRatio1(
                count_user_item_map[cnt], test_user_item_map)
            fp_behavior_occur_count.write('behavior occurs = %d, in train data(%d,%d,%.2lf), in test data(%d,%d,%.2lf)\n' \
Ejemplo n.º 4
0
        time = words[5]
        year1, month1, day1, hour1 = timeSparse(time)
        date1 = datetime.datetime(year1, month1, day1)
        interval = int((date - date1).days)
        # if file_point_map.has_key(interval) == False:
        #     output_file = '%02d.csv' %interval
        #     fp_output = open(output_file, 'w')
        #     file_point_map[interval] = fp_output
        file_point_map[interval].write('%s' %line)

if __name__ == '__main__':
    #分割文件为31个文件,每天一个文件
<<<<<<< HEAD
    # dataSplitByDay('../File/tianchi_mobile_recommend_train_user.csv')
    # dataSplitByDay('../File/complete_addDistance_train_user.csv')
    id_position_map = DataSplit.itemIDMap('../File/tianchi_mobile_recommend_train_item.csv')
=======
    dataSplitByDay('../File/tianchi_mobile_recommend_train_user.csv')
    # dataSplitByDay('../File/complete_addDistance_train_user.csv')
    # id_position_map = DataSplit.itemIDMap('../File/tianchi_mobile_recommend_train_item.csv')
>>>>>>> 972f41ad2af1fc238e65d04b58a152301666f6bf
    # id_position_map = DataSplit.itemIDMap('../File/train_user_complete_by_userInfo.csv')
    """
    ""分析每天的user-item行为
    """
    import os
    total_user_item_behavior_list = [[0 for i in xrange(0, 8)] for i in xrange(0, 5)]
    rootDir = '../File/EverydayData/'
    for cnt in xrange(0, 1):
        print 'process %02d' %cnt
        tag = 0