def main(): reuslt = [] for i in ['1', '2', '3', '4', '5', '6', '7', '8']: # 训练集 train_X = get_exam_score('exam_score', 'course' + i) train_y = train_X['score'] del train_X['score'] # 测试集 test_X = get_submission_s1('submission_s1', 'course' + i) # 规范columns columns = list(set(list(train_X.columns) + list(test_X.columns))) columns.sort() test_X = test_X.reindex(columns=columns).fillna(0) train_X = train_X.reindex(columns=columns).fillna(0) # 保存预测结果 reuslt.extend(get_model(train_X, train_y, test_X).tolist()) submit = load_data().get_test_s1('submission_s1', 'pd') submit['pred'] = reuslt submit.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_stack.csv', index=None, encoding='utf-8')
def __init__( self, args, path, test=False, slide=False, ): super().__init__(path["ja_chars"], path["font"]) data_path = path["newspaper"]["test"] if test else path["newspaper"][ "train"] data = load_data(data_path) self.data = data["data"] self.num_class = data["num_class"] self.char_len = args.char_len self.test = test self.slide = slide if args.character_encoder == "CAE": self.char2embedding = load_data(path["char2embedding"]) else: self.font_img_dict[" "] = self.resize_font_img( self.char_to_font_img(" ")) self.process = self.test_process if self.slide else self.train_process self.character_encoder = args.character_encoder
def get_submission_s1(filename, course_id: str, tag='pd', save=True): """ 处理submission_s1.csv文件 :param filename: :param course_id: :param tag: :return: """ # .h5文件保存路径 save_path = load_data().get_project_path() + '/data/cache/%s_%s.h5' % (filename, course_id) if os.path.exists(save_path): submission_s1 = reduce_mem_usage(pd.read_hdf(path_or_buf=save_path, mode='r', key=course_id)) else: submission_s1 = load_data().get_test_s1(filename, tag) # 获取 学生信息 student = get_student('student') # 获取 课程信息 course = get_course('course') # 提取/合并 性别信息 submission_s1 = pd.merge(submission_s1, student, how='left', on='student_id') # 提取/合并 course_class信息 submission_s1 = pd.merge(submission_s1, course, how='left', on='course') # 获取选中的 course_id submission_s1 = submission_s1[submission_s1['course'] == course_id] # 合并 section/category/complexity submission_s1 = merge_all_knowledge(submission_s1, course_id + '_exams') # 读取特定的course_exams.csv文件 course_exams = get_course_exams(course_id + '_exams') # 处理exam_id submission_s1['exam_id'] = submission_s1['exam_id'].map( lambda x: dict(zip(course_exams['exam_id'], [i for i in range(len(course_exams['exam_id']))]))[x]) submission_s1['pred'] = 0 # 删除列值都相同的列 submission_s1 = submission_s1.ix[:, (submission_s1 != submission_s1.ix[0]).any()] # 保存数据 if save is True: submission_s1.to_hdf(path_or_buf=save_path, key=course_id) return submission_s1
def get_course_exams(filename, tag='pd'): """ 处理course1_exams.csv-course8_exams.csv :param filename: :return: """ df = load_data().get_train_s1(filename, tag) return df
def get_student(filename, tag='pd'): """ 处理student.csv文件 :param filename: :return: """ df = load_data().get_train_s1(filename, tag) return df
def get_course(filename, tag='pd'): """ 处理course.csv文件 :param filename: :return: """ df = load_data().get_train_s1(filename, tag) df = label_encoding(df, columns=[u'course_class']) return df
def __init__(self, chars_path, font_name): self.chars = load_data(chars_path) self.font_name = font_name self.font_size = 64 self.font = ImageFont.truetype(font=font_name, size=int(self.font_size * 0.9), encoding="utf-8") self.font_img_dict = { char: self.resize_font_img(self.char_to_font_img(char)) for char in self.chars }
def get_all_knowledge(filename, tag='pd'): """ 处理all_knowledge.csv文件 :param filename: :return: """ df = load_data().get_train_s1(filename, tag) # v1.0需要 # for feature in ['section', 'category']: # df[feature] = [x.split(':')[-1] for x in df[feature]] return df
# reuslt.extend(predictions.tolist()) # # submit = load_data().get_test_s1('submission_s1', 'pd') # submit['pred'] = reuslt # submit.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_mean_median.csv', index=None, # encoding='utf-8') ########################################################mean-median-xgb################################################################# # data1 = load_data().get_test_s1('submission_s1_sample_xgb', 'pd') # data2 = load_data().get_test_s1('submission_s1_sample_baseline', 'pd') # # data1['pred'] = (data1['pred'] * 0.5 + data2['pred'] * 0.5) # data1.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_baseline_xgb.csv', index=None, # encoding='utf-8') # # print(time.clock() - start) ########################################################stack-and-baseline################################################################# data1 = load_data().get_test_s1('submission_s1_sample_xgb', 'pd') data2 = load_data().get_test_s1('submission_s1_sample_baseline_1', 'pd') data1['pred'] = (data1['pred'] + 32) * 0.3 + data2['pred'] * 0.7 # data1['pred'] = [round(i) for i in list(data1['pred'])] data1.to_csv(load_data().get_project_path() + '/data/test_s1/submission_s1_sample_stack_baseline_1_xgb.csv', index=None, encoding='utf-8') print(time.clock() - start)
def get_exam_score(filename, course_id: str, tag='pd', save=True): """ 处理exam_score.csv course_id 只要一个值 如: 'course1' :param filename: :param course_id: :param tag: :param save: :return: """ # .h5文件保存路径 save_path = load_data().get_project_path() + '/data/cache/%s_%s.h5' % (filename, course_id) if os.path.exists(save_path): exam_score = reduce_mem_usage(pd.read_hdf(path_or_buf=save_path, mode='r', key=course_id)) else: exam_score = load_data().get_train_s1(filename, tag) # 获取 学生信息 student = get_student('student') # 获取 课程信息 course = get_course('course') # 提取/合并 性别信息 exam_score = pd.merge(exam_score, student, how='left', on='student_id') # 提取/合并 course_class信息 exam_score = pd.merge(exam_score, course, how='left', on='course') # 获取选中的 course_id exam_score = exam_score[exam_score['course'] == course_id] # 合并 section/category/complexity exam_score = merge_all_knowledge(exam_score, course_id + '_exams') # 读取特定的course_exams.csv文件 course_exams = get_course_exams(course_id + '_exams') # 处理exam_id exam_score['exam_id'] = exam_score['exam_id'].map( lambda x: dict(zip(course_exams['exam_id'], [i for i in range(len(course_exams['exam_id']))]))[x]) # 取均值 mean_value = get_mean_value(exam_score) # 使用均值来填充空值 result = None for tmp in exam_score.groupby('student_id'): tmp[1]['score'].replace(0, mean_value[tmp[0]], inplace=True) if result is None: result = tmp[1] else: result = pd.concat([result, tmp[1]], axis=0) result.reset_index(drop=True, inplace=True) exam_score = result # log1p就是log(1+x),用来对得分进行数据预处理,它的好处是转化后的数据更加服从高斯分布,有利于后续的分类结果。 # 需要注意,最后需要将预测出的平滑数据还原,而还原过程就是log1p的逆运算expm1。 exam_score["score"] = np.log1p(exam_score["score"]) # 删除列值都相同的列 exam_score = exam_score.ix[:, (exam_score != exam_score.ix[0]).any()] # 保存数据 if save is True: exam_score.to_hdf(path_or_buf=save_path, key=course_id) return exam_score
state['{}_acc'.format(mode)].append(correct / len(data_loader.dataset)) if __name__ == '__main__': # get arguments args = parse() ndegree = args.pdegree # set seeds torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed_all(args.seed) # load dataset train_loader, valid_loader, test_loader = \ load_data(args.batch_size, args.test_batch_size, args.dataset) # load model model = load_model(args.model, args.dataset) if use_cuda: model.cuda() # count total number of parameters model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('\nTotal number of parameters: {}\n'.format(params)) print_model_setting(args) # set optimizer if args.optimizer == 'Adam':