def doit(): """ 将students表中的中文字符串改为数字,方便后续处理 @params @return """ from background_program.z_Tools.my_database import MyDataBase db = MyDataBase("软件学院") executer = db.getExcuter() sql = "" with open('data_clean.sql', 'r', encoding='UTF-8') as f: for line in f.readlines(): print(sql) sql = sql + line executer.execute(sql) db.close()
def clean_useless_data(): from background_program.z_Tools.my_database import MyDataBase # try: db = MyDataBase("软件学院") executer = db.getExcuter() sql = "SELECT *\ FROM\ students\ WHERE\ score IS NOT NULL\ AND canteen_times IS NOT NULL" executer.execute(sql) results = [list(i) for i in executer.fetchall()] for i in range(len(results)): for j in range(len(results[i])): if results[i][j] is None: results[i][j] = 0 sql = "insert into students_float values{0}".format(tuple(results[i])) executer.execute(sql) db.close()
class DataCarer(): def __init__(self, label_name, school_year, usage="regression"): ''' @params label_name:xxx...... @return ''' self.usages = ["regression", "classification"] self.usage = usage if usage not in self.usages: print('用法错误:%s' % usage) elif usage == "regression": self.table_name = "students_float" elif usage == "classification": self.table_name = "students_int" self.label_name = label_name self.school_year = school_year self.labellist = [] def create_train_dataSet(self): ''' 获取训练数据 @params @return numpy.mat X_train:特征,numpy.mat Y_train:标签 list labellist:特征项名称的集合 ''' """确定label是哪一列,并将其作为待预测对象""" self.open_database("软件学院") self.executer.execute("DESCRIBE {0}".format(self.table_name)) columnName = self.executer.fetchall() index = -1 for i in range(2, len(columnName)): if str(columnName[i][0]) == self.label_name: index = i else: self.labellist.append(columnName[i][0]) if index == -1: print('异常:未发现' + self.label_name) """先获得数据库表中的全部学生的数据""" self.executer.execute("select * from {0}".format(self.table_name)) dataSet = [] for i in self.executer.fetchall(): student = Student(student_num=i[0], features=list(i[2:index]) + list(i[index + 1:]), label=i[index]) dataSet.append(student.getAll()) dataSet = np.array(dataSet) self.close_database() dataSet = mat(dataSet) X_train, Y_train = mat(dataSet[:, :-1]), mat(dataSet[:, -1]) # self.pre_process(X_train) return X_train, Y_train def create_validate_dataSet(self): ''' 获取测试数据 @params @return list[Student] students:学生列表,numpy.mat X_test:特征 ''' """确定label是哪一列,并将其作为待预测对象""" self.open_database("软件学院") self.executer.execute("DESCRIBE {0}".format(self.table_name)) columnName = self.executer.fetchall() index = -1 for i in range(len(columnName)): if str(columnName[i][0]) == self.label_name: index = i break if index == -1: print('异常:未发现' + self.label_name) """获得所有学生的数据""" sql = "select * from {0} where right(student_num,4) in('{1}','2016')" self.executer.execute(sql.format(self.table_name, self.school_year)) students, X_test = [], [] for i in self.executer.fetchall(): student = Student(student_num=i[0], features=list(i[2:index]) + list(i[index + 1:]), label=i[index]) X_test.append(student.features) students.append(student) A = np.array(X_test) X_test = mat(X_test) self.close_database() self.pre_process(X_test) return students, X_test def pre_process(self, X): ''' Created on 2017年12月20日 对数据进行初级的预处理 @author: Jack @params @return ''' from background_program.b_Sample_processing.PreProcessing.MyImputer import MyImputer MyImputer().transformer.fit_transform(X) def get_feature_range(self, feature_name, label_name, label_min, label_max): self.open_database("软件学院") sql = 'select min({0}),max({0}) from {1} where {2} >= {3} and {2} < {4}'.\ format(feature_name, 'students_float', label_name, label_min, label_max) try: self.executer.execute(sql) result = self.executer.fetchone() feature_min, feature_max = result[0], result[1] self.close_database return feature_min, feature_max except: return 0, 0 def open_database(self, database_name): self.db = MyDataBase(database_name) self.executer = self.db.getExcuter() def close_database(self): self.executer.close() self.db.close()
class FeatureCalculater: ''' 特征计算器的父类 ''' def __init__(self, database="软件学院", feature_name=None): from background_program.z_Tools.my_database import MyDataBase self.db = MyDataBase(database) self.executer = self.db.getExcuter() self.feature_name = feature_name self.this_year = 2017 # 一个同学一个学年作为一个记录 def get_school_year(self, student_num): in_year = int(student_num[3:7]) school_year = [str(i) for i in range(in_year, self.this_year + 1)] return school_year @my_exception_handler def add_student(self, student_num): ''' 添加学生 @params string student_num:学生学号 @retrun ''' sql = "insert into students(student_num) values('{0}')".format( student_num) self.executer.execute(sql) @my_exception_handler def calculate(self): ''' 获取学号 所有子类都要实现这个函数 特征值的计算 @params @retrun ''' sql = "select distinct student_num,student_name,grade,student_type from subsidy" self.executer.execute(sql) result = self.executer.fetchall() student_types = ['普通高校本科学生', '硕士研究生', '交流生', '普通进修生', '硕士专业学位研究生'] student_type = 0 for re in result: count = int(re[2]) for s_i in range(len(student_types)): if re[3] == student_types[s_i]: student_type = s_i break if student_type == 0: while count <= int(re[2]) + 8 and count <= 2017: sql = "insert into students(student_num,student_name,student_grade,student_type) values(%s,%s,%s,%s)" self.executer.execute(sql, (str(re[0]) + str(count), re[1], str(re[2]), student_type)) count = count + 1 else: while count <= int(re[2]) + 8 and count <= 2017: sql = "insert into students(student_num,student_name,student_grade,student_type) values(%s,%s,%s,%s)" self.executer.execute(sql, (str(re[0]) + str(count), re[1], str(re[2]), student_type)) count = count + 1 @my_exception_handler def cluster(self, feature_min=None, feature_max=None, clusters=1): from sklearn.cluster import KMeans import numpy ''' 所有子类都要实现这个函数 对特征值进行聚类来归一化处理 @params @retrun ''' # 获得学生的数据 sql = 'SELECT {0} FROM students_float WHERE {1} is not null and {2} <> 0'.format( self.feature_name, self.feature_name, self.feature_name) count = self.executer.execute(sql) result = self.executer.fetchall() # count是行数 dataSet2 = list(result) dataSet = numpy.array([i[0] for i in result]).reshape(count, 1) # 聚类 kmeans = KMeans(n_clusters=clusters, random_state=0).fit(dataSet) center = kmeans.cluster_centers_ # 求出聚类中心 center_x = ['%0.4f' % center[i][0] for i in range(len(center))] labels = kmeans.labels_ # 标注每个点的聚类结果 # 写入student_rank表 types, maxx, minn = [[] for i in range(0, clusters)], [], [] dataSet_quene = list(set(dataSet2)) print(len(dataSet_quene)) for i in range(len(labels)): types[labels[i]].append(dataSet[i][0]) if len(dataSet_quene) > 0: for j in range(len(dataSet_quene)): if dataSet[i][0] == dataSet_quene[j]: del dataSet_quene[j] print(len(dataSet_quene)) sql = 'select student_num from students_float where {0}={1}'.format( self.feature_name, str((dataSet[i][0]))) self.executer.execute(sql) student_nums = self.executer.fetchall() for student_num in student_nums: student_num = student_num[0] sql = "update students_int set {0} = {1} where student_num = {2}".format( self.feature_name, str(labels[i] + 1), student_num) n_update = self.executer.execute(sql) if n_update == 0: try: sql = 'insert into students_int(student_num) values({0})'.format( student_num) self.executer.execute(sql) except: pass sql = "update students_int set {0} = {1} where student_num = {2}".format( self.feature_name, str(labels[i] + 1), student_num) self.executer.execute(sql) break # xxxxxx if feature_max is None and feature_min is None: for i in range(0, clusters): maxx.append(max(types[i])) minn.append(min(types[i])) maxx, minn, cent = sorted(maxx), sorted(minn), sorted( [float(i) for i in center_x]) for i in range(len(maxx) - 1): if maxx[i] < minn[i + 1]: temp = (float(maxx[i]) + float(minn[i + 1])) / 2.0 minn[i + 1] = (float(maxx[i]) + float(minn[i + 1])) / 2.0 maxx[i] = temp sql = 'SELECT max({0}) FROM students_float'.format( self.feature_name) self.executer.execute(sql) max_num = int(self.executer.fetchone()[0]) maxx[len(maxx) - 1] = max_num else: maxx, minn, cent = feature_max, feature_min, (feature_max + feature_min) / 2.0 # 将聚类范围保存 with open(r"Cluster_Feature", "a", encoding='utf8') as f: f.write(str(self.feature_name) + '\n') f.write( str(0) + ':' + str(0) + ' ' + str(0) + ' ' + str(minn[0]) + '\n') # 手动加入第一区间 for i in range(len(cent)): f.write( str(i + 1) + ':' + str(cent[i]) + ' ' + str(minn[i]) + ' ' + str(maxx[i]) + '\n') def tearDown(self): ''' teardown @params @retrun ''' self.db.close()