def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) : ''' 画出特征的一元分布图 :param needNormize: :return: ''' _fileName = os.path.join(DATA_MARK,dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(None); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); if needNorminize : _score_array = normizeDataSet(_score_array); #遍历所有的特征 for colomnIndex in range(1, headerArray.__len__()): data = getOneColumn(student_data, colomnIndex); # if headerArray[colomnIndex] == "avgRemoveErrorTime": # for index in range(data.__len__()): # if data[index] > 300: # data[index] = 300; if (needNorminize): data = normizeDataSet(dataSetA=data); plot.scatter(_score_array, data ,s=2); title = headerArray[colomnIndex]+"-score"; if(needNorminize): title += "-nominized"; plot.title(title); plot.xlabel("score"); plot.ylabel(headerArray[colomnIndex]); parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/"; checkThenMkdirs(parentPath); if out_mark is not None: title += "-"+out_mark; plot.savefig(parentPath+ title); plot.clf();
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 if (neadNorm): _score_array = normizeDataSet(_score_array) for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) if (neadNorm): dataArray = normizeDataSet(dataArray) pValue, p = spearmanr(dataArray, _score_array) print(headerArray[index], pValue, p)