def concatFeature(file_name_array, feature_name_array, mark): data_map = {} feature_type_map = {} for file_name in file_name_array: target_source = os.path.join(mark, file_name) student_data, headerarray, headerTypeArray = load_data_from_file( target_source, needType=True) for index, featureName in enumerate(headerarray): if featureName in feature_name_array: feature_type_map[featureName] = headerTypeArray[index] for student_record in student_data: uid = int(student_record[0]) if uid not in data_map: data_map[uid] = {} data_map[uid][featureName] = student_record[index] records = data_map.items() sorted(records) parentPath = DATA_ROOT_PATH + "/" + mark + "/" checkThenMkdirs(parentPath) output_filePath = parentPath + "concatfeature" output_file = open(output_filePath, "w") output_file.write("uId") for featureName in feature_name_array: output_file.write(",") output_file.write(featureName) output_file.write("\n") output_file.write("String") for featureName in feature_name_array: output_file.write(",") output_file.write(feature_type_map[featureName]) output_file.write("\n") for uid, valueMap in records: if valueMap.__len__() == feature_name_array.__len__(): output_file.write(str(uid)) for featureName in feature_name_array: output_file.write(",") output_file.write(str(valueMap[featureName])) output_file.write("\n") output_file.close()
def drawDataDistribution(dataFileName, data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) featureCount = headerArray.__len__() - 1 for colomnIndex in range(1, featureCount + 1): data = getOneColumn(student_data, colomnIndex) max, min = getMaxAndMin(data) boxWidth = (max - min) / _BOX_COUNT x_tags = [] rightBorders = [] _left = _right = min for _index in range(0, _BOX_COUNT): _left = _right _right += boxWidth rightBorders.append(_right) x_tags.append("[%.2f,%.2f)" % (_left, _right)) x_counts = [0] * _BOX_COUNT for _value in data: for _index, _border in enumerate(rightBorders): if _value <= _border: x_counts[_index] += 1 break #将未分类的归到最后一类去 unTagCount = data.__len__() for _value in x_counts: unTagCount -= _value x_counts[_BOX_COUNT - 1] += unTagCount xIndex = range(_BOX_COUNT) plot.bar(xIndex, x_counts) plot.xticks(xIndex, x_tags, rotation=10, fontsize=8) for _a, _b in zip(xIndex, x_counts): plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom') title = headerArray[colomnIndex] plot.title(title) parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/" checkThenMkdirs(parentPath) plot.savefig(parentPath + title) plot.clf()
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) : ''' 画出特征的一元分布图 :param needNormize: :return: ''' _fileName = os.path.join(DATA_MARK,dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(None); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); if needNorminize : _score_array = normizeDataSet(_score_array); #遍历所有的特征 for colomnIndex in range(1, headerArray.__len__()): data = getOneColumn(student_data, colomnIndex); # if headerArray[colomnIndex] == "avgRemoveErrorTime": # for index in range(data.__len__()): # if data[index] > 300: # data[index] = 300; if (needNorminize): data = normizeDataSet(dataSetA=data); plot.scatter(_score_array, data ,s=2); title = headerArray[colomnIndex]+"-score"; if(needNorminize): title += "-nominized"; plot.title(title); plot.xlabel("score"); plot.ylabel(headerArray[colomnIndex]); parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/"; checkThenMkdirs(parentPath); if out_mark is not None: title += "-"+out_mark; plot.savefig(parentPath+ title); plot.clf();
def concatAllFeature(file_name_array, mark): data_map = {} feature_name_array = [] feature_type_map = {} ignore_feature_array = [ "finalTestScore", "buildCount", "useDebug", "longDeleteCount", "hasBuildError", "debugCount", "pasteCount", "totalLength" ] feature_count = 0 for file_name in file_name_array: target_source = os.path.join(mark, file_name) student_data, headerarray, headerTypeArray = load_data_from_file( target_source, needType=True) for _header_index in range(1, headerarray.__len__()): if headerarray[_header_index] not in ignore_feature_array: feature_name_array.append(headerarray[_header_index]) for index, featureName in enumerate(headerarray): if featureName in feature_name_array: feature_count += 1 feature_type_map[featureName] = headerTypeArray[index] for student_record in student_data: uid = int(student_record[0]) if uid not in data_map: data_map[uid] = {} for _ocuppy_featureName in feature_name_array: data_map[uid][_ocuppy_featureName] = NULL_OCCUPY data_map[uid][featureName] = student_record[index] for uid in data_map: if data_map[uid].__len__() < feature_count: data_map[uid][featureName] = NULL_OCCUPY records = data_map.items() sorted(records) parentPath = DATA_ROOT_PATH + "/" + mark + "/" checkThenMkdirs(parentPath) output_filePath = parentPath + "concatfeature" output_file = open(output_filePath, "w") output_file.write("uId") for featureName in feature_name_array: output_file.write(",") output_file.write(featureName) output_file.write("\n") output_file.write("String") for featureName in feature_name_array: output_file.write(",") output_file.write(feature_type_map[featureName]) output_file.write("\n") for uid, valueMap in records: if valueMap.__len__() == feature_name_array.__len__(): output_file.write(str(uid)) for featureName in feature_name_array: output_file.write(",") output_file.write(str(valueMap[featureName])) output_file.write("\n") output_file.close()