Exemple #1
0
def concatFeature(file_name_array, feature_name_array, mark):
    data_map = {}

    feature_type_map = {}

    for file_name in file_name_array:
        target_source = os.path.join(mark, file_name)
        student_data, headerarray, headerTypeArray = load_data_from_file(
            target_source, needType=True)

        for index, featureName in enumerate(headerarray):
            if featureName in feature_name_array:
                feature_type_map[featureName] = headerTypeArray[index]

                for student_record in student_data:
                    uid = int(student_record[0])
                    if uid not in data_map:
                        data_map[uid] = {}

                    data_map[uid][featureName] = student_record[index]

    records = data_map.items()
    sorted(records)

    parentPath = DATA_ROOT_PATH + "/" + mark + "/"
    checkThenMkdirs(parentPath)
    output_filePath = parentPath + "concatfeature"

    output_file = open(output_filePath, "w")

    output_file.write("uId")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(featureName)
    output_file.write("\n")

    output_file.write("String")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(feature_type_map[featureName])
    output_file.write("\n")

    for uid, valueMap in records:
        if valueMap.__len__() == feature_name_array.__len__():
            output_file.write(str(uid))
            for featureName in feature_name_array:
                output_file.write(",")
                output_file.write(str(valueMap[featureName]))
            output_file.write("\n")

    output_file.close()
Exemple #2
0
def drawDataDistribution(dataFileName, data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    featureCount = headerArray.__len__() - 1
    for colomnIndex in range(1, featureCount + 1):
        data = getOneColumn(student_data, colomnIndex)
        max, min = getMaxAndMin(data)
        boxWidth = (max - min) / _BOX_COUNT

        x_tags = []
        rightBorders = []
        _left = _right = min
        for _index in range(0, _BOX_COUNT):
            _left = _right
            _right += boxWidth
            rightBorders.append(_right)
            x_tags.append("[%.2f,%.2f)" % (_left, _right))

        x_counts = [0] * _BOX_COUNT

        for _value in data:
            for _index, _border in enumerate(rightBorders):
                if _value <= _border:
                    x_counts[_index] += 1
                    break

        #将未分类的归到最后一类去
        unTagCount = data.__len__()
        for _value in x_counts:
            unTagCount -= _value
        x_counts[_BOX_COUNT - 1] += unTagCount

        xIndex = range(_BOX_COUNT)
        plot.bar(xIndex, x_counts)
        plot.xticks(xIndex, x_tags, rotation=10, fontsize=8)
        for _a, _b in zip(xIndex, x_counts):
            plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom')

        title = headerArray[colomnIndex]
        plot.title(title)
        parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/"
        checkThenMkdirs(parentPath)
        plot.savefig(parentPath + title)
        plot.clf()
Exemple #3
0
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) :
    '''
    画出特征的一元分布图
    :param needNormize:
    :return:
    '''
    _fileName = os.path.join(DATA_MARK,dataFileName);

    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map(None);

    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    if needNorminize :
        _score_array = normizeDataSet(_score_array);

    #遍历所有的特征
    for colomnIndex in range(1, headerArray.__len__()):
        data = getOneColumn(student_data, colomnIndex);

        # if headerArray[colomnIndex] == "avgRemoveErrorTime":
        #     for index in range(data.__len__()):
        #         if data[index] > 300:
        #             data[index] = 300;

        if (needNorminize):
            data = normizeDataSet(dataSetA=data);

        plot.scatter(_score_array, data ,s=2);
        title = headerArray[colomnIndex]+"-score";
        if(needNorminize):
            title += "-nominized";
        plot.title(title);
        plot.xlabel("score");
        plot.ylabel(headerArray[colomnIndex]);

        parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/";
        checkThenMkdirs(parentPath);
        if out_mark is not None:
            title += "-"+out_mark;
        plot.savefig(parentPath+ title);
        plot.clf();
Exemple #4
0
def concatAllFeature(file_name_array, mark):
    data_map = {}

    feature_name_array = []
    feature_type_map = {}
    ignore_feature_array = [
        "finalTestScore", "buildCount", "useDebug", "longDeleteCount",
        "hasBuildError", "debugCount", "pasteCount", "totalLength"
    ]
    feature_count = 0
    for file_name in file_name_array:
        target_source = os.path.join(mark, file_name)
        student_data, headerarray, headerTypeArray = load_data_from_file(
            target_source, needType=True)

        for _header_index in range(1, headerarray.__len__()):
            if headerarray[_header_index] not in ignore_feature_array:
                feature_name_array.append(headerarray[_header_index])

        for index, featureName in enumerate(headerarray):
            if featureName in feature_name_array:
                feature_count += 1
                feature_type_map[featureName] = headerTypeArray[index]
                for student_record in student_data:
                    uid = int(student_record[0])
                    if uid not in data_map:
                        data_map[uid] = {}
                        for _ocuppy_featureName in feature_name_array:
                            data_map[uid][_ocuppy_featureName] = NULL_OCCUPY
                    data_map[uid][featureName] = student_record[index]

                for uid in data_map:
                    if data_map[uid].__len__() < feature_count:
                        data_map[uid][featureName] = NULL_OCCUPY

    records = data_map.items()
    sorted(records)

    parentPath = DATA_ROOT_PATH + "/" + mark + "/"
    checkThenMkdirs(parentPath)
    output_filePath = parentPath + "concatfeature"

    output_file = open(output_filePath, "w")

    output_file.write("uId")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(featureName)
    output_file.write("\n")

    output_file.write("String")
    for featureName in feature_name_array:
        output_file.write(",")
        output_file.write(feature_type_map[featureName])
    output_file.write("\n")

    for uid, valueMap in records:
        if valueMap.__len__() == feature_name_array.__len__():
            output_file.write(str(uid))
            for featureName in feature_name_array:
                output_file.write(",")
                output_file.write(str(valueMap[featureName]))
            output_file.write("\n")

    output_file.close()