Esempio n. 1
0
def compute(compared_file, input_file):
    save_result = 'output//Result_' + compared_file.split('\\')[-1].split(
        '.')[0] + '_' + input_file.split('\\')[-1].split('.')[0] + '.xlsx'
    try:
        file = xlwt.Workbook()
        sheet1 = file.add_sheet('similarity', cell_overwrite_ok=True)
    except IOError:
        return "Error: Failed to create file"

    specialAlleleList = ['DYS385', 'DYF387S1']
    sampleName_input, _, alleleList_input, data_input = readData.readExcelData(
        input_file, specialAlleleList, SHUFFLE=False)
    sampleName_compared, _, alleleList_compared, data_compared = readData.readExcelData(
        compared_file, specialAlleleList, SHUFFLE=False)

    # sheet1.write(0, 0, "")
    for i in range(len(data_compared)):
        sheet1.write(i + 1, 0, sampleName_compared[i])
        for j in range(len(data_input)):
            sheet1.write(0, j + 1, sampleName_input[j])
            sheet1.write(
                i + 1, j + 1,
                str(round(cos_sim(data_compared[i], data_input[j]), 5)))
    try:
        file.save(save_result)
    except IOError:
        return "Error: File not found or failed to read "
    else:
        print(save_result + " has been written.")
        return save_result + " has been written."
Esempio n. 2
0
    def sl_getDataFile(self):
        text = self.ui.te_specialAllele.toPlainText()

        global alleleList, file, exampleList
        alleleList = []
        file = []
        exampleList = []
        file, _ = QFileDialog.getOpenFileName(
            self, "open example file", "",
            "Excel File (*.xlsx);;Excel File (*.xls);;Text File(*.txt)")
        if len(file) == 0:
            return
        self.ui.le_data.setText(file)
        #check specialAllele isempty
        if len(text) > 0:
            specialAlleleList = re.split('[ ,.\n\t]', text)
        else:
            specialAlleleList = []
        # specialAlleleList = []

        #check type of file
        typeFile = os.path.splitext(file)[-1]
        if typeFile in ['.xlsx', '.xls']:
            exampleList, alleleList, alleleData = readData.readExcelData(
                file, specialAlleleList)

        else:
            exampleList, alleleList, alleleData = readData.readTxtData(
                file, specialAlleleList)

        self.grouping(exampleList, alleleData)
def compute(train_path, compared_file, input_file):
    ml_method = [
        'knn', 'naiveBayes', 'logisticRegression', 'svm', 'decesionTree',
        'randomForest'
    ]
    save_result = 'output//SimilarityResult_DATABASE_' + compared_file.split(
        '\\')[-1].split('.')[0] + '_' + input_file.split('\\')[-1].split(
            '.')[0] + '.xlsx'
    try:
        file = openpyxl.Workbook()
        sheet1 = file.create_sheet('similarity')
    except IOError:
        return "Error: Failed to create file"

    sampleName_input, _, alleleList_input, data_input = readData.readExcelData(
        input_file, SHUFFLE=False)
    sampleName_compared, _, alleleList_compared, data_compared = readData.readExcelData(
        compared_file, SHUFFLE=False)

    proba_compared = list()
    proba_input = list()

    for k in range(6):
        model = os.path.join(train_path, ml_method[k] + '.model')
        proba_input.append(load_compute(model, data_input))
        proba_compared.append(load_compute(model, data_compared))

    line_num = 1
    sheet1.cell(line_num, 1).value = 'method'
    for k in range(6):
        sheet1.cell(line_num + 1, 1).value = ml_method[k]
        for i in range(len(data_compared)):
            sheet1.cell(line_num + 1, 2).value = sampleName_compared[i]
            line_num += 1
            for j in range(len(data_input)):
                sheet1.cell(1, j + 3).value = sampleName_input[j]
                sheet1.cell(line_num, j + 3).value = str(
                    round(cos_sim(proba_compared[k][i], proba_input[k][j]), 5))
    try:
        file.save(save_result)
    except IOError:
        return "Error: File not found or failed to read "
    else:
        print(save_result + " has been written.")
        return save_result + " has been written."
Esempio n. 4
0
def train(train_path):
    len_train_file = len(os.listdir(train_path))
    dirs = os.listdir(train_path)
    specialAlleleList = ['DYS385', 'DYF387S1']

    for file_num in range(len_train_file):
        train_file = os.path.join(train_path, dirs[file_num])
        save_model_path = 'output/train_model/' + train_path.split(
            '\\')[-1] + '/'
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        alleleGroupName, alleleList, alleleData, data, groupName, shuffleIndex = readData.readExcelData(
            train_file, specialAlleleList, SHUFFLE=True)

        x_train = alleleData
        y_train = alleleGroupName

        #knn
        knn = KNeighborsClassifier(1).fit(x_train, y_train)
        joblib.dump(
            knn, save_model_path + dirs[file_num].split('.')[0] + '_knn.model')

        #naiveBayes
        gnb = GaussianNB().fit(x_train, y_train)
        joblib.dump(
            gnb, save_model_path + dirs[file_num].split('.')[0] +
            '_naiveBayes.model')

        #logisticRegression
        lr = LogisticRegression(penalty="l2",
                                C=1,
                                multi_class="ovr",
                                solver="newton-cg",
                                max_iter=1000).fit(x_train, y_train)
        joblib.dump(
            lr, save_model_path + dirs[file_num].split('.')[0] +
            '_logisticRegression.model')

        #SVM
        svm_svm = svm.SVC(probability=True).fit(x_train, y_train)
        joblib.dump(
            svm_svm,
            save_model_path + dirs[file_num].split('.')[0] + '_svm.model')

        #decesionTree
        dt = DecisionTreeClassifier(criterion='entropy').fit(x_train, y_train)
        joblib.dump(
            dt, save_model_path + dirs[file_num].split('.')[0] +
            '_decesionTree.model')

        #randomForest
        rf = RandomForestClassifier(n_estimators=170).fit(x_train, y_train)
        joblib.dump(
            rf, save_model_path + dirs[file_num].split('.')[0] +
            '_randomForest.model')
def test(mode, train_path, test_file):
    ml_method = [
        'knn', 'naiveBayes', 'logisticRegression', 'svm', 'decesionTree',
        'randomForest'
    ]
    len_train_file = len(os.listdir(train_path)) if mode == "others" else 1
    dirs = os.listdir(train_path)
    if (mode == "DATABASE"):
        save_model_path = train_path + '/'
        save_result = 'output/PredictResult_DATABASE_' + test_file.split(
            '\\')[-1].split('.')[0] + '.xlsx'

    else:
        save_model_path = 'output/train_model/' + train_path.split(
            '\\')[-1] + '/'
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        save_result = 'output/PredictResult_' + train_path.split(
            '\\')[-1] + '_' + test_file.split('\\')[-1].split('.')[0] + '.xlsx'
    try:
        file = openpyxl.Workbook()
        sheet1 = file.create_sheet('predict')
    except IOError:
        return "Error: Failed to create file"

    sampleName_test, alleleGroupName_test, alleleList_test, data_test = readData.readExcelData(
        test_file, SHUFFLE=False)

    line_num = 0
    for file_num in range(len_train_file):
        sheet1.cell(
            line_num + 1,
            0 + 1).value = dirs[file_num] if mode == "others" else "DATABASE"
        line_num += 1
        for data_num in range(len(data_test)):
            sheet1.cell(line_num + 1, data_num + 1 + 1).value = str(
                sampleName_test[data_num])
        line_num += 1
        x_test = data_test

        for k in range(6):
            if (mode == "others"):
                model = os.path.join(
                    save_model_path, dirs[file_num].split('.')[0] + '_' +
                    ml_method[k] + '.model')
            else:
                model = os.path.join(save_model_path, ml_method[k] + '.model')

            try:
                fout = open(model)
            except:
                print(model + ' do not exist. Please train first.')
                return
            sheet1, line_num = load_pred(model, x_test, len(data_test), sheet1,
                                         line_num, ml_method[k])

        # #knn
        # model = os.path.join(save_model_path,dirs[file_num].split('.')[0]+'_knn.model')
        #
        # #naiveBayes
        # model = os.path.join(save_model_path,dirs[file_num].split('.')[0]+'_naiveBayes.model')
        # sheet1, line_num=load_pred(mode,model,train_path,x_test,len(data_test),sheet1,line_num,"naiveBayes")
        #
        # #logisticRegression
        # model = os.path.join(save_model_path, dirs[file_num].split('.')[0] + '_logisticRegression.model')
        # sheet1, line_num=load_pred(mode,model,train_path,x_test,len(data_test),sheet1,line_num,"logisticRegression")
        #
        # #SVM
        # model = os.path.join(save_model_path, dirs[file_num].split('.')[0] + '_svm.model')
        # sheet1, line_num=load_pred(mode,model,train_path,x_test,len(data_test),sheet1,line_num,"svm")
        #
        # #decesionTree
        # model = os.path.join(save_model_path, dirs[file_num].split('.')[0] + '_decesionTree.model')
        # sheet1, line_num=load_pred(mode,model,train_path,x_test,len(data_test),sheet1,line_num,"decesionTree")
        #
        # #randomForest
        # model = os.path.join(save_model_path, dirs[file_num].split('.')[0] + '_randomForest.model')
        # sheet1, line_num=load_pred(mode,model,train_path,x_test,len(data_test),sheet1,line_num,"randomForest")
    try:
        file.save(save_result)
    except IOError:
        return "Error: File not found or failed to read "
    else:
        print(save_result + " has been written.")
        return save_result + " has been written."
Esempio n. 6
0
def func(train_path, test_file):
    #readdata
    len_train_file = len(os.listdir(train_path))
    dirs = os.listdir(train_path)

    # write
    try:
        file = xlwt.Workbook()
        sheet1 = file.add_sheet('predict', cell_overwrite_ok=True)
        sheet2 = file.add_sheet('similarity', cell_overwrite_ok=True)
    except IOError:
        return "Error: Failed to create file"

    score_result = [[0 for i in range(6)] for j in range(len_train_file)]
    specialAlleleList = ['DYS385', 'DYF387S1']

    alleleGroupName_test, alleleList_test, data_test = readData.readExcelData(
        test_file, specialAlleleList, SHUFFLE=False)

    result = [[] for _ in range(len_train_file)]
    line_num = 0
    for file_num in range(1):
        sheet1.write(line_num, 0, dirs[file_num])
        sheet2.write(line_num, 0, dirs[file_num])
        line_num += 1

        train_file = os.path.join(train_path, dirs[file_num])
        print(train_file)
        alleleGroupName, alleleList, alleleData, data, groupName, shuffleIndex = readData.readExcelData(
            train_file, specialAlleleList, SHUFFLE=True)
        compare_result = [[] for i in range(6)]
        original_compare_result = [[] for i in range(len(data))]

        loop_num = 1
        x_train = alleleData
        x_test = data_test
        y_train = alleleGroupName
        unique_group = np.unique(y_train)

        #knn
        knn = KNeighborsClassifier(1).fit(x_train, y_train)
        knn_pred = knn.predict(x_test)
        knn_proba = knn.predict_proba(x_test)
        result[file_num].append(knn_pred)
        for jj in range(len(data_test)):
            # sheet1.write(line_num, jj, knn_pred[jj])
            sheet1.write(
                line_num, jj,
                str(knn_pred[jj]) + '(' + str(round(max(knn_proba[jj]), 3)) +
                ')')
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(knn_proba[jj],
                                                   knn_proba[0]))
        line_num += 1

        #naiveBayes
        gnb = GaussianNB().fit(x_train, y_train)
        # predict_score = gnb.predict_proba(x_test)
        gbn_pred = gnb.predict(x_test)
        gbn_proba = gnb.predict_proba(x_test)
        result[file_num].append(gbn_pred)
        for jj in range(len(data_test)):
            sheet1.write(
                line_num, jj,
                str(gbn_pred[jj]) + '(' + str(round(max(gbn_proba[jj]), 3)) +
                ')')
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(gbn_proba[jj],
                                                   gbn_proba[0]))
        line_num += 1
        #logisticRegression
        lr = LogisticRegression(penalty="l2",
                                C=1,
                                multi_class="ovr",
                                solver="newton-cg",
                                max_iter=1000).fit(x_train, y_train)
        # print(lr.predict_proba(x_test))
        lr_pred = lr.predict(x_test)
        lr_proba = lr.predict_proba(x_test)
        result[file_num].append(lr_pred)
        for jj in range(len(data_test)):
            sheet1.write(
                line_num, jj,
                str(lr_pred[jj]) + '(' + str(round(max(lr_proba[jj]), 3)) +
                ')')
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(lr_proba[jj], lr_proba[0]))
        line_num += 1

        #SVM
        svm_svm = svm.SVC(probability=True)
        svm_svm.fit(x_train, y_train)
        svm_pred = svm_svm.predict(x_test)
        svm_proba = svm_svm.predict_proba(x_test)
        result[file_num].append(svm_pred)
        for jj in range(len(data_test)):
            sheet1.write(
                line_num, jj,
                str(svm_pred[jj]) + '(' + str(round(max(svm_proba[jj]), 3)) +
                ')')
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(svm_proba[jj],
                                                   svm_proba[0]))
        line_num += 1
        #decesionTree
        dt = DecisionTreeClassifier(criterion='entropy')
        dt.fit(x_train, y_train)
        dt_pred = dt.predict(x_test)
        dt_proba = dt.predict_proba(x_test)
        result[file_num].append(dt_pred)
        for jj in range(len(data_test)):
            sheet1.write(
                line_num, jj,
                str(svm_pred[jj]) + '(' + str(round(max(dt_proba[jj]), 3)) +
                ')')
            # sheet1.write(line_num, jj, svm_pred[jj])
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(dt_proba[jj], dt_proba[0]))
        line_num += 1

        #randomForest
        rf = RandomForestClassifier(n_estimators=170)
        rf.fit(x_train, y_train)
        rf_pred = rf.predict(x_test)
        rf_proba = rf.predict_proba(x_test)
        result[file_num].append(rf_pred)
        for jj in range(len(data_test)):
            sheet1.write(
                line_num, jj,
                str(rf_pred[jj]) + '(' + str(round(max(rf_proba[jj]), 3)) +
                ')')
        for jj in range(1, len(data_test)):
            sheet2.write(line_num, jj - 1, cos_sim(rf_proba[jj], rf_proba[0]))
        line_num += 1
        loop_num += 1

    try:
        file.save('output//Result_' + train_path.split('\\')[-1] + '_' +
                  test_file.split('\\')[-1].split('.')[0] + '.xlsx')
    except IOError:
        return "Error: File not found or failed to read "
    else:
        print("'result.xlsx' has been written.")
        return "'result.xlsx' has been written."
Esempio n. 7
0
def test(train_path, test_file):
    len_train_file = len(os.listdir(train_path))
    dirs = os.listdir(train_path)
    save_model_path = 'output/train_model/' + train_path.split('\\')[-1] + '/'
    save_result = 'output//Result_' + train_path.split(
        '\\')[-1] + '_' + test_file.split('\\')[-1].split('.')[0] + '.xlsx'
    try:
        file = xlwt.Workbook()
        sheet1 = file.add_sheet('predict', cell_overwrite_ok=True)
        # sheet2 = file.add_sheet('similarity', cell_overwrite_ok=True)
    except IOError:
        return "Error: Failed to create file"

    specialAlleleList = ['DYS385', 'DYF387S1']
    sampleName_test, alleleGroupName_test, alleleList_test, data_test = readData.readExcelData(
        test_file, specialAlleleList, SHUFFLE=False)

    line_num = 0
    for file_num in range(len_train_file):
        sheet1.write(line_num, 0, dirs[file_num])
        line_num += 1
        # sheet2.write(line_num, 0, dirs[file_num])
        for data_num in range(len(data_test)):
            sheet1.write(line_num, data_num + 1,
                         str(sampleName_test[data_num]))
            # sheet2.write(line_num, data_num+1, str(sampleName_test[data_num]))
        line_num += 1
        x_test = data_test

        #knn
        model = os.path.join(save_model_path,
                             dirs[file_num].split('.')[0] + '_knn.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "knn")

        #naiveBayes
        model = os.path.join(
            save_model_path,
            dirs[file_num].split('.')[0] + '_naiveBayes.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "naiveBayes")

        #logisticRegression
        model = os.path.join(
            save_model_path,
            dirs[file_num].split('.')[0] + '_logisticRegression.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "logisticRegression")

        #SVM
        model = os.path.join(save_model_path,
                             dirs[file_num].split('.')[0] + '_svm.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "svm")

        #decesionTree
        model = os.path.join(
            save_model_path,
            dirs[file_num].split('.')[0] + '_decesionTree.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "decesionTree")

        #randomForest
        model = os.path.join(
            save_model_path,
            dirs[file_num].split('.')[0] + '_randomForest.model')
        sheet1, line_num = load_pred(model, train_path, x_test, len(data_test),
                                     sheet1, line_num, "randomForest")
    try:
        file.save(save_result)
    except IOError:
        return "Error: File not found or failed to read "
    else:
        print(save_result + " has been written.")
        return save_result + " has been written."