Beispiel #1
0
def compare_penalties(fileData):
    params = utilities.Parameters(lowerCaseFlag=False,
                                  removeStopWordsFlag=False,
                                  stemFlag=False,
                                  maxFeatures=7363,
                                  ngramRange=(1, 1),
                                  tfidfFlags=(False, False))

    svmL1 = svm.LinearSVC(penalty='l1', dual=False)
    svmL2 = svm.LinearSVC(penalty='l2')

    Corpus, pipelineL1 = utilities.getInfoFromParameters(
        fileData, params, svmL1)
    Corpus, pipelineL2 = utilities.getInfoFromParameters(
        fileData, params, svmL2)

    splits = 10
    outer_cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

    # CV for L1 estimator and L2 estimator which returns the F1-score and Accuracy.
    scoring = ['accuracy', 'f1_macro']
    scoresL2 = cross_validate(pipelineL2,
                              X=Corpus[preprocessing.COMMENT],
                              y=Corpus[preprocessing.CLASS],
                              scoring=scoring,
                              cv=outer_cv)
    scoresL1 = cross_validate(pipelineL1,
                              X=Corpus[preprocessing.COMMENT],
                              y=Corpus[preprocessing.CLASS],
                              scoring=scoring,
                              cv=outer_cv)

    for i in range(1, splits):
        scoresL1['test_accuracy'][0] += scoresL1['test_accuracy'][i]
        scoresL1['test_f1_macro'][0] += scoresL1['test_f1_macro'][i]
        scoresL2['test_accuracy'][0] += scoresL2['test_accuracy'][i]
        scoresL2['test_f1_macro'][0] += scoresL2['test_f1_macro'][i]

    scoresL1['test_accuracy'][0] /= splits
    scoresL1['test_f1_macro'][0] /= splits
    scoresL2['test_accuracy'][0] /= splits
    scoresL2['test_f1_macro'][0] /= splits

    print("L1 accuracy: ", scoresL1['test_accuracy'][0], " - L2 accuracy: ",
          scoresL2['test_accuracy'][0])
    print("L1 F1: ", scoresL1['test_f1_macro'][0], " - L2 F1: ",
          scoresL2['test_f1_macro'][0])
Beispiel #2
0
def compare_penalties(fileData):
    params = utilities.Parameters(lowerCaseFlag=False,
                                  removeStopWordsFlag=False,
                                  stemFlag=False,
                                  maxFeatures=7363,
                                  ngramRange=(1, 1),
                                  tfidfFlags=(False, False))

    lrL1 = LogisticRegression(penalty='l1', solver='saga', tol=0.01)
    lrL2 = LogisticRegression(penalty='l2')

    Corpus, pipelineL1 = utilities.getInfoFromParameters(
        fileData, params, lrL1)
    Corpus, pipelineL2 = utilities.getInfoFromParameters(
        fileData, params, lrL2)

    splits = 2
    outer_cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

    # CV for L1 estimator and L2 estimator which returns f1 and accuracy and we will compare it
    scoring = ['accuracy', 'f1_macro']
    scoresL1 = cross_validate(pipelineL1,
                              X=Corpus[preprocessing.COMMENT],
                              y=Corpus[preprocessing.CLASS],
                              scoring=scoring,
                              cv=outer_cv)
    scoresL2 = cross_validate(pipelineL2,
                              X=Corpus[preprocessing.COMMENT],
                              y=Corpus[preprocessing.CLASS],
                              scoring=scoring,
                              cv=outer_cv)

    for i in range(1, splits):
        scoresL1['test_accuracy'][0] += scoresL1['test_accuracy'][i]
        scoresL1['test_f1_macro'][0] += scoresL1['test_f1_macro'][i]
        scoresL2['test_accuracy'][0] += scoresL2['test_accuracy'][i]
        scoresL2['test_f1_macro'][0] += scoresL2['test_f1_macro'][i]

    scoresL1['test_accuracy'][0] /= splits
    scoresL1['test_f1_macro'][0] /= splits
    scoresL2['test_accuracy'][0] /= splits
    scoresL2['test_f1_macro'][0] /= splits

    print("L1 accuracy: ", scoresL1['test_accuracy'][0], " - L2 accuracy: ",
          scoresL2['test_accuracy'][0])
    print("L1 F1: ", scoresL1['test_f1_macro'][0], " - L2 F1: ",
          scoresL2['test_f1_macro'][0])
if __name__ == "__main__":

    # Construct parameters.
    parametersList = list()

    for lowerCaseFlag in [False, True]:
        for removeStopWordsFlag in [False, True]:
            for stemFlag in [False, True]:
                for maxFeatures in [1000, 5000, 7363]:
                    for ngramRange in [(1, 1), (1, 2), (1, 3)]:
                        for tfidfFlags in [(False, False), (True, False),
                                           (False, True)]:
                            for alpha_value in [1, 0.001, 0.00001]:
                                parametersList.append(
                                    utilities.Parameters(
                                        lowerCaseFlag, removeStopWordsFlag,
                                        stemFlag, maxFeatures, ngramRange,
                                        tfidfFlags, alpha_value))

    cnt = 0

    # Go through all of the input files and configurations and export the results to a .csv file.
    for input_file, output_file_path, singleFunctionalClass in [
        ("../input.txt", "output/outputNBdirectAlphaAll.csv", False),
        ("../input-functional.txt",
         "output/outputNBdirectAlphaFunctionalAll.csv", True)
    ]:
        with open(output_file_path, 'w') as output:
            print(utilities.getHeader(singleFunctionalClass), file=output)
            output.flush()

            fileData = preprocessing.read_file(input_file)
    # Construct parameters.
    parametersList = list()

    for lowerCaseFlag in [False, True]:
        for removeStopWordsFlag in [False, True]:
            for stemFlag in [False, True]:
                    for maxFeatures in [1000, 5000, 7363]:
                        for ngramRange in [(1, 1), (1, 2), (1, 3)]:
                            for alpha in [0.00001, 0.001, 1]:
                                for binarize in [0.0]:
                                    for tfidfFlags in [(False, False)]:
                                        parametersList.append(utilities.Parameters(
                                            lowerCaseFlag,
                                            removeStopWordsFlag,
                                            stemFlag,
                                            maxFeatures,
                                            ngramRange,
                                            tfidfFlags,
                                            alpha,
                                            binarize)
                                )
    print("ParamsList created.\n")

    count_file = 0
    for input_file, output_file, is_functional in [("../input-functional.txt", "output-functional.csv", True), ("../input.txt", "output.csv", False)]:
         with open(output_file, 'w') as output_file_print_target:
            print("Using ",input_file, ", stats will be in ", output_file)
            fileData = preprocessing.read_file(input_file)

            # Print header in output file.
            header = utilities.getHeader(is_functional)