def compare_penalties(fileData): params = utilities.Parameters(lowerCaseFlag=False, removeStopWordsFlag=False, stemFlag=False, maxFeatures=7363, ngramRange=(1, 1), tfidfFlags=(False, False)) svmL1 = svm.LinearSVC(penalty='l1', dual=False) svmL2 = svm.LinearSVC(penalty='l2') Corpus, pipelineL1 = utilities.getInfoFromParameters( fileData, params, svmL1) Corpus, pipelineL2 = utilities.getInfoFromParameters( fileData, params, svmL2) splits = 10 outer_cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42) # CV for L1 estimator and L2 estimator which returns the F1-score and Accuracy. scoring = ['accuracy', 'f1_macro'] scoresL2 = cross_validate(pipelineL2, X=Corpus[preprocessing.COMMENT], y=Corpus[preprocessing.CLASS], scoring=scoring, cv=outer_cv) scoresL1 = cross_validate(pipelineL1, X=Corpus[preprocessing.COMMENT], y=Corpus[preprocessing.CLASS], scoring=scoring, cv=outer_cv) for i in range(1, splits): scoresL1['test_accuracy'][0] += scoresL1['test_accuracy'][i] scoresL1['test_f1_macro'][0] += scoresL1['test_f1_macro'][i] scoresL2['test_accuracy'][0] += scoresL2['test_accuracy'][i] scoresL2['test_f1_macro'][0] += scoresL2['test_f1_macro'][i] scoresL1['test_accuracy'][0] /= splits scoresL1['test_f1_macro'][0] /= splits scoresL2['test_accuracy'][0] /= splits scoresL2['test_f1_macro'][0] /= splits print("L1 accuracy: ", scoresL1['test_accuracy'][0], " - L2 accuracy: ", scoresL2['test_accuracy'][0]) print("L1 F1: ", scoresL1['test_f1_macro'][0], " - L2 F1: ", scoresL2['test_f1_macro'][0])
def compare_penalties(fileData): params = utilities.Parameters(lowerCaseFlag=False, removeStopWordsFlag=False, stemFlag=False, maxFeatures=7363, ngramRange=(1, 1), tfidfFlags=(False, False)) lrL1 = LogisticRegression(penalty='l1', solver='saga', tol=0.01) lrL2 = LogisticRegression(penalty='l2') Corpus, pipelineL1 = utilities.getInfoFromParameters( fileData, params, lrL1) Corpus, pipelineL2 = utilities.getInfoFromParameters( fileData, params, lrL2) splits = 2 outer_cv = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42) # CV for L1 estimator and L2 estimator which returns f1 and accuracy and we will compare it scoring = ['accuracy', 'f1_macro'] scoresL1 = cross_validate(pipelineL1, X=Corpus[preprocessing.COMMENT], y=Corpus[preprocessing.CLASS], scoring=scoring, cv=outer_cv) scoresL2 = cross_validate(pipelineL2, X=Corpus[preprocessing.COMMENT], y=Corpus[preprocessing.CLASS], scoring=scoring, cv=outer_cv) for i in range(1, splits): scoresL1['test_accuracy'][0] += scoresL1['test_accuracy'][i] scoresL1['test_f1_macro'][0] += scoresL1['test_f1_macro'][i] scoresL2['test_accuracy'][0] += scoresL2['test_accuracy'][i] scoresL2['test_f1_macro'][0] += scoresL2['test_f1_macro'][i] scoresL1['test_accuracy'][0] /= splits scoresL1['test_f1_macro'][0] /= splits scoresL2['test_accuracy'][0] /= splits scoresL2['test_f1_macro'][0] /= splits print("L1 accuracy: ", scoresL1['test_accuracy'][0], " - L2 accuracy: ", scoresL2['test_accuracy'][0]) print("L1 F1: ", scoresL1['test_f1_macro'][0], " - L2 F1: ", scoresL2['test_f1_macro'][0])
if __name__ == "__main__": # Construct parameters. parametersList = list() for lowerCaseFlag in [False, True]: for removeStopWordsFlag in [False, True]: for stemFlag in [False, True]: for maxFeatures in [1000, 5000, 7363]: for ngramRange in [(1, 1), (1, 2), (1, 3)]: for tfidfFlags in [(False, False), (True, False), (False, True)]: for alpha_value in [1, 0.001, 0.00001]: parametersList.append( utilities.Parameters( lowerCaseFlag, removeStopWordsFlag, stemFlag, maxFeatures, ngramRange, tfidfFlags, alpha_value)) cnt = 0 # Go through all of the input files and configurations and export the results to a .csv file. for input_file, output_file_path, singleFunctionalClass in [ ("../input.txt", "output/outputNBdirectAlphaAll.csv", False), ("../input-functional.txt", "output/outputNBdirectAlphaFunctionalAll.csv", True) ]: with open(output_file_path, 'w') as output: print(utilities.getHeader(singleFunctionalClass), file=output) output.flush() fileData = preprocessing.read_file(input_file)
# Construct parameters. parametersList = list() for lowerCaseFlag in [False, True]: for removeStopWordsFlag in [False, True]: for stemFlag in [False, True]: for maxFeatures in [1000, 5000, 7363]: for ngramRange in [(1, 1), (1, 2), (1, 3)]: for alpha in [0.00001, 0.001, 1]: for binarize in [0.0]: for tfidfFlags in [(False, False)]: parametersList.append(utilities.Parameters( lowerCaseFlag, removeStopWordsFlag, stemFlag, maxFeatures, ngramRange, tfidfFlags, alpha, binarize) ) print("ParamsList created.\n") count_file = 0 for input_file, output_file, is_functional in [("../input-functional.txt", "output-functional.csv", True), ("../input.txt", "output.csv", False)]: with open(output_file, 'w') as output_file_print_target: print("Using ",input_file, ", stats will be in ", output_file) fileData = preprocessing.read_file(input_file) # Print header in output file. header = utilities.getHeader(is_functional)