def test_provide_train_and_test_idf_dtms_real_data():
    """
    This method should help you to find a suitable min_df value to choose
    which features (=words) are elminated from the vectorizer
    """

    num_instances = 100
    classifier = DecisionTreeClassifier()

    dataset_document_name = ds.DEFAULT_DEVOLOPMENT_DATASET_DOCUMENT_NAME
    dataset_name_train = ds.DEFAULT_TRAININGSET_NAME
    dataset_name_test = ds.DEFAULT_TESTSET_NAME

    for document_fields in dcdp.DEFAULT_ALL_DOCUMENT_FIELDS:

        document_fields = dcdp.DEFAULT_ALL_DOCUMENT_FIELDS[0]
        used_fields = dcdp.retrieveValueForUsedFields(document_fields)
        document_train = dcdp.getDatasetContentDocumentFromDatabase(
            dataset_document_name, dataset_name_train, used_fields)
        document_test = dcdp.getDatasetContentDocumentFromDatabase(
            dataset_document_name, dataset_name_test, used_fields)

        train_content = document_train[dcdp.DSCD_FIELD_CONTENT]
        test_content = document_test[dcdp.DSCD_FIELD_CONTENT]

        vectorizer_params = provide_vectorizer_params_for_classifier(
            classifier, used_fields)

        idf_dtm_train, idf_dtm_test = provide_train_and_test_idf_dtms(
            vectorizer_params, train_content, test_content)

        print(idf_dtm_train.shape)
        print(idf_dtm_test.shape)
def test_provide_train_and_test_idf_dtms_real_data():
    
    """
    This method should help you to find a suitable min_df value to choose
    which features (=words) are elminated from the vectorizer
    """
    
    num_instances=100
    classifier = DecisionTreeClassifier()
    
    dataset_document_name=ds.DEFAULT_DEVOLOPMENT_DATASET_DOCUMENT_NAME
    dataset_name_train=ds.DEFAULT_TRAININGSET_NAME
    dataset_name_test=ds.DEFAULT_TESTSET_NAME
    
    for document_fields in dcdp.DEFAULT_ALL_DOCUMENT_FIELDS:
        
        document_fields = dcdp.DEFAULT_ALL_DOCUMENT_FIELDS[0]
        used_fields = dcdp.retrieveValueForUsedFields(document_fields)
        document_train = dcdp.getDatasetContentDocumentFromDatabase(
                                                dataset_document_name, 
                                                dataset_name_train, 
                                                used_fields)
        document_test = dcdp.getDatasetContentDocumentFromDatabase(
                                                dataset_document_name, 
                                                dataset_name_test, 
                                                used_fields)
            
        train_content=document_train[dcdp.DSCD_FIELD_CONTENT]
        test_content=document_test[dcdp.DSCD_FIELD_CONTENT]
        
        vectorizer_params=provide_vectorizer_params_for_classifier(classifier,
                                                                   used_fields)
    
        idf_dtm_train,idf_dtm_test=provide_train_and_test_idf_dtms(
                                                            vectorizer_params,
                                                                train_content, 
                                                                test_content)
        
        print(idf_dtm_train.shape)
        print(idf_dtm_test.shape)
DEFAULT_MAX_DF_KEY = 'max_df'
DEFAULT_NORM_KEY = 'norm'

BEST_VECTORIZER_PARAMS_NB = {
    pp.STACKEXCHANGE_TITLE_COLUMN: {
        DEFAULT_NORM_KEY: 'l2',
        DEFAULT_MAX_DF_KEY: 0.1,
        DEFAULT_MIN_DF_KEY: 0.001
    },
    pp.STACKEXCHANGE_BODY_COLUMN: {
        DEFAULT_NORM_KEY: 'l2',
        DEFAULT_MAX_DF_KEY: 0.2,
        DEFAULT_MIN_DF_KEY: 2
    },
    dcdp.retrieveValueForUsedFields([
        pp.STACKEXCHANGE_TITLE_COLUMN, pp.STACKEXCHANGE_BODY_COLUMN
    ]): {
        DEFAULT_NORM_KEY: 'l2',
        DEFAULT_MAX_DF_KEY: 0.2,
        DEFAULT_MIN_DF_KEY: 2
    }
}

BEST_VECTORIZER_PARAMS_DT = {
    pp.STACKEXCHANGE_TITLE_COLUMN: {
        DEFAULT_NORM_KEY: 'l1',
        DEFAULT_MAX_DF_KEY: 0.1,
        DEFAULT_MIN_DF_KEY: 0.001,
        'max_features': 500
    },
    pp.STACKEXCHANGE_BODY_COLUMN: {
Esempio n. 4
0
def perform_gridsearch_for_given_fieldslist(document_fields_list,
                                            number_instances,
                                            svm=False):
    #TODO: update docu

    dataset_document_names = [ds.DEFAULT_DEVOLOPMENT_DATASET_DOCUMENT_NAME]
    dataset_name = ds.DEFAULT_TRAININGSET_NAME

    for dataset_document_name in dataset_document_names:

        print("Used dataset document used: " + str(dataset_document_name))
        print("Used dataset: " + str(dataset_name))
        print(
            "===============================================================")

        for document_fields in document_fields_list:

            used_fields = dcdp.retrieveValueForUsedFields(document_fields)

            train_doc = dcdp.getDatasetContentDocumentFromDatabase(
                dataset_document_name, dataset_name, used_fields)

            train_data = train_doc[dcdp.DSCD_FIELD_CONTENT]
            train_targets = dcdp.buildTargetsFromDatasetContentDocument(
                train_doc)

            if number_instances == 0:  #if number of instances is not set,take
                #whole set
                number_instances = len(train_data)

            print("Train for num_instances:" + str(number_instances))

            print("Used_field(s):" + str(used_fields))
            print(
                "-----------------------------------------------------------")

            if svm:
                # svm
                print("SVM")
                print("-----")
                classifier = OneVsRestClassifier(
                    SVC(decision_function_shape='ovr'))

                classifier_parameters = {
                    "clf__estimator__C": [1.0, 3.0],
                    "clf__estimator__kernel":
                    ["poly", "rbf", "linear", "sigmoid"],
                    "clf__estimator__gamma": [0.3, 0.01],
                    "clf__estimator__tol": [1e-3, 1e-6],
                    "clf__estimator__random_state": [42],
                    "clf__estimator__max_iter": [1000, 5000]
                }

                perform_gridsearch_for_classifier(classifier,
                                                  classifier_parameters,
                                                  train_data, train_targets,
                                                  number_instances)
                print()

            # naive bayes
            print("Naive Bayes")
            print("-----------")
            estimator = MultinomialNB()
            classifier = OneVsRestClassifier(estimator)

            classifier_parameters = {
                'clf__estimator__alpha': (1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7)
            }

            perform_gridsearch_for_classifier(classifier,
                                              classifier_parameters,
                                              train_data, train_targets,
                                              number_instances)
            print()

            # decision tree
            print("Decision tree")
            print("--------------")
            classifier = DecisionTreeClassifier()

            classifier_parameters = {
                "clf__criterion": ["gini", "entropy"],
                "clf__splitter": ["best", "random"],
                "clf__min_samples_split": [1, 2, 5],
                "clf__max_features": ["auto", "sqrt", "log2"],
                "clf__random_state": [1, 42]
            }

            perform_gridsearch_for_classifier(classifier,
                                              classifier_parameters,
                                              train_data, train_targets,
                                              number_instances)
            print()
            print(
                "***********************************************************")
def perform_gridsearch_for_given_fieldslist(document_fields_list,
                                            number_instances,svm=False):
    #TODO: update docu
    
    dataset_document_names = [ds.DEFAULT_DEVOLOPMENT_DATASET_DOCUMENT_NAME]
    dataset_name=ds.DEFAULT_TRAININGSET_NAME
    
    for dataset_document_name in dataset_document_names:
    
        print("Used dataset document used: " + str(dataset_document_name))
        print("Used dataset: " + str(dataset_name))
        print("===============================================================")
        
        for document_fields in document_fields_list:

            used_fields = dcdp.retrieveValueForUsedFields(document_fields)
            
            train_doc = dcdp.getDatasetContentDocumentFromDatabase(
                                            dataset_document_name, dataset_name, 
                                            used_fields)
            
            
            train_data = train_doc[dcdp.DSCD_FIELD_CONTENT]
            train_targets = dcdp.buildTargetsFromDatasetContentDocument(train_doc)
            
            if number_instances==0:#if number of instances is not set,take 
                #whole set
                number_instances=len(train_data)
                
            print("Train for num_instances:"+str(number_instances))
            
            print("Used_field(s):"+str(used_fields))
            print("-----------------------------------------------------------")
            
            if svm:           
                # svm
                print("SVM")
                print("-----")
                classifier = OneVsRestClassifier(SVC(decision_function_shape='ovr'))
                         
                classifier_parameters = {
                      "clf__estimator__C": [1.0, 3.0],
                      "clf__estimator__kernel": ["poly", "rbf", "linear", "sigmoid"],
                      "clf__estimator__gamma":[0.3, 0.01],
                      "clf__estimator__tol":[1e-3, 1e-6],
                      "clf__estimator__random_state":[42],
                      "clf__estimator__max_iter":[1000,5000]
                }
                    
                perform_gridsearch_for_classifier(classifier,
                                                classifier_parameters, train_data,
                                                train_targets, number_instances)
                print()
            
            # naive bayes
            print("Naive Bayes")
            print("-----------")
            estimator = MultinomialNB()
            classifier = OneVsRestClassifier(estimator)
        
            classifier_parameters = {
                                        'clf__estimator__alpha': 
                                                        (1e-2, 1e-3, 1e-4, 1e-5, 1e-6,
                                                         1e-7)
                                        }
                
            perform_gridsearch_for_classifier(classifier,
                                                classifier_parameters, train_data,
                                                train_targets, number_instances)
            print()
                
            # decision tree
            print("Decision tree")
            print("--------------")
            classifier = DecisionTreeClassifier()
                    
            classifier_parameters = {
                                        "clf__criterion":["gini", "entropy"],
                                        "clf__splitter":["best", "random"],
                                        "clf__min_samples_split": [1, 2, 5],
                                        "clf__max_features": ["auto", "sqrt", "log2"],
                                        "clf__random_state": [1, 42]
                                        }
                                
            perform_gridsearch_for_classifier(classifier,
                                                classifier_parameters, train_data,
                                                train_targets, number_instances)
            print()
            print("***********************************************************")
warnings.filterwarnings("ignore")

"""
This class should help you to find the best parameters for your classifiers.
Within the constants #BEST_PARAMS_SVM, #BEST_PARAMS_DECISIONTREE and 
#BEST_VECTORIZER_PARAMS_CLASSIFIERS there are already the best params for the original
used Stackexchange dataset.
To test different parameters simply start #perform_gridsearch_for_given_fieldslist
and adapt param ranges.
"""

#TODO: adapt after grid search
BEST_PARAMS_NB = {pp.STACKEXCHANGE_TITLE_COLUMN:{'alpha': 0.001},
              pp.STACKEXCHANGE_BODY_COLUMN:{'alpha': 0.01},
              dcdp.retrieveValueForUsedFields([pp.STACKEXCHANGE_TITLE_COLUMN, 
                                          pp.STACKEXCHANGE_BODY_COLUMN]):{
                                            'alpha': 0.01}
              }

#best params for decisiontree
BEST_PARAMS_DECISIONTREE = {pp.STACKEXCHANGE_TITLE_COLUMN:{'random_state': 42, 
                                     'splitter': 'best', 
                                     'criterion': 'gini', 
                                     'min_samples_split': 1, 
                                     'max_features': 'auto'},
                        pp.STACKEXCHANGE_BODY_COLUMN:{'random_state': 42, 
                                'splitter': 'random', 
                                'criterion': 'gini', 
                                'min_samples_split': 1, 
                                'max_features': 'auto'},
                        dcdp.retrieveValueForUsedFields(
def perform_classifiers_for_given_fieldslist(document_fields_list,
                                             baseline=True, use_tree=False):
    """
    #TODO: update docu
    Performs selected classifiers for all fields of stackexchange documents.
    
    :param baseline - whether or not to perform classifier MultinomialNB (if not
    at least SVM is performed if tree is not used)
    :param use_tree - whether or not to perform classifier DecissionTree
    :param use_cv - whether or not to also perform classifier with 
    cross validation
    """
    
    #dataset documents we want to use
    dataset_document_names = [ds.DEFAULT_DATASET_DOCUMENT_NAME]
    #training part of dataset
    dataset_name_train=ds.DEFAULT_TRAININGSET_NAME
    #testing part of dataset
    dataset_name_test=ds.DEFAULT_TESTSET_NAME
    
    #perform classification given datasets
    for dataset_document_name in dataset_document_names:
    
        print("Dataset document used: " + str(dataset_document_name))
        print("---------------------------------------------------------------")
        
        #perform classifiction for given combinations of fields
        for document_fields in document_fields_list:
                        
            #used to retrieve correct document and fields
            used_fields = dcdp.retrieveValueForUsedFields(document_fields)
            
            print("Used fields: " + str(used_fields))
            
            #get the dtm for train
            document_train = dcdp.getDatasetContentDocumentFromDatabase(
                                    dataset_document_name, dataset_name_train, 
                                    used_fields)
            #get the dtm for test
            document_test = dcdp.getDatasetContentDocumentFromDatabase(
                                    dataset_document_name, dataset_name_test, 
                                    used_fields)
            #baseline classifier?
            if baseline:
                
                estimator = MultinomialNB()
                classifier = OneVsRestClassifier(estimator)
                params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)
                estimator=MultinomialNB(**params)
                classifier = OneVsRestClassifier(estimator)
                
            else:#if not baseline classifier use tree or SVC
                
                if use_tree:
                    classifier = tree.DecisionTreeClassifier()
                    params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)

                    classifier = tree.DecisionTreeClassifier(**params)
                    
                else:
                    estimator = SVC()
                    classifier = OneVsRestClassifier(estimator)
                    params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)
                    estimator = SVC(decision_function_shape='ovr',**params)
                    classifier = OneVsRestClassifier(estimator)
                        
            perform_classifier(classifier,used_fields,document_train, 
                               document_test)
            
            print()
from data_representation import dtm_provider
from data_representation import dataset_content_document_provider as dcdp
import data_representation.dataset_spliter as ds

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

classifier=DecisionTreeClassifier()
document_fields=dcdp.DEFAULT_ALL_DOCUMENT_FIELDS[2]
used_fields = dcdp.retrieveValueForUsedFields(document_fields)
params=dtm_provider.provide_vectorizer_params_for_classifier(classifier, 
                                                             used_fields)

print(params)
Esempio n. 9
0
def perform_classifiers_for_given_fieldslist(document_fields_list,
                                             baseline=True,
                                             use_tree=False):
    """
    #TODO: update docu
    Performs selected classifiers for all fields of stackexchange documents.
    
    :param baseline - whether or not to perform classifier MultinomialNB (if not
    at least SVM is performed if tree is not used)
    :param use_tree - whether or not to perform classifier DecissionTree
    :param use_cv - whether or not to also perform classifier with 
    cross validation
    """

    #dataset documents we want to use
    dataset_document_names = [ds.DEFAULT_DATASET_DOCUMENT_NAME]
    #training part of dataset
    dataset_name_train = ds.DEFAULT_TRAININGSET_NAME
    #testing part of dataset
    dataset_name_test = ds.DEFAULT_TESTSET_NAME

    #perform classification given datasets
    for dataset_document_name in dataset_document_names:

        print("Dataset document used: " + str(dataset_document_name))
        print(
            "---------------------------------------------------------------")

        #perform classifiction for given combinations of fields
        for document_fields in document_fields_list:

            #used to retrieve correct document and fields
            used_fields = dcdp.retrieveValueForUsedFields(document_fields)

            print("Used fields: " + str(used_fields))

            #get the dtm for train
            document_train = dcdp.getDatasetContentDocumentFromDatabase(
                dataset_document_name, dataset_name_train, used_fields)
            #get the dtm for test
            document_test = dcdp.getDatasetContentDocumentFromDatabase(
                dataset_document_name, dataset_name_test, used_fields)
            #baseline classifier?
            if baseline:

                estimator = MultinomialNB()
                classifier = OneVsRestClassifier(estimator)
                params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)
                estimator = MultinomialNB(**params)
                classifier = OneVsRestClassifier(estimator)

            else:  #if not baseline classifier use tree or SVC

                if use_tree:
                    classifier = tree.DecisionTreeClassifier()
                    params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)

                    classifier = tree.DecisionTreeClassifier(**params)

                else:
                    estimator = SVC()
                    classifier = OneVsRestClassifier(estimator)
                    params = classifier_param_selection.\
                    provide_best_params_for_classifier(classifier, used_fields)
                    estimator = SVC(decision_function_shape='ovr', **params)
                    classifier = OneVsRestClassifier(estimator)

            perform_classifier(classifier, used_fields, document_train,
                               document_test)

            print()