Esempio n. 1
0
def naive_bayse_cross(train_x, train_y, validation, test, test_data):
    print("training data...")

    clf_pipe = make_pipeline(CountVectorizer(ngram_range=(1, 2)),
                             RandomUnderSampler(), MultinomialNB(alpha=0.01))

    scores = cross_val_score(clf_pipe, train_x, train_y, cv=10)

    print("Model is fitted!")
    if validation:
        print("scores: ", scores)
        print("std of score: ", np.std(scores))
        print("Accuracy: %0.2f (+/- %0.2f)" %
              (scores.mean(), scores.std() * 2))
        y_pred = cross_val_predict(clf_pipe, train_x, train_y, cv=5)

        # Evaluation
        # classification report
        print("classification reports:",
              classification_report(train_y, y_pred))
        # confusion matrix
        conf_mat = confusion_matrix(train_y, y_pred)
        print(conf_mat)
        plot_conf(conf_mat)
    if test:
        naive_bayes(test_data)
Esempio n. 2
0
def test_custom_review(count_vec, train_vec, y_train_data):
    print('\nTest a custom review message')
    print('Enter review to be analysed: ', end=" ")

    test = []
    test_list = []
    test.append(input())
    test_review = pd.DataFrame(data={"id": 1, "review": test})
    print("Cleaning the test review")
    for i in range(0, len(test_review.review)):
        test_list.append(clean_review(test_review.review[i]))
    print("Vectorizing the test review")
    test_review_vec = count_vec.transform(test_list)
    print("Predicting")
    pred_naive_bayes = naive_bayes(train_vec, test_review_vec, y_train_data)
    if (pred_naive_bayes == 1):
        print("The review is predicted positive")
    else:
        print("The review is predicted negative")
Esempio n. 3
0
    def __init__(self, config):
        """This class is a wrapper for a model which targets either the Purpose or Field category set.
        """
        self.target_set = config['target_set']
        self.model_config = config['model_config']
        self.model_name = config['model']
        self.model = None

        # look for a non-default label set
        self.labelset = None

        if 'labels' in config.keys():
            self.labelset = LabelTransformer(self.target_set, config['labels'])
        else:
            self.labelset = LabelTransformer(
                self.target_set,
                LabelTransformer.default_labels(self.target_set))

        if self.target_set != 'purpose' and self.target_set != 'field':
            raise ValueError('Unknown target_set configuration value: %s \n' %
                             config['target_set'])

        # sort out which model the configuration contains
        if self.model_name == 'LogisticRegression':
            self.model = LogisticRegression()
        elif self.model_name == 'DecisionTree':
            self.model = DecisionTreeClassifier()
        elif self.model_name == 'SVC':
            self.model = SVC()
        elif self.model_name == 'Naivebayes':
            self.model = naive_bayes()
        elif self.model_name == 'RandomForest':
            self.model = RandomForestClassifier()
        else:
            sys.exit('Invalid config, model given is unknown: %s' % self.model)

        self.model.set_params(**self.model_config)

        return
Esempio n. 4
0
def train_model(classifier, train_path, test_path, type_classification, train=True, validation=True, test=True,
                cross_validation=False):
    # collect train data
    print("reading train set...")
    if type_classification == "T":
        # read titles and their label
        train_x, train_y = collect_titles(train_path)
    elif type_classification == "TB":
        # read whole document
        train_x, train_y = collect_documents(train_path)
    elif type_classification == "TBW":
        # weighted title and body
        train_x, train_y = collect_weighted_doc(train_path)
    else:
        print("wrong argument")
    # if test:
    print("loading test data...")
    test_data, reference = collect_test_documents(test_path)

    # split data
    if not cross_validation:
        print("spliting the train set...")

        train_data, validate_data, train_target, validate_target = train_test_split(train_x, train_y, test_size=0.4,
                                                                                    random_state=0)
        # Naive bayes classifier
        if classifier == "NB":
            # train data set
            if train:
                print("training data...")
                naive_bayes_train(train_data, train_target)
            # validate validation set
            if validation:
                print("evaluating data...")
                naive_bayes_evaluate(validate_data, validate_target)
            # test data
            if test:
                print("testing data...")
                naive_bayes(test_data, reference)
                print("results are written in: \Results\Prediction.xlsx")

        # SVM classifier
        if classifier == "SVM":
            # train data set
            if train:
                print("training data...")
                svm_train(train_data, train_target)
            # validate validation set
            if validation:
                print("evaluating data...")
                svm_evaluate(validate_data, validate_target)
            # test data
            if test:
                print("testing data...")
                svm(test_data, reference)
                print("results are written in: \Results\Prediction.xlsx")

        # Logistic regression
        if classifier == "LR":
            # train data set
            if train:
                print("training data...")
                train_logistic_regression(train_data, train_target)
            # validate validation set
            if validation:
                print("evaluating data...")
                validate_logistic_regression(validate_data, validate_target)
            # test data
            if test:
                print("testing data...")
                logistic_regression(test_data, reference)
                print("results are written in: \Results\Prediction.xlsx")
    # using cross validation
    else:
        if classifier == "NB":
            naive_bayse_cross(train_x, train_y, validation, test, test_data)

        if classifier == "SVM":
            SVM_train_cross(train_x, train_y, validation, test, test_data)
    print "SVM, N-Gram Vectors: ", accuracy


def random_forrest():
    #RF on Count Vectors
    accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count,
                           train_y, xvalid_count)
    print "RF, Count Vectors: ", accuracy

    # RF on Word Level TF IDF Vectors
    accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf,
                           train_y, xvalid_tfidf)
    print "RF, WordLevel TF-IDF: ", accuracy


while choice != "q":
    if choice == "1":
        naive_bayes()
    elif choice == "2":
        linear_classifier()
    elif choice == "3":
        svm()
    elif choice == "4":
        random_forrest()

    else:
        print("Invalid choice, please choose again")
        print("\n")

    choice = getChoice()
                            delimiter="\t",
                            quoting=0)

    y_train_data = train_data.sentiment

    #Vectorization - TFIDF
    print("Using TFIDF ")
    train_vect, test_vec, count_vec = tfidf_vectorizer(train_list, test_list,
                                                       train_data, test_data)

    #Dimensionality Reduction
    train_vec, test_vec = dimensionality_reduction(train_vect, test_vec,
                                                   y_train_data)

    #Prediction
    pred_naive_bayes = naive_bayes(train_vec, test_vec, y_train_data)
    pred_random_forest = random_forest(train_vec, test_vec, y_train_data)
    pred_linear_svc = linear_svc(train_vec, test_vec, y_train_data)
    pred_logistic = logistic_regression(train_vec, test_vec, y_train_data)

    #Writing output of classifier with highest accuracy(Linear SVC)to csv
    output = pd.DataFrame(
        data={
            "id": test_data.id,
            "review": test_data.review,
            "sentiment": pred_linear_svc
        })
    output.to_csv("tfidf_svc.csv", index=False)

    print("Using pre-trained word2vec model")
    train_list = []