Ejemplo n.º 1
0
def my_method_classifier(stop_words, train_data, test_data,
                         dynamic_datasets_path):
    print('Running myMethodClassifier...\n')

    # headers = ['RowNum', 'Id', 'Title', 'Content', 'Category']
    # print(headers[2:4]) #DEBUG!

    # Split train_dataset into 0.7% train and 0.3% test.
    # train_data, test_data, train_y, test_y = train_test_split(train_data[headers[2:4]], train_data[headers[-1]], train_size=1, test_size=0)

    # Train and Test dataset size details
    # print "Train_x Shape :: ", train_data.shape
    # print "Train_y Shape :: ", train_y.shape
    # print "Test_x Shape :: ", test_data.shape
    # print "Test_y Shape :: ", test_y.shape
    # print "Train_x colums ::", train_data.columns

    train_data, test_data = appendTitleToContentXtimes.append_title_to_content_x_times(
        train_data, test_data, 1)

    # LE
    # le = preprocessing.LabelEncoder()
    # train_data = le.fit_transform(train_data)
    # test_data = le.fit(test_data)

    # print train_data['Content'][1] #DEBUG!

    # List to be returned later.
    scores = []

    print('Running successional-version of myMethodClassifier...')

    start_time_successional = time.time()

    # Count Vectorizer
    count_vectorizer = CountVectorizer(input=stop_words)
    vectorTrain = count_vectorizer.fit_transform(train_data['Content'])
    vectorTest = count_vectorizer.transform(test_data['Content'])
    print("VectorTrain shape::", vectorTrain.shape)
    print("VectorTest shape::", vectorTest.shape)

    # TfidfTransformer
    tfidf = TfidfTransformer()
    vectorTrain = tfidf.fit_transform(vectorTrain)
    vectorTest = tfidf.transform(vectorTest)

    # TfidfVectorizer (it does the job of CountVectorizer & TfidfTransformer together)
    # tfidf_v = TfidfVectorizer(stopWords)
    # vectorTrain = tfidf_v.fit_transform(train_data['Content'])
    # vectorTest = tfidf_v.transform(test_data['Content'])

    # LSA
    lsa = TruncatedSVD(n_components=100)
    vectorTrain = lsa.fit_transform(vectorTrain)
    vectorTest = lsa.transform(vectorTest)

    print("VectorTrain shape after LSA::", vectorTrain.shape)
    print("VectorTest shape after LSA::", vectorTest.shape)

    # Normalizer
    norm = Normalizer(norm="l2", copy=True)
    vectorTrain = norm.fit_transform(vectorTrain)
    vectorTest = norm.transform(vectorTest)

    # CLF
    clf = svm.SVC(kernel='linear', C=1.0)
    # clf = svm.SVC(kernel='rbf', C=1.0, gamma='auto')

    print('Running crossValidation on MyMethod...')
    scores = crossValidation.get_scores_from_cross_validation(
        clf, vectorTrain, train_data['Category'])

    # GridSearch (find the best parameters)
    # parameters = {'kernel': ('linear', 'rbf'), 'C': [1.5, 10], 'gamma': [0, 'auto']}
    # svr = svm.SVC()
    # clf = GridSearchCV(svr, parameters)

    clf.fit(vectorTrain, train_data['Category'])
    y_pred = clf.predict(vectorTest)

    # print "Train Accuracy :: ", accuracy_score(train_data['Category'], clf.predict(vectorTrain))
    # print "Test Accuracy :: ", accuracy_score(train_data['Category'], y_pred)

    #y_pred = cross_val_predict(clf, X=vectorTrain, y=vectorTest, cv=10, n_jobs=multiprocessing.cpu_count())
    writePredictionsToCsv.write_predictions_to_csv(y_pred, test_data,
                                                   dynamic_datasets_path)

    # Best GridSearch params
    # print clf.best_params_

    print("Elapsed time of successional-run: ",
          time.time() - start_time_successional)

    print('MyMethodClassifier finished!\n')
    return scores
def svm_classifier(stop_words, train_data, test_data, use_pipeline):
    print('Running svmClassifier...\n')

    headers = ['RowNum', 'Id', 'Title', 'Content', 'Category']
    # print(headers[2:4]) #DEBUG!

    # Split train_dataset into 0.7% train and 0.3% test.
    train_x, test_x, train_y, test_y = train_test_split(
        train_data[headers[2:4]],
        train_data[headers[-1]],
        train_size=0.7,
        test_size=0.3)

    # LE (currently not used..)
    # le = preprocessing.LabelEncoder()
    # y = le.fit_transform(train_data["Category"])
    # print 'y : ', set(y) #DEBUG!

    # Train and Test dataset size details
    print("Train_x Shape :: ", train_x.shape)
    print("Train_y Shape :: ", train_y.shape)
    print("Test_x Shape :: ", test_x.shape)
    print("Test_y Shape :: ", test_y.shape)
    print("Train_x colums ::", train_x.columns)

    train_x, test_x = appendTitleToContentXtimes.append_title_to_content_x_times(
        train_x, test_x, 1)

    # print train_x['Content'][1] #DEBUG!

    # List to be returned later.
    scores = []

    if use_pipeline:
        print('\nRunning pipeline-version of svmClassifier...')

        # PipeLine.
        start_time_pipeline = time.time()

        pipeline = Pipeline([
            ('vect', CountVectorizer(input=stop_words)),
            # ('tfidf', TfidfTransformer()),
            # ('tfidf_v', TfidfVectorizer(stop_words)),
            ('lsa', TruncatedSVD(n_components=100)),
            # ('norm', Normalizer(norm="l2", copy=True)),
            ('clf', svm.SVC(kernel='linear', C=1.0))
            # ('clf', svm.SVC(kernel='rbf', C=1.0, gamma='auto'))
        ])

        predicted_train = pipeline.fit(train_x['Content'],
                                       train_y).predict(train_x['Content'])
        # Now evaluate all steps on test set
        predicted_test = pipeline.predict(test_x['Content'])

        print("Train Accuracy :: ", accuracy_score(train_y, predicted_train))
        print("Test Accuracy  :: ", accuracy_score(test_y, predicted_test))

        print("Elapsed time of pipeline: ", time.time() - start_time_pipeline)

    else:
        print('\nRunning successional-version of svmClassifier...')

        start_time_successional = time.time()

        # Count Vectorizer
        count_vectorizer = CountVectorizer(input=stop_words)
        vectorTrain = count_vectorizer.fit_transform(train_x['Content'])
        vectorTest = count_vectorizer.transform(test_x['Content'])
        print("VectorTrain shape::", vectorTrain.shape)
        print("VectorTest shape::", vectorTest.shape)

        # TfidfTransformer
        # tfidf = TfidfTransformer()
        # vectorTrain = tfidf.fit_transform(vectorTrain)
        # vectorTest = tfidf.transform(vectorTest)

        # TfidfVectorizer (it does the job of CountVectorizer & TfidfTransformer together)
        # tfidf_v = TfidfVectorizer(stopWords)
        # vectorTrain = tfidf_v.fit_transform(train_x['Content'])
        # vectorTest = tfidf_v.transform(test_x['Content'])

        # LSA
        lsa = TruncatedSVD(n_components=100)
        vectorTrain = lsa.fit_transform(vectorTrain)
        vectorTest = lsa.transform(vectorTest)

        print("VectorTrain shape after LSA::", vectorTrain.shape)
        print("VectorTest shape after LSA::", vectorTest.shape)

        # Normalizer
        # norm = Normalizer(norm="l2", copy=True)
        # vectorTrain = norm.fit_transform(vectorTrain)
        # vectorTest = norm.transform(vectorTest)

        # CLF
        clf = svm.SVC(kernel='linear', C=1.0)
        # clf = svm.SVC(kernel='rbf', C=1.0, gamma='auto')

        print('Running crossValidation on SVM...')
        scores = crossValidation.get_scores_from_cross_validation(
            clf, vectorTrain, train_y)

        # GridSearch (find the best parameters)
        # parameters = {'kernel': ('linear', 'rbf'), 'C': [1.5, 10], 'gamma': [0, 'auto']}
        # svr = svm.SVC()
        # clf = GridSearchCV(svr, parameters)

        # clf.fit(vectorTrain, train_y)
        # y_pred = clf.predict(vectorTest)
        #
        # print "Train Accuracy :: ", accuracy_score(train_y, clf.predict(vectorTrain))
        # print "Test Accuracy :: ", accuracy_score(test_y, y_pred)

        # Best GridSearch params
        # print clf.best_params_

        print("Elapsed time of successional-run: ",
              time.time() - start_time_successional)

    print('svmClassifier finished!\n')
    return scores
Ejemplo n.º 3
0
def knn_classifier(
        stop_words, train_data,
        test_data):  # It's uncertain if we will implement pipeline for knn..

    print('Running knnClassifier...\n')

    headers = ['RowNum', 'Id', 'Title', 'Content', 'Category']
    print(headers[2:4])  # DEBUG!

    # Split train_dataset into 0.7% train and .03% test.
    train_x, test_x, train_y, test_y = train_test_split(
        train_data[headers[2:5]],
        train_data[headers[-1]],
        train_size=0.7,
        test_size=0.3)

    # Train and Test dataset size details
    print("Train_x Shape :: ", train_x.shape)
    print("Train_y Shape :: ", train_y.shape)
    print("Test_x Shape :: ", test_x.shape)
    print("Test_y Shape :: ", test_y.shape)
    print("Train_x colums ::", train_x.columns)

    train_x, test_x = appendTitleToContentXtimes.append_title_to_content_x_times(
        train_x, test_x, 1)

    start_time_successional = time.time()

    # LE
    # le = preprocessing.LabelEncoder()
    # train_x = le.fit_transform(train_x)
    # test_x = le.fit(test_x)

    # Count Vectorizer
    count_vectorizer = CountVectorizer(input=stop_words)
    vectorTrain = count_vectorizer.fit_transform(train_x['Content'])
    vectorTest = count_vectorizer.transform(test_x['Content'])
    print("VectorTrain shape::", vectorTrain.shape)
    print("VectorTest shape::", vectorTest.shape)

    # TfidfTransformer
    # tfidf = TfidfTransformer()
    # vectorTrain = tfidf.fit_transform(vectorTrain)
    # vectorTest = tfidf.transform(vectorTest)

    # TfidfVectorizer (it does the job of CountVectorizer & TfidfTransformer together)
    # tfidf_v = TfidfVectorizer(stopWords)
    # vectorTrain = tfidf_v.fit_transform(train_x['Content'])
    # vectorTest = tfidf_v.transform(test_x['Content'])

    # LSA
    lsa = TruncatedSVD(n_components=100)
    vectorTrain = lsa.fit_transform(vectorTrain)
    vectorTest = lsa.transform(vectorTest)

    print("VectorTrain shape after LSA::", vectorTrain.shape)
    print("VectorTest shape after LSA::", vectorTest.shape)

    # Normalizer
    # norm = Normalizer(norm="l2", copy=True)
    # vectorTrain = norm.fit_transform(vectorTrain)
    # vectorTest = norm.transform(vectorTest)

    predictions = []
    # accuracies = []
    # kValue = randint(1, 10)  # Assumed K value
    kValue = 10
    count = 0

    # for i in range(10):  # 10-fold-validation
    for x in range(100):
        neighbors = getNeighbors(vectorTrain, vectorTest[x], kValue,
                                 train_data)
        # print(neighbors)
        result = getResponse(neighbors)
        # print result
        predictions.append(result)
        # print('> predicted=' + repr(result) + ', actual=' + repr(train_data['Category'][8586+x]))
        if result == train_data['Category'][8586 + x]:
            count += 1

    # print 'Test', test_x[1:2], 'Pred', predictions[0]
    # accuracies.append(getAccuracy(test_data, predictions))
    # print('Accuracy: ' + repr(accuracy) + '%')
    print("Got right", count, "out of", 100)

    # Final accuracy after crossValidation
    # print accuracies
    # print np.mean(accuracies)

    print("Elapsed time of successional-run: ",
          time.time() - start_time_successional)
Ejemplo n.º 4
0
def nb_classifier(stop_words, train_data, test_data, use_pipeline):

    print('Running nbClassifier...\n')

    headers = ['RowNum', 'Id', 'Title', 'Content', 'Category']
    # print(headers[2:4]) #DEBUG!

    # Split train_dataset into 0.7% train and 0.3% test.
    train_x, test_x, train_y, test_y = train_test_split(
        train_data[headers[2:4]],
        train_data[headers[-1]],
        train_size=0.7,
        test_size=0.3)

    # Train and Test dataset size details
    print("Train_x Shape :: ", train_x.shape)
    print("Train_y Shape :: ", train_y.shape)
    print("Test_x Shape :: ", test_x.shape)
    print("Test_y Shape :: ", test_y.shape)
    print("Train_x colums ::", train_x.columns)

    train_x, test_x = appendTitleToContentXtimes.append_title_to_content_x_times(
        train_x, test_x, 1)

    # LE
    # le = preprocessing.LabelEncoder()
    # train_x = le.fit_transform(train_x)
    # test_x = le.fit(test_x)

    # print train_x['Content'][1] #DEBUG!

    # List to be returned later.
    scores = []

    if use_pipeline:
        print('\nRunning pipeline-version of nbClassifier...')

        # PipeLine.
        start_time_pipeline = time.time()

        pipeline = Pipeline([
            ('vect', CountVectorizer(stop_words)),
            # ('tfidf', TfidfTransformer()),
            # ('tfidf_v', TfidfVectorizer(stop_words)),
            # ('norm', Normalizer(norm="l2", copy=True)),
            ('clf', MultinomialNB(alpha=0.01, class_prior=None,
                                  fit_prior=True))
        ])

        pipeline = pipeline.fit(train_x['Content'], train_y)
        #accuracies = []
        # for i in range(10):
        predicted_train = pipeline.predict(train_x['Content'])
        # Now evaluate all steps on test set
        predicted_test = pipeline.predict(test_x['Content'])
        print("Train Accuracy :: ", accuracy_score(train_y, predicted_train))
        print("Test Accuracy  :: ", accuracy_score(test_y, predicted_test))
        #accuracies.append(accuracy_score(test_y, predicted_test))

        #print 'CrossValidation mean accuracy: ', np.mean(accuracies)

        print("Elapsed time of pipeline: ", time.time() - start_time_pipeline)

    else:
        print('\nRunning successional-version of nbClassifier...')

        start_time_successional = time.time()

        # Count Vectorizer
        count_vectorizer = CountVectorizer(stop_words)
        vectorTrain = count_vectorizer.fit_transform(train_x['Content'])
        vectorTest = count_vectorizer.transform(test_x['Content'])
        print("VectorTrain shape::", vectorTrain.shape)
        print("VectorTest shape::", vectorTest.shape)

        # TfidfTransformer
        # tfidf = TfidfTransformer()
        # vectorTrain = tfidf.fit_transform(vectorTrain)
        # vectorTest = tfidf.transform(vectorTest)

        # TfidfVectorizer (it does the job of CountVectorizer & TfidfTransformer together)
        # tfidf_v = TfidfVectorizer(stop_words)
        # vectorTrain = tfidf_v.fit_transform(train_x['Content'])
        # vectorTest = tfidf_v.transform(test_x['Content'])

        # Here we don't use LSA, as it has some issues (negative numbers).

        # Normalizer
        # norm = Normalizer(norm="l2", copy=True)
        # vectorTrain = norm.fit_transform(vectorTrain)
        # vectorTest = norm.transform(vectorTest)

        # CLF
        clf = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

        print('Running crossValidation on NaiveBayes...')
        scores = crossValidation.get_scores_from_cross_validation(
            clf, vectorTrain, train_y)

        # GridSearch
        # parameters = {'alpha': [10, 2, 1, 0.5, 0.1, 0.01, 0.001, 0.0001]}
        # svr = MultinomialNB()
        # clf = GridSearchCV(svr, parameters)

        # clf.fit(vectorTrain, train_y)
        # y_pred = clf.predict(vectorTest)

        # print "Train Accuracy :: ", accuracy_score(train_y, clf.predict(vectorTrain))
        # print "Test Accuracy :: ", accuracy_score(test_y, y_pred)

        # Best GridSearch params
        # print clf.best_params_

        print("Elapsed time of successional-run: ",
              time.time() - start_time_successional)

    print('nbClassifier finished!\n')
    return scores
Ejemplo n.º 5
0
def rf_classifier_with_graph(stop_words, train_data, test_data, use_pipeline):

    print('Running rfClassifier...\n')

    headers = ['RowNum', 'Id', 'Title', 'Content', 'Category']
    # print(headers[2:4]) #DEBUG!

    # Split train_dataset into 0.7% train and 0.3% test.
    train_x, test_x, train_y, test_y = train_test_split(
        train_data[headers[2:4]],
        train_data[headers[-1]],
        train_size=0.7,
        test_size=0.3)

    # Train and Test dataset size details
    print("Train_x Shape :: ", train_x.shape)
    print("Train_y Shape :: ", train_y.shape)
    print("Test_x Shape :: ", test_x.shape)
    print("Test_y Shape :: ", test_y.shape)
    print("Train_x colums ::", train_x.columns)

    train_x, test_x = appendTitleToContentXtimes.append_title_to_content_x_times(
        train_x, test_x, 1)

    # LE
    # le = preprocessing.LabelEncoder()
    # train_x = le.fit_transform(train_x)
    # test_x = le.fit(test_x)

    # print train_x['Content'][1] #DEBUG!

    # List to be returned later.
    scores = []
    accuraccies = []
    components = []

    if use_pipeline:
        print('\nRunning pipeline-version of rfClassifier...')

        # PipeLine-test.
        start_time_pipeline = time.time()

        pipeline = Pipeline([
            ('vect', CountVectorizer(input=stop_words)),
            ('tfidf', TfidfTransformer()),
            # ('tfidf_v', TfidfVectorizer(stopWords)),
            ('lsa', TruncatedSVD(n_components=100)),
            ('norm', Normalizer(norm="l2", copy=True)),
            ('clf', RandomForestClassifier(n_estimators=100))
        ])

        predicted_train = pipeline.fit(train_x['Content'],
                                       train_y).predict(train_x['Content'])
        # Now evaluate all steps on test set
        predicted_test = pipeline.predict(test_x['Content'])

        print("Train Accuracy :: ", accuracy_score(train_y, predicted_train))
        print("Test Accuracy  :: ", accuracy_score(test_y, predicted_test))

        print("Elapsed time of pipeline: ", time.time() - start_time_pipeline)

    else:
        print('\nRunning successional-version of rfClassifier...')

        # Count Vectorizer
        count_vectorizer = CountVectorizer(input=stop_words)

        # LSA
        for x in range(1, 101, 10):
            start_time_successional = time.time()

            vectorTrain = count_vectorizer.fit_transform(train_x['Content'])
            vectorTest = count_vectorizer.transform(test_x['Content'])
            print("VectorTrain shape::", vectorTrain.shape)
            print("VectorTest shape::", vectorTest.shape)

            # TfidfTransformer
            # tfidf = TfidfTransformer()
            # vectorTrain = tfidf.fit_transform(vectorTrain)
            # vectorTest = tfidf.transform(vectorTest)

            lsa = TruncatedSVD(n_components=x)
            vectorTrain = lsa.fit_transform(vectorTrain)
            vectorTest = lsa.transform(vectorTest)

            print(x, "- VectorTrain shape after LSA::", vectorTrain.shape)
            print(x, "- VectorTest shape after LSA::", vectorTest.shape)

            # CLF
            clf = RandomForestClassifier(n_estimators=100)

            print('Running crossValidation on RandomForest...')
            scores = crossValidation.get_scores_from_cross_validation(
                clf, vectorTrain, train_y)
            components.append(x)
            accuraccies.append(scores[0])

            print("Elapsed time of successional-run: ",
                  time.time() - start_time_successional)

    x = np.array(components)
    y = np.array(accuraccies)

    x_new = np.linspace(x.min(), x.max(), 500)

    f = interp1d(x, y, kind='quadratic')
    y_smooth = f(x_new)
    plt.xlabel('n_components')
    plt.ylabel('Accuracy')
    plt.title('RandomForestClassifier Accuracy Graph')
    plt.plot(x_new, y_smooth, color='r')
    plt.scatter(x, y)
    plt.savefig(
        os.path.join(dynamic_datasets_path, 'Resources', 'images',
                     'RandomForestClassifier Accuracy Graph.png'))
    plt.show()
    print('rfClassifier finished!\n')
    return scores