Ejemplo n.º 1
0
def classify_bkp(classifier_type,
                 X_train,
                 Y_train=None,
                 X_test=None,
                 Y_test=None,
                 keywords=None,
                 class_map=None,
                 is_X_text=True):
    _, Y_train = utility.binarize_data(Y_train, class_mapping=class_map)
    _, Y_test = utility.binarize_data(Y_test, class_mapping=class_map)
    svc_class = supervised.SupervisedClassifier.SvcClassifier()
    Y_pred, train_acc, test_acc = svc_class.classify(X_train, Y_train, X_test,
                                                     Y_test)
    return Y_pred, train_acc, test_acc
Ejemplo n.º 2
0
def main():

    fs = FeatureSelection()

    data = datasets.get_news_data('keyword_data',
                                  'annotator_data_dump_with_text')

    X_data, Y_data = data['text'].values, data['category'].values

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data)

    X_data_tf = tf_idf_vectorizer.transform(X_data)

    Y_binary = utility.binarize_data(data=Y_data)

    res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50)

    for i in range(20):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%')
        print(res[0])
def validation_model():
    ## Classifier Name
    classifier_name = 'svc'
    keyword_classifier_name = 'svc'
    train_data_src = 'tweets_and_news'
    num_splits = 5

    ## News Data
    news_data = datasets.get_news_data(
        folder_name='keyword_data', file_name='annotator_data_dump_with_text')

    ## Tweet Data
    tweet_data = datasets.get_tweet_data(file_type='txt',
                                         file_name='tweet_truth.txt')

    if train_data_src == 'news':
        data = news_data
        field_names = ['text', 'category']

    elif train_data_src == 'tweets':
        data = tweet_data
        field_names = ['tweet_cmplt', 'class_annotated']

    elif train_data_src == 'tweets_and_news':
        data = tweet_data
        data_extra = news_data
        field_names = ['tweet_cmplt', 'class_annotated']
        field_names_extra = ['text', 'category']

    kf = KFold(n_splits=num_splits)
    kf.get_n_splits(data)
    train_acc = []
    test_acc = []

    pos_f_measure = []
    both_f_measure = []
    pos_acc_list = []
    both_acc_list = []

    for train_index, test_index in kf.split(data):
        X_train = data[field_names[0]].iloc[train_index]
        Y_train = data[field_names[1]].iloc[train_index]
        X_test = data[field_names[0]].iloc[test_index]
        Y_test = data[field_names[1]].iloc[test_index]

        if train_data_src == 'tweets_and_news':
            X_extra = data_extra[field_names_extra[0]]
            Y_extra = data_extra[field_names_extra[1]]

            X_train = X_train.append(X_extra)
            Y_train = Y_train.append(Y_extra)

        if classifier_name in ['svc', 'lr', 'ada_boost']:
            Y_predicted, curr_train_acc, curr_test_acc = classifier.classify(
                classifier_name, X_train, Y_train, X_test, Y_test)
            train_acc.append(curr_train_acc)
            test_acc.append(curr_test_acc)

        elif classifier_name == 'cosineSim':

            keywords = keyword_generator.keyword_driver(
                keyword_classifier_name, X_train, Y_train, num_of_keywords=50)
            Y_predicted_pos, Y_predicted_both = classifier.classify(
                classifier_name, X_test, keywords=keywords)

            Y_test_list = []
            Y_pred_both_list = []
            Y_pred_pos_list = []

            for i in Y_test.keys():
                Y_test_list.append(Y_test.get_value(i))
                Y_pred_pos_list.append(Y_predicted_pos[i])
                Y_pred_both_list.append(Y_predicted_both[i])

            Y_test_binary = utility.binarize_data(Y_test_list)
            Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list)
            Y_pred_both_binary = utility.binarize_data(Y_pred_both_list)

            both_acc_list.append(
                ca.calculate_accuracy(Y_predicted_both, Y_test))
            both_f_measure.append(
                cf.calculate_f_measure(Y_test_binary[1],
                                       Y_pred_both_binary[1]))
            pos_acc_list.append(ca.calculate_accuracy(Y_predicted_pos, Y_test))
            pos_f_measure.append(
                cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1]))

    if classifier_name == 'svc':
        print('SVC train Acc : ', mean(train_acc))
        print('SVC test Acc : ', mean(test_acc))

    elif classifier_name == 'cosineSim':

        print('cosineSim POS Acc : ', mean(pos_acc_list))
        print('cosineSim BOTH Acc : ', mean(both_acc_list))
        print('cosineSim POS F : ', mean(pos_f_measure))
        print('cosineSim BOTH F : ', mean(both_f_measure))
Ejemplo n.º 4
0
def classify(classifier_type,
             X_train,
             Y_train=None,
             X_test=None,
             Y_test=None,
             keywords=None,
             class_map=None,
             is_X_text=True):

    if (classifier_type in ['svc', 'lr', 'ada_boost']):

        if Y_train is None:

            raise ValueError(
                classifier_type,
                ' is a Supervised Algorithm, pass training labels ...')

        elif X_test is None and Y_test is None:

            train_data = zip(X_train, Y_train)

            train_data, test_data = sklearn.model_selection.train_test_split(
                pd.DataFrame.from_records(train_data))

            X_train, Y_train = train_data[0], train_data[1]

            X_test, Y_test = test_data[0], test_data[1]

            print(
                'Since no TEST Data provided, splitting given data into train and test'
            )

        X_train = utility.get_str_from_list(X_train)

        X_test = utility.get_str_from_list(X_test)

        # if class_map is not None:
        #
        #     fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train,class_mapping=class_map)
        # else:
        #     fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train)

        print(Y_train)
        exit(0)

        if Y_test is not None:

            f, Y_test_binary = utility.binarize_data(Y_test,
                                                     class_mapping=class_map)

        if is_X_text == True:
            tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_train)

            X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

            X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
        else:
            X_train_tf_idf = X_train
            X_test_tf_idf = X_test

        if classifier_type == 'svc':

            svc_class = supervised.SupervisedClassifier.SvcClassifier()

            if Y_test is not None:

                Y_pred, train_acc, test_acc = svc_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:

                Y_pred, train_acc = svc_class.classify(X_train_tf_idf,
                                                       Y_train_binary,
                                                       X_test_tf_idf)

                return Y_pred, train_acc

            return fitted_binarizer.inverse_transform(
                Y_pred), train_acc, test_acc

        elif classifier_type == 'lr':

            lr_class = supervised.SupervisedClassifier.LogisticRClassifier()

            if Y_test is not None:
                Y_pred, train_acc, test_acc = lr_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)
                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = lr_class.classify(X_train_tf_idf,
                                                      Y_train_binary,
                                                      X_test_tf_idf)
                return Y_pred, train_acc

        elif classifier_type == 'ada_boost':

            ada_boost_class = supervised.SupervisedClassifier.AdaBoostClassifier(
            )

            if Y_test is not None:
                Y_pred, train_acc, test_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf)

                return Y_pred, train_acc

    elif classifier_type == 'cosineSim':

        cosine_sim_class = unsupervised.UnsupervisedClassifiers.CosineSimilarity(
        )

        Y_pred_pos, Y_pred_both = cosine_sim_class.classify(
            X_train, keywords, vector_type='word_embeddings')

        return Y_pred_pos, Y_pred_both
Ejemplo n.º 5
0
def main():
    news_dict = datasets.get_news_data(
        folder_name='keyword_data', file_name='annotator_data_dump_with_text')

    category_names = ['tweet_cmplt', 'class_annotated']
    category_names_news = ['text', 'category']

    twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt')

    kf = KFold(n_splits=5)
    kf.get_n_splits(twitter_dict)

    some_dict = {}
    train_acc = []
    test_acc = []

    acc_both = []
    f_both = []
    acc_pos = []
    f_pos = []

    ada_test_list = []
    ada_train_list = []

    news_train = news_dict['text']
    news_class = news_dict['category']

    for train_index, test_index in kf.split(twitter_dict):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')

        X_train = twitter_dict['tweet_cmplt'].iloc[train_index]
        Y_train = twitter_dict['class_annotated'].iloc[train_index]
        X_test = twitter_dict['tweet_cmplt'].iloc[test_index]
        Y_test = twitter_dict['class_annotated'].iloc[test_index]

        some_dict['tweet_cmplt'] = X_train.append(news_train)
        some_dict['class_annotated'] = Y_train.append(news_class)

        ada_predicted, ada_train_acc, ada_test_acc = classify(
            'ada_boost', some_dict['tweet_cmplt'],
            some_dict['class_annotated'], X_test, Y_test)

        ada_train_list.append(ada_train_acc)
        ada_test_list.append(ada_test_acc)

        exit(0)

        print('ada_train_list : ', ada_train_list)
        print('ada_test_list : ', ada_test_list)

        keywords = keyword_generator.keyword_driver(
            'svc',
            some_dict['tweet_cmplt'],
            some_dict['class_annotated'],
            num_of_keywords=50)

        for item in keywords:
            print(item, ' : ', keywords[item])

        predicted, curr_train_acc, curr_test_acc = classify(
            'svc', some_dict['tweet_cmplt'], some_dict['class_annotated'],
            X_test, Y_test)

        train_acc.append(curr_train_acc)
        test_acc.append(curr_test_acc)

        print('train_acc SVC: ', train_acc)
        print('test_acc SVC: ', test_acc)

        # Y_pred_pos, Y_pred_both = classify('cosineSim', X_test ,keywords = keywords)

        Y_test_list = []
        Y_pred_both_list = []
        Y_pred_pos_list = []

        for i in Y_test.keys():
            Y_test_list.append(Y_test.get_value(i))
            Y_pred_pos_list.append(Y_pred_pos[i])
            Y_pred_both_list.append(Y_pred_both[i])

        Y_test_binary = utility.binarize_data(Y_test_list)
        Y_pred_pos_binary = utility.binarize_data(Y_pred_pos_list)
        Y_pred_both_binary = utility.binarize_data(Y_pred_both_list)

        acc_both.append(ca.calculate_accuracy(Y_pred_both, Y_test))
        f_both.append(
            cf.calculate_f_measure(Y_test_binary[1], Y_pred_both_binary[1]))
        acc_pos.append(ca.calculate_accuracy(Y_pred_pos, Y_test))
        f_pos.append(
            cf.calculate_f_measure(Y_test_binary[1], Y_pred_pos_binary[1]))

    print('################################ BOTH')
    print('acc_both : ', mean(acc_both))
    print('f_both : ', mean(f_both))
    print('################################ POS')
    print('acc_pos : ', mean(acc_pos))
    print('f_pos : ', mean(f_pos))
    print('############################### SVC')
    print('Train_Accuracy : ', mean(train_acc))
    print('Test_Accuracy : ', mean(test_acc))
    print('############################### ADA_Boost')
    print('Train_Accuracy : ', mean(ada_train_list))
    print('Test_Accuracy : ', mean(ada_test_list))
    exit(0)

    # TWEET DATA
    twitter_dict = datasets.get_tweet_data('txt', 'tweet_truth.txt')
    train_data, test_data = utility.split_data(twitter_dict)
    category_names = ['tweet_cmplt', 'class_annotated']
    #category_names_tweet = ['tweet_word_list', 'class_annotated']

    predicted_data, train_acc, test_acc = classify(
        'lr', train_data[category_names[0]], train_data[category_names[1]],
        test_data[category_names[0]], test_data[category_names[1]])
    #predicted_data, train_acc, test_acc = classify('svc', news_dict[category_names_news[0]], news_dict[category_names_news[1]],
    #                                                   twitter_dict[category_names_tweet[0]], twitter_dict[category_names_tweet[1]])
    # print(predicted_data)

    print('train_acc : ', train_acc)

    print('test_acc : ', test_acc)