Example #1
0
def main():

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    dataset_test = Dataset.DatasetReview()
    dataset_test.load_review_from_csv(test_set)

    fe_tfidf = TfidfFeatureExtractor(size=500)
    fe_w2v = WordEmbeddingFeatureExtractor(infile=w2v_vec_path, binary=False, dimen=500)
    fe_sswe_w2v = WordEmbeddingFeatureExtractor(infile=sswe_w2v, binary=False, dimen=500, sswe=1)
    fe_sswe = SennaFeatureExtractor(infile=sswe_senna_vectors, vocabfile=sswe_senna_vocabs, dimen=500)

    feature_extractors = [fe_tfidf, fe_w2v, fe_sswe_w2v, fe_sswe]

    ev = Evaluator()

    print "\n**** CROSS VALIDATION EVALUATION (CORPUS: DATASET) ****\n"
    model = Classifier(models="nn")

    kfold = KFold(n_splits=10)
    ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
                                    training_set=dataset_train, num_fold=10, cv=kfold)

    model = Classifier(models="nn")
    ev.create_evaluation_result(model, feature_extractors=feature_extractors,
                                    training_set=dataset_train, num_fold=10, cv=kfold)

    print "\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n"
    model = Classifier(models="nn")
    ev.eval_with_test_set(model, feature_extractors=feature_extractors,
                            training_set=dataset_train,
                            test_set=dataset_test)
Example #2
0
def main(train_set, output_path):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    # dataset_test = Dataset.DatasetReview()
    # dataset_test.load_review_from_csv(test_set)

    # preprocessor = DatasetPreprocessor()
    # dataset = preprocessor.fold_cases_d(dataset)
    # dataset = preprocessor.remove_punctuations_d(dataset)
    # dataset = preprocessor.convert_numbers_d(dataset)

    # dataset.export_only_contents("../Test/dataset.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    # dataset.export_formatted_dataset("formatted_dataset_wow.tsv")
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), dimen=300)
    fe.save_model_to_file(output_path + "vectors_full_300.txt",
                          vocabfile=output_path + "vocab_full_300.txt",
                          binary=False)
Example #3
0
def main(infile):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(infile)

    dataset_train.export_only_contents("sentences_new.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    dataset_train.export_formatted_dataset("formatted_dataset_wow.tsv")
Example #4
0
def main(infile):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    dataset_test = Dataset.DatasetReview()
    dataset_test.load_review_from_csv(test_set)

    # preprocessor = DatasetPreprocessor()
    # dataset = preprocessor.fold_cases_d(dataset)
    # dataset = preprocessor.remove_punctuations_d(dataset)
    # dataset = preprocessor.convert_numbers_d(dataset)

    # dataset.export_only_contents("../Test/dataset.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    # dataset.export_formatted_dataset("formatted_dataset_wow.tsv")

    print "\n**** CROSS VALIDATION EVALUATION (CORPUS: WIKIPEDIA) ****\n"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = SennaFeatureExtractor(dataset_train.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    print "\n**** TRAINING SET EVALUATION (CORPUS: WIKIPEDIA) ****\n"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = SennaFeatureExtractor(dataset_train.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    print "TEST SET EVALUATION (CORPUS: WIKIPEDIA)"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = SennaFeatureExtractor(dataset.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)
def main():
    """ Sentiment Specific Embedding for twitter classification """

    embeddings_size = 50  # Embedding size for SSWE model
    vocab_file = "Embedding/features/semeval_vocabs_200.txt"  # path to the vocabulary file
    vector_file = "Embedding/features/semeval_vectors_200.txt"  # path to the vector file
    stopwordsfile = "preprocess/stopwords.txt"
    """     Sentiment-Specific Word Embedding (SSWE)    """

    if True:
        # Load dataset
        data_train = 'dataset/training1600000.csv'  # training data set file path
        pre_data_train = 'dataset/preprocessed_dataset1600000.csv'  # file to save dataset after cleaning

        if True:
            print("\n **** Dataset cleaning ****")
            tweets_prepocess(data_train, pre_data_train, stopwordsfile)

        if True:
            print("\n **** SSWE model Trainig ****")
            train_model = None  # path to the file contains the trained model if it is already exist
            save_model = "Embedding/models/SSWE_model_1600000_200"  # path to the file where model will be saved
            sswe = create_sswe_model(pre_data_train, vocab_file, vector_file,
                                     train_model, save_model, embeddings_size)
            sswe_trainer(sswe)
    """     Embedding visualisation and Similarity computing    """

    if True:
        visualiser = Visualiser(
            sizeOfEmbedding=embeddings_size,
            VocabsFname=vocab_file,
            VectorsFname=vector_file,
            WVFilename="Visualisation/data/w2vformat.txt",
            visualizerHTMLfilename="Visualisation/data/embedding.html")
        visualiser.visualize()
    """ Twitter Sentiment Classification """

    if True:
        # Data pre-processing

        print("\n **** Training data cleaning ****")
        pre_processing_train = "dataset/preprocessed_semeval_traindataset.csv"
        # tweets_prepocess(train_set, pre_processing_train, stopwordsfile)

        print("\n **** Test data cleaning ****")
        pre_processing_test = "dataset/preprocessed_semeval_testdataset.csv"
        # tweets_prepocess(test_set, pre_processing_test, stopwordsfile)

        # LOAD TRAIN SET
        dataset_train = Dataset.DatasetReview()
        dataset_train.load_review_from_csv(pre_processing_train)

        # LOAD TEST SET
        dataset_test = Dataset.DatasetReview()
        dataset_test.load_review_from_csv(pre_processing_test)

        ################################### Neural Nets classifier ###########################

        # Extract Features
        tweet2v = get_sswe_features(vocab_file, vector_file)

        # Extract samples and labels
        x_train, y_train = split_data(dataset_train)
        x_test, y_test = split_data(dataset_train)

        tfidf = build_tfidf(x_train)

        train_vecs_sswe = np.concatenate([
            buildWordVector(z.split(), embeddings_size, tweet2v, tfidf)
            for z in tqdm(map(lambda x: x, x_train))
        ])

        train_vecs_sswe = scale(train_vecs_sswe)

        test_vecs_sswe = np.concatenate([
            buildWordVector(z.split(), embeddings_size, tweet2v, tfidf)
            for z in tqdm(map(lambda x: x, x_test))
        ])
        test_vecs_sswe = scale(test_vecs_sswe)

        # neural network model
        neuralnets = NeuralNets(input_size=embeddings_size,
                                x_train=train_vecs_sswe,
                                y_train=y_train,
                                epochs=450,
                                batch_size=32,
                                x_test=test_vecs_sswe,
                                y_test=y_test)
        neuralnets.train_neural_nets()

        ##########################################################################################
        ########
        ########        Classical classifiers with sklearn
        ########
        ##########################################################################################
        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")

        fe_sswe = SennaFeatureExtractor(infile=vector_file,
                                        vocabfile=vocab_file,
                                        dimen=embeddings_size)
        feature_extractors = [fe_sswe]
        ev = Evaluator()

        ################################# SVM ###################################################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="svm")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        ################################### Naive bayes ##########################################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="multinomial")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        #########################################  RandomForestClassifier #######################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="rfc")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        #########################################  MLPClassifier #######################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="nn")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)