def run_nlp(self, language):
        # Make sure server is running properly (as explained in https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK) :
        # might need root
        # english: java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,sentiment -status_port 9000 -port 9000 -timeout 15000
        # the german implementation cannot do sentiment analysis, the predictions do not bear any relevance, keeping the code like that just makes it easier to maybe add seom sentiment analysis of the parsed german text in the future
        # if the service times out increasing the timeout helps. This usually happens when a sentence is too long to be handled within the given period.
        self.__check_language(language)
        util.time_log("starting NLP...")
        annotator_dict = {"annotators": "sentiment"}
        classifier = CoreNLPParser("http://localhost:9000")

        ret_list = []

        for k_iter in range(0, self.k):
            prediction = []
            for review in self.test_data_text(language, k_iter):
                response_dict = classifier.api_call(review,
                                                    properties=annotator_dict,
                                                    timeout=500)
                count = 0
                sentiment = 0.0
                for sentence in response_dict["sentences"]:
                    count += 1
                    sentiment += float(sentence["sentimentValue"])

                avg_sentiment = sentiment / count
                # a lot better results with >=2
                prediction.append(1 if avg_sentiment >= 2 else 0)
            ret_list.append(prediction)
        return ret_list
 def __access_data_reviews(self, language):
     if self.current_language_type_accessed is None or self.current_language_type_accessed != language:
         util.time_log("vectorizing for " + language + " ...")
         self.current_language_type_accessed = language
         learn_test_tuple = self.file_reader.create_training_test_reviews(
             self.languages, language)
         self.k = len(learn_test_tuple)
         self.current_language_type_data = ([
             learn_test_tuple[i][0] for i in range(0, self.k)
         ], [learn_test_tuple[i][1] for i in range(0, self.k)])
         self.__create_bow_matrix(language)
         # self.__vectorize_reviews_with_bow(language)
         util.time_log("done vectorizing...")
 def run_textblob(self, language):
     self.__check_language(language)
     util.time_log("starting textblob...")
     ret_list = []
     for k_iter in range(0, self.k):
         if self.languages[language] == "english":
             ret_list.append([
                 1 if TextBlob_EN(w).polarity > 0 else 0
                 for w in self.test_data_text(language, k_iter)
             ])
         else:
             ret_list.append([
                 1 if TextBlob_DE(w).polarity > 0 else 0
                 for w in self.test_data_text(language, k_iter)
             ])
     return ret_list
    def run_polyglot(self, language):
        self.__check_language(language)
        util.time_log("starting polyglot...")
        lang_code = "en" if self.languages[language] == "english" else "de"

        ret_list = []

        for k_iter in range(0, self.k):
            if self.languages[language] == "english":
                ret_list.append([
                    1 if Text(x, lang_code).polarity > 0 else 0
                    for x in self.test_data_text(language, k_iter)
                ])
            else:
                ret_list.append([
                    1 if Text(x, lang_code).polarity >= 0 else 0
                    for x in self.test_data_text(language, k_iter)
                ])
        return ret_list
    def run_vader(self, language):
        self.__check_language(language)
        util.time_log("starting VADER...")

        ret_list = []

        for k_iter in range(0, self.k):
            if self.languages[language] == "english":
                classifier = Vader_EN()
                ret_list.append([
                    1 if classifier.polarity_scores(w)["compound"] >= 0 else 0
                    for w in self.test_data_text(language, k_iter)
                ])
            else:
                classifier = Vader_DE()
                ret_list.append([
                    1 if classifier.polarity_scores(w)["compound"] >= 0 else 0
                    for w in self.test_data_text(language, k_iter)
                ])
        return ret_list
 def run_naive_bayes(self, language):
     self.__check_language(language)
     util.time_log("starting nb...")
     ret_list = []
     self.load_data_reviews(language)
     for k_iter in range(0, self.k):
         util.time_log("learning...")
         classifier = NaiveBayesClassifier.train(
             self.training_data_text_vectorized_nb(language, k_iter))
         util.time_log("classifying")
         ret_list.append([
             classifier.classify(x)
             for x in self.test_data_text_vectorized_nb(language, k_iter)
         ])
     return ret_list
 def run_svm(self, language):
     self.__check_language(language)
     util.time_log("starting svm...")
     ret_list = []
     self.load_data_reviews(language)
     for k_iter in range(0, self.k):
         classifier = svm.SVC(kernel="linear")
         util.time_log("learning...")
         vectorized = self.training_data_text_vectorized_bow(
             language, k_iter)
         classifier.fit(vectorized,
                        self.training_data_rating(language, k_iter))
         util.time_log("classifying")
         ret_list.append(
             classifier.predict(
                 self.test_data_text_vectorized_bow(language, k_iter)))
         #print(language + "," + str(k_iter) +": " + str(self.bow_size(language, k_iter)))
     return ret_list
Exemple #8
0
import util
from randomizeFromFiles import K_fold_data_picker_mixed, K_fold_style_data_picker_specific_tests_on_base, \
    K_fold_parallel_file_reader
from standard_run import RunData


def handle_f1_prints(f1_list):
    accuracy_list = [x.accuracy for x in f1_list]
    print("accuracy-average: " + str(sum(accuracy_list) / len(f1_list)))
    print("accuracy-median: " + str(median(accuracy_list)))
    print("accuracy-min: " + str(min(accuracy_list)))
    print("accuracy-max: " + str(max(accuracy_list)))


if __name__ == '__main__':
    util.time_log("starting...")

    filenames_originally_english = [
        "../reviews/originally_english/1_star_reviews_orig_english.txt",
        "../reviews/originally_english/2_star_reviews_orig_english.txt",
        "../reviews/originally_english/4_star_reviews_orig_english.txt",
        "../reviews/originally_english/5_star_reviews_orig_english.txt"
    ]
    filenames_english_uncorrected = [
        "../reviews/original/1-star_translated_mapped.txt",
        "../reviews/original/2-star_translated_mapped.txt",
        "../reviews/original/4-star_translated_mapped.txt",
        "../reviews/original/5-star_translated_mapped.txt"
    ]
    """
    2k, 4k and 6k as base relevant base learning data. In this case from the originally english set of data.