Python GeneralHelpers Exemples, helpers.GeneralHelpers.GeneralHelpers Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Main.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

    def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")

Exemple #2

0

Afficher le fichier

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

Exemple #3

0

Afficher le fichier

Fichier : ImportManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

Exemple #4

0

Afficher le fichier

Fichier : PreprocessManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

 def __init__(self):
     Preprocessor.__init__(self)
     self.__helper = GeneralHelpers()
     self.__root_cache = self.__helper.load_roots_cache()
     self.__suggestion_cache = self.__helper.load_suggestion_cache()
     self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME

Exemple #5

0

Afficher le fichier

Fichier : PreprocessManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class PreprocessManager(Preprocessor):
    def __init__(self):
        Preprocessor.__init__(self)
        self.__helper = GeneralHelpers()
        self.__root_cache = self.__helper.load_roots_cache()
        self.__suggestion_cache = self.__helper.load_suggestion_cache()
        self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME

    def remove_characters_in_string(self, text, characters=[]):
        """
        Removes specified characters in a string
        :param text: String
        :param characters: list, list of characters to remove
        :return: new string
        """

        if len(characters):
            for char in characters:
                text = text.replace(char, "")

        return text

    def correct_misspelling(self, word):
        """
        Suggest correct words for given word
        :param text: string, word
        :return: string, suggestion
        """
        has_special_keyword, special_keyword = self._has_special_keyword(word)
        if not has_special_keyword:
            if word in self.__suggestion_cache:
                return self.__suggestion_cache[word]
            else:
                corrected_word = self.__helper.correct_misspelling_from_zemberek(
                    word)
                self.__suggestion_cache[word] = corrected_word
                return corrected_word
        else:
            self.__suggestion_cache[word] = special_keyword
            return special_keyword

    def find_root_of_word(self, word):
        """
        Returns root of word
        :param word: string, word
        :return: string, root of word
        """
        has_special_keyword, special_keyword = self._has_special_keyword(word)
        if not has_special_keyword:
            if word in self.__root_cache:
                return self.__root_cache[word]
            else:
                root_of_word = self.__helper.find_root_from_zemberek(word)
                self.__root_cache[word] = root_of_word
                return root_of_word
        else:
            self.__root_cache[word] = special_keyword
            return special_keyword

    def save_caches(self):
        """
        Saves suggestion and root finding caches' changes
        :return: void
        """
        self.__helper.save_changes_in_suggestion_cache(self.__suggestion_cache)
        self.__helper.save_changes_in_root_cache(self.__root_cache)

    def _has_special_keyword(self, word):
        """
        If model name is present in the word, returns the model name
        :param word: string, word
        :return: bool, string
        """
        has_special_keyword = False
        special_keyword = ""

        for keyword in SPECIAL_KEYWORDS:
            if keyword.lower() in word.lower():
                has_special_keyword = True
                special_keyword = keyword

        return has_special_keyword, special_keyword

Exemple #6

0

Afficher le fichier

Fichier : Main.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class Main:
    """
    Main class, makes necessary function calls to necessary classes
    """

    def __init__(self):
        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__plot_manager = PlotManager()
        self.__import_manager = ImportManager()
        self.__feature_manager = FeatureManager()

        self.years = ("2012", "2013", "2014", "2015")

    def retrieve_tweets(self, file_path_of_ids):
        """
        Runs Import Manager to retrieve and import tweets
        :param file_path_of_ids: String, file path of tweets to import
        :return: void
        """
        self.__import_manager.run(file_path_of_ids)

    def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'):
        """
        Makes necessary function calls to extract features for given year and to generate arff file
        :param n: int, ngram count
        :param analyzer: string, word or char
        :param year: string, 2012, 2013, 2014, 2015 or ALL
        :return: string, path of generated arff file
        """

        # Getting tweets with year
        print("Getting tweets for year "+ year)
        tweets_for_given_year = self.__db_manager.get_tweets_for_year(year)

        print("Generating document and classes of tweets.")
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True)

        print("Fitting the data, finding ngrams and frequencies.")
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer)

        print("Formatting the data for arff lib format.")
        formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data)

        print("Generating file.")
        # Experiment name, 1grams, 2grams, 3grams.. or words
        experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word'

        # File name, TTNet_3grams_2012
        file_name = MODEL_NAME + '_' + experiment_name + '_' + year

        # File name randomized TTNet_3grams_2012_asfas12.arff
        file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION)

        # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff
        arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/'

        # Generating the file with data
        self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data)

        print("Arff file generated at path:"+arff_file_path+file_name)

    def run_experiment_with_scikit_learn(self, n=1, analyzer='word'):
        """
        Makes necessary method calls to run the experiment on scikit learn.
        :param n: int, count n in n-gram
        :param analyzer: string, either 'word' or 'char'
        :return: void
        """
        # Retrieving all tweets from database
        print("Retrieving all tweets from database.")
        tweets_for_all_years = {}
        # Iterating over all years
        for year in self.years:
            # Retrieving tweets for the year
            tweets_for_year = self.__db_manager.get_tweets_for_year(year)
            tweets_for_all_years[year] = tweets_for_year

        # Creating a big list of tweets
        print("Creating a big list of tweets.")
        all_tweets = []
        # Appending all tweets together
        for year, tweets in tweets_for_all_years.iteritems():
            all_tweets += tweets

        # Generating document
        print("Generating document and classes by preprocessing")
        # Preprocessing and generation of document
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True)

        # Getting years' tweets counts
        print("Getting years' tweets counts.")
        years_tweets_counts = {}
        for year in self.years:
            years_tweets_counts[year] = len(tweets_for_all_years[year])

        all_processes = []
        self.all_experiments_results = []

        pool = Pool(cpu_count()-1 or 1)
        copy_reg.pickle(types.MethodType, self._reduce_method)

        print("Running experiments.")
        t0 = time.time()
        for i in range(0, N_EXPERIMENTS):
            print("Experiment:"+str(i))
            experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer)
            r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores)
            all_processes.append(r)

        for a_process in all_processes:
            a_process.wait()

        t1 = time.time()

        print("Elapsed time:", t1- t0, " seconds")

        pool.close()
        pool.join()

        print("Cumulating all the experiments' scores.")
        final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results)
        return final_results_from_all_experiments

    def _reduce_method(self, m):
        """

        :param m:
        :return:
        """
        if m.im_self is None:
            return getattr, (m.im_class, m.im_func.func_name)
        else:
            return getattr, (m.im_self, m.im_func.func_name)

    def _accumulate_experiments_scores(self, an_experiments_result):
        """
        Accumulates experiments' scores
        :return: void
        """
        an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result)
        self.all_experiments_results.append(an_experiments_result)

    def plot_experiment_results(self, root_dir):
        """
        Plots experiment's results from log files
        :param root_dir: string
        :return: void
        """
        lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir)
        self.__plot_manager.plot_experiments_results(lines_scores)

    def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments):
        """
        Plots all line scores of all experiments
        :param all_line_scores_of_all_experiments: dict
        :return: void
        """
        self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments)

    def plot_years_scores(self, root_dir):
        """
        Makes necessary function calls to plot years scores
        :param dir: string
        :return: void
        """
        self.__plot_manager.plot_years_scores_from_root_directory(root_dir)

    def plot_2012_vs_rest(self, root_dir):
        """
        Makes necessary function calls to plot 2012 vs REST scores
        :param root_dir: string
        :return: void
        """
        self.__plot_manager.plot_2012_vs_rest(root_dir)

    def plot_top_feature_frequencies_in_years(self):
        """
        Makes necessary function calls to plot top features frequencies' in years
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)

        self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts)

    def find_frequency_dictionary_for_year(self, year):
        """
        Finds frequencies of each feature for given year
        :param year: string
        :return: dict
        """
        # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words

        tweets_for_the_year = self.__db_manager.get_tweets_for_year(year)
        document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True)
        ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word')

        terms = vectorizer.get_feature_names()
        freqs = X.sum(axis=0).A1

        result = sorted(zip(freqs, terms), reverse=True)

        freqs = [elm[0] for elm in result]
        terms = [elm[1] for elm in result]

        final_result = dict(zip(terms, freqs))

        return final_result

    def plot_years_intersection_scores(self):
        """
        Makes necessary function callst to plot a matrix which shows years' vocabularies similarities
        :return: void
        """
        years_features_counts = {}

        for year in self.years:
            years_features_counts[year] = self.find_frequency_dictionary_for_year(year)
            
        self.__plot_manager.plot_years_intersection_scores(years_features_counts)

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        self.__import_manager.import_new_tweets_from_csv(root_path)

Exemple #7

0

Afficher le fichier

Fichier : PreprocessManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class PreprocessManager(Preprocessor):

    def __init__(self):
        Preprocessor.__init__(self)
        self.__helper = GeneralHelpers()
        self.__root_cache = self.__helper.load_roots_cache()
        self.__suggestion_cache = self.__helper.load_suggestion_cache()
        self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME

    def remove_characters_in_string(self, text, characters=[]):
        """
        Removes specified characters in a string
        :param text: String
        :param characters: list, list of characters to remove
        :return: new string
        """

        if len(characters):
            for char in characters:
                text = text.replace(char, "")

        return text

    def correct_misspelling(self, word):
        """
        Suggest correct words for given word
        :param text: string, word
        :return: string, suggestion
        """
        has_special_keyword, special_keyword = self._has_special_keyword(word)
        if not has_special_keyword:
            if word in self.__suggestion_cache:
                return self.__suggestion_cache[word]
            else:
                corrected_word = self.__helper.correct_misspelling_from_zemberek(word)
                self.__suggestion_cache[word] = corrected_word
                return corrected_word
        else:
            self.__suggestion_cache[word] = special_keyword
            return special_keyword



    def find_root_of_word(self, word):
        """
        Returns root of word
        :param word: string, word
        :return: string, root of word
        """
        has_special_keyword, special_keyword = self._has_special_keyword(word)
        if not has_special_keyword:
            if word in self.__root_cache:
                return self.__root_cache[word]
            else:
                root_of_word = self.__helper.find_root_from_zemberek(word)
                self.__root_cache[word] = root_of_word
                return root_of_word
        else:
            self.__root_cache[word] = special_keyword
            return special_keyword

    def save_caches(self):
        """
        Saves suggestion and root finding caches' changes
        :return: void
        """
        self.__helper.save_changes_in_suggestion_cache(self.__suggestion_cache)
        self.__helper.save_changes_in_root_cache(self.__root_cache)

    def _has_special_keyword(self, word):
        """
        If model name is present in the word, returns the model name
        :param word: string, word
        :return: bool, string
        """
        has_special_keyword = False
        special_keyword = ""

        for keyword in SPECIAL_KEYWORDS:
            if keyword.lower() in word.lower():
                has_special_keyword = True
                special_keyword = keyword

        return has_special_keyword, special_keyword

Exemple #8

0

Afficher le fichier

Fichier : PreprocessManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

 def __init__(self):
     Preprocessor.__init__(self)
     self.__helper = GeneralHelpers()
     self.__root_cache = self.__helper.load_roots_cache()
     self.__suggestion_cache = self.__helper.load_suggestion_cache()
     self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME

Exemple #9

0

Afficher le fichier

class ImportManager:
    """
    This class imports handles importing tweets to the database from various sources such as text files
    """

    __file_path = None
    __components_in_a_line = None

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

    def run(self, file_path_to_import):
        """
        Runs all necessary methods to import tweets for a year
        :return: void
        """
        self.__file_path = file_path_to_import

        # getting tweets with their classes
        tweets_with_classes = self._parse_tweets_from_file()
        self.__tweets_with_classes = tweets_with_classes

        # finding duplicates
        unique_tweets, duplicate_tweets = self._find_duplicates(
            tweets_with_classes)

        print("Found " + str(len(duplicate_tweets)) + " duplicate tweets.")
        self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:")
        print("Continuing with unique ones.")

        # getting tweet ids from [tweet_id, class]
        unique_tweets_ids = self._get_tweets_ids(unique_tweets)

        # retrieving tweets from Twitter
        all_tweet_information = self._retrieve_tweets_from_twitter(
            unique_tweets_ids)

        # some tweets may not be found on Twitter
        not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(
            all_tweet_information)

        # creating db model objects
        all_tweet_objects = self._create_tweet_objects(all_tweet_information)

        # insert to database
        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(
            all_tweet_objects)

        print("\n")
        print('-' * 10)
        print('Total Math:')
        print('Unique tweets:' + str(len(unique_tweets)))
        print('Tweets not found:' + str(len(not_found_tweets_on_twitter)))
        print('Tweets not inserted:' + str(len(not_imported_tweets)))
        print('Tweets OK:' + str(success_count))
        print(
            str(len(unique_tweets)) + "==" + str(
                len(not_found_tweets_on_twitter) + len(not_imported_tweets) +
                success_count))

    def _parse_tweets_from_file(self):
        """
        Parses tweet ids and classes from txt file
        :return: list, holds [[124214124, positive],...]
        """

        characters_to_remove = ["'", '"', '\n', ' ']

        with open(self.__file_path, 'r') as tweets_ids_file:
            tweets_with_classes = []
            self.tweets_classes_dictionary = {}

            # Iterating over lines in txt file
            for line in tweets_ids_file:
                line_components = line.split(",")

                # if there are two components in a line. E.g. "121412412412", "positive"
                if self.__components_in_a_line == len(line_components):

                    # iterating over components
                    for index, component in enumerate(line_components):

                        # removing unnecessary characters
                        line_components[
                            index] = self.__preprocess_manager.remove_characters_in_string(
                                component, characters_to_remove)

                    tweets_with_classes.append(line_components)
                    self.__tweets_classes_dictionary.update(
                        {line_components[0]: line_components[1]})

            return tweets_with_classes

    def _find_duplicates(self, tweets_with_classes):
        """
        Finds duplicate tweets
        :param tweets_with_classes: List a list of tweet ids and their sentiment classes.
        :return: unique tweets, duplicate tweets
        """

        unique_tweets = []
        seen_tweets_ids = []
        duplicate_tweet_ids = []

        # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...]
        for tweet_block in tweets_with_classes:

            # First element is the tweet id
            tweet_id = tweet_block[0]

            # If it isn't seen before
            if not tweet_id in seen_tweets_ids:
                seen_tweets_ids.append(tweet_id)
                unique_tweets.append(tweet_block)

            else:
                duplicate_tweet_ids.append(tweet_id)

        return unique_tweets, duplicate_tweet_ids

    def _retrieve_tweets_from_twitter(self, tweet_ids):
        """
        Retrieves tweet information from Twitter
        :param unique_tweets_with_classes: List, tweets and
        :return:
        """
        tweets_results = []
        twitter_manager = TwitterManager()

        chunks_of_tweets_ids = self.__helper.get_chunks_of_list(
            tweet_ids, self.__max_num_of_tweets_at_once)

        for chunk in chunks_of_tweets_ids:
            print("Searching for " + str(len(chunk)) + " tweets.")
            result = twitter_manager.lookup(chunk)
            print("Found " + str(len(result)) + " tweets.")
            tweets_results += result

        return tweets_results

    def _get_tweets_ids(self, tweets_with_classes):
        """
        Extracts tweet ids from tweets with classes
        :param tweets_with_classes: List, tweet_with classes
        :return: extracted ids
        """
        ids = [x[0] for x in tweets_with_classes]
        return ids

    def _create_tweet_objects(self, all_tweets):
        """
        Creates a list which holds db model objects.
        :param tweet_info: List, tweets
        :return: List, tweet objects
        """

        all_tweet_objects = []

        for tweet in all_tweets:
            tweet_object = self.__db_manager.get_new_model_instance()
            tweet_object.id = tweet.id_str
            tweet_object.created_at = tweet.created_at
            tweet_object.lang = tweet.lang
            tweet_object.source = tweet.source
            tweet_object.user_id = tweet.user.id_str

            tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(
                tweet.text).encode('utf-8')
            tweet_object.tweet_class = self._get_sentiment_class_of_tweet(
                tweet.id_str)

            all_tweet_objects.append(tweet_object)

        return all_tweet_objects

    def _get_sentiment_class_of_tweet(self, tweet_id):
        """
        Returns a sentiment class for given tweet id
        :param tweet_id: string
        :return: tweet class
        """
        return self.__tweets_classes_dictionary[tweet_id]

    def _find_not_found_tweets_on_twitter(self, twitter_response):
        """
        Finds not found tweets on Twitter API
        :param twitter_response: list, Twitter response
        :return: list, not found tweets
        """
        tweets_ids = self._get_tweets_ids(self.__tweets_with_classes)
        response_ids = [
            a_tweet_response.id_str for a_tweet_response in twitter_response
        ]
        return list(set(tweets_ids) - set(response_ids))

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        tweet_objects = []

        for file in os.listdir(root_path):
            if file.endswith('.csv'):
                with open(root_path + file, 'r') as file_handle:
                    reader = csv.reader(file_handle, delimiter=';')
                    next(reader, None)  # skip the headers
                    for row in reader:
                        a_tweet_obj = self._create_tweet_object_from_line(
                            row, file)
                        tweet_objects.append(a_tweet_obj)

        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(
            tweet_objects)
        print(success_count)
        print(not_imported_tweets)

    def _create_tweet_object_from_line(self, components, file_name):
        """

        :param a_line:
        :return:
        """
        MAP_DICT = {'e': 'positive', 'h': 'negative', 'n': 'neutral'}

        id_component = components[0]
        date_component = components[2]
        text_component = components[3]
        sentiment_component = MAP_DICT[components[4]]
        year_abv = file_name.split('.')[0][2:]

        if year_abv not in file_name:
            return

        date_len = len(date_component.split('-'))
        if date_len == 2:
            date_component = date_component + '-' + year_abv

        format_str = '%d-%b-%y'

        datetime_of_tweet = datetime.strptime(date_component, format_str)

        tweet_object = self.__db_manager.get_new_model_instance()
        tweet_object.id = self.__helper.generate_random_string(10)
        tweet_object.text = text_component
        tweet_object.created_at = datetime_of_tweet
        tweet_object.tweet_class = sentiment_component

        return tweet_object

Exemple #10

0

Afficher le fichier

Fichier : ImportManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class ImportManager:

    """
    This class imports handles importing tweets to the database from various sources such as text files
    """

    __file_path = None
    __components_in_a_line = None

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

    def run(self, file_path_to_import):
        """
        Runs all necessary methods to import tweets for a year
        :return: void
        """
        self.__file_path = file_path_to_import

        # getting tweets with their classes
        tweets_with_classes = self._parse_tweets_from_file()
        self.__tweets_with_classes = tweets_with_classes

        # finding duplicates
        unique_tweets, duplicate_tweets = self._find_duplicates(tweets_with_classes)

        print("Found "+str(len(duplicate_tweets))+" duplicate tweets.")
        self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:")
        print("Continuing with unique ones.")

        # getting tweet ids from [tweet_id, class]
        unique_tweets_ids = self._get_tweets_ids(unique_tweets)

        # retrieving tweets from Twitter
        all_tweet_information = self._retrieve_tweets_from_twitter(unique_tweets_ids)

        # some tweets may not be found on Twitter
        not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(all_tweet_information)

        # creating db model objects
        all_tweet_objects = self._create_tweet_objects(all_tweet_information)

        # insert to database
        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(all_tweet_objects)

        print("\n")
        print('-'*10)
        print('Total Math:')
        print('Unique tweets:'+str(len(unique_tweets)))
        print('Tweets not found:'+str(len(not_found_tweets_on_twitter)))
        print('Tweets not inserted:'+str(len(not_imported_tweets)))
        print('Tweets OK:'+str(success_count))
        print(str(len(unique_tweets))+"=="+str(len(not_found_tweets_on_twitter)+len(not_imported_tweets)+success_count))

    def _parse_tweets_from_file(self):
        """
        Parses tweet ids and classes from txt file
        :return: list, holds [[124214124, positive],...]
        """

        characters_to_remove = ["'", '"', '\n', ' ']

        with open(self.__file_path, 'r') as tweets_ids_file:
            tweets_with_classes = []
            self.tweets_classes_dictionary = {}

            # Iterating over lines in txt file
            for line in tweets_ids_file:
                line_components = line.split(",")

                # if there are two components in a line. E.g. "121412412412", "positive"
                if self.__components_in_a_line == len(line_components):

                    # iterating over components
                    for index, component in enumerate(line_components):

                        # removing unnecessary characters
                        line_components[index] = self.__preprocess_manager.remove_characters_in_string(component,
                                                                                                       characters_to_remove)

                    tweets_with_classes.append(line_components)
                    self.__tweets_classes_dictionary.update({line_components[0]:line_components[1]})

            return tweets_with_classes

    def _find_duplicates(self, tweets_with_classes):
        """
        Finds duplicate tweets
        :param tweets_with_classes: List a list of tweet ids and their sentiment classes.
        :return: unique tweets, duplicate tweets
        """

        unique_tweets = []
        seen_tweets_ids = []
        duplicate_tweet_ids = []

        # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...]
        for tweet_block in tweets_with_classes:

            # First element is the tweet id
            tweet_id = tweet_block[0]

            # If it isn't seen before
            if not tweet_id in seen_tweets_ids:
                seen_tweets_ids.append(tweet_id)
                unique_tweets.append(tweet_block)

            else:
                duplicate_tweet_ids.append(tweet_id)

        return unique_tweets, duplicate_tweet_ids

    def _retrieve_tweets_from_twitter(self, tweet_ids):
        """
        Retrieves tweet information from Twitter
        :param unique_tweets_with_classes: List, tweets and
        :return:
        """
        tweets_results = []
        twitter_manager = TwitterManager()

        chunks_of_tweets_ids = self.__helper.get_chunks_of_list(tweet_ids, self.__max_num_of_tweets_at_once)

        for chunk in chunks_of_tweets_ids:
            print("Searching for "+str(len(chunk))+" tweets.")
            result = twitter_manager.lookup(chunk)
            print("Found "+str(len(result))+" tweets.")
            tweets_results+=result

        return tweets_results

    def _get_tweets_ids(self, tweets_with_classes):
        """
        Extracts tweet ids from tweets with classes
        :param tweets_with_classes: List, tweet_with classes
        :return: extracted ids
        """
        ids = [x[0] for x in tweets_with_classes]
        return ids

    def _create_tweet_objects(self, all_tweets):
        """
        Creates a list which holds db model objects.
        :param tweet_info: List, tweets
        :return: List, tweet objects
        """

        all_tweet_objects = []

        for tweet in all_tweets:
            tweet_object = self.__db_manager.get_new_model_instance()
            tweet_object.id = tweet.id_str
            tweet_object.created_at = tweet.created_at
            tweet_object.lang = tweet.lang
            tweet_object.source = tweet.source
            tweet_object.user_id = tweet.user.id_str

            tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(tweet.text).encode('utf-8')
            tweet_object.tweet_class = self._get_sentiment_class_of_tweet(tweet.id_str)

            all_tweet_objects.append(tweet_object)

        return all_tweet_objects

    def _get_sentiment_class_of_tweet(self, tweet_id):
        """
        Returns a sentiment class for given tweet id
        :param tweet_id: string
        :return: tweet class
        """
        return self.__tweets_classes_dictionary[tweet_id]

    def _find_not_found_tweets_on_twitter(self, twitter_response):
        """
        Finds not found tweets on Twitter API
        :param twitter_response: list, Twitter response
        :return: list, not found tweets
        """
        tweets_ids = self._get_tweets_ids(self.__tweets_with_classes)
        response_ids = [a_tweet_response.id_str for a_tweet_response in twitter_response]
        return list(set(tweets_ids) - set(response_ids))

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        tweet_objects = []

        for file in os.listdir(root_path):
            if file.endswith('.csv'):
                with open(root_path+file, 'r') as file_handle:
                    reader = csv.reader(file_handle, delimiter=';')
                    next(reader, None)  # skip the headers
                    for row in reader:
                        a_tweet_obj = self._create_tweet_object_from_line(row, file)
                        tweet_objects.append(a_tweet_obj)

        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(tweet_objects)
        print(success_count)
        print(not_imported_tweets)

    def _create_tweet_object_from_line(self, components, file_name):
        """

        :param a_line:
        :return:
        """
        MAP_DICT = {
            'e': 'positive',
            'h': 'negative',
            'n': 'neutral'
        }

        id_component = components[0]
        date_component = components[2]
        text_component = components[3]
        sentiment_component = MAP_DICT[components[4]]
        year_abv = file_name.split('.')[0][2:]

        if year_abv not in file_name:
            return


        date_len = len(date_component.split('-'))
        if date_len == 2:
            date_component = date_component + '-' + year_abv

        format_str = '%d-%b-%y'

        datetime_of_tweet = datetime.strptime(date_component, format_str)


        tweet_object = self.__db_manager.get_new_model_instance()
        tweet_object.id = self.__helper.generate_random_string(10)
        tweet_object.text = text_component
        tweet_object.created_at = datetime_of_tweet
        tweet_object.tweet_class = sentiment_component

        return tweet_object

Exemple #11

0

Afficher le fichier

Fichier : PlotManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

 def __init__(self):
     self.__first_year = 2012
     self.__helper = GeneralHelpers()
     self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k']
     self.__years = ('2012', '2013', '2014', '2015')
     self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}"

Exemple #12

0

Afficher le fichier

Fichier : PlotManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class PlotManager:
    """
    This class does the necessary works for visualizing data
    """
    def __init__(self):
        self.__first_year = 2012
        self.__helper = GeneralHelpers()
        self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k']
        self.__years = ('2012', '2013', '2014', '2015')
        self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}"

    def plot_years_scores_from_root_directory(self, root_dir):
        """
        Plots years' scores for given classifiers and mean of them
        :param root_dir: string, root directory to scan
        :return: void
        """
        bar_width = 0.10

        # Getting scores from helper
        years_classifier_scores_list = []
        years_classifier_scores_dict = self.__helper.get_accuracy_scores_for_years_from_root_dir(
            root_dir)

        # Making them lists
        for year, classifiers_scores in years_classifier_scores_dict.iteritems(
        ):
            years_classifier_scores_list.append(classifiers_scores.values())

        years_classifier_scores_list = np.array(years_classifier_scores_list)
        classifier_names = years_classifier_scores_dict['2012'].keys()

        indexes = np.arange(len(
            years_classifier_scores_dict.keys()))  # [0,1,2,3] +

        # Iterating over J48 for 2012, 2013, 2014, 2015, MEAN for 2012, 2013, 2014, 2015 an so on..
        for iteration_number, (color_name, classifier_name,
                               classifier_scores) in enumerate(
                                   zip(self.__colors, classifier_names,
                                       years_classifier_scores_list.T)):
            bar_offset = indexes + (iteration_number * bar_width)
            plt.bar(bar_offset,
                    classifier_scores,
                    bar_width,
                    color=color_name,
                    label=classifier_name)

        plt.xlabel('Years')
        plt.ylabel('Scores %')
        plt.title('Scores by year and classifier(' + MODEL_NAME + ', CV=4)')
        plt.xticks(indexes + bar_width, self.__years)
        plt.legend(loc=4)
        plt.show()

    def plot_2012_vs_rest(self, root_dir):
        """
        Plots results of classifications of using 2012 as train set, 2013, 2014, 2015 as test set.
        :param root_dir: string
        :return: void
        """
        all_accuracy_scores = self.__helper.get_log_files_stats(root_dir)
        """
        Example accuracy scores:
        {
            'SMO':{
                2013: [62.79, 66.67, 50.0, 70.45, 57.14, 60.0, 64.29, 66.67, 62.79, 73.17, 57.14, 66.67],
                2014: [65.45, 58.97, 54.35, 72.09, 47.62, 66.67, 71.43, 66.67, 57.78, 64.44, 71.43, 59.26],
                2015: [62.79, 57.14, 63.16, 62.5, 59.26, 67.27, 61.76, 66.67, 68.63]
            },
            'IB1': {
                2013: [37.21, 40.48, 38.1, 43.18, 45.24, 47.5, 45.24, 35.71, 32.56, 24.39, 28.57, 51.28],
                2014: [38.18, 43.59, 41.3, 39.53, 47.62, 50.0, 33.33, 44.44, 26.67, 24.44, 40.48, 37.04],
                2015: [37.21, 41.07, 43.86, 39.29, 51.85, 30.91, 39.71, 26.32, 45.1]
            }
            ...
            ...
            ...
        }
        """
        self._plot_2012_vs_rest_monthly(all_accuracy_scores)
        self._plot_2012_vs_rest_yearly(all_accuracy_scores)

    def plot_top_feature_frequencies_in_years(self, years_features_counts):
        """
        Plots top features' frequencies in years
        :return: void
        """

        plot_feature_counts = {}
        bar_width = 0.20

        for feature_name in INFO_GAIN_ATTRIBUTES:
            if not feature_name in plot_feature_counts:
                plot_feature_counts[feature_name] = []

            f_key = feature_name.decode('utf-8')

            for year in self.__years:
                if not f_key in years_features_counts[year]:
                    years_features_counts[year][f_key] = 0

            plot_feature_counts[feature_name] = [
                years_features_counts["2012"][f_key],
                years_features_counts["2013"][f_key],
                years_features_counts["2014"][f_key],
                years_features_counts["2015"][f_key]
            ]
        print(plot_feature_counts)

        indexes = np.arange(len(plot_feature_counts.keys()))

        for first_iteration_number, (feature_name,
                                     feature_counts) in enumerate(
                                         plot_feature_counts.iteritems()):
            for second_iteration_number, (color, feature_count) in enumerate(
                    zip(self.__colors, feature_counts)):
                x_coord = first_iteration_number + (second_iteration_number *
                                                    bar_width)
                plt.bar(x_coord, feature_count, bar_width, color=color)

        xticks = [key.decode('utf-8') for key in plot_feature_counts.keys()]

        plt.xlabel('Features')
        plt.ylabel('Frequencies in __years')
        plt.title('InfoGain features by year and features(' + MODEL_NAME + ')')
        plt.xticks(indexes + bar_width * 2, xticks)

        handles = []
        for idx, (year, color) in enumerate(zip(self.__years, self.__colors)):
            patch = Patch(color=color, label=year)
            handles.append(patch)

        plt.legend(loc=1, handles=handles)
        plt.show()

    def plot_years_intersection_scores(self, years_features_counts):
        """
        Plots a matrix which shows years' vocabularies similarities
        :param years_features_counts: dict
        :return: void
        """

        years_intersection_scores = np.zeros(
            (len(self.__years), len(self.__years)))
        feature_frequencies = years_features_counts

        for first_iteration_number, (x_year, x_years_features) in enumerate(
                feature_frequencies.iteritems()):
            features_of_x = x_years_features.keys()
            total_count = np.sum(x_years_features.values())

            for second_iteration_number, (y_year,
                                          y_years_features) in enumerate(
                                              feature_frequencies.iteritems()):

                if x_year == y_year:
                    pass

                else:
                    features_of_y = y_years_features.keys()
                    intersect = list(set(features_of_x) & set(features_of_y))

                    intersect_count = 0
                    for intersect_item in intersect:
                        intersect_count = intersect_count + y_years_features[
                            intersect_item]

                    ratio = float(intersect_count) / total_count

                    i_index = int(x_year) - self.__first_year  #0
                    j_index = int(y_year) - self.__first_year  #1
                    years_intersection_scores[i_index][j_index] = ratio

        all_scores_df = pd.DataFrame(years_intersection_scores, self.__years,
                                     self.__years)

        print(MODEL_NAME + '\'s __years\' vocabulary similarities:')
        print(all_scores_df)

    def plot_experiments_results_with_scikit_learn(self, lines_scores):
        """
        Plots experiments' results from scikit learn
        :param lines_scores: dict
        :return: void
        """
        test_years = ['13', '14', '15']
        markers = ['o', 'D', 'h', '*', '+']
        plot_types = ['-', '--', '-.', ':', ',']
        legend_line_names = {
            'line1': 'LINE1',
            'line2': 'LINE2',
            'line3L0': 'LINE3-MultinomialNB DB',
            'line3L1': 'LINE3-kMEANS CLUSTERING',
            'line3L2': 'LINE3-kMEANS CLUSTERING(probabilities)',
            'line3L3': 'LINE3-MultinomialNB DB Iterative Approach',
            'line4': 'LINE4'
        }
        # -(2012-500)/(YEAR-300)
        # -(2012-500)+(YEAR-R50)/(YEAR-300)
        # -(2012-500)+(YEAR-L50)/(YEAR-300)
        # -(2012-500)+(YEAR-L50)/(YEAR-300)
        # -(2012-500)+(YEAR-200)/(YEAR-300)
        fig, ax = plt.subplots(figsize=(20, 9))
        ax.set_autoscale_on(False)
        ax.set_xlim([12.5, 15.5])

        all_of_min = 100
        all_of_max = 0

        handles = []
        color_index = 0
        for first_iteration_number, (line_name, line_points) in enumerate(
                lines_scores.iteritems()):
            line_max, line_min = 0, 100

            if line_name == "line2":
                line_points_array = np.array(line_points.values())
                ys = line_points_array[:, 1]
                mins = line_points_array[:, 0]
                maxs = line_points_array[:, 2]
                line_max, line_min = np.max(maxs), np.min(mins)

                for sub_iteration_number, (a_min,
                                           a_max) in enumerate(zip(mins,
                                                                   maxs)):
                    ax.plot((int(test_years[sub_iteration_number]) - 0.05,
                             int(test_years[sub_iteration_number]) + 0.05),
                            (a_min, a_min), 'k-')
                    ax.plot((int(test_years[sub_iteration_number]) - 0.05,
                             int(test_years[sub_iteration_number]) + 0.05),
                            (a_max, a_max), 'k-')
                    ax.plot((int(test_years[sub_iteration_number]),
                             int(test_years[sub_iteration_number])),
                            (a_min, a_max), 'k-')

                ax.plot(test_years,
                        ys,
                        self.__colors[color_index],
                        marker=markers[first_iteration_number],
                        linestyle=plot_types[first_iteration_number],
                        linewidth=3.0)
                patch = Patch(color=self.__colors[color_index],
                              label=legend_line_names[line_name])
                color_index += 1
                handles.append(patch)

            elif line_name == "line3":
                for sub_iteration_number, (
                        ale_experiment_key) in enumerate(ALE_LINE3_KEYS):
                    proper_dict_values = [
                        line_points[dict_key]
                        for dict_key in line_points.keys()
                        if dict_key.startswith(ale_experiment_key)
                    ]
                    ys = proper_dict_values

                    line_max, line_min = np.max(ys), np.min(ys)
                    ax.plot(test_years,
                            ys,
                            self.__colors[color_index],
                            marker=markers[first_iteration_number],
                            linestyle=plot_types[first_iteration_number],
                            linewidth=3.0)
                    patch = Patch(color=self.__colors[color_index],
                                  label=legend_line_names[line_name +
                                                          ale_experiment_key])
                    handles.append(patch)
                    color_index += 1
            else:
                ys = line_points.values()
                line_max, line_min = np.max(ys), np.min(ys)
                ax.plot(test_years,
                        ys,
                        self.__colors[color_index],
                        marker=markers[first_iteration_number],
                        linestyle=plot_types[first_iteration_number],
                        linewidth=3.0)
                patch = Patch(color=self.__colors[color_index],
                              label=legend_line_names[line_name])
                handles.append(patch)
                color_index += 1
            all_of_min = min(line_min, all_of_min)
            all_of_max = max(line_max, all_of_max)

        ymin = all_of_min - 0.01
        ymax = all_of_max + 0.01

        plt.legend(handles=handles)

        ax.set_ylim([ymin, ymax])
        plt.yticks(np.arange(ymin, ymax, 0.01))
        ax.set_xticklabels(["", "13", "", "14", "", "15"])
        plt.xlabel('Years')
        plt.ylabel('Scores %')
        plt.title(
            'Scores by year with changing training sets. Classifier=SVM Feature=Word.'
        )
        plt.tight_layout()
        plt.grid()
        plt.show()

    def _plot_2012_vs_rest_monthly(self, all_accuracy_scores):
        """
        Plots 2012 vs REST graphic in monthly basis.
        :param all_accuracy_scores: dict
        :return: void
        """

        date_ranges = pd.date_range(start='1/1/2013', periods=33, freq='M')
        date_ranges = np.array(
            [date_obj.strftime('%b-%y') for date_obj in date_ranges])

        xs = date_ranges

        for iteration_number, classifier_scores in enumerate(
                all_accuracy_scores.values()):
            ys = []
            fig = plt.figure(iteration_number)
            for year, year_scores in classifier_scores.iteritems():
                ys += year_scores

            xs = np.arange(1, 34, 1)

            plt.xlabel("Months")
            plt.ylabel("Scores%")
            plt.title(all_accuracy_scores.keys()[iteration_number])
            plt.plot(xs, ys)

    def _plot_2012_vs_rest_yearly(self, all_accuracy_scores):
        """
        Plots 2012 vs REST graphic in yearly basis.
        :param all_accuracy_scores: dict
        :return: void
        """
        date_ranges = pd.date_range(start='1/1/2013', periods=3, freq='365D')
        date_ranges = np.array(
            [date_obj.strftime('%y') for date_obj in date_ranges])

        xs = date_ranges
        yearly_scores = {}

        fig, ax = plt.subplots()
        names_of_classifiers = all_accuracy_scores.keys()

        for iteration_number, classifier_scores in enumerate(
                all_accuracy_scores.values()):
            ys = []
            for year, year_scores in classifier_scores.iteritems():
                ys.append(np.mean(year_scores))

            plt.xlabel('Years')
            plt.ylabel('Scores %')
            plt.title('Scores by year and classifier(' + MODEL_NAME +
                      ', train=2012, test=2013, 2014, 2015)')
            ax.set_xticklabels(xs)
            plt.xticks(rotation=90)
            ax.plot(xs,
                    ys,
                    self.__colors[iteration_number],
                    label=names_of_classifiers[iteration_number])
            plt.legend()
        plt.show()

Exemple #13

0

Afficher le fichier

Fichier : PlotManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

 def __init__(self):
     self.__first_year = 2012
     self.__helper = GeneralHelpers()
     self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k']
     self.__years = ('2012', '2013', '2014', '2015')
     self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}"

Exemple #14

0

Afficher le fichier

Fichier : PlotManager.py Projet : datafordevelopment/TimeEffectInSentimentAnalysis

class PlotManager:
    """
    This class does the necessary works for visualizing data
    """
    def __init__(self):
        self.__first_year = 2012
        self.__helper = GeneralHelpers()
        self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k']
        self.__years = ('2012', '2013', '2014', '2015')
        self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}"


    def plot_years_scores_from_root_directory(self, root_dir):
        """
        Plots years' scores for given classifiers and mean of them
        :param root_dir: string, root directory to scan
        :return: void
        """
        bar_width = 0.10

        # Getting scores from helper
        years_classifier_scores_list = []
        years_classifier_scores_dict = self.__helper.get_accuracy_scores_for_years_from_root_dir(root_dir)

        # Making them lists
        for year, classifiers_scores in years_classifier_scores_dict.iteritems():
            years_classifier_scores_list.append(classifiers_scores.values())

        years_classifier_scores_list = np.array(years_classifier_scores_list)
        classifier_names = years_classifier_scores_dict['2012'].keys()

        indexes = np.arange(len(years_classifier_scores_dict.keys()))  # [0,1,2,3] +

        # Iterating over J48 for 2012, 2013, 2014, 2015, MEAN for 2012, 2013, 2014, 2015 an so on..
        for iteration_number, (color_name, classifier_name, classifier_scores) in enumerate(zip(self.__colors, classifier_names, years_classifier_scores_list.T)):
            bar_offset = indexes + (iteration_number * bar_width)
            plt.bar(bar_offset, classifier_scores, bar_width, color=color_name, label=classifier_name)

        plt.xlabel('Years')
        plt.ylabel('Scores %')
        plt.title('Scores by year and classifier(' + MODEL_NAME + ', CV=4)')
        plt.xticks(indexes + bar_width, self.__years)
        plt.legend(loc=4)
        plt.show()

    def plot_2012_vs_rest(self, root_dir):
        """
        Plots results of classifications of using 2012 as train set, 2013, 2014, 2015 as test set.
        :param root_dir: string
        :return: void
        """
        all_accuracy_scores = self.__helper.get_log_files_stats(root_dir)
        """
        Example accuracy scores:
        {
            'SMO':{
                2013: [62.79, 66.67, 50.0, 70.45, 57.14, 60.0, 64.29, 66.67, 62.79, 73.17, 57.14, 66.67],
                2014: [65.45, 58.97, 54.35, 72.09, 47.62, 66.67, 71.43, 66.67, 57.78, 64.44, 71.43, 59.26],
                2015: [62.79, 57.14, 63.16, 62.5, 59.26, 67.27, 61.76, 66.67, 68.63]
            },
            'IB1': {
                2013: [37.21, 40.48, 38.1, 43.18, 45.24, 47.5, 45.24, 35.71, 32.56, 24.39, 28.57, 51.28],
                2014: [38.18, 43.59, 41.3, 39.53, 47.62, 50.0, 33.33, 44.44, 26.67, 24.44, 40.48, 37.04],
                2015: [37.21, 41.07, 43.86, 39.29, 51.85, 30.91, 39.71, 26.32, 45.1]
            }
            ...
            ...
            ...
        }
        """
        self._plot_2012_vs_rest_monthly(all_accuracy_scores)
        self._plot_2012_vs_rest_yearly(all_accuracy_scores)

    def plot_top_feature_frequencies_in_years(self, years_features_counts):
        """
        Plots top features' frequencies in years
        :return: void
        """

        plot_feature_counts = {}
        bar_width = 0.20

        for feature_name in INFO_GAIN_ATTRIBUTES:
            if not feature_name in plot_feature_counts:
                plot_feature_counts[feature_name] = []

            f_key = feature_name.decode('utf-8')

            for year in self.__years:
                if not f_key in years_features_counts[year]:
                    years_features_counts[year][f_key] = 0

            plot_feature_counts[feature_name] = [years_features_counts["2012"][f_key],
                                                  years_features_counts["2013"][f_key],
                                                  years_features_counts["2014"][f_key],
                                                  years_features_counts["2015"][f_key]
                                                ]
        print(plot_feature_counts)

        indexes = np.arange(len(plot_feature_counts.keys()))

        for first_iteration_number, (feature_name, feature_counts) in enumerate(plot_feature_counts.iteritems()):
            for second_iteration_number, (color, feature_count) in enumerate(zip(self.__colors, feature_counts)):
                x_coord = first_iteration_number + (second_iteration_number*bar_width)
                plt.bar(x_coord, feature_count, bar_width, color=color)

        xticks = [key.decode('utf-8') for key in plot_feature_counts.keys()]

        plt.xlabel('Features')
        plt.ylabel('Frequencies in __years')
        plt.title('InfoGain features by year and features(' + MODEL_NAME + ')')
        plt.xticks(indexes + bar_width*2, xticks)

        handles = []
        for idx, (year, color) in enumerate(zip(self.__years, self.__colors)):
            patch = Patch(color=color, label=year)
            handles.append(patch)

        plt.legend(loc=1, handles=handles)
        plt.show()

    def plot_years_intersection_scores(self, years_features_counts):
        """
        Plots a matrix which shows years' vocabularies similarities
        :param years_features_counts: dict
        :return: void
        """

        years_intersection_scores = np.zeros((len(self.__years),len(self.__years)))
        feature_frequencies = years_features_counts

        for first_iteration_number, (x_year, x_years_features) in enumerate(feature_frequencies.iteritems()):
            features_of_x = x_years_features.keys()
            total_count = np.sum(x_years_features.values())

            for second_iteration_number, (y_year, y_years_features) in enumerate(feature_frequencies.iteritems()):

                if x_year == y_year:
                    pass

                else:
                    features_of_y = y_years_features.keys()
                    intersect = list(set(features_of_x) & set(features_of_y))


                    intersect_count = 0
                    for intersect_item in intersect:
                        intersect_count = intersect_count + y_years_features[intersect_item]

                    ratio = float(intersect_count)/total_count

                    i_index = int(x_year) - self.__first_year #0
                    j_index = int(y_year) - self.__first_year #1
                    years_intersection_scores[i_index][j_index] = ratio


        all_scores_df = pd.DataFrame(years_intersection_scores, self.__years, self.__years)

        print(MODEL_NAME+'\'s __years\' vocabulary similarities:')
        print(all_scores_df)

    def plot_experiments_results_with_scikit_learn(self, lines_scores):
        """
        Plots experiments' results from scikit learn
        :param lines_scores: dict
        :return: void
        """
        test_years = ['13', '14', '15']
        markers = ['o','D','h','*','+']
        plot_types = ['-','--','-.',':', ',']
        legend_line_names = {
            'line1':'LINE1',
            'line2':'LINE2',
            'line3L0':'LINE3-MultinomialNB DB',
            'line3L1':'LINE3-kMEANS CLUSTERING',
            'line3L2':'LINE3-kMEANS CLUSTERING(probabilities)',
            'line3L3':'LINE3-MultinomialNB DB Iterative Approach',
            'line4':'LINE4'
        }
        # -(2012-500)/(YEAR-300)
        # -(2012-500)+(YEAR-R50)/(YEAR-300)
        # -(2012-500)+(YEAR-L50)/(YEAR-300)
        # -(2012-500)+(YEAR-L50)/(YEAR-300)
        # -(2012-500)+(YEAR-200)/(YEAR-300)
        fig, ax = plt.subplots(figsize=(20,9))
        ax.set_autoscale_on(False)
        ax.set_xlim([12.5,15.5])

        all_of_min = 100
        all_of_max = 0

        handles = []
        color_index = 0
        for first_iteration_number, (line_name, line_points) in enumerate(lines_scores.iteritems()):
            line_max, line_min = 0, 100

            if line_name == "line2":
                line_points_array = np.array(line_points.values())
                ys = line_points_array[:,1]
                mins = line_points_array[:,0]
                maxs = line_points_array[:,2]
                line_max, line_min = np.max(maxs), np.min(mins)

                for sub_iteration_number, (a_min, a_max) in enumerate(zip(mins, maxs)):
                    ax.plot((int(test_years[sub_iteration_number])-0.05,int(test_years[sub_iteration_number])+0.05),(a_min, a_min),'k-')
                    ax.plot((int(test_years[sub_iteration_number])-0.05,int(test_years[sub_iteration_number])+0.05),(a_max, a_max),'k-')
                    ax.plot((int(test_years[sub_iteration_number]),int(test_years[sub_iteration_number])),(a_min, a_max),'k-')

                ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0)
                patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name])
                color_index+=1
                handles.append(patch)

            elif line_name == "line3":
                for sub_iteration_number, (ale_experiment_key) in enumerate(ALE_LINE3_KEYS):
                    proper_dict_values = [line_points[dict_key] for dict_key in line_points.keys() if dict_key.startswith(ale_experiment_key)]
                    ys = proper_dict_values

                    line_max, line_min = np.max(ys), np.min(ys)
                    ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0)
                    patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name+ale_experiment_key])
                    handles.append(patch)
                    color_index+=1
            else:
                ys = line_points.values()
                line_max, line_min = np.max(ys), np.min(ys)
                ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0)
                patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name])
                handles.append(patch)
                color_index+=1
            all_of_min = min(line_min, all_of_min)
            all_of_max = max(line_max, all_of_max)


        ymin = all_of_min-0.01
        ymax = all_of_max+0.01

        plt.legend(handles=handles)

        ax.set_ylim([ymin, ymax])
        plt.yticks(np.arange(ymin, ymax, 0.01))
        ax.set_xticklabels(["","13","","14","","15"])
        plt.xlabel('Years')
        plt.ylabel('Scores %')
        plt.title('Scores by year with changing training sets. Classifier=SVM Feature=Word.')
        plt.tight_layout()
        plt.grid()
        plt.show()

    def _plot_2012_vs_rest_monthly(self, all_accuracy_scores):
        """
        Plots 2012 vs REST graphic in monthly basis.
        :param all_accuracy_scores: dict
        :return: void
        """

        date_ranges = pd.date_range(start='1/1/2013', periods=33, freq='M')
        date_ranges = np.array([date_obj.strftime('%b-%y') for date_obj in date_ranges])

        xs = date_ranges

        for iteration_number, classifier_scores in enumerate(all_accuracy_scores.values()):
            ys = []
            fig = plt.figure(iteration_number)
            for year, year_scores in classifier_scores.iteritems():
                ys += year_scores

            xs = np.arange(1, 34, 1)

            plt.xlabel("Months")
            plt.ylabel("Scores%")
            plt.title(all_accuracy_scores.keys()[iteration_number])
            plt.plot(xs, ys)

    def _plot_2012_vs_rest_yearly(self, all_accuracy_scores):
        """
        Plots 2012 vs REST graphic in yearly basis.
        :param all_accuracy_scores: dict
        :return: void
        """
        date_ranges = pd.date_range(start='1/1/2013', periods=3, freq='365D')
        date_ranges = np.array([date_obj.strftime('%y') for date_obj in date_ranges])

        xs = date_ranges
        yearly_scores = {}


        fig, ax = plt.subplots()
        names_of_classifiers = all_accuracy_scores.keys()

        for iteration_number, classifier_scores in enumerate(all_accuracy_scores.values()):
            ys = []
            for year, year_scores in classifier_scores.iteritems():
                ys.append(np.mean(year_scores))

            plt.xlabel('Years')
            plt.ylabel('Scores %')
            plt.title('Scores by year and classifier(' + MODEL_NAME + ', train=2012, test=2013, 2014, 2015)')
            ax.set_xticklabels(xs)
            plt.xticks(rotation=90)
            ax.plot(xs, ys, self.__colors[iteration_number], label=names_of_classifiers[iteration_number])
            plt.legend()
        plt.show()