def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015")
def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100
def __init__(self): Preprocessor.__init__(self) self.__helper = GeneralHelpers() self.__root_cache = self.__helper.load_roots_cache() self.__suggestion_cache = self.__helper.load_suggestion_cache() self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME
class PreprocessManager(Preprocessor): def __init__(self): Preprocessor.__init__(self) self.__helper = GeneralHelpers() self.__root_cache = self.__helper.load_roots_cache() self.__suggestion_cache = self.__helper.load_suggestion_cache() self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME def remove_characters_in_string(self, text, characters=[]): """ Removes specified characters in a string :param text: String :param characters: list, list of characters to remove :return: new string """ if len(characters): for char in characters: text = text.replace(char, "") return text def correct_misspelling(self, word): """ Suggest correct words for given word :param text: string, word :return: string, suggestion """ has_special_keyword, special_keyword = self._has_special_keyword(word) if not has_special_keyword: if word in self.__suggestion_cache: return self.__suggestion_cache[word] else: corrected_word = self.__helper.correct_misspelling_from_zemberek( word) self.__suggestion_cache[word] = corrected_word return corrected_word else: self.__suggestion_cache[word] = special_keyword return special_keyword def find_root_of_word(self, word): """ Returns root of word :param word: string, word :return: string, root of word """ has_special_keyword, special_keyword = self._has_special_keyword(word) if not has_special_keyword: if word in self.__root_cache: return self.__root_cache[word] else: root_of_word = self.__helper.find_root_from_zemberek(word) self.__root_cache[word] = root_of_word return root_of_word else: self.__root_cache[word] = special_keyword return special_keyword def save_caches(self): """ Saves suggestion and root finding caches' changes :return: void """ self.__helper.save_changes_in_suggestion_cache(self.__suggestion_cache) self.__helper.save_changes_in_root_cache(self.__root_cache) def _has_special_keyword(self, word): """ If model name is present in the word, returns the model name :param word: string, word :return: bool, string """ has_special_keyword = False special_keyword = "" for keyword in SPECIAL_KEYWORDS: if keyword.lower() in word.lower(): has_special_keyword = True special_keyword = keyword return has_special_keyword, special_keyword
class Main: """ Main class, makes necessary function calls to necessary classes """ def __init__(self): self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__plot_manager = PlotManager() self.__import_manager = ImportManager() self.__feature_manager = FeatureManager() self.years = ("2012", "2013", "2014", "2015") def retrieve_tweets(self, file_path_of_ids): """ Runs Import Manager to retrieve and import tweets :param file_path_of_ids: String, file path of tweets to import :return: void """ self.__import_manager.run(file_path_of_ids) def extract_features_and_generate_arff(self, n=3, analyzer='char', year='2012'): """ Makes necessary function calls to extract features for given year and to generate arff file :param n: int, ngram count :param analyzer: string, word or char :param year: string, 2012, 2013, 2014, 2015 or ALL :return: string, path of generated arff file """ # Getting tweets with year print("Getting tweets for year "+ year) tweets_for_given_year = self.__db_manager.get_tweets_for_year(year) print("Generating document and classes of tweets.") document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_given_year, True) print("Fitting the data, finding ngrams and frequencies.") ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n, analyzer) print("Formatting the data for arff lib format.") formatted_arff_data = self.__feature_manager.format_data_for_arff(ngrams, arff_data) print("Generating file.") # Experiment name, 1grams, 2grams, 3grams.. or words experiment_name = str(n)+'Gram' if analyzer == 'char' else 'Word' # File name, TTNet_3grams_2012 file_name = MODEL_NAME + '_' + experiment_name + '_' + year # File name randomized TTNet_3grams_2012_asfas12.arff file_name = self.__helper.generate_random_file_name(file_name, ARFF_FILE_EXTENSION) # Arff file path ...../DataSet-ARFF/3Gram/TTNet/TTNet_3grams_2012_asfas12.arff arff_file_path = PROJECT_ROOT_DIRECTORY + DATASET_ARFF_DIR_NAME + experiment_name + '/' + MODEL_NAME + '/' # Generating the file with data self.__helper.generate_arff_file(arff_file_path, file_name, formatted_arff_data) print("Arff file generated at path:"+arff_file_path+file_name) def run_experiment_with_scikit_learn(self, n=1, analyzer='word'): """ Makes necessary method calls to run the experiment on scikit learn. :param n: int, count n in n-gram :param analyzer: string, either 'word' or 'char' :return: void """ # Retrieving all tweets from database print("Retrieving all tweets from database.") tweets_for_all_years = {} # Iterating over all years for year in self.years: # Retrieving tweets for the year tweets_for_year = self.__db_manager.get_tweets_for_year(year) tweets_for_all_years[year] = tweets_for_year # Creating a big list of tweets print("Creating a big list of tweets.") all_tweets = [] # Appending all tweets together for year, tweets in tweets_for_all_years.iteritems(): all_tweets += tweets # Generating document print("Generating document and classes by preprocessing") # Preprocessing and generation of document document, classes = self.__feature_manager.create_document_and_classes_for_tweets(all_tweets, True) # Getting years' tweets counts print("Getting years' tweets counts.") years_tweets_counts = {} for year in self.years: years_tweets_counts[year] = len(tweets_for_all_years[year]) all_processes = [] self.all_experiments_results = [] pool = Pool(cpu_count()-1 or 1) copy_reg.pickle(types.MethodType, self._reduce_method) print("Running experiments.") t0 = time.time() for i in range(0, N_EXPERIMENTS): print("Experiment:"+str(i)) experiment_manager = ExperimentManager(i, years_tweets_counts, n, analyzer) r = pool.apply_async(experiment_manager.run_experiment, args=(document, classes,), callback=self._accumulate_experiments_scores) all_processes.append(r) for a_process in all_processes: a_process.wait() t1 = time.time() print("Elapsed time:", t1- t0, " seconds") pool.close() pool.join() print("Cumulating all the experiments' scores.") final_results_from_all_experiments = self.__helper.cumulate_years_scores(self.all_experiments_results) return final_results_from_all_experiments def _reduce_method(self, m): """ :param m: :return: """ if m.im_self is None: return getattr, (m.im_class, m.im_func.func_name) else: return getattr, (m.im_self, m.im_func.func_name) def _accumulate_experiments_scores(self, an_experiments_result): """ Accumulates experiments' scores :return: void """ an_experiments_result = self.__helper.calculate_relative_scores(an_experiments_result) self.all_experiments_results.append(an_experiments_result) def plot_experiment_results(self, root_dir): """ Plots experiment's results from log files :param root_dir: string :return: void """ lines_scores = self.__helper.get_accuracy_scores_for_experiment_years_from_root_dir(root_dir) self.__plot_manager.plot_experiments_results(lines_scores) def plot_all_experiment_results_with_scikit_learn(self, all_line_scores_of_all_experiments): """ Plots all line scores of all experiments :param all_line_scores_of_all_experiments: dict :return: void """ self.__plot_manager.plot_experiments_results_with_scikit_learn(all_line_scores_of_all_experiments) def plot_years_scores(self, root_dir): """ Makes necessary function calls to plot years scores :param dir: string :return: void """ self.__plot_manager.plot_years_scores_from_root_directory(root_dir) def plot_2012_vs_rest(self, root_dir): """ Makes necessary function calls to plot 2012 vs REST scores :param root_dir: string :return: void """ self.__plot_manager.plot_2012_vs_rest(root_dir) def plot_top_feature_frequencies_in_years(self): """ Makes necessary function calls to plot top features frequencies' in years :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_top_feature_frequencies_in_years(years_features_counts) def find_frequency_dictionary_for_year(self, year): """ Finds frequencies of each feature for given year :param year: string :return: dict """ # For this particular method, find_roots=True, n=1, analyzer=word because we're working with top info gain words tweets_for_the_year = self.__db_manager.get_tweets_for_year(year) document, classes = self.__feature_manager.create_document_and_classes_for_tweets(tweets_for_the_year, find_roots=True) ngrams, arff_data, vectorizer, X = self.__feature_manager.fit_data(document, classes, n=1, analyzer='word') terms = vectorizer.get_feature_names() freqs = X.sum(axis=0).A1 result = sorted(zip(freqs, terms), reverse=True) freqs = [elm[0] for elm in result] terms = [elm[1] for elm in result] final_result = dict(zip(terms, freqs)) return final_result def plot_years_intersection_scores(self): """ Makes necessary function callst to plot a matrix which shows years' vocabularies similarities :return: void """ years_features_counts = {} for year in self.years: years_features_counts[year] = self.find_frequency_dictionary_for_year(year) self.__plot_manager.plot_years_intersection_scores(years_features_counts) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ self.__import_manager.import_new_tweets_from_csv(root_path)
class PreprocessManager(Preprocessor): def __init__(self): Preprocessor.__init__(self) self.__helper = GeneralHelpers() self.__root_cache = self.__helper.load_roots_cache() self.__suggestion_cache = self.__helper.load_suggestion_cache() self.__dictionaries_directory = PROJECT_ROOT_DIRECTORY + DICTIONARIES_DIR_NAME def remove_characters_in_string(self, text, characters=[]): """ Removes specified characters in a string :param text: String :param characters: list, list of characters to remove :return: new string """ if len(characters): for char in characters: text = text.replace(char, "") return text def correct_misspelling(self, word): """ Suggest correct words for given word :param text: string, word :return: string, suggestion """ has_special_keyword, special_keyword = self._has_special_keyword(word) if not has_special_keyword: if word in self.__suggestion_cache: return self.__suggestion_cache[word] else: corrected_word = self.__helper.correct_misspelling_from_zemberek(word) self.__suggestion_cache[word] = corrected_word return corrected_word else: self.__suggestion_cache[word] = special_keyword return special_keyword def find_root_of_word(self, word): """ Returns root of word :param word: string, word :return: string, root of word """ has_special_keyword, special_keyword = self._has_special_keyword(word) if not has_special_keyword: if word in self.__root_cache: return self.__root_cache[word] else: root_of_word = self.__helper.find_root_from_zemberek(word) self.__root_cache[word] = root_of_word return root_of_word else: self.__root_cache[word] = special_keyword return special_keyword def save_caches(self): """ Saves suggestion and root finding caches' changes :return: void """ self.__helper.save_changes_in_suggestion_cache(self.__suggestion_cache) self.__helper.save_changes_in_root_cache(self.__root_cache) def _has_special_keyword(self, word): """ If model name is present in the word, returns the model name :param word: string, word :return: bool, string """ has_special_keyword = False special_keyword = "" for keyword in SPECIAL_KEYWORDS: if keyword.lower() in word.lower(): has_special_keyword = True special_keyword = keyword return has_special_keyword, special_keyword
class ImportManager: """ This class imports handles importing tweets to the database from various sources such as text files """ __file_path = None __components_in_a_line = None def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100 def run(self, file_path_to_import): """ Runs all necessary methods to import tweets for a year :return: void """ self.__file_path = file_path_to_import # getting tweets with their classes tweets_with_classes = self._parse_tweets_from_file() self.__tweets_with_classes = tweets_with_classes # finding duplicates unique_tweets, duplicate_tweets = self._find_duplicates( tweets_with_classes) print("Found " + str(len(duplicate_tweets)) + " duplicate tweets.") self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:") print("Continuing with unique ones.") # getting tweet ids from [tweet_id, class] unique_tweets_ids = self._get_tweets_ids(unique_tweets) # retrieving tweets from Twitter all_tweet_information = self._retrieve_tweets_from_twitter( unique_tweets_ids) # some tweets may not be found on Twitter not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter( all_tweet_information) # creating db model objects all_tweet_objects = self._create_tweet_objects(all_tweet_information) # insert to database success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects( all_tweet_objects) print("\n") print('-' * 10) print('Total Math:') print('Unique tweets:' + str(len(unique_tweets))) print('Tweets not found:' + str(len(not_found_tweets_on_twitter))) print('Tweets not inserted:' + str(len(not_imported_tweets))) print('Tweets OK:' + str(success_count)) print( str(len(unique_tweets)) + "==" + str( len(not_found_tweets_on_twitter) + len(not_imported_tweets) + success_count)) def _parse_tweets_from_file(self): """ Parses tweet ids and classes from txt file :return: list, holds [[124214124, positive],...] """ characters_to_remove = ["'", '"', '\n', ' '] with open(self.__file_path, 'r') as tweets_ids_file: tweets_with_classes = [] self.tweets_classes_dictionary = {} # Iterating over lines in txt file for line in tweets_ids_file: line_components = line.split(",") # if there are two components in a line. E.g. "121412412412", "positive" if self.__components_in_a_line == len(line_components): # iterating over components for index, component in enumerate(line_components): # removing unnecessary characters line_components[ index] = self.__preprocess_manager.remove_characters_in_string( component, characters_to_remove) tweets_with_classes.append(line_components) self.__tweets_classes_dictionary.update( {line_components[0]: line_components[1]}) return tweets_with_classes def _find_duplicates(self, tweets_with_classes): """ Finds duplicate tweets :param tweets_with_classes: List a list of tweet ids and their sentiment classes. :return: unique tweets, duplicate tweets """ unique_tweets = [] seen_tweets_ids = [] duplicate_tweet_ids = [] # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...] for tweet_block in tweets_with_classes: # First element is the tweet id tweet_id = tweet_block[0] # If it isn't seen before if not tweet_id in seen_tweets_ids: seen_tweets_ids.append(tweet_id) unique_tweets.append(tweet_block) else: duplicate_tweet_ids.append(tweet_id) return unique_tweets, duplicate_tweet_ids def _retrieve_tweets_from_twitter(self, tweet_ids): """ Retrieves tweet information from Twitter :param unique_tweets_with_classes: List, tweets and :return: """ tweets_results = [] twitter_manager = TwitterManager() chunks_of_tweets_ids = self.__helper.get_chunks_of_list( tweet_ids, self.__max_num_of_tweets_at_once) for chunk in chunks_of_tweets_ids: print("Searching for " + str(len(chunk)) + " tweets.") result = twitter_manager.lookup(chunk) print("Found " + str(len(result)) + " tweets.") tweets_results += result return tweets_results def _get_tweets_ids(self, tweets_with_classes): """ Extracts tweet ids from tweets with classes :param tweets_with_classes: List, tweet_with classes :return: extracted ids """ ids = [x[0] for x in tweets_with_classes] return ids def _create_tweet_objects(self, all_tweets): """ Creates a list which holds db model objects. :param tweet_info: List, tweets :return: List, tweet objects """ all_tweet_objects = [] for tweet in all_tweets: tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = tweet.id_str tweet_object.created_at = tweet.created_at tweet_object.lang = tweet.lang tweet_object.source = tweet.source tweet_object.user_id = tweet.user.id_str tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys( tweet.text).encode('utf-8') tweet_object.tweet_class = self._get_sentiment_class_of_tweet( tweet.id_str) all_tweet_objects.append(tweet_object) return all_tweet_objects def _get_sentiment_class_of_tweet(self, tweet_id): """ Returns a sentiment class for given tweet id :param tweet_id: string :return: tweet class """ return self.__tweets_classes_dictionary[tweet_id] def _find_not_found_tweets_on_twitter(self, twitter_response): """ Finds not found tweets on Twitter API :param twitter_response: list, Twitter response :return: list, not found tweets """ tweets_ids = self._get_tweets_ids(self.__tweets_with_classes) response_ids = [ a_tweet_response.id_str for a_tweet_response in twitter_response ] return list(set(tweets_ids) - set(response_ids)) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ tweet_objects = [] for file in os.listdir(root_path): if file.endswith('.csv'): with open(root_path + file, 'r') as file_handle: reader = csv.reader(file_handle, delimiter=';') next(reader, None) # skip the headers for row in reader: a_tweet_obj = self._create_tweet_object_from_line( row, file) tweet_objects.append(a_tweet_obj) success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects( tweet_objects) print(success_count) print(not_imported_tweets) def _create_tweet_object_from_line(self, components, file_name): """ :param a_line: :return: """ MAP_DICT = {'e': 'positive', 'h': 'negative', 'n': 'neutral'} id_component = components[0] date_component = components[2] text_component = components[3] sentiment_component = MAP_DICT[components[4]] year_abv = file_name.split('.')[0][2:] if year_abv not in file_name: return date_len = len(date_component.split('-')) if date_len == 2: date_component = date_component + '-' + year_abv format_str = '%d-%b-%y' datetime_of_tweet = datetime.strptime(date_component, format_str) tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = self.__helper.generate_random_string(10) tweet_object.text = text_component tweet_object.created_at = datetime_of_tweet tweet_object.tweet_class = sentiment_component return tweet_object
class ImportManager: """ This class imports handles importing tweets to the database from various sources such as text files """ __file_path = None __components_in_a_line = None def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100 def run(self, file_path_to_import): """ Runs all necessary methods to import tweets for a year :return: void """ self.__file_path = file_path_to_import # getting tweets with their classes tweets_with_classes = self._parse_tweets_from_file() self.__tweets_with_classes = tweets_with_classes # finding duplicates unique_tweets, duplicate_tweets = self._find_duplicates(tweets_with_classes) print("Found "+str(len(duplicate_tweets))+" duplicate tweets.") self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:") print("Continuing with unique ones.") # getting tweet ids from [tweet_id, class] unique_tweets_ids = self._get_tweets_ids(unique_tweets) # retrieving tweets from Twitter all_tweet_information = self._retrieve_tweets_from_twitter(unique_tweets_ids) # some tweets may not be found on Twitter not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(all_tweet_information) # creating db model objects all_tweet_objects = self._create_tweet_objects(all_tweet_information) # insert to database success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(all_tweet_objects) print("\n") print('-'*10) print('Total Math:') print('Unique tweets:'+str(len(unique_tweets))) print('Tweets not found:'+str(len(not_found_tweets_on_twitter))) print('Tweets not inserted:'+str(len(not_imported_tweets))) print('Tweets OK:'+str(success_count)) print(str(len(unique_tweets))+"=="+str(len(not_found_tweets_on_twitter)+len(not_imported_tweets)+success_count)) def _parse_tweets_from_file(self): """ Parses tweet ids and classes from txt file :return: list, holds [[124214124, positive],...] """ characters_to_remove = ["'", '"', '\n', ' '] with open(self.__file_path, 'r') as tweets_ids_file: tweets_with_classes = [] self.tweets_classes_dictionary = {} # Iterating over lines in txt file for line in tweets_ids_file: line_components = line.split(",") # if there are two components in a line. E.g. "121412412412", "positive" if self.__components_in_a_line == len(line_components): # iterating over components for index, component in enumerate(line_components): # removing unnecessary characters line_components[index] = self.__preprocess_manager.remove_characters_in_string(component, characters_to_remove) tweets_with_classes.append(line_components) self.__tweets_classes_dictionary.update({line_components[0]:line_components[1]}) return tweets_with_classes def _find_duplicates(self, tweets_with_classes): """ Finds duplicate tweets :param tweets_with_classes: List a list of tweet ids and their sentiment classes. :return: unique tweets, duplicate tweets """ unique_tweets = [] seen_tweets_ids = [] duplicate_tweet_ids = [] # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...] for tweet_block in tweets_with_classes: # First element is the tweet id tweet_id = tweet_block[0] # If it isn't seen before if not tweet_id in seen_tweets_ids: seen_tweets_ids.append(tweet_id) unique_tweets.append(tweet_block) else: duplicate_tweet_ids.append(tweet_id) return unique_tweets, duplicate_tweet_ids def _retrieve_tweets_from_twitter(self, tweet_ids): """ Retrieves tweet information from Twitter :param unique_tweets_with_classes: List, tweets and :return: """ tweets_results = [] twitter_manager = TwitterManager() chunks_of_tweets_ids = self.__helper.get_chunks_of_list(tweet_ids, self.__max_num_of_tweets_at_once) for chunk in chunks_of_tweets_ids: print("Searching for "+str(len(chunk))+" tweets.") result = twitter_manager.lookup(chunk) print("Found "+str(len(result))+" tweets.") tweets_results+=result return tweets_results def _get_tweets_ids(self, tweets_with_classes): """ Extracts tweet ids from tweets with classes :param tweets_with_classes: List, tweet_with classes :return: extracted ids """ ids = [x[0] for x in tweets_with_classes] return ids def _create_tweet_objects(self, all_tweets): """ Creates a list which holds db model objects. :param tweet_info: List, tweets :return: List, tweet objects """ all_tweet_objects = [] for tweet in all_tweets: tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = tweet.id_str tweet_object.created_at = tweet.created_at tweet_object.lang = tweet.lang tweet_object.source = tweet.source tweet_object.user_id = tweet.user.id_str tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(tweet.text).encode('utf-8') tweet_object.tweet_class = self._get_sentiment_class_of_tweet(tweet.id_str) all_tweet_objects.append(tweet_object) return all_tweet_objects def _get_sentiment_class_of_tweet(self, tweet_id): """ Returns a sentiment class for given tweet id :param tweet_id: string :return: tweet class """ return self.__tweets_classes_dictionary[tweet_id] def _find_not_found_tweets_on_twitter(self, twitter_response): """ Finds not found tweets on Twitter API :param twitter_response: list, Twitter response :return: list, not found tweets """ tweets_ids = self._get_tweets_ids(self.__tweets_with_classes) response_ids = [a_tweet_response.id_str for a_tweet_response in twitter_response] return list(set(tweets_ids) - set(response_ids)) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ tweet_objects = [] for file in os.listdir(root_path): if file.endswith('.csv'): with open(root_path+file, 'r') as file_handle: reader = csv.reader(file_handle, delimiter=';') next(reader, None) # skip the headers for row in reader: a_tweet_obj = self._create_tweet_object_from_line(row, file) tweet_objects.append(a_tweet_obj) success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(tweet_objects) print(success_count) print(not_imported_tweets) def _create_tweet_object_from_line(self, components, file_name): """ :param a_line: :return: """ MAP_DICT = { 'e': 'positive', 'h': 'negative', 'n': 'neutral' } id_component = components[0] date_component = components[2] text_component = components[3] sentiment_component = MAP_DICT[components[4]] year_abv = file_name.split('.')[0][2:] if year_abv not in file_name: return date_len = len(date_component.split('-')) if date_len == 2: date_component = date_component + '-' + year_abv format_str = '%d-%b-%y' datetime_of_tweet = datetime.strptime(date_component, format_str) tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = self.__helper.generate_random_string(10) tweet_object.text = text_component tweet_object.created_at = datetime_of_tweet tweet_object.tweet_class = sentiment_component return tweet_object
def __init__(self): self.__first_year = 2012 self.__helper = GeneralHelpers() self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k'] self.__years = ('2012', '2013', '2014', '2015') self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}"
class PlotManager: """ This class does the necessary works for visualizing data """ def __init__(self): self.__first_year = 2012 self.__helper = GeneralHelpers() self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k'] self.__years = ('2012', '2013', '2014', '2015') self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}" def plot_years_scores_from_root_directory(self, root_dir): """ Plots years' scores for given classifiers and mean of them :param root_dir: string, root directory to scan :return: void """ bar_width = 0.10 # Getting scores from helper years_classifier_scores_list = [] years_classifier_scores_dict = self.__helper.get_accuracy_scores_for_years_from_root_dir( root_dir) # Making them lists for year, classifiers_scores in years_classifier_scores_dict.iteritems( ): years_classifier_scores_list.append(classifiers_scores.values()) years_classifier_scores_list = np.array(years_classifier_scores_list) classifier_names = years_classifier_scores_dict['2012'].keys() indexes = np.arange(len( years_classifier_scores_dict.keys())) # [0,1,2,3] + # Iterating over J48 for 2012, 2013, 2014, 2015, MEAN for 2012, 2013, 2014, 2015 an so on.. for iteration_number, (color_name, classifier_name, classifier_scores) in enumerate( zip(self.__colors, classifier_names, years_classifier_scores_list.T)): bar_offset = indexes + (iteration_number * bar_width) plt.bar(bar_offset, classifier_scores, bar_width, color=color_name, label=classifier_name) plt.xlabel('Years') plt.ylabel('Scores %') plt.title('Scores by year and classifier(' + MODEL_NAME + ', CV=4)') plt.xticks(indexes + bar_width, self.__years) plt.legend(loc=4) plt.show() def plot_2012_vs_rest(self, root_dir): """ Plots results of classifications of using 2012 as train set, 2013, 2014, 2015 as test set. :param root_dir: string :return: void """ all_accuracy_scores = self.__helper.get_log_files_stats(root_dir) """ Example accuracy scores: { 'SMO':{ 2013: [62.79, 66.67, 50.0, 70.45, 57.14, 60.0, 64.29, 66.67, 62.79, 73.17, 57.14, 66.67], 2014: [65.45, 58.97, 54.35, 72.09, 47.62, 66.67, 71.43, 66.67, 57.78, 64.44, 71.43, 59.26], 2015: [62.79, 57.14, 63.16, 62.5, 59.26, 67.27, 61.76, 66.67, 68.63] }, 'IB1': { 2013: [37.21, 40.48, 38.1, 43.18, 45.24, 47.5, 45.24, 35.71, 32.56, 24.39, 28.57, 51.28], 2014: [38.18, 43.59, 41.3, 39.53, 47.62, 50.0, 33.33, 44.44, 26.67, 24.44, 40.48, 37.04], 2015: [37.21, 41.07, 43.86, 39.29, 51.85, 30.91, 39.71, 26.32, 45.1] } ... ... ... } """ self._plot_2012_vs_rest_monthly(all_accuracy_scores) self._plot_2012_vs_rest_yearly(all_accuracy_scores) def plot_top_feature_frequencies_in_years(self, years_features_counts): """ Plots top features' frequencies in years :return: void """ plot_feature_counts = {} bar_width = 0.20 for feature_name in INFO_GAIN_ATTRIBUTES: if not feature_name in plot_feature_counts: plot_feature_counts[feature_name] = [] f_key = feature_name.decode('utf-8') for year in self.__years: if not f_key in years_features_counts[year]: years_features_counts[year][f_key] = 0 plot_feature_counts[feature_name] = [ years_features_counts["2012"][f_key], years_features_counts["2013"][f_key], years_features_counts["2014"][f_key], years_features_counts["2015"][f_key] ] print(plot_feature_counts) indexes = np.arange(len(plot_feature_counts.keys())) for first_iteration_number, (feature_name, feature_counts) in enumerate( plot_feature_counts.iteritems()): for second_iteration_number, (color, feature_count) in enumerate( zip(self.__colors, feature_counts)): x_coord = first_iteration_number + (second_iteration_number * bar_width) plt.bar(x_coord, feature_count, bar_width, color=color) xticks = [key.decode('utf-8') for key in plot_feature_counts.keys()] plt.xlabel('Features') plt.ylabel('Frequencies in __years') plt.title('InfoGain features by year and features(' + MODEL_NAME + ')') plt.xticks(indexes + bar_width * 2, xticks) handles = [] for idx, (year, color) in enumerate(zip(self.__years, self.__colors)): patch = Patch(color=color, label=year) handles.append(patch) plt.legend(loc=1, handles=handles) plt.show() def plot_years_intersection_scores(self, years_features_counts): """ Plots a matrix which shows years' vocabularies similarities :param years_features_counts: dict :return: void """ years_intersection_scores = np.zeros( (len(self.__years), len(self.__years))) feature_frequencies = years_features_counts for first_iteration_number, (x_year, x_years_features) in enumerate( feature_frequencies.iteritems()): features_of_x = x_years_features.keys() total_count = np.sum(x_years_features.values()) for second_iteration_number, (y_year, y_years_features) in enumerate( feature_frequencies.iteritems()): if x_year == y_year: pass else: features_of_y = y_years_features.keys() intersect = list(set(features_of_x) & set(features_of_y)) intersect_count = 0 for intersect_item in intersect: intersect_count = intersect_count + y_years_features[ intersect_item] ratio = float(intersect_count) / total_count i_index = int(x_year) - self.__first_year #0 j_index = int(y_year) - self.__first_year #1 years_intersection_scores[i_index][j_index] = ratio all_scores_df = pd.DataFrame(years_intersection_scores, self.__years, self.__years) print(MODEL_NAME + '\'s __years\' vocabulary similarities:') print(all_scores_df) def plot_experiments_results_with_scikit_learn(self, lines_scores): """ Plots experiments' results from scikit learn :param lines_scores: dict :return: void """ test_years = ['13', '14', '15'] markers = ['o', 'D', 'h', '*', '+'] plot_types = ['-', '--', '-.', ':', ','] legend_line_names = { 'line1': 'LINE1', 'line2': 'LINE2', 'line3L0': 'LINE3-MultinomialNB DB', 'line3L1': 'LINE3-kMEANS CLUSTERING', 'line3L2': 'LINE3-kMEANS CLUSTERING(probabilities)', 'line3L3': 'LINE3-MultinomialNB DB Iterative Approach', 'line4': 'LINE4' } # -(2012-500)/(YEAR-300) # -(2012-500)+(YEAR-R50)/(YEAR-300) # -(2012-500)+(YEAR-L50)/(YEAR-300) # -(2012-500)+(YEAR-L50)/(YEAR-300) # -(2012-500)+(YEAR-200)/(YEAR-300) fig, ax = plt.subplots(figsize=(20, 9)) ax.set_autoscale_on(False) ax.set_xlim([12.5, 15.5]) all_of_min = 100 all_of_max = 0 handles = [] color_index = 0 for first_iteration_number, (line_name, line_points) in enumerate( lines_scores.iteritems()): line_max, line_min = 0, 100 if line_name == "line2": line_points_array = np.array(line_points.values()) ys = line_points_array[:, 1] mins = line_points_array[:, 0] maxs = line_points_array[:, 2] line_max, line_min = np.max(maxs), np.min(mins) for sub_iteration_number, (a_min, a_max) in enumerate(zip(mins, maxs)): ax.plot((int(test_years[sub_iteration_number]) - 0.05, int(test_years[sub_iteration_number]) + 0.05), (a_min, a_min), 'k-') ax.plot((int(test_years[sub_iteration_number]) - 0.05, int(test_years[sub_iteration_number]) + 0.05), (a_max, a_max), 'k-') ax.plot((int(test_years[sub_iteration_number]), int(test_years[sub_iteration_number])), (a_min, a_max), 'k-') ax.plot(test_years, ys, self.__colors[color_index], marker=markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name]) color_index += 1 handles.append(patch) elif line_name == "line3": for sub_iteration_number, ( ale_experiment_key) in enumerate(ALE_LINE3_KEYS): proper_dict_values = [ line_points[dict_key] for dict_key in line_points.keys() if dict_key.startswith(ale_experiment_key) ] ys = proper_dict_values line_max, line_min = np.max(ys), np.min(ys) ax.plot(test_years, ys, self.__colors[color_index], marker=markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name + ale_experiment_key]) handles.append(patch) color_index += 1 else: ys = line_points.values() line_max, line_min = np.max(ys), np.min(ys) ax.plot(test_years, ys, self.__colors[color_index], marker=markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name]) handles.append(patch) color_index += 1 all_of_min = min(line_min, all_of_min) all_of_max = max(line_max, all_of_max) ymin = all_of_min - 0.01 ymax = all_of_max + 0.01 plt.legend(handles=handles) ax.set_ylim([ymin, ymax]) plt.yticks(np.arange(ymin, ymax, 0.01)) ax.set_xticklabels(["", "13", "", "14", "", "15"]) plt.xlabel('Years') plt.ylabel('Scores %') plt.title( 'Scores by year with changing training sets. Classifier=SVM Feature=Word.' ) plt.tight_layout() plt.grid() plt.show() def _plot_2012_vs_rest_monthly(self, all_accuracy_scores): """ Plots 2012 vs REST graphic in monthly basis. :param all_accuracy_scores: dict :return: void """ date_ranges = pd.date_range(start='1/1/2013', periods=33, freq='M') date_ranges = np.array( [date_obj.strftime('%b-%y') for date_obj in date_ranges]) xs = date_ranges for iteration_number, classifier_scores in enumerate( all_accuracy_scores.values()): ys = [] fig = plt.figure(iteration_number) for year, year_scores in classifier_scores.iteritems(): ys += year_scores xs = np.arange(1, 34, 1) plt.xlabel("Months") plt.ylabel("Scores%") plt.title(all_accuracy_scores.keys()[iteration_number]) plt.plot(xs, ys) def _plot_2012_vs_rest_yearly(self, all_accuracy_scores): """ Plots 2012 vs REST graphic in yearly basis. :param all_accuracy_scores: dict :return: void """ date_ranges = pd.date_range(start='1/1/2013', periods=3, freq='365D') date_ranges = np.array( [date_obj.strftime('%y') for date_obj in date_ranges]) xs = date_ranges yearly_scores = {} fig, ax = plt.subplots() names_of_classifiers = all_accuracy_scores.keys() for iteration_number, classifier_scores in enumerate( all_accuracy_scores.values()): ys = [] for year, year_scores in classifier_scores.iteritems(): ys.append(np.mean(year_scores)) plt.xlabel('Years') plt.ylabel('Scores %') plt.title('Scores by year and classifier(' + MODEL_NAME + ', train=2012, test=2013, 2014, 2015)') ax.set_xticklabels(xs) plt.xticks(rotation=90) ax.plot(xs, ys, self.__colors[iteration_number], label=names_of_classifiers[iteration_number]) plt.legend() plt.show()
class PlotManager: """ This class does the necessary works for visualizing data """ def __init__(self): self.__first_year = 2012 self.__helper = GeneralHelpers() self.__colors = ['r', 'b', 'y', 'm', 'g', 'c', 'k'] self.__years = ('2012', '2013', '2014', '2015') self.__regexp_for_predict_lines = "\d{1,}\s{1,}\d{1}:\w{1,8}.{1,}" def plot_years_scores_from_root_directory(self, root_dir): """ Plots years' scores for given classifiers and mean of them :param root_dir: string, root directory to scan :return: void """ bar_width = 0.10 # Getting scores from helper years_classifier_scores_list = [] years_classifier_scores_dict = self.__helper.get_accuracy_scores_for_years_from_root_dir(root_dir) # Making them lists for year, classifiers_scores in years_classifier_scores_dict.iteritems(): years_classifier_scores_list.append(classifiers_scores.values()) years_classifier_scores_list = np.array(years_classifier_scores_list) classifier_names = years_classifier_scores_dict['2012'].keys() indexes = np.arange(len(years_classifier_scores_dict.keys())) # [0,1,2,3] + # Iterating over J48 for 2012, 2013, 2014, 2015, MEAN for 2012, 2013, 2014, 2015 an so on.. for iteration_number, (color_name, classifier_name, classifier_scores) in enumerate(zip(self.__colors, classifier_names, years_classifier_scores_list.T)): bar_offset = indexes + (iteration_number * bar_width) plt.bar(bar_offset, classifier_scores, bar_width, color=color_name, label=classifier_name) plt.xlabel('Years') plt.ylabel('Scores %') plt.title('Scores by year and classifier(' + MODEL_NAME + ', CV=4)') plt.xticks(indexes + bar_width, self.__years) plt.legend(loc=4) plt.show() def plot_2012_vs_rest(self, root_dir): """ Plots results of classifications of using 2012 as train set, 2013, 2014, 2015 as test set. :param root_dir: string :return: void """ all_accuracy_scores = self.__helper.get_log_files_stats(root_dir) """ Example accuracy scores: { 'SMO':{ 2013: [62.79, 66.67, 50.0, 70.45, 57.14, 60.0, 64.29, 66.67, 62.79, 73.17, 57.14, 66.67], 2014: [65.45, 58.97, 54.35, 72.09, 47.62, 66.67, 71.43, 66.67, 57.78, 64.44, 71.43, 59.26], 2015: [62.79, 57.14, 63.16, 62.5, 59.26, 67.27, 61.76, 66.67, 68.63] }, 'IB1': { 2013: [37.21, 40.48, 38.1, 43.18, 45.24, 47.5, 45.24, 35.71, 32.56, 24.39, 28.57, 51.28], 2014: [38.18, 43.59, 41.3, 39.53, 47.62, 50.0, 33.33, 44.44, 26.67, 24.44, 40.48, 37.04], 2015: [37.21, 41.07, 43.86, 39.29, 51.85, 30.91, 39.71, 26.32, 45.1] } ... ... ... } """ self._plot_2012_vs_rest_monthly(all_accuracy_scores) self._plot_2012_vs_rest_yearly(all_accuracy_scores) def plot_top_feature_frequencies_in_years(self, years_features_counts): """ Plots top features' frequencies in years :return: void """ plot_feature_counts = {} bar_width = 0.20 for feature_name in INFO_GAIN_ATTRIBUTES: if not feature_name in plot_feature_counts: plot_feature_counts[feature_name] = [] f_key = feature_name.decode('utf-8') for year in self.__years: if not f_key in years_features_counts[year]: years_features_counts[year][f_key] = 0 plot_feature_counts[feature_name] = [years_features_counts["2012"][f_key], years_features_counts["2013"][f_key], years_features_counts["2014"][f_key], years_features_counts["2015"][f_key] ] print(plot_feature_counts) indexes = np.arange(len(plot_feature_counts.keys())) for first_iteration_number, (feature_name, feature_counts) in enumerate(plot_feature_counts.iteritems()): for second_iteration_number, (color, feature_count) in enumerate(zip(self.__colors, feature_counts)): x_coord = first_iteration_number + (second_iteration_number*bar_width) plt.bar(x_coord, feature_count, bar_width, color=color) xticks = [key.decode('utf-8') for key in plot_feature_counts.keys()] plt.xlabel('Features') plt.ylabel('Frequencies in __years') plt.title('InfoGain features by year and features(' + MODEL_NAME + ')') plt.xticks(indexes + bar_width*2, xticks) handles = [] for idx, (year, color) in enumerate(zip(self.__years, self.__colors)): patch = Patch(color=color, label=year) handles.append(patch) plt.legend(loc=1, handles=handles) plt.show() def plot_years_intersection_scores(self, years_features_counts): """ Plots a matrix which shows years' vocabularies similarities :param years_features_counts: dict :return: void """ years_intersection_scores = np.zeros((len(self.__years),len(self.__years))) feature_frequencies = years_features_counts for first_iteration_number, (x_year, x_years_features) in enumerate(feature_frequencies.iteritems()): features_of_x = x_years_features.keys() total_count = np.sum(x_years_features.values()) for second_iteration_number, (y_year, y_years_features) in enumerate(feature_frequencies.iteritems()): if x_year == y_year: pass else: features_of_y = y_years_features.keys() intersect = list(set(features_of_x) & set(features_of_y)) intersect_count = 0 for intersect_item in intersect: intersect_count = intersect_count + y_years_features[intersect_item] ratio = float(intersect_count)/total_count i_index = int(x_year) - self.__first_year #0 j_index = int(y_year) - self.__first_year #1 years_intersection_scores[i_index][j_index] = ratio all_scores_df = pd.DataFrame(years_intersection_scores, self.__years, self.__years) print(MODEL_NAME+'\'s __years\' vocabulary similarities:') print(all_scores_df) def plot_experiments_results_with_scikit_learn(self, lines_scores): """ Plots experiments' results from scikit learn :param lines_scores: dict :return: void """ test_years = ['13', '14', '15'] markers = ['o','D','h','*','+'] plot_types = ['-','--','-.',':', ','] legend_line_names = { 'line1':'LINE1', 'line2':'LINE2', 'line3L0':'LINE3-MultinomialNB DB', 'line3L1':'LINE3-kMEANS CLUSTERING', 'line3L2':'LINE3-kMEANS CLUSTERING(probabilities)', 'line3L3':'LINE3-MultinomialNB DB Iterative Approach', 'line4':'LINE4' } # -(2012-500)/(YEAR-300) # -(2012-500)+(YEAR-R50)/(YEAR-300) # -(2012-500)+(YEAR-L50)/(YEAR-300) # -(2012-500)+(YEAR-L50)/(YEAR-300) # -(2012-500)+(YEAR-200)/(YEAR-300) fig, ax = plt.subplots(figsize=(20,9)) ax.set_autoscale_on(False) ax.set_xlim([12.5,15.5]) all_of_min = 100 all_of_max = 0 handles = [] color_index = 0 for first_iteration_number, (line_name, line_points) in enumerate(lines_scores.iteritems()): line_max, line_min = 0, 100 if line_name == "line2": line_points_array = np.array(line_points.values()) ys = line_points_array[:,1] mins = line_points_array[:,0] maxs = line_points_array[:,2] line_max, line_min = np.max(maxs), np.min(mins) for sub_iteration_number, (a_min, a_max) in enumerate(zip(mins, maxs)): ax.plot((int(test_years[sub_iteration_number])-0.05,int(test_years[sub_iteration_number])+0.05),(a_min, a_min),'k-') ax.plot((int(test_years[sub_iteration_number])-0.05,int(test_years[sub_iteration_number])+0.05),(a_max, a_max),'k-') ax.plot((int(test_years[sub_iteration_number]),int(test_years[sub_iteration_number])),(a_min, a_max),'k-') ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name]) color_index+=1 handles.append(patch) elif line_name == "line3": for sub_iteration_number, (ale_experiment_key) in enumerate(ALE_LINE3_KEYS): proper_dict_values = [line_points[dict_key] for dict_key in line_points.keys() if dict_key.startswith(ale_experiment_key)] ys = proper_dict_values line_max, line_min = np.max(ys), np.min(ys) ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name+ale_experiment_key]) handles.append(patch) color_index+=1 else: ys = line_points.values() line_max, line_min = np.max(ys), np.min(ys) ax.plot(test_years, ys, self.__colors[color_index], marker= markers[first_iteration_number], linestyle=plot_types[first_iteration_number], linewidth=3.0) patch = Patch(color=self.__colors[color_index], label=legend_line_names[line_name]) handles.append(patch) color_index+=1 all_of_min = min(line_min, all_of_min) all_of_max = max(line_max, all_of_max) ymin = all_of_min-0.01 ymax = all_of_max+0.01 plt.legend(handles=handles) ax.set_ylim([ymin, ymax]) plt.yticks(np.arange(ymin, ymax, 0.01)) ax.set_xticklabels(["","13","","14","","15"]) plt.xlabel('Years') plt.ylabel('Scores %') plt.title('Scores by year with changing training sets. Classifier=SVM Feature=Word.') plt.tight_layout() plt.grid() plt.show() def _plot_2012_vs_rest_monthly(self, all_accuracy_scores): """ Plots 2012 vs REST graphic in monthly basis. :param all_accuracy_scores: dict :return: void """ date_ranges = pd.date_range(start='1/1/2013', periods=33, freq='M') date_ranges = np.array([date_obj.strftime('%b-%y') for date_obj in date_ranges]) xs = date_ranges for iteration_number, classifier_scores in enumerate(all_accuracy_scores.values()): ys = [] fig = plt.figure(iteration_number) for year, year_scores in classifier_scores.iteritems(): ys += year_scores xs = np.arange(1, 34, 1) plt.xlabel("Months") plt.ylabel("Scores%") plt.title(all_accuracy_scores.keys()[iteration_number]) plt.plot(xs, ys) def _plot_2012_vs_rest_yearly(self, all_accuracy_scores): """ Plots 2012 vs REST graphic in yearly basis. :param all_accuracy_scores: dict :return: void """ date_ranges = pd.date_range(start='1/1/2013', periods=3, freq='365D') date_ranges = np.array([date_obj.strftime('%y') for date_obj in date_ranges]) xs = date_ranges yearly_scores = {} fig, ax = plt.subplots() names_of_classifiers = all_accuracy_scores.keys() for iteration_number, classifier_scores in enumerate(all_accuracy_scores.values()): ys = [] for year, year_scores in classifier_scores.iteritems(): ys.append(np.mean(year_scores)) plt.xlabel('Years') plt.ylabel('Scores %') plt.title('Scores by year and classifier(' + MODEL_NAME + ', train=2012, test=2013, 2014, 2015)') ax.set_xticklabels(xs) plt.xticks(rotation=90) ax.plot(xs, ys, self.__colors[iteration_number], label=names_of_classifiers[iteration_number]) plt.legend() plt.show()