def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100
def process_bio(self, input_bio): # Convert to ord() bio_data = PreprocessManager.get_unique_words(input_bio) bio_data = [str(word) for word in bio_data] # Match interests [interest_p, interest_q] = self.get_best_match(bio_data, self.interest_dict, self.interests_tolerance) # Match technologies [tech_p, tech_q] = self.get_best_match(bio_data, self.tech_dict, self.tech_tolerance) # Match languages [languages_p, languages_q] = self.get_best_match(bio_data, self.lagunages_dict, self.languages_tolerance) # Match positions [positions_p, positions_q] = self.get_best_match(bio_data, self.position_dict, self.position_tolernce) # Match Student Statutes [status_p, status_q] = self.get_best_match(bio_data, self.student_status_dict, self.student_status_tolerance) return interest_q, tech_q, languages_q, positions_q, status_q
def process_one_log(self, input_log, repo_info_topics): input_log = PreprocessManager.remove_non_ascii(input_log) # TODO : Do we need repo info? #repo_info_topics = PreprocessManager.remove_non_ascii(repo_info_topics) # Find the length # TODO : All the scores which are dependent on the length are not unbiased if not normalized! Check that length = len(PreprocessManager.get_raw_tokenized_text(input_log)) # Find structural integrity. self.grammar_tool.enable_spellchecking() problematic_matches = self.grammar_tool.check(input_log) corrected_text = gc.correct(input_log, problematic_matches) degree_of_match = fuzz.ratio(input_log, corrected_text) structural_integrity_score = degree_of_match * (length - len(problematic_matches)) # Check if topic is relevant # This is still in testing phase and not sure if it has a good impact on the final results. # Might be totally useless at times. sframe_data_for_topics = gl.SArray([PreprocessManager.get_word_counts(input_log)]) # Add Associations here TODO: Make it proper associations = gl.SFrame({'word': ['fix', 'issue', 'implement', 'modify', 'changed', 'bug', 'error'], 'topic': [0, 0, 0, 0, 0, 0, 0]}) topic_model = gl.topic_model.create(sframe_data_for_topics, associations=associations) # TODO : Add here the match with the description. Is that useful? Maybe Future work? #pred = topic_model.predict(sframe_data_for_topics, output_type='probability') topics = topic_model.get_topics() # The final score is the sum of all the topic 0 scores! As they were used in associations. Gives us relevance of being a commit message! topic_relevance_score = 0 for i in xrange(0, len(topics)): curr = topics[i] topic_id = curr['topic'] score_val = curr['score'] if topic_id == 0: topic_relevance_score += score_val topic_relevance_score *= 100 #print topics, topic_relevance_score # Check how much positivity log_dict = dict() log_dict['text'] = input_log positivity = self.senti_checker.predict_row(log_dict) positivity_score = 100 * positivity #print positivity_score # Spelling Goodness self.spell_master.set_text(input_log) error_words = list() for err in self.spell_master: error_words.append(err.word) spelling_integrity_score = length - len(error_words) #return all return length, structural_integrity_score, topic_relevance_score, positivity_score, spelling_integrity_score
class ImportManager: """ This class imports handles importing tweets to the database from various sources such as text files """ __file_path = None __components_in_a_line = None def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100 def run(self, file_path_to_import): """ Runs all necessary methods to import tweets for a year :return: void """ self.__file_path = file_path_to_import # getting tweets with their classes tweets_with_classes = self._parse_tweets_from_file() self.__tweets_with_classes = tweets_with_classes # finding duplicates unique_tweets, duplicate_tweets = self._find_duplicates( tweets_with_classes) print("Found " + str(len(duplicate_tweets)) + " duplicate tweets.") self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:") print("Continuing with unique ones.") # getting tweet ids from [tweet_id, class] unique_tweets_ids = self._get_tweets_ids(unique_tweets) # retrieving tweets from Twitter all_tweet_information = self._retrieve_tweets_from_twitter( unique_tweets_ids) # some tweets may not be found on Twitter not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter( all_tweet_information) # creating db model objects all_tweet_objects = self._create_tweet_objects(all_tweet_information) # insert to database success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects( all_tweet_objects) print("\n") print('-' * 10) print('Total Math:') print('Unique tweets:' + str(len(unique_tweets))) print('Tweets not found:' + str(len(not_found_tweets_on_twitter))) print('Tweets not inserted:' + str(len(not_imported_tweets))) print('Tweets OK:' + str(success_count)) print( str(len(unique_tweets)) + "==" + str( len(not_found_tweets_on_twitter) + len(not_imported_tweets) + success_count)) def _parse_tweets_from_file(self): """ Parses tweet ids and classes from txt file :return: list, holds [[124214124, positive],...] """ characters_to_remove = ["'", '"', '\n', ' '] with open(self.__file_path, 'r') as tweets_ids_file: tweets_with_classes = [] self.tweets_classes_dictionary = {} # Iterating over lines in txt file for line in tweets_ids_file: line_components = line.split(",") # if there are two components in a line. E.g. "121412412412", "positive" if self.__components_in_a_line == len(line_components): # iterating over components for index, component in enumerate(line_components): # removing unnecessary characters line_components[ index] = self.__preprocess_manager.remove_characters_in_string( component, characters_to_remove) tweets_with_classes.append(line_components) self.__tweets_classes_dictionary.update( {line_components[0]: line_components[1]}) return tweets_with_classes def _find_duplicates(self, tweets_with_classes): """ Finds duplicate tweets :param tweets_with_classes: List a list of tweet ids and their sentiment classes. :return: unique tweets, duplicate tweets """ unique_tweets = [] seen_tweets_ids = [] duplicate_tweet_ids = [] # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...] for tweet_block in tweets_with_classes: # First element is the tweet id tweet_id = tweet_block[0] # If it isn't seen before if not tweet_id in seen_tweets_ids: seen_tweets_ids.append(tweet_id) unique_tweets.append(tweet_block) else: duplicate_tweet_ids.append(tweet_id) return unique_tweets, duplicate_tweet_ids def _retrieve_tweets_from_twitter(self, tweet_ids): """ Retrieves tweet information from Twitter :param unique_tweets_with_classes: List, tweets and :return: """ tweets_results = [] twitter_manager = TwitterManager() chunks_of_tweets_ids = self.__helper.get_chunks_of_list( tweet_ids, self.__max_num_of_tweets_at_once) for chunk in chunks_of_tweets_ids: print("Searching for " + str(len(chunk)) + " tweets.") result = twitter_manager.lookup(chunk) print("Found " + str(len(result)) + " tweets.") tweets_results += result return tweets_results def _get_tweets_ids(self, tweets_with_classes): """ Extracts tweet ids from tweets with classes :param tweets_with_classes: List, tweet_with classes :return: extracted ids """ ids = [x[0] for x in tweets_with_classes] return ids def _create_tweet_objects(self, all_tweets): """ Creates a list which holds db model objects. :param tweet_info: List, tweets :return: List, tweet objects """ all_tweet_objects = [] for tweet in all_tweets: tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = tweet.id_str tweet_object.created_at = tweet.created_at tweet_object.lang = tweet.lang tweet_object.source = tweet.source tweet_object.user_id = tweet.user.id_str tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys( tweet.text).encode('utf-8') tweet_object.tweet_class = self._get_sentiment_class_of_tweet( tweet.id_str) all_tweet_objects.append(tweet_object) return all_tweet_objects def _get_sentiment_class_of_tweet(self, tweet_id): """ Returns a sentiment class for given tweet id :param tweet_id: string :return: tweet class """ return self.__tweets_classes_dictionary[tweet_id] def _find_not_found_tweets_on_twitter(self, twitter_response): """ Finds not found tweets on Twitter API :param twitter_response: list, Twitter response :return: list, not found tweets """ tweets_ids = self._get_tweets_ids(self.__tweets_with_classes) response_ids = [ a_tweet_response.id_str for a_tweet_response in twitter_response ] return list(set(tweets_ids) - set(response_ids)) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ tweet_objects = [] for file in os.listdir(root_path): if file.endswith('.csv'): with open(root_path + file, 'r') as file_handle: reader = csv.reader(file_handle, delimiter=';') next(reader, None) # skip the headers for row in reader: a_tweet_obj = self._create_tweet_object_from_line( row, file) tweet_objects.append(a_tweet_obj) success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects( tweet_objects) print(success_count) print(not_imported_tweets) def _create_tweet_object_from_line(self, components, file_name): """ :param a_line: :return: """ MAP_DICT = {'e': 'positive', 'h': 'negative', 'n': 'neutral'} id_component = components[0] date_component = components[2] text_component = components[3] sentiment_component = MAP_DICT[components[4]] year_abv = file_name.split('.')[0][2:] if year_abv not in file_name: return date_len = len(date_component.split('-')) if date_len == 2: date_component = date_component + '-' + year_abv format_str = '%d-%b-%y' datetime_of_tweet = datetime.strptime(date_component, format_str) tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = self.__helper.generate_random_string(10) tweet_object.text = text_component tweet_object.created_at = datetime_of_tweet tweet_object.tweet_class = sentiment_component return tweet_object
class ImportManager: """ This class imports handles importing tweets to the database from various sources such as text files """ __file_path = None __components_in_a_line = None def __init__(self): """ Constructor method :param file_path_to_import: String a txt file path containing tweet ids :return: ImportManager instance """ self.__db_manager = DBManager() self.__helper = GeneralHelpers() self.__preprocess_manager = PreprocessManager() self.__tweets_classes_dictionary = {} # magic numbers self.__components_in_a_line = 2 self.__max_num_of_tweets_at_once = 100 def run(self, file_path_to_import): """ Runs all necessary methods to import tweets for a year :return: void """ self.__file_path = file_path_to_import # getting tweets with their classes tweets_with_classes = self._parse_tweets_from_file() self.__tweets_with_classes = tweets_with_classes # finding duplicates unique_tweets, duplicate_tweets = self._find_duplicates(tweets_with_classes) print("Found "+str(len(duplicate_tweets))+" duplicate tweets.") self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:") print("Continuing with unique ones.") # getting tweet ids from [tweet_id, class] unique_tweets_ids = self._get_tweets_ids(unique_tweets) # retrieving tweets from Twitter all_tweet_information = self._retrieve_tweets_from_twitter(unique_tweets_ids) # some tweets may not be found on Twitter not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(all_tweet_information) # creating db model objects all_tweet_objects = self._create_tweet_objects(all_tweet_information) # insert to database success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(all_tweet_objects) print("\n") print('-'*10) print('Total Math:') print('Unique tweets:'+str(len(unique_tweets))) print('Tweets not found:'+str(len(not_found_tweets_on_twitter))) print('Tweets not inserted:'+str(len(not_imported_tweets))) print('Tweets OK:'+str(success_count)) print(str(len(unique_tweets))+"=="+str(len(not_found_tweets_on_twitter)+len(not_imported_tweets)+success_count)) def _parse_tweets_from_file(self): """ Parses tweet ids and classes from txt file :return: list, holds [[124214124, positive],...] """ characters_to_remove = ["'", '"', '\n', ' '] with open(self.__file_path, 'r') as tweets_ids_file: tweets_with_classes = [] self.tweets_classes_dictionary = {} # Iterating over lines in txt file for line in tweets_ids_file: line_components = line.split(",") # if there are two components in a line. E.g. "121412412412", "positive" if self.__components_in_a_line == len(line_components): # iterating over components for index, component in enumerate(line_components): # removing unnecessary characters line_components[index] = self.__preprocess_manager.remove_characters_in_string(component, characters_to_remove) tweets_with_classes.append(line_components) self.__tweets_classes_dictionary.update({line_components[0]:line_components[1]}) return tweets_with_classes def _find_duplicates(self, tweets_with_classes): """ Finds duplicate tweets :param tweets_with_classes: List a list of tweet ids and their sentiment classes. :return: unique tweets, duplicate tweets """ unique_tweets = [] seen_tweets_ids = [] duplicate_tweet_ids = [] # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...] for tweet_block in tweets_with_classes: # First element is the tweet id tweet_id = tweet_block[0] # If it isn't seen before if not tweet_id in seen_tweets_ids: seen_tweets_ids.append(tweet_id) unique_tweets.append(tweet_block) else: duplicate_tweet_ids.append(tweet_id) return unique_tweets, duplicate_tweet_ids def _retrieve_tweets_from_twitter(self, tweet_ids): """ Retrieves tweet information from Twitter :param unique_tweets_with_classes: List, tweets and :return: """ tweets_results = [] twitter_manager = TwitterManager() chunks_of_tweets_ids = self.__helper.get_chunks_of_list(tweet_ids, self.__max_num_of_tweets_at_once) for chunk in chunks_of_tweets_ids: print("Searching for "+str(len(chunk))+" tweets.") result = twitter_manager.lookup(chunk) print("Found "+str(len(result))+" tweets.") tweets_results+=result return tweets_results def _get_tweets_ids(self, tweets_with_classes): """ Extracts tweet ids from tweets with classes :param tweets_with_classes: List, tweet_with classes :return: extracted ids """ ids = [x[0] for x in tweets_with_classes] return ids def _create_tweet_objects(self, all_tweets): """ Creates a list which holds db model objects. :param tweet_info: List, tweets :return: List, tweet objects """ all_tweet_objects = [] for tweet in all_tweets: tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = tweet.id_str tweet_object.created_at = tweet.created_at tweet_object.lang = tweet.lang tweet_object.source = tweet.source tweet_object.user_id = tweet.user.id_str tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(tweet.text).encode('utf-8') tweet_object.tweet_class = self._get_sentiment_class_of_tweet(tweet.id_str) all_tweet_objects.append(tweet_object) return all_tweet_objects def _get_sentiment_class_of_tweet(self, tweet_id): """ Returns a sentiment class for given tweet id :param tweet_id: string :return: tweet class """ return self.__tweets_classes_dictionary[tweet_id] def _find_not_found_tweets_on_twitter(self, twitter_response): """ Finds not found tweets on Twitter API :param twitter_response: list, Twitter response :return: list, not found tweets """ tweets_ids = self._get_tweets_ids(self.__tweets_with_classes) response_ids = [a_tweet_response.id_str for a_tweet_response in twitter_response] return list(set(tweets_ids) - set(response_ids)) def import_new_tweets_from_csv(self, root_path): """ :param root_path: :return: """ tweet_objects = [] for file in os.listdir(root_path): if file.endswith('.csv'): with open(root_path+file, 'r') as file_handle: reader = csv.reader(file_handle, delimiter=';') next(reader, None) # skip the headers for row in reader: a_tweet_obj = self._create_tweet_object_from_line(row, file) tweet_objects.append(a_tweet_obj) success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(tweet_objects) print(success_count) print(not_imported_tweets) def _create_tweet_object_from_line(self, components, file_name): """ :param a_line: :return: """ MAP_DICT = { 'e': 'positive', 'h': 'negative', 'n': 'neutral' } id_component = components[0] date_component = components[2] text_component = components[3] sentiment_component = MAP_DICT[components[4]] year_abv = file_name.split('.')[0][2:] if year_abv not in file_name: return date_len = len(date_component.split('-')) if date_len == 2: date_component = date_component + '-' + year_abv format_str = '%d-%b-%y' datetime_of_tweet = datetime.strptime(date_component, format_str) tweet_object = self.__db_manager.get_new_model_instance() tweet_object.id = self.__helper.generate_random_string(10) tweet_object.text = text_component tweet_object.created_at = datetime_of_tweet tweet_object.tweet_class = sentiment_component return tweet_object