Esempio n. 1
0
    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100
    def process_bio(self, input_bio):
        # Convert to ord()
        bio_data = PreprocessManager.get_unique_words(input_bio)
        bio_data = [str(word) for word in bio_data]
        # Match interests
        [interest_p,
         interest_q] = self.get_best_match(bio_data, self.interest_dict,
                                           self.interests_tolerance)

        # Match technologies
        [tech_p, tech_q] = self.get_best_match(bio_data, self.tech_dict,
                                               self.tech_tolerance)

        # Match languages
        [languages_p,
         languages_q] = self.get_best_match(bio_data, self.lagunages_dict,
                                            self.languages_tolerance)

        # Match positions
        [positions_p,
         positions_q] = self.get_best_match(bio_data, self.position_dict,
                                            self.position_tolernce)

        # Match Student Statutes
        [status_p,
         status_q] = self.get_best_match(bio_data, self.student_status_dict,
                                         self.student_status_tolerance)

        return interest_q, tech_q, languages_q, positions_q, status_q
    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100
Esempio n. 4
0
    def process_one_log(self, input_log, repo_info_topics):
        input_log = PreprocessManager.remove_non_ascii(input_log)
        # TODO : Do we need repo info?
        #repo_info_topics = PreprocessManager.remove_non_ascii(repo_info_topics)
        # Find the length
        # TODO : All the scores which are dependent on the length are not unbiased if not normalized! Check that
        length = len(PreprocessManager.get_raw_tokenized_text(input_log))

        # Find structural integrity.
        self.grammar_tool.enable_spellchecking()
        problematic_matches = self.grammar_tool.check(input_log)
        corrected_text = gc.correct(input_log, problematic_matches)
        degree_of_match = fuzz.ratio(input_log, corrected_text)
        structural_integrity_score = degree_of_match * (length - len(problematic_matches))

        # Check if topic is relevant
        # This is still in testing phase and not sure if it has a good impact on the final results.
        # Might be totally useless at times.
        sframe_data_for_topics = gl.SArray([PreprocessManager.get_word_counts(input_log)])
        # Add Associations here TODO: Make it proper
        associations = gl.SFrame({'word': ['fix', 'issue', 'implement', 'modify', 'changed', 'bug', 'error'],
                               'topic': [0, 0, 0, 0, 0, 0, 0]})

        topic_model = gl.topic_model.create(sframe_data_for_topics, associations=associations)

        # TODO : Add here the match with the description. Is that useful? Maybe Future work?

        #pred = topic_model.predict(sframe_data_for_topics, output_type='probability')
        topics = topic_model.get_topics()
        # The final score is the sum of all the topic 0 scores! As they were used in associations. Gives us relevance of being a commit message!
        topic_relevance_score = 0
        for i in xrange(0, len(topics)):
            curr = topics[i]
            topic_id = curr['topic']
            score_val = curr['score']
            if topic_id == 0:
                topic_relevance_score += score_val

        topic_relevance_score *= 100

        #print topics, topic_relevance_score



        # Check how much positivity
        log_dict = dict()
        log_dict['text'] = input_log
        positivity = self.senti_checker.predict_row(log_dict)
        positivity_score = 100 * positivity

        #print positivity_score




        # Spelling Goodness
        self.spell_master.set_text(input_log)
        error_words = list()
        for err in self.spell_master:
            error_words.append(err.word)
        spelling_integrity_score = length - len(error_words)


        #return all
        return length, structural_integrity_score, topic_relevance_score, positivity_score, spelling_integrity_score
Esempio n. 5
0
class ImportManager:
    """
    This class imports handles importing tweets to the database from various sources such as text files
    """

    __file_path = None
    __components_in_a_line = None

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

    def run(self, file_path_to_import):
        """
        Runs all necessary methods to import tweets for a year
        :return: void
        """
        self.__file_path = file_path_to_import

        # getting tweets with their classes
        tweets_with_classes = self._parse_tweets_from_file()
        self.__tweets_with_classes = tweets_with_classes

        # finding duplicates
        unique_tweets, duplicate_tweets = self._find_duplicates(
            tweets_with_classes)

        print("Found " + str(len(duplicate_tweets)) + " duplicate tweets.")
        self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:")
        print("Continuing with unique ones.")

        # getting tweet ids from [tweet_id, class]
        unique_tweets_ids = self._get_tweets_ids(unique_tweets)

        # retrieving tweets from Twitter
        all_tweet_information = self._retrieve_tweets_from_twitter(
            unique_tweets_ids)

        # some tweets may not be found on Twitter
        not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(
            all_tweet_information)

        # creating db model objects
        all_tweet_objects = self._create_tweet_objects(all_tweet_information)

        # insert to database
        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(
            all_tweet_objects)

        print("\n")
        print('-' * 10)
        print('Total Math:')
        print('Unique tweets:' + str(len(unique_tweets)))
        print('Tweets not found:' + str(len(not_found_tweets_on_twitter)))
        print('Tweets not inserted:' + str(len(not_imported_tweets)))
        print('Tweets OK:' + str(success_count))
        print(
            str(len(unique_tweets)) + "==" + str(
                len(not_found_tweets_on_twitter) + len(not_imported_tweets) +
                success_count))

    def _parse_tweets_from_file(self):
        """
        Parses tweet ids and classes from txt file
        :return: list, holds [[124214124, positive],...]
        """

        characters_to_remove = ["'", '"', '\n', ' ']

        with open(self.__file_path, 'r') as tweets_ids_file:
            tweets_with_classes = []
            self.tweets_classes_dictionary = {}

            # Iterating over lines in txt file
            for line in tweets_ids_file:
                line_components = line.split(",")

                # if there are two components in a line. E.g. "121412412412", "positive"
                if self.__components_in_a_line == len(line_components):

                    # iterating over components
                    for index, component in enumerate(line_components):

                        # removing unnecessary characters
                        line_components[
                            index] = self.__preprocess_manager.remove_characters_in_string(
                                component, characters_to_remove)

                    tweets_with_classes.append(line_components)
                    self.__tweets_classes_dictionary.update(
                        {line_components[0]: line_components[1]})

            return tweets_with_classes

    def _find_duplicates(self, tweets_with_classes):
        """
        Finds duplicate tweets
        :param tweets_with_classes: List a list of tweet ids and their sentiment classes.
        :return: unique tweets, duplicate tweets
        """

        unique_tweets = []
        seen_tweets_ids = []
        duplicate_tweet_ids = []

        # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...]
        for tweet_block in tweets_with_classes:

            # First element is the tweet id
            tweet_id = tweet_block[0]

            # If it isn't seen before
            if not tweet_id in seen_tweets_ids:
                seen_tweets_ids.append(tweet_id)
                unique_tweets.append(tweet_block)

            else:
                duplicate_tweet_ids.append(tweet_id)

        return unique_tweets, duplicate_tweet_ids

    def _retrieve_tweets_from_twitter(self, tweet_ids):
        """
        Retrieves tweet information from Twitter
        :param unique_tweets_with_classes: List, tweets and
        :return:
        """
        tweets_results = []
        twitter_manager = TwitterManager()

        chunks_of_tweets_ids = self.__helper.get_chunks_of_list(
            tweet_ids, self.__max_num_of_tweets_at_once)

        for chunk in chunks_of_tweets_ids:
            print("Searching for " + str(len(chunk)) + " tweets.")
            result = twitter_manager.lookup(chunk)
            print("Found " + str(len(result)) + " tweets.")
            tweets_results += result

        return tweets_results

    def _get_tweets_ids(self, tweets_with_classes):
        """
        Extracts tweet ids from tweets with classes
        :param tweets_with_classes: List, tweet_with classes
        :return: extracted ids
        """
        ids = [x[0] for x in tweets_with_classes]
        return ids

    def _create_tweet_objects(self, all_tweets):
        """
        Creates a list which holds db model objects.
        :param tweet_info: List, tweets
        :return: List, tweet objects
        """

        all_tweet_objects = []

        for tweet in all_tweets:
            tweet_object = self.__db_manager.get_new_model_instance()
            tweet_object.id = tweet.id_str
            tweet_object.created_at = tweet.created_at
            tweet_object.lang = tweet.lang
            tweet_object.source = tweet.source
            tweet_object.user_id = tweet.user.id_str

            tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(
                tweet.text).encode('utf-8')
            tweet_object.tweet_class = self._get_sentiment_class_of_tweet(
                tweet.id_str)

            all_tweet_objects.append(tweet_object)

        return all_tweet_objects

    def _get_sentiment_class_of_tweet(self, tweet_id):
        """
        Returns a sentiment class for given tweet id
        :param tweet_id: string
        :return: tweet class
        """
        return self.__tweets_classes_dictionary[tweet_id]

    def _find_not_found_tweets_on_twitter(self, twitter_response):
        """
        Finds not found tweets on Twitter API
        :param twitter_response: list, Twitter response
        :return: list, not found tweets
        """
        tweets_ids = self._get_tweets_ids(self.__tweets_with_classes)
        response_ids = [
            a_tweet_response.id_str for a_tweet_response in twitter_response
        ]
        return list(set(tweets_ids) - set(response_ids))

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        tweet_objects = []

        for file in os.listdir(root_path):
            if file.endswith('.csv'):
                with open(root_path + file, 'r') as file_handle:
                    reader = csv.reader(file_handle, delimiter=';')
                    next(reader, None)  # skip the headers
                    for row in reader:
                        a_tweet_obj = self._create_tweet_object_from_line(
                            row, file)
                        tweet_objects.append(a_tweet_obj)

        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(
            tweet_objects)
        print(success_count)
        print(not_imported_tweets)

    def _create_tweet_object_from_line(self, components, file_name):
        """

        :param a_line:
        :return:
        """
        MAP_DICT = {'e': 'positive', 'h': 'negative', 'n': 'neutral'}

        id_component = components[0]
        date_component = components[2]
        text_component = components[3]
        sentiment_component = MAP_DICT[components[4]]
        year_abv = file_name.split('.')[0][2:]

        if year_abv not in file_name:
            return

        date_len = len(date_component.split('-'))
        if date_len == 2:
            date_component = date_component + '-' + year_abv

        format_str = '%d-%b-%y'

        datetime_of_tweet = datetime.strptime(date_component, format_str)

        tweet_object = self.__db_manager.get_new_model_instance()
        tweet_object.id = self.__helper.generate_random_string(10)
        tweet_object.text = text_component
        tweet_object.created_at = datetime_of_tweet
        tweet_object.tweet_class = sentiment_component

        return tweet_object
class ImportManager:

    """
    This class imports handles importing tweets to the database from various sources such as text files
    """

    __file_path = None
    __components_in_a_line = None

    def __init__(self):
        """
        Constructor method
        :param file_path_to_import: String a txt file path containing tweet ids
        :return: ImportManager instance
        """

        self.__db_manager = DBManager()
        self.__helper = GeneralHelpers()
        self.__preprocess_manager = PreprocessManager()
        self.__tweets_classes_dictionary = {}

        # magic numbers
        self.__components_in_a_line = 2
        self.__max_num_of_tweets_at_once = 100

    def run(self, file_path_to_import):
        """
        Runs all necessary methods to import tweets for a year
        :return: void
        """
        self.__file_path = file_path_to_import

        # getting tweets with their classes
        tweets_with_classes = self._parse_tweets_from_file()
        self.__tweets_with_classes = tweets_with_classes

        # finding duplicates
        unique_tweets, duplicate_tweets = self._find_duplicates(tweets_with_classes)

        print("Found "+str(len(duplicate_tweets))+" duplicate tweets.")
        self.__helper.pretty_print_list(duplicate_tweets, "Duplicate tweets:")
        print("Continuing with unique ones.")

        # getting tweet ids from [tweet_id, class]
        unique_tweets_ids = self._get_tweets_ids(unique_tweets)

        # retrieving tweets from Twitter
        all_tweet_information = self._retrieve_tweets_from_twitter(unique_tweets_ids)

        # some tweets may not be found on Twitter
        not_found_tweets_on_twitter = self._find_not_found_tweets_on_twitter(all_tweet_information)

        # creating db model objects
        all_tweet_objects = self._create_tweet_objects(all_tweet_information)

        # insert to database
        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(all_tweet_objects)

        print("\n")
        print('-'*10)
        print('Total Math:')
        print('Unique tweets:'+str(len(unique_tweets)))
        print('Tweets not found:'+str(len(not_found_tweets_on_twitter)))
        print('Tweets not inserted:'+str(len(not_imported_tweets)))
        print('Tweets OK:'+str(success_count))
        print(str(len(unique_tweets))+"=="+str(len(not_found_tweets_on_twitter)+len(not_imported_tweets)+success_count))

    def _parse_tweets_from_file(self):
        """
        Parses tweet ids and classes from txt file
        :return: list, holds [[124214124, positive],...]
        """

        characters_to_remove = ["'", '"', '\n', ' ']

        with open(self.__file_path, 'r') as tweets_ids_file:
            tweets_with_classes = []
            self.tweets_classes_dictionary = {}

            # Iterating over lines in txt file
            for line in tweets_ids_file:
                line_components = line.split(",")

                # if there are two components in a line. E.g. "121412412412", "positive"
                if self.__components_in_a_line == len(line_components):

                    # iterating over components
                    for index, component in enumerate(line_components):

                        # removing unnecessary characters
                        line_components[index] = self.__preprocess_manager.remove_characters_in_string(component,
                                                                                                       characters_to_remove)

                    tweets_with_classes.append(line_components)
                    self.__tweets_classes_dictionary.update({line_components[0]:line_components[1]})

            return tweets_with_classes

    def _find_duplicates(self, tweets_with_classes):
        """
        Finds duplicate tweets
        :param tweets_with_classes: List a list of tweet ids and their sentiment classes.
        :return: unique tweets, duplicate tweets
        """

        unique_tweets = []
        seen_tweets_ids = []
        duplicate_tweet_ids = []

        # Iterating over tweets with their classes. E.g [[214124124124, positive], [124124124124, negative]...]
        for tweet_block in tweets_with_classes:

            # First element is the tweet id
            tweet_id = tweet_block[0]

            # If it isn't seen before
            if not tweet_id in seen_tweets_ids:
                seen_tweets_ids.append(tweet_id)
                unique_tweets.append(tweet_block)

            else:
                duplicate_tweet_ids.append(tweet_id)

        return unique_tweets, duplicate_tweet_ids

    def _retrieve_tweets_from_twitter(self, tweet_ids):
        """
        Retrieves tweet information from Twitter
        :param unique_tweets_with_classes: List, tweets and
        :return:
        """
        tweets_results = []
        twitter_manager = TwitterManager()

        chunks_of_tweets_ids = self.__helper.get_chunks_of_list(tweet_ids, self.__max_num_of_tweets_at_once)

        for chunk in chunks_of_tweets_ids:
            print("Searching for "+str(len(chunk))+" tweets.")
            result = twitter_manager.lookup(chunk)
            print("Found "+str(len(result))+" tweets.")
            tweets_results+=result

        return tweets_results

    def _get_tweets_ids(self, tweets_with_classes):
        """
        Extracts tweet ids from tweets with classes
        :param tweets_with_classes: List, tweet_with classes
        :return: extracted ids
        """
        ids = [x[0] for x in tweets_with_classes]
        return ids

    def _create_tweet_objects(self, all_tweets):
        """
        Creates a list which holds db model objects.
        :param tweet_info: List, tweets
        :return: List, tweet objects
        """

        all_tweet_objects = []

        for tweet in all_tweets:
            tweet_object = self.__db_manager.get_new_model_instance()
            tweet_object.id = tweet.id_str
            tweet_object.created_at = tweet.created_at
            tweet_object.lang = tweet.lang
            tweet_object.source = tweet.source
            tweet_object.user_id = tweet.user.id_str

            tweet_object.text = self.__preprocess_manager.clean_emojis_and_smileys(tweet.text).encode('utf-8')
            tweet_object.tweet_class = self._get_sentiment_class_of_tweet(tweet.id_str)

            all_tweet_objects.append(tweet_object)

        return all_tweet_objects

    def _get_sentiment_class_of_tweet(self, tweet_id):
        """
        Returns a sentiment class for given tweet id
        :param tweet_id: string
        :return: tweet class
        """
        return self.__tweets_classes_dictionary[tweet_id]

    def _find_not_found_tweets_on_twitter(self, twitter_response):
        """
        Finds not found tweets on Twitter API
        :param twitter_response: list, Twitter response
        :return: list, not found tweets
        """
        tweets_ids = self._get_tweets_ids(self.__tweets_with_classes)
        response_ids = [a_tweet_response.id_str for a_tweet_response in twitter_response]
        return list(set(tweets_ids) - set(response_ids))

    def import_new_tweets_from_csv(self, root_path):
        """

        :param root_path:
        :return:
        """
        tweet_objects = []

        for file in os.listdir(root_path):
            if file.endswith('.csv'):
                with open(root_path+file, 'r') as file_handle:
                    reader = csv.reader(file_handle, delimiter=';')
                    next(reader, None)  # skip the headers
                    for row in reader:
                        a_tweet_obj = self._create_tweet_object_from_line(row, file)
                        tweet_objects.append(a_tweet_obj)

        success_count, not_imported_tweets = self.__db_manager.insert_tweet_objects(tweet_objects)
        print(success_count)
        print(not_imported_tweets)

    def _create_tweet_object_from_line(self, components, file_name):
        """

        :param a_line:
        :return:
        """
        MAP_DICT = {
            'e': 'positive',
            'h': 'negative',
            'n': 'neutral'
        }

        id_component = components[0]
        date_component = components[2]
        text_component = components[3]
        sentiment_component = MAP_DICT[components[4]]
        year_abv = file_name.split('.')[0][2:]

        if year_abv not in file_name:
            return


        date_len = len(date_component.split('-'))
        if date_len == 2:
            date_component = date_component + '-' + year_abv

        format_str = '%d-%b-%y'

        datetime_of_tweet = datetime.strptime(date_component, format_str)


        tweet_object = self.__db_manager.get_new_model_instance()
        tweet_object.id = self.__helper.generate_random_string(10)
        tweet_object.text = text_component
        tweet_object.created_at = datetime_of_tweet
        tweet_object.tweet_class = sentiment_component

        return tweet_object