Esempio n. 1
0
    def process_one_log(self, input_log, repo_info_topics):
        input_log = PreprocessManager.remove_non_ascii(input_log)
        # TODO : Do we need repo info?
        #repo_info_topics = PreprocessManager.remove_non_ascii(repo_info_topics)
        # Find the length
        # TODO : All the scores which are dependent on the length are not unbiased if not normalized! Check that
        length = len(PreprocessManager.get_raw_tokenized_text(input_log))

        # Find structural integrity.
        self.grammar_tool.enable_spellchecking()
        problematic_matches = self.grammar_tool.check(input_log)
        corrected_text = gc.correct(input_log, problematic_matches)
        degree_of_match = fuzz.ratio(input_log, corrected_text)
        structural_integrity_score = degree_of_match * (length - len(problematic_matches))

        # Check if topic is relevant
        # This is still in testing phase and not sure if it has a good impact on the final results.
        # Might be totally useless at times.
        sframe_data_for_topics = gl.SArray([PreprocessManager.get_word_counts(input_log)])
        # Add Associations here TODO: Make it proper
        associations = gl.SFrame({'word': ['fix', 'issue', 'implement', 'modify', 'changed', 'bug', 'error'],
                               'topic': [0, 0, 0, 0, 0, 0, 0]})

        topic_model = gl.topic_model.create(sframe_data_for_topics, associations=associations)

        # TODO : Add here the match with the description. Is that useful? Maybe Future work?

        #pred = topic_model.predict(sframe_data_for_topics, output_type='probability')
        topics = topic_model.get_topics()
        # The final score is the sum of all the topic 0 scores! As they were used in associations. Gives us relevance of being a commit message!
        topic_relevance_score = 0
        for i in xrange(0, len(topics)):
            curr = topics[i]
            topic_id = curr['topic']
            score_val = curr['score']
            if topic_id == 0:
                topic_relevance_score += score_val

        topic_relevance_score *= 100

        #print topics, topic_relevance_score



        # Check how much positivity
        log_dict = dict()
        log_dict['text'] = input_log
        positivity = self.senti_checker.predict_row(log_dict)
        positivity_score = 100 * positivity

        #print positivity_score




        # Spelling Goodness
        self.spell_master.set_text(input_log)
        error_words = list()
        for err in self.spell_master:
            error_words.append(err.word)
        spelling_integrity_score = length - len(error_words)


        #return all
        return length, structural_integrity_score, topic_relevance_score, positivity_score, spelling_integrity_score