Ejemplo n.º 1
0
    def __init__(self, file_path):
        global plain_text
        plain_text = ""
        global no_of_comments
        no_of_comments = 0

        global fs
        fs = FrequencySummarizer()

        global delete_list
        delete_list = ["hello", "thanks","thank you", ",", 'regards']

        global current_file_path
        current_file_path = file_path

        global punctuation_list
        punctuation_list = ["<", ">", " /", "/ ", "(", ")", "[", "]", "{", "}", "?"]
        #process file and separate into authors, questions, answers and ignores
        self.comment_filter = CommentFilter(current_file_path)
Ejemplo n.º 2
0
    if not os.path.exists(new_path):
        os.mkdir(new_path)


    for fileName in os.listdir(path_raw+ '/' + path):
        print fileName
        filePath = path_raw + '/' + path + '/' + fileName

        fileName = fileName.replace('json', 'txt')
        print fileName

        output_file_path = path_summary + '/' + path + '/' + fileName
        if os.path.exists(output_file_path):
            os.remove(output_file_path)
        with open(output_file_path, 'a') as output:
            issueProcessor = CommentFilter(filePath)
            summarizer = CommentProcessor(filePath)
            print "Description:------------------\n" + issueProcessor._get_description()
            output.write("==============================Description==============================\n")
            output.write(smart_str(issueProcessor._get_description()) + '\n\n')

            summarized, comments = summarizer._summarize_answers()

            print len(summarized)

            output.write("============================Summary Answer============================\n\n")
            for summarizedsent in summarized:

                print "* " + smart_str(summarizedsent)
                output.write("* " + smart_str(summarizedsent) + '\n\n')
                # output.write('------------------------------------------------------------------\n\n')
Ejemplo n.º 3
0
class CommentProcessor:

    def __init__(self, file_path):
        global plain_text
        plain_text = ""
        global no_of_comments
        no_of_comments = 0

        global fs
        fs = FrequencySummarizer()

        global delete_list
        delete_list = ["hello", "thanks","thank you", ",", 'regards']

        global current_file_path
        current_file_path = file_path

        global punctuation_list
        punctuation_list = ["<", ">", " /", "/ ", "(", ")", "[", "]", "{", "}", "?"]
        #process file and separate into authors, questions, answers and ignores
        self.comment_filter = CommentFilter(current_file_path)


    def _summarize(self, authors, comments):
        ''''
        :param authors: the list of authors of comments and the issue to remove them from the text
        :param comments: the lsit of comments that would be summarized
        : returns two lists: summary (specific number of sentences) and comments containing sentences chosen by the summarizer
        '''

        #identifying authors to remove names from comments
        for author in authors:
            for author_name in str(author).split():
                if author_name.__len__()>2:
                    delete_list.append(author_name.lower())

        plain_text, no_of_comments = self._load_plain_text(comments)
        print "++++++++++++++++++ :::" , no_of_comments

        # print "no_of_comments :" , no_of_comments

        # print 'No. of Comments : ' + str(no_of_comments)
        # print 'Plain Text : ', plain_text
        # print '*****************************************************************************************************************************'

        # removing unnecessary words such as greetings and author names
        for word in delete_list:
            plain_text = plain_text.lower().replace(word, "")

        global sents
        # specifying the number of sentences in the summary depending on the total number of comments
        if no_of_comments > 4:
            if no_of_comments > 15:
                sents = fs.summarize(plain_text, no_of_comments/2)
            else:
                sents = fs.summarize(plain_text, (no_of_comments/4)*3)
        else:
            sents = fs.summarize(plain_text, no_of_comments)

        # print 'Summarized: \n'

        # for s in sents:
        #     print '* ' + smart_str(s) + '\n'

        # print '\n --------------------------------------------------Comments containing the Summary----------------------------------------------------------- \n'
        sentenceList = []
        commentList = []

        #Identifying the comments containing the sentences picked by the summarizer as most important
        for s in sents:
            stripped_s = smart_str(s)
            for word in delete_list:
                stripped_s = stripped_s.lower().replace(word, "")
            for comment in comments:
                current_comment = smart_str(comment)
                for word in delete_list:
                    current_comment = current_comment.lower().replace(word, "")
                if current_comment.__contains__(stripped_s[:-1]):
                    # print 'Found comment \n'
                    # print current_comment
                    # comment_sum = stripped_s, current_comment
                    # commentList.append(comment_sum)
                    # print '--------------------------------------------------------------------------------------------\n'
                    commentList.append(comment)
                    comments.remove(comment)
                    break
            sentenceList.append(stripped_s)

        return sentenceList, commentList


    def _load_plain_text(self, comments):
        '''
        :param comments:
        :return: Tuple containing the plain text and no of comments
        '''
        plain_text = ""
        no_of_comments = 0
        for comment in comments:
            if comment.endswith("." or "?" or "!"):
                plain_text += comment + '\n\n'
            else:
                plain_text += comment + '.\n\n'
            no_of_comments += 1
        return plain_text, no_of_comments


    def _summarize_answers(self):
        '''Summarizes the list of comments identified as answers by the classifier'''
        return self._summarize(self.comment_filter._get_author_list(), self.comment_filter._get_comment_list('a'))

    def _summarize_questions(self):
        '''Summarizes the list of comments identified as questions by the classifier'''
        return self._summarize(self.comment_filter._get_author_list(), self.comment_filter._get_comment_list('q'))


    def _get_description(self):
        '''Summarizes the the description of the issue'''
        return  self.comment_filter._get_description()

    def _get_stripped_description(self):
        ''':return the description stripped of greetings and names of comment authors' and issue reporter's names'''
        desc_text = self._get_description()
        for author in self.comment_filter._get_author_list():
            for author_name in str(author).split():
                if author_name.__len__()>2:
                    delete_list.append(author_name.lower())

        print delete_list

        for word in delete_list:
            desc_text = desc_text.lower().replace(word, "")

        for mark in punctuation_list:
            desc_text = desc_text.replace(mark, ". ")

        return desc_text

    def _get_all_answers(self):
        ''':return retrieve and returns list of comments identified by the classifier as answers'''
        return self.comment_filter._get_comment_list('a')

    def _get_all_questions(self):
        '''return retrieve and returns list of comments identified by the classifier as questions'''
        return self.comment_filter._get_comment_list('q')

    def _get_all_ignores(self):
        '''return retrieve and returns list of comments identified by the classifier as ignores'''
        return self.comment_filter._get_comment_list('i')

    def _get_issue_summary(self):
        '''return retrieve and returns issue summary '''
        return self.comment_filter._get_issue_summary()