def __init__(self, file_path): global plain_text plain_text = "" global no_of_comments no_of_comments = 0 global fs fs = FrequencySummarizer() global delete_list delete_list = ["hello", "thanks","thank you", ",", 'regards'] global current_file_path current_file_path = file_path global punctuation_list punctuation_list = ["<", ">", " /", "/ ", "(", ")", "[", "]", "{", "}", "?"] #process file and separate into authors, questions, answers and ignores self.comment_filter = CommentFilter(current_file_path)
if not os.path.exists(new_path): os.mkdir(new_path) for fileName in os.listdir(path_raw+ '/' + path): print fileName filePath = path_raw + '/' + path + '/' + fileName fileName = fileName.replace('json', 'txt') print fileName output_file_path = path_summary + '/' + path + '/' + fileName if os.path.exists(output_file_path): os.remove(output_file_path) with open(output_file_path, 'a') as output: issueProcessor = CommentFilter(filePath) summarizer = CommentProcessor(filePath) print "Description:------------------\n" + issueProcessor._get_description() output.write("==============================Description==============================\n") output.write(smart_str(issueProcessor._get_description()) + '\n\n') summarized, comments = summarizer._summarize_answers() print len(summarized) output.write("============================Summary Answer============================\n\n") for summarizedsent in summarized: print "* " + smart_str(summarizedsent) output.write("* " + smart_str(summarizedsent) + '\n\n') # output.write('------------------------------------------------------------------\n\n')
class CommentProcessor: def __init__(self, file_path): global plain_text plain_text = "" global no_of_comments no_of_comments = 0 global fs fs = FrequencySummarizer() global delete_list delete_list = ["hello", "thanks","thank you", ",", 'regards'] global current_file_path current_file_path = file_path global punctuation_list punctuation_list = ["<", ">", " /", "/ ", "(", ")", "[", "]", "{", "}", "?"] #process file and separate into authors, questions, answers and ignores self.comment_filter = CommentFilter(current_file_path) def _summarize(self, authors, comments): '''' :param authors: the list of authors of comments and the issue to remove them from the text :param comments: the lsit of comments that would be summarized : returns two lists: summary (specific number of sentences) and comments containing sentences chosen by the summarizer ''' #identifying authors to remove names from comments for author in authors: for author_name in str(author).split(): if author_name.__len__()>2: delete_list.append(author_name.lower()) plain_text, no_of_comments = self._load_plain_text(comments) print "++++++++++++++++++ :::" , no_of_comments # print "no_of_comments :" , no_of_comments # print 'No. of Comments : ' + str(no_of_comments) # print 'Plain Text : ', plain_text # print '*****************************************************************************************************************************' # removing unnecessary words such as greetings and author names for word in delete_list: plain_text = plain_text.lower().replace(word, "") global sents # specifying the number of sentences in the summary depending on the total number of comments if no_of_comments > 4: if no_of_comments > 15: sents = fs.summarize(plain_text, no_of_comments/2) else: sents = fs.summarize(plain_text, (no_of_comments/4)*3) else: sents = fs.summarize(plain_text, no_of_comments) # print 'Summarized: \n' # for s in sents: # print '* ' + smart_str(s) + '\n' # print '\n --------------------------------------------------Comments containing the Summary----------------------------------------------------------- \n' sentenceList = [] commentList = [] #Identifying the comments containing the sentences picked by the summarizer as most important for s in sents: stripped_s = smart_str(s) for word in delete_list: stripped_s = stripped_s.lower().replace(word, "") for comment in comments: current_comment = smart_str(comment) for word in delete_list: current_comment = current_comment.lower().replace(word, "") if current_comment.__contains__(stripped_s[:-1]): # print 'Found comment \n' # print current_comment # comment_sum = stripped_s, current_comment # commentList.append(comment_sum) # print '--------------------------------------------------------------------------------------------\n' commentList.append(comment) comments.remove(comment) break sentenceList.append(stripped_s) return sentenceList, commentList def _load_plain_text(self, comments): ''' :param comments: :return: Tuple containing the plain text and no of comments ''' plain_text = "" no_of_comments = 0 for comment in comments: if comment.endswith("." or "?" or "!"): plain_text += comment + '\n\n' else: plain_text += comment + '.\n\n' no_of_comments += 1 return plain_text, no_of_comments def _summarize_answers(self): '''Summarizes the list of comments identified as answers by the classifier''' return self._summarize(self.comment_filter._get_author_list(), self.comment_filter._get_comment_list('a')) def _summarize_questions(self): '''Summarizes the list of comments identified as questions by the classifier''' return self._summarize(self.comment_filter._get_author_list(), self.comment_filter._get_comment_list('q')) def _get_description(self): '''Summarizes the the description of the issue''' return self.comment_filter._get_description() def _get_stripped_description(self): ''':return the description stripped of greetings and names of comment authors' and issue reporter's names''' desc_text = self._get_description() for author in self.comment_filter._get_author_list(): for author_name in str(author).split(): if author_name.__len__()>2: delete_list.append(author_name.lower()) print delete_list for word in delete_list: desc_text = desc_text.lower().replace(word, "") for mark in punctuation_list: desc_text = desc_text.replace(mark, ". ") return desc_text def _get_all_answers(self): ''':return retrieve and returns list of comments identified by the classifier as answers''' return self.comment_filter._get_comment_list('a') def _get_all_questions(self): '''return retrieve and returns list of comments identified by the classifier as questions''' return self.comment_filter._get_comment_list('q') def _get_all_ignores(self): '''return retrieve and returns list of comments identified by the classifier as ignores''' return self.comment_filter._get_comment_list('i') def _get_issue_summary(self): '''return retrieve and returns issue summary ''' return self.comment_filter._get_issue_summary()