def main(): filename = './origin_data/bugreports.xml' path = './bug_reports' bugslist = utils.read_xml(filename) # print(bugslist) label = utils.read_label('./origin_data/goldset.txt') # print(label) samples, ids = utils.get_content(bugslist) # print(samples) num_word_list, numword = utils.count_word(samples) # print(len(num_word_list)) # for i in num_word_list: # num_sentence.append(len(i)) utils.savefile(samples) # print(num_sentence) results = textrank.bugsum(path, numword, num_word_list) print(len(i) for i in results) # extra_ids = index2id(results,ids) # print(len(extra_ids)) pred = eval.index2pred(results, ids) y = eval.label2y(label, ids) mean_acc, mean_pr, mean_re, mean_f1 = eval.evaluate(y, pred) print('mean_acc, mean_pr, mean_re, mean_f1', mean_acc, mean_pr, mean_re, mean_f1)
def on_modified(self, event): modified_content = file_reader(event.src_path) count = count_word(modified_content, KEYWORD_TO_COUNT) log_writer = Logger() log_writer.write_logs( f'In file: {event.src_path} Keyword count of {KEYWORD_TO_COUNT} is {count}' ) print('%s File modified CDS count: %s' % (event.src_path, count))
def clean_data(): start_time = time.time() processer = dataProcessor.DataProcessor() data = pd.read_csv(data_path, header=0) if mode.lower() == 'clean': data['word_count'] = utils.count_word(data.text) data['count_number'] = utils.count_numbers(data.text) data['emojies'] = utils.view_emojie(data.text) data['emoticons'] = utils.view_emoticon(data.text) data['len_tweet'] = utils.len_tweet(data.text) data['avg_words_len'] = utils.avg_word_len(data.text) data['count_stopwords'] = utils.count_stopwords(data.text) data['count_tagging'] = utils.count_tagging(data.text) data['flagged'] = utils.repeated_char(data.text) # data_copy.append([word_count, count_number, emojies, len_tweet, avg_words_len, count_stopwords, count_tagging], ignore_index=True) data.to_csv(filename + '.csv', index=False) tf = utils.term_freq(data.text) tf.to_csv('term_frequency.csv', index=False) data_pro, _ = processer.proccess_data(data.text, handle_emojies=handle_emojies) data_pro = pd.DataFrame(data_pro, columns=['text']) data_pro.append(data['label']) data_pro.to_csv('cleaned.csv', index=False) elapsed_time = time.time() - start_time print(f'Finished in {elapsed_time}') return None
def text_summary(self): """Composes a textual summary of the event.""" msg = "The build event was started on %s." % self.text_timestamp() msg += ' It' pkgCount = len(self.list_package_files()) changesName = self.file_path('changes.html') commitCount = 0 if os.path.exists(changesName): commitCount = utils.count_word('<li>', file(changesName).read()) if commitCount: moreThan = '' if commitCount == 100: moreThan = 'more than ' msg += " contains %s%i commits and" % (moreThan, commitCount) msg += " produced %i installable binary package%s." % \ (pkgCount, 's' if (pkgCount != 1) else '') return msg
def text_summary(self): """Composes a textual summary of the event.""" msg = "The build event was started on %s." % self.text_timestamp() msg += ' It' pkgCount = len(self.list_package_files()) changesName = self.file_path('changes.html') commitCount = 0 if os.path.exists(changesName): commitCount = utils.count_word('<li>', file(changesName).read()) if commitCount: moreThan = '' if commitCount == 100: moreThan = 'more than ' msg += " contains %s%i commits and" % (moreThan, commitCount) msg += " produced %i package%s." % \ (pkgCount, 's' if (pkgCount != 1) else '') return msg
def html_description(self, encoded=True): """Composes an HTML build report.""" name = self.name buildDir = self.buildDir oses = self.oses msg = '<p>' + self.text_summary() + '</p>' # What do we have here? files = self.list_package_files() # Print out the matrix. msg += '<p><table cellspacing="4" border="0">' msg += '<tr style="text-align:left;"><th>OS<th>Binary<th>Logs<th>Issues</tr>' for osName, osExt, osIdent in oses: isFirst = True # Find the binaries for this OS. binaries = [] for f in files: if self.os_from_filename(f)[2] == osIdent: binaries.append(f) if not binaries: # Nothing available for this OS. msg += '<tr><td>' + osName + '<td>n/a' # Do we have a log? logName = log_filename(self.packages[0], osIdent) if os.path.exists(self.file_path(logName)): msg += self.html_table_log_issues(logName) msg += '</tr>' continue # List all the binaries. One row per binary. for binary in self.sort_by_package(binaries): msg += '<tr><td>' if isFirst: msg += osName isFirst = False msg += '<td>' msg += '<a href="%s">%s</a>' % (self.download_uri(binary), binary) # Status of the log. logName = self.compressed_log_filename(binary) if not os.path.exists(self.file_path(logName)): msg += '</tr>' continue # Link to the compressed log. msg += self.html_table_log_issues(logName) msg += '</tr>' msg += '</table></p>' # Changes. chgFn = self.file_path('changes.html') if os.path.exists(chgFn): if utils.count_word('<li>', file(chgFn).read()): msg += '<h2>Commits</h2>' + file(chgFn, 'rt').read() # Enclose it in a CDATA block if needed. if encoded: return '<![CDATA[' + msg + ']]>' return msg
test = collection.find().next() articles_wordcount = {} i = 0 for article in collection.find({ 'type': 'article', 'creationDate': { '$gte': datetime(2019, 1, 1) } }): i += 1 print(i) wordcount = 0 title_wordcount = 0 for chapter in article['title'].values(): title_wordcount += count_word(chapter) for chapter in article['chapters']: if chapter['type'] == 'paragraph': text = [text for text in chapter['text'].values()] for t in text: wordcount += count_word(t) articles_wordcount[article['id']] = wordcount with open("Articles_wordcount.json", 'w') as f: json.dump(articles_wordcount, f) f.close() with open("title_wordcount.json", 'w') as f: json.dump(articles_wordcount, f) f.close()