def main(): args = get_args() data_path = os.path.join(args.iobasedir, 'processed/downloads', args.data_set) log_path = os.path.join(args.iobasedir, 'logs') log_file = os.path.join(args.iobasedir, 'logs', 'UB.log') mkdirp(log_path) set_logger(log_file) for filename in os.listdir(data_path): data_file = os.path.join(data_path, filename) topic = filename[:-5] docs, refs = load_data(data_file) if not refs: continue if not args.summary_size: summary_size = len(' '.join(refs[0]).split(' ')) else: summary_size = int(args.summary_size) logger.info('Topic ID: %s ', topic) logger.info('###') logger.info('Summmary_len: %d', summary_size) algos = ['UB1', 'UB2'] for algo in algos: get_summary_scores(algo, docs, refs, summary_size, language, rouge) logger.info('###')
def main(): args = get_args() rouge_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'rouge/RELEASE-1.5.5/') data_path = os.path.join(args.iobasedir, 'processed/', args.dataset, args.domain, args.split) log_path = os.path.join(args.iobasedir, 'logs') log_file = os.path.join( args.iobasedir, 'logs', 'baselines_rsumm_%s_%s_%s_%s.log' % (args.dataset, args.domain, args.split, str(args.summary_size))) mkdirp(log_path) set_logger(log_file) data_file = os.path.join(data_path, 'test0.csv') df = pd.read_csv(data_file, sep=",", quotechar='"', engine='python', header=None, skiprows=1, names=[ "user_id", "product_id", "rating", "review", "nouns", "summary", 'time' ]) # check_index = 1099 for index, row in df.iterrows(): # if index != check_index: # continue topic = row['user_id'] + '_' + row['product_id'] docs = [[sent] for sent in sent_tokenize(row['review'].strip())] refs = [sent_tokenize(row['summary'].strip())] if not refs: continue if not args.summary_size: summary_size = len(" ".join(refs[0]).split(' ')) else: summary_size = int(args.summary_size) logger.info('Topic ID: %s', topic) logger.info('###') logger.info('Summmary_len: %d', summary_size) rouge = Rouge(rouge_dir) algos = [ 'Luhn', 'LexRank', 'TextRank', 'LSA', 'KL', "ICSI", 'UB1', 'UB2' ] best_summary = [] best_score = 0.0 for algo in algos: best_summary, best_score = get_summary_scores( algo, docs, refs, summary_size, args.language, rouge, best_summary, best_score) rouge._cleanup() logger.info('###')
def main(): args = get_args() rouge_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'rouge/RELEASE-1.5.5/') data_path = os.path.join(args.iobasedir, args.data_setpath) log_path = os.path.join(args.iobasedir, 'logs') log_file = os.path.join( args.iobasedir, 'logs', 'baselines_%s_%s.log' % (args.data_set, args.summary_size)) mkdirp(log_path) set_logger(log_file) for filename in os.listdir(data_path): data_file = os.path.join(data_path, filename) topic = filename[:-5] try: docs, refs = load_data(data_file) except: pass if not refs: continue if not args.summary_size: summary_size = len(" ".join(refs[0]).split(' ')) else: summary_size = int(args.summary_size) logger.info('Topic ID: %s', topic) logger.info('###') logger.info('Summmary_len: %d', summary_size) rouge = Rouge(rouge_dir) algos = ['UB1', 'UB2', 'ICSI', 'Luhn', 'LexRank', 'LSA', 'KL'] for algo in algos: get_summary_scores(algo, docs, refs, summary_size, args.language, rouge) rouge._cleanup() logger.info('###')
def main(): parser = argparse.ArgumentParser( description='Generate the Summarization Corpus') parser.add_argument('--corpus', choices=['bbc', 'guardian'], required=True) parser.add_argument('--data_type', choices=['raw', 'processed']) parser.add_argument('--mode', choices=['fetch_urls', 'download', 'archive_urls'], required=True) parser.add_argument('--request_parallelism', type=int, default=1) args = parser.parse_args() if args.mode == 'fetch_urls': data_path = path.join(base_dir, 'data/%s/' % ('raw')) FetchMode(data_path, args.corpus) elif args.mode == 'download': data_path = path.join(base_dir, 'data/%s/' % (args.data_type)) download_path = path.join(data_path, 'downloads/%s' % (args.corpus)) print("Download Path:", download_path) if not os.path.isdir(download_path): mkdirp(download_path) DownloadMode(data_path, args.corpus) elif args.mode == 'archive_urls': UrlMode(path.join(base_dir, 'data/processed/'), args.corpus, args.request_parallelism)