def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) train_x, train_y, val_x, val_y, test_x, test_y = load_data() lr = LogisticRegressionCV() lr.fit(train_x, train_y) train_acc = lr.score(train_x, train_y) print(train_acc) test_acc = lr.score(test_x, test_y) print(test_acc) train_y_pred = lr.predict(train_x) y_pred = lr.predict(test_x) print('Training eval') print(metrics.classification_report(train_y, train_y_pred)) print('Testing eval') print('-----------------------------------------------') print(metrics.classification_report(test_y, y_pred)) with open(os.path.join(model_dir, dataset + '.pkl'), 'wb') as f: dill.dump(lr, f) util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text) if singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) if singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') list(futures.map(load_and_evaluate_example, ex_list)) # for ex in tqdm(ex_list, total=total): # load_and_evaluate_example(ex) print('Evaluating ROUGE...') results_dict = rouge_eval_references.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_eval_references.rouge_log(results_dict, my_log_dir) util.print_execution_time(start_time)
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files)*1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) # Read output of BERT and put into a dictionary with: # key=(article idx, source indices {this is a tuple of length 1 or 2, depending on if it is a singleton or pair}) # value=score qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, None, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] # # Main function to get results on all test examples # pool = mp.Pool(mp.cpu_count()) # ssi_list = list(tqdm(pool.imap(evaluate_example, ex_list), total=total)) # pool.close() # Main function to get results on all test examples ssi_list = list(map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'wb') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl'), 'rb') as f: ssi_list = pickle.load(f) print('Evaluating BERT model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) ssis_restricted = [ssi_triple[1][:ssi_triple[2]] for ssi_triple in ssi_list] ssi_lens = [len(source_indices) for source_indices in util.flatten_list_of_lists(ssis_restricted)] num_singles = ssi_lens.count(1) num_pairs = ssi_lens.count(2) print ('Percent singles/pairs: %.2f %.2f' % (num_singles*100./len(ssi_lens), num_pairs*100./len(ssi_lens))) util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': in_dataset = FLAGS.dataset_name out_dataset = FLAGS.dataset_name + '_both' else: in_dataset = FLAGS.dataset_name + '_singles' out_dataset = FLAGS.dataset_name + '_singles' if FLAGS.lr: out_dataset = FLAGS.dataset_name + '_lr' start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, in_dataset) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) util.print_vars(single_feat_len, pair_feat_len) util.create_dirs(temp_dir) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] elif FLAGS.dataset_split == 'train_val': dataset_splits = ['val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + split + '*')) out_path = os.path.join(out_dir, out_dataset, split) if FLAGS.pca: out_path += '_pca' util.create_dirs(os.path.join(out_path)) total = len(source_files) * 1000 if ( 'cnn' in in_dataset or 'newsroom' in in_dataset or 'xsum' in in_dataset) else len(source_files) example_generator = data.example_generator(source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs, out_path) print('Creating list') ex_list = [ex for ex in ex_gen] if FLAGS.num_instances != -1: ex_list = ex_list[:FLAGS.num_instances] print('Converting...') # all_features = pool.map(convert_article_to_lambdamart_features, ex_list) # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list]) if FLAGS.lr: all_instances = list( futures.map(convert_article_to_lambdamart_features, ex_list)) all_instances = util.flatten_list_of_lists(all_instances) x = [inst.features for inst in all_instances] x = np.array(x) y = [inst.relevance for inst in all_instances] y = np.expand_dims(np.array(y), 1) x_y = np.concatenate((x, y), 1) np.save(writer, x_y) else: list(futures.map(convert_article_to_lambdamart_features, ex_list)) # writer.write(''.join(all_features)) # all_features = [] # for example in tqdm(ex_gen, total=total): # all_features.append(convert_article_to_lambdamart_features(example)) # all_features = util.flatten_list_of_lists(all_features) # num1 = sum(x == 1 for x in all_features) # num2 = sum(x == 2 for x in all_features) # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2) # for example in tqdm(ex_gen, total=total): # features = convert_article_to_lambdamart_features(example) # writer.write(features) final_out_path = out_path + '.txt' file_names = sorted(glob.glob(os.path.join(out_path, '*'))) writer = open(final_out_path, 'wb') for file_name in tqdm(file_names): with open(file_name) as f: text = f.read() writer.write(text) writer.close() util.print_execution_time(start_time)
def main(unused_argv): # def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) if FLAGS.mode == 'write_to_file': ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') # if len(sys.argv) > 1 and sys.argv[1] == '-m': list(futures.map(write_to_lambdamart_examples_to_file, ex_list)) # else: # instances_list = [] # for ex in tqdm(ex_list): # instances_list.append(write_to_lambdamart_examples_to_file(ex)) file_names = sorted(glob.glob(os.path.join(temp_in_dir, '*'))) instances_str = '' for file_name in tqdm(file_names): with open(file_name) as f: instances_str += f.read() with open(temp_in_path, 'wb') as f: f.write(instances_str) # RUN LAMBDAMART SCORING COMMAND HERE if FLAGS.mode == 'generate_summaries': qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] ssi_list = list(futures.map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'w') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl')) as f: ssi_list = pickle.load(f) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) # # # for ex in tqdm(ex_list, total=total): # # load_and_evaluate_example(ex) # print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) util.print_execution_time(start_time)
def main(unused_argv): # def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) total = len(source_files) * 1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) qid_ssi_to_token_scores_and_mappings = get_token_scores_for_ssi( temp_in_path, file_path_seq, file_path_mappings) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, qid_ssi_to_token_scores_and_mappings) print('Creating list') ex_list = [ex for ex in ex_gen] ssi_list = list(futures.map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'wb') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl'), 'rb') as f: ssi_list = pickle.load(f) print('Evaluating BERT model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) # # # for ex in tqdm(ex_list, total=total): # # load_and_evaluate_example(ex) # print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) ssis_restricted = [ ssi_triple[1][:ssi_triple[2]] for ssi_triple in ssi_list ] ssi_lens = [ len(source_indices) for source_indices in util.flatten_list_of_lists(ssis_restricted) ] # print ssi_lens num_singles = ssi_lens.count(1) num_pairs = ssi_lens.count(2) print( 'Percent singles/pairs: %.2f %.2f' % (num_singles * 100. / len(ssi_lens), num_pairs * 100. / len(ssi_lens))) util.print_execution_time(start_time)
prec *= 100 rec *= 100 f1 *= 100 prefix = '%.2f\t%.2f\t%.2f\t' print('Lambdamart P/R/F: ') print(prefix) # for ex in tqdm(ex_list, total=total): # load_and_evaluate_example(ex) print('Evaluating ROUGE...') results_dict = rouge_eval_references.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_eval_references.rouge_log(results_dict, my_log_dir, prefix=prefix) util.print_execution_time(start_time) if __name__ == '__main__': app.run(main)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) util.create_dirs(os.path.join(out_dir, FLAGS.input_dataset)) if FLAGS.input_dataset == 'all': datasets = [ 'duc_2003', 'duc_2004', 'tac_2008', 'tac_2010', 'tac_2011', 'cnn_dm', 'xsum' ] else: datasets = [FLAGS.input_dataset] if dataset_split == 'all': dataset_splits = ['train', 'val', 'test'] else: dataset_splits = [dataset_split] all_articles = [] for in_dataset in datasets: source_dir = os.path.join(data_dir, in_dataset) for split in dataset_splits: # split = dataset_split source_files = sorted(glob.glob(source_dir + '/' + split + '*')) if len(source_files) == 0: continue total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in in_dataset else len( source_files) example_generator = data.example_generator( source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') articles = list(futures.map(save_as_txt_file, ex_list)) all_articles.extend(articles) vec = TfidfVectorizer(input='content', ngram_range=(1, 1), min_df=min_df, max_df=0.5, decode_error='ignore', preprocessor=my_preprocessor, tokenizer=my_tokenizer) # list(futures.map(save_as_txt_file, ex_list)) # file_list = [os.path.join(out_dir, in_dataset, fname) for fname in os.listdir(os.path.join(out_dir, in_dataset))] # vec = TfidfVectorizer(input='filename', ngram_range=(1,1), min_df=min_df, max_df=0.5, decode_error='ignore') # vec.fit(file_list) if FLAGS.pca: X = vec.fit_transform(all_articles) suffix = '_pca' elif FLAGS.pg_mmr: stemmer = PorterStemmer() class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (stemmer.stem(w) for w in analyzer(doc)) vec = StemmedTfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_df=0.7) vec.fit_transform(all_articles) else: vec.fit_transform(all_articles) suffix = '' print('Vocabulary size', len(list(vec.vocabulary_.keys()))) if FLAGS.pg_mmr: util.create_dirs(os.path.join(log_dir, 'tfidf_vectorizer')) with open( os.path.join(log_dir, 'tfidf_vectorizer', FLAGS.input_dataset + '.dill'), 'wb') as f: dill.dump(vec, f) else: with open( os.path.join( out_dir, FLAGS.input_dataset + '_tfidf_vec_' + str(min_df) + suffix + '.pkl'), 'wb') as f: pickle.dump(vec, f) if FLAGS.pca: print('Fitting LSA model...') from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=100) svd.fit(X) with open(os.path.join(out_dir, FLAGS.input_dataset + '_pca' + '.pkl'), 'wb') as f: pickle.dump(svd, f) util.print_execution_time(start_time)
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) if FLAGS.dataset_name == 'all': datasets = dataset_names else: datasets = [FLAGS.dataset_name] for dataset in datasets: coref_dir = os.path.join(FLAGS.coref_root, dataset) to_coref_dir = os.path.join(coref_dir, 'to_coref') corenlp_lists_dir = os.path.join(coref_dir, 'corenlp_lists') data_coref_dir = os.path.join(FLAGS.data_root, 'with_coref', dataset) util.create_dirs(to_coref_dir) util.create_dirs(corenlp_lists_dir) util.create_dirs(data_coref_dir) source_dir = os.path.join(FLAGS.data_root, dataset) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted( glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset or 'newsroom' in dataset or 'xsum' in dataset) else len(source_files) example_generator = data.example_generator( source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) if FLAGS.mode == 'prepare': corenlp_list = [] out_idx = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example( example, names_to_types) if raw_article_sents is None: continue raw_article = ' '.join(raw_article_sents) file_name = os.path.join( to_coref_dir, '%s_%06d.bin' % (dataset_split, out_idx)) with open(file_name, 'wb') as f: f.write(raw_article) corenlp_list.append(file_name) with open( os.path.join(corenlp_lists_dir, 'all_' + dataset_split + '.txt'), 'wb') as f: f.write('\n'.join(corenlp_list)) out_idx += 1 elif FLAGS.mode == 'create': process_coref_dir = os.path.join(coref_dir, 'processed') out_idx = 0 out_file_name = os.path.join( data_coref_dir, dataset_split + '_{:05d}.bin'.format(out_idx // 1000)) writer = open(os.path.join(out_file_name), 'wb') coref_files = sorted( glob.glob( os.path.join(process_coref_dir, dataset_split + '*'))) coref_dict = {} for c in coref_files: coref_dict[c.split('/')[-1].split('.json')[0]] = c print(len(coref_files), len(source_files)) for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstract, doc_indices = util.unpack_tf_example( example, names_to_types) if raw_article_sents is None: continue raw_article_sents = [ sent for sent in raw_article_sents if sent.strip() != '' ] if out_idx % 1000 == 0 and out_idx != 0: writer.close() out_file_name = os.path.join( data_coref_dir, dataset_split + '_{:05d}.bin'.format(out_idx // 1000)) writer = open(os.path.join(out_file_name), 'wb') # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json' % example_idx) # coref_file = coref_files[out_idx] # matched_files = [name for name in coref_files if '%s_%06d.bin'%(dataset_split, out_idx) in name] file_name = '%s_%06d.bin' % (dataset_split, out_idx) if file_name in coref_dict: file_path = coref_dict[file_name] corefs = get_corefs(file_path) fixed_corefs = fix_trailing_apostrophe_s(corefs) corefs_relevant_info = remove_irrelevant(fixed_corefs) corefs_json = json.dumps(corefs_relevant_info) else: corefs_json = json.dumps([]) example.features.feature['corefs'].bytes_list.value.extend( [corefs_json]) tf_example = convert_data.make_example( article, abstract, doc_indices, raw_article_sents, corefs) convert_data.write_tf_example(example, writer) out_idx += 1 writer.close() # file_name = os.path.join(data_coref_dir, '%s_%06d.bin' % (dataset_split, example_idx)) # writer = open(file_name, 'wb') # coref_file = os.path.join(process_coref_dir, 'test_%06d.bin.json'%example_idx) # corefs = get_corefs(coref_file) # fixed_corefs = fix_trailing_apostrophe_s(corefs) # # corefs_relevant_info = remove_irrelevant(fixed_corefs) # corefs_json = json.dumps(corefs_relevant_info) # # example.features.feature['corefs'].bytes_list.value.extend([corefs_json]) # tf_example_str = example.SerializeToString() # str_len = len(tf_example_str) # writer.write(struct.pack('q', str_len)) # writer.write(struct.pack('%ds' % str_len, tf_example_str)) # # writer.close() util.print_execution_time(start_time)