def read_scores(all=False, remove_null=False, data_dir=RAW_DATA_DIR): file_path = os.path.join(data_dir, 'lqual{}.jsonl'.format('_all' if all else '')) if not os.path.isfile(file_path): raise ValueError("The file {} does not exist!".format(file_path)) scores = [] id_set = set() with jsonl.open(file_path) as file: for sample in file: entry = OrderedDict() entry['id'] = sample['input']['contents']['id'] entry['ref'] = sample['input']['contents']['reference'] entry['sys_name'] = sample['input']['contents']['system'] entry['sys_summ'] = sample['input']['contents']['text'] del sample['output']['_responses'] entry['scores'] = sample['output'] if remove_null and entry['scores']['overall'] is None: continue scores.append(entry) id_set.add(entry['id']) return scores, list(id_set)
def read_article_refs(ids=None, as_dict=False, data_dir=RAW_DATA_DIR): refs, ref_ids = read_references() article_path = os.path.join(data_dir, 'articles.jsonl') if not os.path.isfile(article_path): raise ValueError("The file {} does not exist!".format(article_path)) with jsonl.open(article_path) as file: for sample in file: if sample['id'] not in ref_ids: continue idx = find_idx_by_id(refs, sample['id']) assert idx != -1 refs[idx]['article'] = sample['text'] if ids is not None: refs = [r for r in refs if r['id'] in ids] if as_dict: article_ref_dict = {} for r in refs: article_ref_dict.update( {r['id']: { 'article': r['article'], 'ref': r['ref'] }}) return article_ref_dict else: return refs
def read_samples(data_dir=PROCESSED_DATA_DIR): sample_file_path = os.path.join(data_dir, 'samples.jsonl.gz') if not os.path.isfile(sample_file_path): raise ValueError( "The file {} does not exist!".format(sample_file_path)) with jsonl.open(sample_file_path, gzip=True) as sample_file: return sample_file.read()
def read_articles(data_dir=RAW_DATA_DIR): article_path = os.path.join(data_dir, 'articles.jsonl') if not os.path.isfile(article_path): raise ValueError("The file {} does not exist!".format(article_path)) articles = [] article_id_list = [] with jsonl.open(article_path) as article_file: for sample in article_file: entry = OrderedDict() entry['id'] = sample['id'] entry['article'] = sample['text'] articles.append(entry) if entry['id'] not in article_id_list: article_id_list.append(entry['id']) return articles, article_id_list
def read_processed_scores(all=False, remove_null=False, data_dir=PROCESSED_DATA_DIR): file_path = os.path.join(data_dir, 'lqual{}.jsonl'.format('_all' if all else '')) if not os.path.isfile(file_path): raise ValueError("The file {} does not exist!".format(file_path)) scores, _ = read_scores(all, remove_null) scores_dict = {} for score in scores: key = "{}-{}".format(score['id'], score['sys_name']) if key not in scores_dict: scores_dict.update({key: score}) with jsonl.open(file_path) as file: for sample in file: key = "{}-{}".format(sample['id'], sample['system']) if key in scores_dict: del sample['prompts']['overall']['gold'] del sample['prompts']['overall']['human'] scores_dict[key]['metrics'] = sample['prompts']['overall'] del_keys = [] for key, entry in scores_dict.items(): if 'metrics' not in entry: del_keys.append(key) for key in del_keys: scores_dict.pop(key) id_list = list(set([entry['id'] for entry in scores_dict.values()])) return list(scores_dict.values()), id_list
def read_references(data_dir=RAW_DATA_DIR): ref_path = os.path.join(data_dir, 'lqual_all.jsonl') if not os.path.isfile(ref_path): raise ValueError("The file {} does not exist!".format(ref_path)) refs = [] id_list = [] with jsonl.open(ref_path) as file: for sample in file: if sample['id'] in id_list: continue entry = OrderedDict() entry['id'] = sample['id'] entry['ref'] = sample['reference'] refs.append(entry) id_list.append(entry['id']) return refs, id_list