def read_scores(all=False, remove_null=False, data_dir=RAW_DATA_DIR):
    file_path = os.path.join(data_dir,
                             'lqual{}.jsonl'.format('_all' if all else ''))

    if not os.path.isfile(file_path):
        raise ValueError("The file {} does not exist!".format(file_path))

    scores = []
    id_set = set()

    with jsonl.open(file_path) as file:
        for sample in file:
            entry = OrderedDict()
            entry['id'] = sample['input']['contents']['id']
            entry['ref'] = sample['input']['contents']['reference']
            entry['sys_name'] = sample['input']['contents']['system']
            entry['sys_summ'] = sample['input']['contents']['text']
            del sample['output']['_responses']
            entry['scores'] = sample['output']

            if remove_null and entry['scores']['overall'] is None:
                continue

            scores.append(entry)
            id_set.add(entry['id'])

    return scores, list(id_set)
def read_article_refs(ids=None, as_dict=False, data_dir=RAW_DATA_DIR):
    refs, ref_ids = read_references()

    article_path = os.path.join(data_dir, 'articles.jsonl')

    if not os.path.isfile(article_path):
        raise ValueError("The file {} does not exist!".format(article_path))

    with jsonl.open(article_path) as file:
        for sample in file:
            if sample['id'] not in ref_ids:
                continue

            idx = find_idx_by_id(refs, sample['id'])

            assert idx != -1
            refs[idx]['article'] = sample['text']

    if ids is not None:
        refs = [r for r in refs if r['id'] in ids]

    if as_dict:
        article_ref_dict = {}

        for r in refs:
            article_ref_dict.update(
                {r['id']: {
                     'article': r['article'],
                     'ref': r['ref']
                 }})

        return article_ref_dict
    else:
        return refs
def read_samples(data_dir=PROCESSED_DATA_DIR):
    sample_file_path = os.path.join(data_dir, 'samples.jsonl.gz')

    if not os.path.isfile(sample_file_path):
        raise ValueError(
            "The file {} does not exist!".format(sample_file_path))

    with jsonl.open(sample_file_path, gzip=True) as sample_file:
        return sample_file.read()
def read_articles(data_dir=RAW_DATA_DIR):
    article_path = os.path.join(data_dir, 'articles.jsonl')

    if not os.path.isfile(article_path):
        raise ValueError("The file {} does not exist!".format(article_path))

    articles = []
    article_id_list = []

    with jsonl.open(article_path) as article_file:
        for sample in article_file:
            entry = OrderedDict()
            entry['id'] = sample['id']
            entry['article'] = sample['text']
            articles.append(entry)

            if entry['id'] not in article_id_list:
                article_id_list.append(entry['id'])

    return articles, article_id_list
def read_processed_scores(all=False,
                          remove_null=False,
                          data_dir=PROCESSED_DATA_DIR):
    file_path = os.path.join(data_dir,
                             'lqual{}.jsonl'.format('_all' if all else ''))

    if not os.path.isfile(file_path):
        raise ValueError("The file {} does not exist!".format(file_path))

    scores, _ = read_scores(all, remove_null)
    scores_dict = {}

    for score in scores:
        key = "{}-{}".format(score['id'], score['sys_name'])

        if key not in scores_dict:
            scores_dict.update({key: score})

    with jsonl.open(file_path) as file:
        for sample in file:
            key = "{}-{}".format(sample['id'], sample['system'])

            if key in scores_dict:
                del sample['prompts']['overall']['gold']
                del sample['prompts']['overall']['human']

                scores_dict[key]['metrics'] = sample['prompts']['overall']

    del_keys = []

    for key, entry in scores_dict.items():
        if 'metrics' not in entry:
            del_keys.append(key)

    for key in del_keys:
        scores_dict.pop(key)

    id_list = list(set([entry['id'] for entry in scores_dict.values()]))

    return list(scores_dict.values()), id_list
def read_references(data_dir=RAW_DATA_DIR):
    ref_path = os.path.join(data_dir, 'lqual_all.jsonl')

    if not os.path.isfile(ref_path):
        raise ValueError("The file {} does not exist!".format(ref_path))

    refs = []
    id_list = []

    with jsonl.open(ref_path) as file:
        for sample in file:
            if sample['id'] in id_list:
                continue

            entry = OrderedDict()
            entry['id'] = sample['id']
            entry['ref'] = sample['reference']

            refs.append(entry)
            id_list.append(entry['id'])

    return refs, id_list