Python split_sentence_id Examples, fever_utils.split_sentence_id Python Examples

Example #1

0

Show file

File: generate_label_prediction_data_noisy.py Project: estella98/pygaggle

    def generate_samples(query_id, pred_sent_ids):
        all_sent_ids = []
        all_relevances = []

        for true_evidence_set in evidences[query_id]:
            sent_ids = [evidence for evidence in pred_sent_ids]
            relevances = [
                int(evidence in true_evidence_set)
                for evidence in pred_sent_ids
            ]

            # randomly insert relevant evidences if query is not NEI and not all true evidences are in sent_ids
            if len(true_evidence_set) != 0 and len(true_evidence_set) != sum(
                    relevances):
                for evidence in true_evidence_set:
                    # stop inserting if all evidences are relevant
                    if sum(relevances) == len(relevances):
                        break
                    if evidence not in sent_ids:
                        doc_id, _ = split_sentence_id(evidence)
                        docs[doc_id] = 'N/A'  # placeholder

                        overwrite_index = random.choice([
                            i for i in range(len(relevances))
                            if relevances[i] == 0
                        ])
                        sent_ids[overwrite_index] = evidence
                        relevances[overwrite_index] = 1

            all_sent_ids.append(sent_ids)
            all_relevances.append(relevances)

        return all_sent_ids, all_relevances

Example #2

0

Show file

File: generate_label_prediction_data_gold.py Project: estella98/pygaggle

    def negative_sample(query_id, pred_sent_ids):
        neg_sent_ids = random.sample(pred_sent_ids,
                                     random.randint(1, args.max_evidences))

        for sent_id in neg_sent_ids:
            doc_id, _ = split_sentence_id(sent_id)
            docs[doc_id] = 'N/A'  # placeholder

        return [neg_sent_ids]

Example #3

0

Show file

    def aggregate(query_id, scores, sent_ids):
        pred = {}

        best = np.argmax(scores[0])

        pred['id'] = query_id
        if best == 0:
            pred['predicted_label'] = 'REFUTES'
        elif best == 1:
            pred['predicted_label'] = 'NOT ENOUGH INFO'
        else:  # best == 2
            pred['predicted_label'] = 'SUPPORTS'
        pred['predicted_evidence'] = [list(split_sentence_id(sent)) for sent in sent_ids]

        return best, pred

Example #4

0

Show file

    def generate_samples(query_id, pred_sent_ids):
        curr_pred_evidences = []

        # include all ground truth relevant evidences as positive samples
        for sent_id in evidences[query_id]:
            curr_pred_evidences.append(sent_id)

        # sample negative evidences from pred_sent_ids
        neg_pred_sent_ids = [
            pred for pred in pred_sent_ids if pred not in evidences[query_id]
        ]
        neg_sent_ids = random.sample(
            neg_pred_sent_ids,
            min(len(evidences[query_id]), len(neg_pred_sent_ids)))
        for sent_id in neg_sent_ids:
            doc_id, _ = split_sentence_id(sent_id)
            docs[doc_id] = 'N/A'  # placeholder
            curr_pred_evidences.append(sent_id)

        return curr_pred_evidences

Example #5

0

Show file

def convert_run(args):
    queries = {}
    labels = {}
    evidences = {}
    docs = {}

    num_truncated = 0

    # read in dataset file and save queries to dicts
    with open(args.dataset_file, 'r', encoding='utf-8') as f:
        print('Reading FEVER dataset file...')
        for line in f:
            line_json = json.loads(line.strip())
            query_id = line_json['id']

            query = line_json['claim']
            queries[query_id] = query

            if args.has_labels:
                label = line_json['label']
                if label == 'SUPPORTS':
                    labels[query_id] = 'true'
                elif label == 'REFUTES':
                    labels[query_id] = 'false'
                else:  # label == 'NOT ENOUGH INFO'
                    labels[query_id] = 'weak'

    def generate_samples(query_id, pred_sent_ids):
        evidence_sets = []
        if args.format == 'concat':
            evidence_sets = [[sent_id for sent_id in pred_sent_ids]]
        elif args.format == 'agg':
            evidence_sets = [[sent_id] for sent_id in pred_sent_ids]
        else:  # args.format == 'seq':
            curr_preds = []
            for sent_id in pred_sent_ids:
                curr_preds.append(sent_id)
                evidence_sets.append([pred for pred in curr_preds])

        return evidence_sets

    # read in run file and take top run file ranking predictions
    with open(args.run_file, 'r', encoding='utf-8') as f:
        print('Reading run file...')
        curr_query = None
        pred_sent_ids = []
        for line in f:
            query_id, sent_id, rank = line.strip().split('\t')
            query_id = int(query_id)

            # if we reach a new query in the run file, generate samples for previous query
            if query_id != curr_query:
                if curr_query is not None:
                    evidences[curr_query] = generate_samples(
                        curr_query, pred_sent_ids)
                curr_query = query_id
                pred_sent_ids.clear()

            if int(rank) <= args.max_evidences:
                doc_id, _ = split_sentence_id(sent_id)
                docs[doc_id] = 'N/A'  # placeholder
                pred_sent_ids.append(sent_id)

        # handle the final query
        evidences[curr_query] = generate_samples(curr_query, pred_sent_ids)

    # read through all wiki dump files and save doc text for involved docs
    print('Reading wiki pages...')
    for file in os.listdir(args.collection_folder):
        with open(os.path.join(args.collection_folder, file),
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                line_json = json.loads(line.strip())
                if line_json['id'] in docs:
                    docs[line_json['id']] = line_json['lines']

    # write query-doc text pairs to files
    with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \
            open(args.output_text_file, 'w', encoding='utf-8') as f_text:
        print('Writing query-doc pairs to files...')
        for query_id, query_text in queries.items():
            if args.has_labels:
                label = labels[query_id]

            for evidence_ids in evidences[query_id]:
                evidence_texts = []
                for evidence in evidence_ids:
                    # get specific sentence from within doc_text
                    doc_id, sent_num = split_sentence_id(evidence)
                    entity = doc_id.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text = docs[doc_id]
                    sent_text, _ = extract_sentences(doc_text)[sent_num]
                    evidence_texts.append(
                        f'{normalize_text(entity)} . {normalize_text(sent_text)}'
                    )

                # format evidence ids and texts in proper format
                evidence_ids_str = ' '.join(evidence_ids)
                prefixed_evidence_texts = []
                for i, evidence_text in enumerate(evidence_texts):
                    if args.format == 'agg':
                        prefixed_evidence_texts.append(
                            f'premise: {evidence_text}')
                    else:
                        truncated_text, num_truncated = truncate(
                            query_text, evidence_text, args.max_evidences,
                            args.max_seq_len, num_truncated)
                        prefixed_evidence_texts.append(
                            f'sentence{i + 1}: {truncated_text}')
                evidence_texts_str = ' '.join(prefixed_evidence_texts)

                if args.has_labels:
                    f_id.write(f'{query_id}\t{evidence_ids_str}\t{label}\n')
                else:
                    f_id.write(f'{query_id}\t{evidence_ids_str}\n')
                f_text.write(
                    f'hypothesis: {query_text} {evidence_texts_str}\n')

    print(f'Number of sentences truncated: {num_truncated}')

Example #6

0

Show file

def generate_data(args):
    queries = {}
    evidences = {}
    pred_evidences = {}
    docs = {}

    # read in dataset file and save queries and evidences to dicts
    with open(args.dataset_file, 'r', encoding='utf-8') as f:
        print('Reading FEVER dataset file...')
        for line in f:
            line_json = json.loads(line.strip())

            query_id = line_json['id']

            query = line_json['claim']
            queries[query_id] = query

            # only save evidences for non-test sets and non-NEI queries
            deduped_evidence_set = set()
            if line_json['label'] != 'NOT ENOUGH INFO':
                for annotator in line_json['evidence']:
                    for evidence in annotator:
                        evidence[2] = ftfy.fix_text(evidence[2])
                        docs[evidence[2]] = 'N/A'  # placeholder
                        deduped_evidence_set.add(
                            make_sentence_id(evidence[2], evidence[3]))
            evidences[query_id] = deduped_evidence_set

    def generate_samples(query_id, pred_sent_ids):
        curr_pred_evidences = []

        # include all ground truth relevant evidences as positive samples
        for sent_id in evidences[query_id]:
            curr_pred_evidences.append(sent_id)

        # sample negative evidences from pred_sent_ids
        neg_pred_sent_ids = [
            pred for pred in pred_sent_ids if pred not in evidences[query_id]
        ]
        neg_sent_ids = random.sample(
            neg_pred_sent_ids,
            min(len(evidences[query_id]), len(neg_pred_sent_ids)))
        for sent_id in neg_sent_ids:
            doc_id, _ = split_sentence_id(sent_id)
            docs[doc_id] = 'N/A'  # placeholder
            curr_pred_evidences.append(sent_id)

        return curr_pred_evidences

    # read in run file and negative sample using run file ranking predictions
    with open(args.run_file, 'r', encoding='utf-8') as f:
        print('Reading run file...')
        curr_query = None
        pred_sent_ids = []
        for line in f:
            query_id, sent_id, rank = line.strip().split('\t')
            query_id = int(query_id)

            # if we reach a new query in the run file, perform sampling for the previous query
            if query_id != curr_query:
                if curr_query is not None:
                    pred_evidences[curr_query] = generate_samples(
                        curr_query, pred_sent_ids)
                curr_query = query_id
                pred_sent_ids.clear()

            if args.min_rank <= int(rank) <= args.max_rank:
                pred_sent_ids.append(sent_id)

        # perform sampling for the final query
        pred_evidences[curr_query] = generate_samples(curr_query,
                                                      pred_sent_ids)

    # read through all wiki dump files and save doc text for involved docs
    print('Reading wiki pages...')
    for file in os.listdir(args.collection_folder):
        with open(os.path.join(args.collection_folder, file),
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                line_json = json.loads(line.strip())
                if line_json['id'] in docs:
                    docs[line_json['id']] = line_json['lines']

    # write query-doc text pairs to files
    with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \
            open(args.output_text_file, 'w', encoding='utf-8') as f_text:
        print('Writing query-doc pairs to files...')
        for query_id, sent_ids in pred_evidences.items():
            query_text = queries[query_id]

            for rank, sent_id in enumerate(sent_ids):
                relevance = 'true' if sent_id in evidences[
                    query_id] else 'false'
                # get specific sentence from within doc_text
                doc_id, sent_num = split_sentence_id(sent_id)
                entity = doc_id.replace(
                    '_', ' ')  # prepend entity name to document text
                doc_text = docs[doc_id]
                sent_text, _ = extract_sentences(doc_text)[sent_num]

                f_id.write(f'{query_id}\t{sent_id}\t{rank + 1}\n')
                f_text.write(
                    f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} Relevant:\t{relevance}\n'
                )

Example #7

0

Show file

File: convert_run_to_sentence_selection_input.py Project: estella98/pygaggle

def convert_run(args):
    queries = {}
    evidences = {}
    pred_evidences = {}
    docs = {}

    # read in dataset file and save queries and evidences to dicts
    with open(args.dataset_file, 'r', encoding='utf-8') as f:
        print('Reading FEVER dataset file...')
        for line in f:
            line_json = json.loads(line.strip())

            query_id = line_json['id']

            query = line_json['claim']
            queries[query_id] = query

            # only save evidences for non-test sets and non-NEI queries
            deduped_evidence_set = set()
            if args.has_labels and line_json['label'] != 'NOT ENOUGH INFO':
                for annotator in line_json['evidence']:
                    for evidence in annotator:
                        evidence[2] = ftfy.fix_text(evidence[2])
                        docs[evidence[2]] = 'N/A'  # placeholder
                        deduped_evidence_set.add(
                            make_sentence_id(evidence[2], evidence[3]))
            evidences[query_id] = deduped_evidence_set

    # read in run file and save rankings to dict
    with open(args.run_file, 'r', encoding='utf-8') as f:
        print('Reading run file...')
        for line in f:
            query_id, sent_id, rank = line.strip().split('\t')
            query_id = int(query_id)
            doc_id, _ = split_sentence_id(sent_id)
            docs[doc_id] = 'N/A'  # placeholder
            if query_id not in pred_evidences:
                pred_evidences[query_id] = []
            if args.k is None or int(rank) <= args.k:
                pred_evidences[query_id].append(sent_id)

    # read through all wiki dump files and save doc text for involved docs
    print('Reading wiki pages...')
    for file in os.listdir(args.collection_folder):
        with open(os.path.join(args.collection_folder, file),
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                line_json = json.loads(line.strip())
                if line_json['id'] in docs:
                    docs[line_json['id']] = line_json['lines']

    # write query-doc pairs to files
    with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \
            open(args.output_text_file, 'w', encoding='utf-8') as f_text:
        print('Writing query-doc pairs to files...')
        for query_id, sent_ids in pred_evidences.items():
            query_text = queries[query_id]
            if args.type == 'mono':
                if args.ner:
                    ner_entities = extract_entities(query_text)

                for rank, sent_id in enumerate(sent_ids):
                    if args.has_labels:
                        relevance = 'true' if sent_id in evidences[
                            query_id] else 'false'

                    # get specific sentence from within doc_text
                    doc_id, sent_num = split_sentence_id(sent_id)
                    entity = doc_id.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text = docs[doc_id]
                    sent_text, _ = extract_sentences(doc_text)[sent_num]

                    # write query-doc pair ids and texts
                    if args.has_labels:
                        f_id.write(
                            f'{query_id}\t{sent_id}\t{rank + 1}\t{relevance}\n'
                        )
                    else:
                        f_id.write(f'{query_id}\t{sent_id}\t{rank + 1}\n')
                    if args.ner:
                        numbered_entities = [
                            f'Entity{i + 1}: {entity}'
                            for i, entity in enumerate(ner_entities)
                        ]
                        entities_str = ' '.join(numbered_entities)
                        f_text.write(
                            f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} {entities_str} Relevant:\n'
                        )
                    else:
                        f_text.write(
                            f'Query: {query_text} Document: {entity} . {normalize_text(sent_text)} Relevant:\n'
                        )
            else:  # args.type == 'duo'
                ranked_sent_ids = [(sent_id, i)
                                   for i, sent_id in enumerate(sent_ids)]
                for (sent_id_1, rank_1), (sent_id_2,
                                          rank_2) in itertools.permutations(
                                              ranked_sent_ids, 2):
                    if args.has_labels:
                        relevance = 'true' if sent_id_1 in evidences[
                            query_id] else 'false'

                    # get specific sentence from within doc_text
                    doc_id_1, sent_1_num = split_sentence_id(sent_id_1)
                    entity_1 = doc_id_1.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text_1 = docs[doc_id_1]
                    sent_1_text, _ = extract_sentences(doc_text_1)[sent_1_num]

                    doc_id_2, sent_2_num = split_sentence_id(sent_id_2)
                    entity_2 = doc_id_2.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text_2 = docs[doc_id_2]
                    sent_2_text, _ = extract_sentences(doc_text_2)[sent_2_num]

                    # write query-doc pair ids and texts
                    if args.has_labels:
                        f_id.write(
                            f'{query_id}\t{sent_id_1}\t{rank_1 + 1}\t{sent_id_2}\t{rank_2 + 1}\t{relevance}\n'
                        )
                    else:
                        f_id.write(
                            f'{query_id}\t{sent_id_1}\t{rank_1 + 1}\t{sent_id_2}\t{rank_2 + 1}\n'
                        )
                    f_text.write(
                        f'Query: {query_text} Document1: {entity_1} . {normalize_text(sent_1_text)} Document2: {entity_2} . {normalize_text(sent_2_text)} Relevant:\n'
                    )

Example #8

0

Show file

File: generate_label_prediction_data_gold.py Project: estella98/pygaggle

def generate_data(args):
    queries = {}
    labels = {}
    evidences = {}
    docs = {}

    num_truncated = 0

    # read in dataset file and save queries and evidences to dicts
    with open(args.dataset_file, 'r', encoding='utf-8') as f:
        print('Reading FEVER dataset file...')
        for line in f:
            line_json = json.loads(line.strip())
            query_id = line_json['id']

            query = line_json['claim']
            queries[query_id] = query

            label = line_json['label']
            if label == 'SUPPORTS':
                labels[query_id] = 'true'
            elif label == 'REFUTES':
                labels[query_id] = 'false'
            else:  # label == 'NOT ENOUGH INFO'
                labels[query_id] = 'weak'

            annotators = []
            if label != 'NOT ENOUGH INFO':  # no evidence set for NEI queries, will sample from run files later
                for annotator in line_json['evidence']:
                    evidence_set = []
                    for evidence in annotator:
                        evidence[2] = ftfy.fix_text(evidence[2])
                        docs[evidence[2]] = 'N/A'  # placeholder
                        evidence_set.append(
                            make_sentence_id(evidence[2], evidence[3]))
                    annotators.append(evidence_set)
            evidences[query_id] = annotators

    # samples evidence from pred_sent_ids
    def negative_sample(query_id, pred_sent_ids):
        neg_sent_ids = random.sample(pred_sent_ids,
                                     random.randint(1, args.max_evidences))

        for sent_id in neg_sent_ids:
            doc_id, _ = split_sentence_id(sent_id)
            docs[doc_id] = 'N/A'  # placeholder

        return [neg_sent_ids]

    # read in run file and sample run file ranking predictions for queries
    with open(args.run_file, 'r', encoding='utf-8') as f:
        print('Reading run file...')
        curr_query = None
        pred_sent_ids = []
        for line in f:
            query_id, sent_id, rank = line.strip().split('\t')
            query_id = int(query_id)

            # if we reach a new query in the run file, perform sampling for previous query if needed
            if query_id != curr_query:
                if curr_query is not None and len(evidences[curr_query]) == 0:
                    evidences[curr_query] = negative_sample(
                        curr_query, pred_sent_ids)
                curr_query = query_id
                pred_sent_ids.clear()

            if args.min_rank <= int(rank) <= args.max_rank:
                pred_sent_ids.append(sent_id)

        # handle the final query
        if len(evidences[curr_query]) == 0:
            evidences[curr_query] = negative_sample(curr_query, pred_sent_ids)

    # read through all wiki dump files and save doc text for involved docs
    print('Reading wiki pages...')
    for file in os.listdir(args.collection_folder):
        with open(os.path.join(args.collection_folder, file),
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                line_json = json.loads(line.strip())
                if line_json['id'] in docs:
                    docs[line_json['id']] = line_json['lines']

    # write query-doc text pairs to files
    with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \
            open(args.output_text_file, 'w', encoding='utf-8') as f_text:
        print('Writing query-doc pairs to files...')
        for query_id, query_text in queries.items():
            label = labels[query_id]

            for evidence_ids in evidences[query_id]:
                evidence_texts = []
                for evidence in evidence_ids:
                    # get specific sentence from within doc_text
                    doc_id, sent_num = split_sentence_id(evidence)
                    entity = doc_id.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text = docs[doc_id]
                    sent_text, _ = extract_sentences(doc_text)[sent_num]
                    evidence_texts.append(
                        f'{normalize_text(entity)} . {normalize_text(sent_text)}'
                    )

                if args.format == 'concat':
                    evidence_ids_str = ' '.join(evidence_ids)
                    prefixed_evidence_texts = []
                    for i, evidence_text in enumerate(evidence_texts):
                        truncated_text, num_truncated = truncate(
                            query_text, evidence_text, args.max_evidences,
                            args.max_seq_len, num_truncated)
                        prefixed_evidence_texts.append(
                            f'sentence{i + 1}: {truncated_text}')
                    evidence_texts_str = ' '.join(prefixed_evidence_texts)

                    f_id.write(f'{query_id}\t{evidence_ids_str}\n')
                    f_text.write(
                        f'hypothesis: {query_text} {evidence_texts_str}\t{label}\n'
                    )
                else:  # args.format == 'agg'
                    for evidence_id, evidence_text in zip(
                            evidence_ids, evidence_texts):
                        f_id.write(f'{query_id}\t{evidence_id}\n')
                        f_text.write(
                            f'hypothesis: {query_text} premise: {evidence_text}\t{label}\n'
                        )

    print(f'Number of sentences truncated: {num_truncated}')

Example #9

0

Show file

File: generate_label_prediction_data_noisy.py Project: estella98/pygaggle

def generate_data(args):
    queries = {}
    labels = {}
    evidences = {}
    evidence_relevances = {}
    docs = {}

    num_truncated = 0

    # read in dataset file and save queries and evidences to dicts
    with open(args.dataset_file, 'r', encoding='utf-8') as f:
        print('Reading FEVER dataset file...')
        for line in f:
            line_json = json.loads(line.strip())
            query_id = line_json['id']

            query = line_json['claim']
            queries[query_id] = query

            label = line_json['label']
            if label == 'SUPPORTS':
                labels[query_id] = 'true'
            elif label == 'REFUTES':
                labels[query_id] = 'false'
            else:  # label == 'NOT ENOUGH INFO'
                labels[query_id] = 'weak'

            annotators = []
            if label != 'NOT ENOUGH INFO':  # no evidence set for NEI queries, will sample from run files later
                for annotator in line_json['evidence']:
                    evidence_set = []
                    for evidence in annotator:
                        evidence[2] = ftfy.fix_text(evidence[2])
                        evidence_set.append(
                            make_sentence_id(evidence[2], evidence[3]))
                    annotators.append(evidence_set)
            else:
                annotators.append([])
            evidences[query_id] = annotators

    # for each evidence set, check if all gold evidences are in pred_sent_ids and randomly insert if not present
    def generate_samples(query_id, pred_sent_ids):
        all_sent_ids = []
        all_relevances = []

        for true_evidence_set in evidences[query_id]:
            sent_ids = [evidence for evidence in pred_sent_ids]
            relevances = [
                int(evidence in true_evidence_set)
                for evidence in pred_sent_ids
            ]

            # randomly insert relevant evidences if query is not NEI and not all true evidences are in sent_ids
            if len(true_evidence_set) != 0 and len(true_evidence_set) != sum(
                    relevances):
                for evidence in true_evidence_set:
                    # stop inserting if all evidences are relevant
                    if sum(relevances) == len(relevances):
                        break
                    if evidence not in sent_ids:
                        doc_id, _ = split_sentence_id(evidence)
                        docs[doc_id] = 'N/A'  # placeholder

                        overwrite_index = random.choice([
                            i for i in range(len(relevances))
                            if relevances[i] == 0
                        ])
                        sent_ids[overwrite_index] = evidence
                        relevances[overwrite_index] = 1

            all_sent_ids.append(sent_ids)
            all_relevances.append(relevances)

        return all_sent_ids, all_relevances

    # read in run file and sample run file ranking predictions for queries
    with open(args.run_file, 'r', encoding='utf-8') as f:
        print('Reading run file...')
        curr_query = None
        pred_sent_ids = []
        for line in f:
            query_id, sent_id, rank = line.strip().split('\t')
            query_id = int(query_id)

            # if we reach a new query in the run file, perform sampling for previous query if needed
            if query_id != curr_query:
                if curr_query is not None:
                    all_sent_ids, all_relevances = generate_samples(
                        curr_query, pred_sent_ids)
                    evidences[curr_query] = all_sent_ids
                    evidence_relevances[curr_query] = all_relevances
                curr_query = query_id
                pred_sent_ids.clear()

            if int(rank) <= args.max_evidences:
                doc_id, _ = split_sentence_id(sent_id)
                docs[doc_id] = 'N/A'  # placeholder
                pred_sent_ids.append(sent_id)

        # handle the final query
        all_sent_ids, all_relevances = generate_samples(
            curr_query, pred_sent_ids)
        evidences[curr_query] = all_sent_ids
        evidence_relevances[curr_query] = all_relevances

    # read through all wiki dump files and save doc text for involved docs
    print('Reading wiki pages...')
    for file in os.listdir(args.collection_folder):
        with open(os.path.join(args.collection_folder, file),
                  'r',
                  encoding='utf-8') as f:
            for line in f:
                line_json = json.loads(line.strip())
                if line_json['id'] in docs:
                    docs[line_json['id']] = line_json['lines']

    # write query-doc text pairs to files
    with open(args.output_id_file, 'w', encoding='utf-8') as f_id, \
            open(args.output_text_file, 'w', encoding='utf-8') as f_text:
        print('Writing query-doc pairs to files...')
        for query_id, query_text in queries.items():
            label = labels[query_id]

            for evidence_ids, relevances in zip(evidences[query_id],
                                                evidence_relevances[query_id]):
                evidence_texts = []
                for evidence in evidence_ids:
                    # get specific sentence from within doc_text
                    doc_id, sent_num = split_sentence_id(evidence)
                    entity = doc_id.replace(
                        '_', ' ')  # prepend entity name to document text
                    doc_text = docs[doc_id]
                    sent_text, _ = extract_sentences(doc_text)[sent_num]
                    evidence_texts.append(
                        f'{normalize_text(entity)} . {normalize_text(sent_text)}'
                    )

                # format evidence ids and texts in proper format
                evidence_ids_str = ' '.join(evidence_ids)
                relevances_str = ','.join(
                    [str(relevance) for relevance in relevances])
                prefixed_evidence_texts = []
                for i, evidence_text in enumerate(evidence_texts):
                    truncated_text, num_truncated = truncate(
                        query_text, evidence_text, args.max_evidences,
                        args.max_seq_len, num_truncated)
                    prefixed_evidence_texts.append(
                        f'sentence{i + 1}: {truncated_text}')
                evidence_texts_str = ' '.join(prefixed_evidence_texts)

                f_id.write(
                    f'{query_id}\t{evidence_ids_str}\t{relevances_str}\n')
                f_text.write(
                    f'hypothesis: {query_text} {evidence_texts_str}\t{label}\n'
                )

    print(f'Number of sentences truncated: {num_truncated}')