def dup_remove(doc, question): """ For each document, remove the duplicated paragraphs Args: doc: a doc in the sample Returns: bool Raises: None """ paragraphs_his = {} del_ids = [] para_id = None if 'most_related_para' in doc: para_id = doc['most_related_para'] else: para_id = find_best_question_match(doc, question) doc['paragraphs_length'] = [] for p_idx, segmented_paragraph in \ enumerate(doc["segmented_paragraphs"]): doc['paragraphs_length'].append(len(segmented_paragraph)) paragraph = ''.join(segmented_paragraph) if paragraph in paragraphs_his: del_ids.append(p_idx) if p_idx == para_id: para_id = paragraphs_his[paragraph] continue paragraphs_his[paragraph] = p_idx return False
def dup_remove(doc, question): """ For each document, remove the duplicated paragraphs Args: doc: a doc in the sample Returns: bool Raises: None """ paragraphs_his = {} del_ids = [] para_id = None if 'most_related_para' in doc: para_id = doc['most_related_para'] else: para_id = find_best_question_match(doc, question) doc['paragraphs_length'] = [] for p_idx, (segmented_paragraph, paragraph_score) in \ enumerate(zip(doc["segmented_paragraphs"], doc["segmented_paragraphs_scores"])): doc['paragraphs_length'].append(len(segmented_paragraph)) paragraph = ''.join(segmented_paragraph) if paragraph in paragraphs_his: del_ids.append(p_idx) if p_idx == para_id: para_id = paragraphs_his[paragraph] continue paragraphs_his[paragraph] = p_idx # delete prev_del_num = 0 del_num = 0 for p_idx in del_ids: if p_idx < para_id: prev_del_num += 1 del doc["segmented_paragraphs"][p_idx - del_num] del doc["segmented_paragraphs_scores"][p_idx - del_num] del doc['paragraphs_length'][p_idx - del_num] del_num += 1 if len(del_ids) != 0: if 'most_related_para' in doc: doc['most_related_para'] = para_id - prev_del_num doc['paragraphs'] = [] for segmented_para in doc["segmented_paragraphs"]: paragraph = ''.join(segmented_para) doc['paragraphs'].append(paragraph) return True else: return False