def summarize_score(info: Dict, prediction_file_path: str, f_handler: FormatHandler, combine_score: Callable, score_type) -> Dict[Tuple[str, str], float]: key_logit = "logits" data: List[Dict] = join_prediction_with_info(prediction_file_path, info, ["data_id", key_logit]) def logit_to_score_softmax(logit): return scipy.special.softmax(logit)[1] def get_score(entry): if score_type == "softmax": return logit_to_score_softmax(entry['logits']) elif score_type == "raw": return entry[key_logit][0] elif score_type == "scalar": return entry[key_logit] elif score_type == "tuple": return entry[key_logit][1] else: assert False grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, f_handler.get_pair_id) tprint("Group size:", len(grouped)) out_d = {} for pair_id, items in grouped.items(): scores = lmap(get_score, items) final_score = combine_score(scores) out_d[pair_id] = final_score num_items_per_group = average(lmap(len, grouped.values())) tprint("Num items per group : ", num_items_per_group) return out_d
def main(): tprint("loading counter dict") counter_dict: Dict[str, Counter] = load_counter_dict() def get_doc_lm(doc_id) -> Counter: counter = counter_dict[doc_id] n_tf = sum(counter.values()) out_counter = Counter() for word, cnt in counter.items(): out_counter[word] = cnt / n_tf return out_counter qrel = load_robust_qrel() def get_pos_docs(query_id): if query_id not in qrel: return judgement = qrel[query_id] for doc_id, score in judgement.items(): if score: yield doc_id tprint("build query lm dict") query_lm_dict = {} queries = list(qrel.keys()) for query_id in queries: pos_docs_ids: Iterable[str] = get_pos_docs(query_id) pos_doc_lms: List[Counter] = lmap(get_doc_lm, pos_docs_ids) query_lm: Counter = average_counters(pos_doc_lms) query_lm_dict[query_id] = query_lm
def generate_selected_training_data_ablation_only_pos(info, key, max_seq_length, save_dir, score_dir): data_id_manager = DataIDManager(0, 1000000) out_path = os.path.join(save_dir, str(key)) pred_path = os.path.join(score_dir, str(key)) tprint("data gen") itr = enum_best_segments(pred_path, info) insts = [] for selected_entry in itr: selected = decompress_seg_ids_entry(selected_entry) assert len(selected['input_ids']) == len(selected['seg_ids']) selected['input_ids'] = pad0(selected['input_ids'], max_seq_length) selected['seg_ids'] = pad0(selected['seg_ids'], max_seq_length) # data_id = data_id_manager.assign(selected_segment.to_info_d()) data_id = 0 ci = InstAsInputIds( selected['input_ids'], selected['seg_ids'], selected['label'], data_id) insts.append(ci) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst) tprint("writing") write_records_w_encode_fn(out_path, encode_fn, insts, len(insts)) save_info(save_dir, data_id_manager, str(key) + ".info")
def do_predict( bert_hp, train_config, data, lms_config, modeling_option, init_fn, ): num_gpu = train_config.num_gpu train_batches, dev_batches = data lms_model = LMSModel(modeling_option, bert_hp, lms_config, num_gpu) sess = init_session() sess.run(tf.global_variables_initializer()) init_fn(sess) step_size = 100 for i in range(100): st = i * step_size ed = st + step_size # make explain train_op does not increase global step tprint(st, ed) output_d = predict_fn(sess, train_batches[st:ed], lms_model.logits, lms_model.loss_tensor, lms_model.ex_score_tensor, lms_model.per_layer_logit_tensor, lms_model.batch2feed_dict) save_path = at_output_dir("lms_scores", str(i)) save_to_pickle(output_d, save_path)
def summarize_score_wo_merge(info: Dict, prediction_file_path: str, f_handler: FormatHandler, score_type) -> Dict[Tuple[str, str], float]: key_logit = "logits" data: List[Dict] = join_prediction_with_info(prediction_file_path, info, ["data_id", key_logit]) def logit_to_score_softmax(logit): return scipy.special.softmax(logit)[1] def get_score(entry): if score_type == "softmax": return logit_to_score_softmax(entry['logits']) elif score_type == "raw": return entry[key_logit][0] elif score_type == "scalar": return entry[key_logit] elif score_type == "tuple": return entry[key_logit][1] else: assert False grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, f_handler.get_pair_id) tprint("Group size:", len(grouped)) out_d = {} for pair_id, items in grouped.items(): query_id, doc_id = pair_id scores = lmap(get_score, items) for idx, score in enumerate(scores): new_doc_id = "{}_{}".format(doc_id, idx) out_d[(query_id, new_doc_id)] = score return out_d
def qk_candidate_gen(q_res_path: str, doc_score_path, split, config) -> List[Tuple[QCKQuery, List[KDP]]]: queries: List[QCKQuery] = get_qck_queries(split) num_jobs = d_n_claims_per_split2[split] score_d = load_doc_scores(doc_score_path, num_jobs) tprint("loading ranked list") ranked_list: Dict[ str, List[SimpleRankedListEntry]] = load_galago_ranked_list(q_res_path) query_ids = list(ranked_list.keys()) query_ids.sort() print("num queries", len(query_ids)) q_id_to_job_id = {q_id: job_id for job_id, q_id in enumerate(query_ids)} print("Pre loading docs") top_n = config['top_n'] out_qk: List[Tuple[QCKQuery, List[KnowledgeDocumentPart]]] = [] all_doc_parts = 0 ticker = TimeEstimator(len(queries)) for q in queries: job_id: int = q_id_to_job_id[q.query_id] entries: List = score_d[job_id] entries.sort(key=get_second, reverse=True) doc_ids = left(entries) doc_ids = doc_ids[:top_n] preload_man.preload(TokenizedCluewebDoc, doc_ids) docs = iterate_docs(doc_ids) doc_part_list: List[KDP] = iterate_document_parts( docs, config['window_size'], config['step_size'], 20) all_doc_parts += len(doc_part_list) out_qk.append((q, doc_part_list)) ticker.tick() return out_qk
def generate_instances(self, job_id, data_id_man): q_id = self.job_id_to_q_id[job_id] query_text = self.query_d[int(q_id)] query_tokens = self.tokenizer.tokenize(query_text) ranked_list = self.ranked_list[q_id][:1000] doc_ids = list([e.doc_id for e in ranked_list]) tprint("Loading documents start") docs_d: Dict[str, List[List[str]]] = load_multiple(BertTokenizedCluewebDoc, doc_ids, True) tprint("Loading documents done") avail_seq_length = self.max_seq_length - len(query_tokens) - 3 label_dummy = 0 not_found = 0 for doc_id in doc_ids: try: doc: List[List[str]] = docs_d[doc_id] passages: Iterable[List[str]] = enum_passages(doc, avail_seq_length) for passage_idx, p in enumerate(passages): if passage_idx > 9: break data_id = data_id_man.assign({ 'query_id': q_id, 'doc_id': doc_id, 'passage_idx': passage_idx }) yield Instance(query_tokens, p, label_dummy, data_id) except KeyError: not_found += 1 print("{} of {} docs not found".format(not_found, len(doc_ids)))
def main(): args = parse_arg() param = json.load(open(args.config_path, "r")) model, train_processed, valid_processed = prepare_model_and_data(param) hist_callback = mz.data_generator.callbacks.Histogram(embedding_matrix, bin_size=30, hist_mode='LCH') tprint("defining generator") train_generator = mz.DataGenerator(train_processed, batch_size=param['batch_size'], shuffle=True, callbacks=[hist_callback]) valid_x, valid_y = valid_processed.unpack() evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_x)) early_stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=1, verbose=1, mode='min') tprint("fitting") callbacks = [evaluate] if param['early_stop']: callbacks.append(early_stop) history = model.fit_generator(train_generator, epochs=100, callbacks=callbacks, workers=5, use_multiprocessing=False)
def main(): info_path = sys.argv[1] # TODO remove # sh_format_path = sys.argv[2] # model_name = sys.argv[3] # step = int(sys.argv[4]) # model_sub_path = "{}/model.ckpt-{}".format(model_name, step) # TODO remove end run_info = json.load(open(info_path, "r")) job_info_list = run_info['job_info_list'] job_group_name = run_info['job_group_name'] save_dir = run_info['save_dir'] data_st = run_info['data_st'] data_ed = run_info['data_ed'] sh_format_path = run_info['sh_format_path'] model_sub_path = run_info['model_sub_path'] if 'rerun_jobs' in run_info and run_info['rerun_jobs']: new_job_info_list = rerun_jobs(sh_format_path, model_sub_path, save_dir, job_group_name, job_info_list) job_info_list = new_job_info_list print("len(job_info_list)", len(job_info_list)) tprint("Waiting files") wait_files(job_info_list) tprint("Make ranked list") make_ranked_list_from_multiple_files(job_group_name, save_dir, data_st, data_ed)
def label_predict(hparam, data, model_path) -> List[np.array]: tprint("building model") voca_size = 30522 task = transformer_logit(hparam, 2, voca_size, False) enc_payload: List[Tuple[np.array, np.array, np.array]] = data sout = tf.nn.softmax(task.logits, axis=-1) sess = init_session() sess.run(tf.global_variables_initializer()) tprint("loading model") load_model(sess, model_path) def forward_run(inputs): batches = get_batches_ex(inputs, hparam.batch_size, 3) logit_list = [] ticker = TimeEstimator(len(batches)) for batch in batches: x0, x1, x2 = batch soft_out, = sess.run([ sout, ], feed_dict={ task.x_list[0]: x0, task.x_list[1]: x1, task.x_list[2]: x2, }) logit_list.append(soft_out) ticker.tick() return np.concatenate(logit_list) logits = forward_run(enc_payload) return logits
def summarize(self): topic = data_generator.argmining.ukp_header.all_topics[0] data_loader = ukp.DataLoader(topic) stopwords = load_stopwords() def tokenize(x): return tokenizer.tokenize(x, stopwords) def sent_score(token_sent, bow_score): score = 0 factor = 1 for t in token_sent: score += bow_score[t] * factor factor *= 0.5 return score def is_argument(entry): return entry['annotation'] == "Argument_for" or entry[ 'annotation'] == "Argument_against" for topic in data_generator.argmining.ukp_header.all_topics: entries = data_loader.all_data[topic] raw_sents = list( [e['sentence'] for e in entries if e['set'] == 'train']) token_sents = list(map(tokenize, raw_sents)) tprint("Runing TextRank") text_rank = TextRank(token_sents) tr_score = Counter(text_rank.run(flatten(token_sents))) tprint("claim_gen.generate") raw_sents.sort(key=lambda x: sent_score(tokenize(x), tr_score), reverse=True) for i in range(10): print(raw_sents[i])
def check_wait_tasks(active_proc_list): num_tas = active_proc_list.update_alive() tprint("Number of active task : ", num_tas) while num_tas > max_task: tprint("Waiting for tasks to be done") time.sleep(60) num_tas = active_proc_list.update_alive()
def execute(job_id): out = open(get_log_path(job_id), "w") p = psutil.Popen(["/bin/bash", get_sh_path_for_job_id(job_id)], stdout=out, stderr=out, preexec_fn=preexec_function) tprint("Executed job {} . pid={}".format(job_id, p.pid)) return p
def main(): data_name = sys.argv[1] tprint("Loading idf scores") get_idf = load_idf_fn_for(data_name) problems: List[QueryDoc] = load_as_tokens(data_name) save_name = "{}_idf.txt".format(data_name) save_path = os.path.join(get_genex_run_save_dir(), save_name) scores_list: Iterable[Counter] = get_idf_keyword_score(problems, get_idf) save_score_to_file(scores_list, save_path)
def load_info_from_compressed(pickle_path): tprint("loading info pickle") output_d = {} data = load_pickle_from(pickle_path) tprint("decompressing...") for data_id, value_d in data.items(): new_entry = decompress_seg_ids_entry(value_d) output_d[data_id] = new_entry return output_d
def get_score_d(pred_file_path: str, info: Dict, f_handler: FormatHandler, combine_strategy: str, score_type: str): tprint("Reading from :", pred_file_path) DOC_SEG_COMBINE = 0 DOC_PART_SEG_COMBINE = 1 NO_COMBINE = 2 combine_type = "" if combine_strategy == "top_k": print("using top k") combine_score = top_k_average combine_type = DOC_SEG_COMBINE elif combine_strategy == "avg": combine_score = average combine_type = DOC_SEG_COMBINE print("using avg") elif combine_strategy == "max": print("using max") combine_type = DOC_SEG_COMBINE combine_score = max elif combine_strategy == "non_tail_max": combine_type = DOC_SEG_COMBINE combine_score = non_tail_max elif combine_strategy == "first4_max": combine_type = DOC_SEG_COMBINE combine_score = first4_max elif combine_strategy == "first": combine_type = DOC_SEG_COMBINE combine_score = select_first elif combine_strategy == "avg_then_doc_max": combine_type = DOC_PART_SEG_COMBINE combine_score = average print("using avg then max") elif combine_strategy == "max_then_doc_max": combine_type = DOC_PART_SEG_COMBINE combine_score = max print("using avg then max") elif combine_strategy == "no_merge": combine_type = NO_COMBINE combine_score = None else: assert False if combine_type == DOC_SEG_COMBINE: score_d = summarize_score(info, pred_file_path, f_handler, combine_score, score_type) elif combine_type == DOC_PART_SEG_COMBINE: score_d = summarize_score(info, pred_file_path, f_handler, combine_score, score_type) score_d = get_max_score_from_doc_parts(score_d) elif combine_type == NO_COMBINE: score_d = summarize_score_wo_merge(info, pred_file_path, f_handler, score_type) else: assert False return score_d
def main(): print("Process started") for split in splits: tprint("Loading pickles") job_name = "argu_debate_qck_datagen_{}".format(split) qk_candidate: List[QKUnit] = load_qk(split) candidate_dict, correct_d = load_from_pickle(job_name + "_base_resource") tprint("Starting job") start_job(job_name, split, candidate_dict, correct_d, qk_candidate)
def get_lm_for_claim(all_ranked_list, cid, unigrams): ranked_list = all_ranked_list.get(str(cid)) doc_ids = [t[0] for t in ranked_list] tprint("Loading document") preload_docs(doc_ids) preload_tf(doc_ids) docs = lmap(load_and_format_doc, doc_ids) tprint("building clm document") lm_classifier = build_lm(docs, unigrams) return lm_classifier
def baseline_predict(hparam, nli_setting, data, method_name, model_path) -> List[np.array]: tprint("building model") voca_size = 30522 task = transformer_logit(hparam, 2, voca_size, False) enc_payload: List[Tuple[np.array, np.array, np.array]] = data sout = tf.nn.softmax(task.logits, axis=-1) sess = init_session() sess.run(tf.global_variables_initializer()) tprint("loading model") load_model(sess, model_path) def forward_run(inputs): batches = get_batches_ex(inputs, hparam.batch_size, 3) logit_list = [] ticker = TimeEstimator(len(batches)) for batch in batches: x0, x1, x2 = batch soft_out, = sess.run([ sout, ], feed_dict={ task.x_list[0]: x0, task.x_list[1]: x1, task.x_list[2]: x2, }) logit_list.append(soft_out) ticker.tick() return np.concatenate(logit_list) # train_batches, dev_batches = self.load_nli_data(data_loader) def idf_explain(enc_payload, explain_tag, forward_run): train_batches, dev_batches = get_nli_data(hparam, nli_setting) idf_scorer = IdfScorer(train_batches) return idf_scorer.explain(enc_payload, explain_tag, forward_run) todo_list = [ ('deletion_seq', explain_by_seq_deletion), ('replace_token', explain_by_replace), ('term_deletion', explain_by_term_deletion), ('term_replace', explain_by_term_replace), ('random', explain_by_random), ('idf', idf_explain), ('deletion', explain_by_deletion), ('LIME', explain_by_lime), ] method_dict = dict(todo_list) method = method_dict[method_name] explain_tag = "mismatch" explains: List[np.array] = method(enc_payload, explain_tag, forward_run) # pred_list = predict_translate(explains, data_loader, enc_payload, plain_payload) return explains
def run_and_save(): texts: List[str] = list(enum_f5_data()) train_x, train_y, dev_x, dev_y = get_aawd_binary_train_dev() tprint("training...") svm = SVMWrap(train_x, train_y) tprint("predicting...") scores = svm.predict(texts) output: List[Tuple[str, float]] = list(zip(texts, scores)) save_to_pickle(output, "f5_svm_aawd_prediction")
def run(args): tprint("msmarco run") hp = Hyperparam() nli_setting = ExTrainConfig() def worker_factory(out_dir): worker = PredictWorker(args.input_dir, out_dir) worker.load_model(hp, nli_setting, args.model_path, "co") return worker runner = JobRunner(args.save_dir, 696, "pc_tfrecord_ex", worker_factory) runner.auto_runner()
def main(): train_x, train_y, dev_x, dev_y = get_argu_pointwise_data() tprint("training and testing") use_char_ngram = False print("Use char ngram", use_char_ngram) pred_svm_ngram = svm.train_svm_and_test( svm.NGramFeature(use_char_ngram, 4), train_x, train_y, dev_x) # pred_svm_ngram = list([random.randint(0,1) for _ in dev_y]) acc = accuracy(pred_svm_ngram, dev_y) ap = get_ap(dev_y, pred_svm_ngram) print("acc:", acc) print("ap:", ap)
def load_robust_meta(docs_dir, only_one_seg=False): collections = dict() for (dirpath, dirnames, filenames) in os.walk(docs_dir): for name in filenames: filepath = os.path.join(dirpath, name) tprint(filepath) d = load_trec_meta(filepath) print(len(d)) collections.update(d) if only_one_seg: break return collections
def generate_selected_training_data_for_many_runs( target_data_idx, info_dir, max_seq_length, score_and_save_dir: List, generate_selected_training_data_fn): interval_start_list = left(robust_query_intervals) key = interval_start_list[target_data_idx] info_path = os.path.join(info_dir, str(key)) tprint("loading info: " + info_path) info = load_pickle_from(info_path) for score_dir, save_dir in score_and_save_dir: exist_or_mkdir(save_dir) tprint(save_dir) generate_selected_training_data_fn(info, key, max_seq_length, save_dir, score_dir)
def demo_score(info, max_seq_length): tprint("data gen") df_d = load_from_pickle("subword_df_robust_train") df_d = collections.Counter(df_d) tokenizer = get_tokenizer() sep_id = tokenizer.convert_tokens_to_ids(["[SEP]"])[0] sbc = SubwordConvertor() collection_size = 1139368311 + 10 avdl = 446 def get_score(input_ids): sep_idx1 = input_ids.index(sep_id) sep_idx2 = input_ids.index(sep_id, sep_idx1 + 1) query = input_ids[1:sep_idx1] doc_content = input_ids[sep_idx1 + 1:sep_idx2] q_terms: List[Tuple[int]] = list(sbc.get_word_as_subtoken_tuple(query)) d_terms: List[Tuple[int]] = list( sbc.get_word_as_subtoken_tuple(doc_content)) tf = collections.Counter() for d_term in d_terms: if d_term in q_terms: tf[d_term] += 1 score = 0 for q_term in q_terms: f = tf[q_term] df = df_d[q_term] N = collection_size dl = len(d_terms) score += BM25_2(f, df, N, dl, avdl) def to_str(input_ids): return " ".join(tokenizer.convert_ids_to_tokens(input_ids)) print("query", to_str(query)) print("doc", to_str(doc_content)) print('score:', score) insts = collections.defaultdict(list) grouped = group_by(info.values(), lambda e: (e['query_id'], e['doc_id'])) for qid, doc_id in grouped: key = qid, doc_id sub_entries = grouped[key] for e in sub_entries: get_score(e['input_ids']) def encode_fn(inst: InstAsInputIds) -> collections.OrderedDict: return encode_inst_as_input_ids(max_seq_length, inst)
def work(self, job_id): qids = self.query_group[job_id] data_bin = 100000 data_id_st = job_id * data_bin data_id_ed = data_id_st + data_bin data_id_manager = DataIDManager(data_id_st, data_id_ed) tprint("generating instances") insts = self.generator.generate(data_id_manager, qids) # tprint("{} instances".format(len(insts))) out_path = os.path.join(self.out_dir, str(job_id)) self.generator.write(insts, out_path) info_path = os.path.join(self.info_dir, "{}.info".format(job_id)) json.dump(data_id_manager.id_to_info, open(info_path, "w"))
def generate_selected_training_data_loop(split_no, score_dir, info_dir, max_seq_length, save_dir, generate_selected_training_data_fn): train_items, held_out = get_robust_splits(split_no) print(train_items) exist_or_mkdir(save_dir) for key in train_items: info_path = os.path.join(info_dir, str(key)) # info = load_combine_info_jsons(info_path, False, False) tprint("loading info: " + info_path) info = load_pickle_from(info_path) # info = load_info_from_compressed(info_path) generate_selected_training_data_fn(info, key, max_seq_length, save_dir, score_dir)
def prepare_model_and_data(param): tprint("Loading data") preprocessor, train_processed, valid_processed = drmm_processed() print(train_processed) tprint("Defining task") classification_task = mz.tasks.classification.Classification() classification_task.metrics = ['accuracy'] output_dim = 300 tprint('output_dim : {}'.format(output_dim)) # Initialize the model, fine-tune the hyper-parameters. tprint("building model") #model = mz.models.KNRM() #model = KNRMEx() # model = AvgEmbedding() # model.params.update(preprocessor.context) # model.params['task'] = classification_task # model.params['embedding_output_dim'] = output_dim # model.params['embedding_trainable'] = False # model.params['kernel_num'] = 11 # model.params['sigma'] = 0.1 # model.params['exact_sigma'] = 0.001 glove_embedding = mz.datasets.embeddings.load_glove_embedding( dimension=output_dim) model = get_drmm_model(preprocessor, classification_task, output_dim) for key, v in param.items(): if key in model.params: model.params[key] = v model.guess_and_fill_missing_params(verbose=1) step_per_epoch = 423 * 128 num_max_steps = 100 * step_per_epoch if 'lr_decay' in param and param['lr_decay']: lr = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=param['lr'], decay_steps=num_max_steps / 20, decay_rate=0.9) else: lr = param['lr'] model.params['optimizer'] = tf.keras.optimizers.Adam(learning_rate=lr) model.build() model.compile() tprint("processing embedding") term_index = preprocessor.context['vocab_unit'].state['term_index'] embedding_matrix = prepare_embedding(output_dim, term_index) model.load_embedding_matrix(embedding_matrix) return model, train_processed, valid_processed
def collect_doc_per_query(split, target_qid): ms_reader = MSMarcoDataReader(split) def pop(query_id, cur_doc_ids: Set): num_candidate_doc = len(cur_doc_ids) cur_doc_ids.update(ms_reader.qrel[query_id]) todo = [] for doc_id in cur_doc_ids: offset = ms_reader.doc_offset[doc_id] todo.append((doc_id, offset)) todo.sort(key=get_second) num_all_docs = len(cur_doc_ids) print("{} docs".format(num_all_docs)) exist_or_mkdir(per_query_root) save_path = get_per_query_doc_path(query_id) out_f = open(save_path, "w") for doc_id, offset in todo: content: str = ms_reader.get_content(doc_id) out_f.write(content + "\n") out_f.close() ### total_line = 36701116 skip = True with open_top100(split) as top100f: last_topic_id = None cur_doc_ids = set() for line_no, line in enumerate(top100f): if skip: if not line.startswith(target_qid): continue else: tprint("skip done") remain_lines = total_line - line_no ticker = TimeEstimator(remain_lines, "reading", 1000) skip = False [topic_id, _, doc_id, rank, _, _] = line.split() if last_topic_id is None: last_topic_id = topic_id elif last_topic_id != topic_id: pop(last_topic_id, cur_doc_ids) break last_topic_id = topic_id cur_doc_ids = set() ticker.tick() cur_doc_ids.add(doc_id) pop(last_topic_id, cur_doc_ids)
def load_scores(info_file_path, prediction_file_path): input_type = "qc" f_handler = get_format_handler(input_type) tprint("Loading json info") info: Dict = load_combine_info_jsons(info_file_path, f_handler.get_mapping(), f_handler.drop_kdp()) key_logit = "logits" tprint("Reading predictions...") data: List[Dict] = join_prediction_with_info(prediction_file_path, info, ["data_id", key_logit]) grouped: Dict[Tuple[str, str], List[Dict]] = group_by(data, f_handler.get_pair_id) print("number of groups:", len(grouped)) return grouped