def save_gold_conll_files(documents, mentions, clusters, dir_path, doc_name): non_singletons = { cluster: ms for cluster, ms in clusters.items() if len(ms) > 1 } doc_ids = [m['doc_id'] for m in mentions] starts = [min(m['tokens_ids']) for m in mentions] ends = [max(m['tokens_ids']) for m in mentions] write_output_file(documents, non_singletons, doc_ids, starts, ends, dir_path, doc_name)
predicted = clustering.fit(pairwise_distances) predicted_clusters = predicted.labels_ + max_cluster_id max_cluster_id = max(predicted_clusters) + 1 doc_ids.extend(doc_id[span_indices.cpu()]) sentence_ids.extend(sentence_id[span_indices].tolist()) starts.extend(start[span_indices].tolist()) ends.extend(end[span_indices].tolist()) all_topic_predicted_clusters.extend(predicted_clusters) torch.cuda.empty_cache() all_clusters = {} for i, cluster_id in enumerate(all_topic_predicted_clusters): if cluster_id not in all_clusters: all_clusters[cluster_id] = [] all_clusters[cluster_id].append(i) if not config['use_gold_mentions']: all_clusters, doc_ids, starts, ends = remove_nested_mentions(all_clusters, doc_ids, starts, ends) all_clusters = {cluster_id:mentions for cluster_id, mentions in all_clusters.items() if len(mentions) > 1} print('Saving conll file...') doc_name = '{}_{}_{}_{}_model_{}'.format( config['split'], config['mention_type'], config['linkage_type'], config['threshold'], config['model_num']) write_output_file(data.documents, all_clusters, doc_ids, starts, ends, config['save_path'], doc_name, topic_level=config.topic_level, corpus_level=not config.topic_level)
for i, agglomerative in enumerate(clustering): predicted = agglomerative.fit(pairwise_distances) predicted_clusters = predicted.labels_ + max_ids[i] max_ids[i] = max(predicted_clusters) + 1 clusters[i].extend(predicted_clusters) topic_level = False for i, predicted in enumerate(clusters): logger.info('Saving cluster for threshold {}'.format(threshold[i])) all_clusters = collections.defaultdict(list) for span_id, cluster_id in enumerate(predicted): all_clusters[cluster_id].append(span_id) print('Saving conll file...') doc_name = '{}_{}_{}_{}'.format(config['split'], config['mention_type'], config['linkage_type'], threshold[i]) save_path = subdir_path write_output_file(data.documents, all_clusters, doc_ids, starts, ends, save_path, doc_name, topic_level=topic_level, corpus_level=not topic_level)