Ejemplo n.º 1
0
def save_gold_conll_files(documents, mentions, clusters, dir_path, doc_name):
    non_singletons = {
        cluster: ms
        for cluster, ms in clusters.items() if len(ms) > 1
    }
    doc_ids = [m['doc_id'] for m in mentions]
    starts = [min(m['tokens_ids']) for m in mentions]
    ends = [max(m['tokens_ids']) for m in mentions]

    write_output_file(documents, non_singletons, doc_ids, starts, ends,
                      dir_path, doc_name)
Ejemplo n.º 2
0
        predicted = clustering.fit(pairwise_distances)
        predicted_clusters = predicted.labels_ + max_cluster_id
        max_cluster_id = max(predicted_clusters) + 1

        doc_ids.extend(doc_id[span_indices.cpu()])
        sentence_ids.extend(sentence_id[span_indices].tolist())
        starts.extend(start[span_indices].tolist())
        ends.extend(end[span_indices].tolist())
        all_topic_predicted_clusters.extend(predicted_clusters)
        torch.cuda.empty_cache()


    all_clusters = {}
    for i, cluster_id in enumerate(all_topic_predicted_clusters):
        if cluster_id not in all_clusters:
            all_clusters[cluster_id] = []
        all_clusters[cluster_id].append(i)


    if not config['use_gold_mentions']:
        all_clusters, doc_ids, starts, ends = remove_nested_mentions(all_clusters, doc_ids, starts, ends)

    all_clusters = {cluster_id:mentions for cluster_id, mentions in all_clusters.items() if len(mentions) > 1}

    print('Saving conll file...')
    doc_name = '{}_{}_{}_{}_model_{}'.format(
        config['split'], config['mention_type'], config['linkage_type'], config['threshold'], config['model_num'])

    write_output_file(data.documents, all_clusters, doc_ids, starts, ends, config['save_path'], doc_name,
                      topic_level=config.topic_level, corpus_level=not config.topic_level)
Ejemplo n.º 3
0
            for i, agglomerative in enumerate(clustering):
                predicted = agglomerative.fit(pairwise_distances)
                predicted_clusters = predicted.labels_ + max_ids[i]
                max_ids[i] = max(predicted_clusters) + 1
                clusters[i].extend(predicted_clusters)

        topic_level = False

        for i, predicted in enumerate(clusters):
            logger.info('Saving cluster for threshold {}'.format(threshold[i]))
            all_clusters = collections.defaultdict(list)
            for span_id, cluster_id in enumerate(predicted):
                all_clusters[cluster_id].append(span_id)

            print('Saving conll file...')
            doc_name = '{}_{}_{}_{}'.format(config['split'],
                                            config['mention_type'],
                                            config['linkage_type'],
                                            threshold[i])
            save_path = subdir_path
            write_output_file(data.documents,
                              all_clusters,
                              doc_ids,
                              starts,
                              ends,
                              save_path,
                              doc_name,
                              topic_level=topic_level,
                              corpus_level=not topic_level)