Example #1
0
def create_dataset(filename):

    dataset = convert_lib.Dataset(PRECO)

    lines = get_lines_from_file(filename)
    for line in tqdm.tqdm(lines):
        orig_document = json.loads(line)
        new_document = convert_lib.Document(
            convert_lib.make_doc_id(PRECO, orig_document["id"]),
            DUMMY_DOC_PART)
        sentence_offsets = []
        token_count = 0

        new_sentences, sentence_index_map, sentence_offsets = condense_sentences(
            orig_document["sentences"])

        new_document.sentences = new_sentences
        new_document.speakers = make_empty_speakers(new_document.sentences)
        new_document.clusters = []
        for cluster in orig_document["mention_clusters"]:
            new_cluster = []
            for sentence, begin, end in cluster:
                modified_sentence = sentence_index_map[sentence]
                new_cluster.append([
                    sentence_offsets[modified_sentence] + begin,
                    sentence_offsets[modified_sentence] + end - 1
                ])
            new_document.clusters.append(new_cluster)
        dataset.documents.append(new_document)

    return dataset
def create_dataset(filename):

    dataset_name = convert_lib.DatasetName.wikicoref
    dataset = convert_lib.Dataset(dataset_name)

    document_counter = 0
    sentence_offset = 0

    curr_doc = None
    curr_sent = []
    curr_sent_orig_coref_labels = []
    all_spans = collections.defaultdict(list)

    print(filename)

    for line in get_lines_from_file(filename):

        if line.startswith("#end") or line.startswith("null"):
            continue
        elif line.startswith("#begin"):
            if curr_doc is not None:
                curr_doc.clusters = list(all_spans.values())
                all_spans = collections.defaultdict(list)
                dataset.documents.append(curr_doc)

            print(line.split()[2:])
            curr_doc_id = convert_lib.make_doc_id(dataset_name,
                                                  "-".join(line.split()[2:]))
            curr_doc = convert_lib.Document(curr_doc_id, DUMMY_DOC_PART)
            sentence_offset = 0
        else:
            fields = line.split()
            if not fields:
                if curr_sent:
                    add_sentence(curr_doc, curr_sent)
                    coref_spans = conll_lib.coref_to_spans(
                        curr_sent_orig_coref_labels, sentence_offset)
                    all_spans = ldd_append(all_spans, coref_spans)
                    sentence_offset += len(curr_sent)
                    curr_sent = []
                    curr_sent_orig_coref_labels = []
            else:
                word = fields[3]
                coref_label = fields[4]
                curr_sent_orig_coref_labels.append(coref_label)
                curr_sent.append((word, convert_lib.NO_SPEAKER))

    curr_doc.clusters = list(all_spans.values())
    all_spans = collections.defaultdict(list)
    dataset.documents.append(curr_doc)

    return dataset
def main():
  data_home = sys.argv[1]

  for dataset in convert_lib.DatasetName.ALL:
    if dataset == 'conll':
      print("Skipping conll")
      continue
    #for subset in convert_lib.DatasetSplit.ALL:
    for subset in ["test"]:
      input_file = os.path.join(data_home, "original", dataset,
                                subset + ".miniconll")
      print(dataset, subset)
      listified_dataset = conll_lib.listify_conll_dataset(input_file)
      new_dataset = convert_lib.Dataset(dataset + "_" + subset)
      for document in tqdm.tqdm(listified_dataset):
        converted_document = convert(document, dataset)
        new_dataset.documents[convert_lib.ProcessingStage.TOKENIZED].append(converted_document)


      output_file = os.path.join(data_home, "processed", dataset,
                                subset + ".jsonl")
      new_dataset.dump_to_jsonl(output_file)
Example #4
0
def create_dataset(filename):
    dataset = convert_lib.Dataset(GAP)
    with open(filename, 'r') as tsvfile:
        for row in csv.DictReader(tsvfile, delimiter='\t'):
            text = clean_up_text(row)

            curr_document = convert_lib.Document(
                convert_lib.make_doc_id(GAP, row["ID"]), DUMMY_DOC_PART)
            (pronoun_indices, a_indices, b_indices) = char_to_tok_idx(
                text,
                ((row["Pronoun"], row["Pronoun-offset"]),
                 (row["A"], row["A-offset"]), (row["B"], row["B-offset"])))
            true_cluster = [pronoun_indices]
            other_cluster = []
            if row["A-coref"] == "TRUE":
                true_cluster.append(a_indices)
            else:
                other_cluster.append(a_indices)
            if row["B-coref"] == "TRUE":
                true_cluster.append(b_indices)
            else:
                other_cluster.append(b_indices)
            curr_document.sentences = [
                word_tokenize(sent) for sent in sent_tokenize(text)
            ]

            tok_running_count = 0
            char_running_count = 0
            for i, sent in enumerate(curr_document.sentences):
                for j, tok in enumerate(sent):
                    char_running_count += len(tok) + 1
                tok_running_count += len(sent)

            curr_document.speakers = convert_lib.make_empty_speakers(
                curr_document.sentences)
            curr_document.clusters = [true_cluster, other_cluster]
            dataset.documents.append(curr_document)

    return dataset
def create_dataset(filename, field_map):

    dataset = convert_lib.Dataset(CONLL12)

    document_counter = 0
    sentence_offset = 0

    curr_doc = None
    curr_doc_id = None
    curr_sent = collections.defaultdict(list)
    doc_spans = collections.defaultdict(list)

    for line in get_lines_from_file(filename):

        if line.startswith("#begin"):
            assert curr_doc is None
            curr_doc_id = line.split()[2][1:-2].replace("/", "-")
            part = str(int(line.split()[-1]))
            curr_doc = convert_lib.Document(curr_doc_id, part)
            sentence_offset = 0

        elif line.startswith("#end"):
            curr_doc.clusters = list(doc_spans.values())
            dataset.documents.append(curr_doc)
            doc_spans = collections.defaultdict(list)
            curr_doc = None

        elif not line.strip():
            if curr_sent:
                doc_spans, sentence_offset = add_sentence(
                    curr_doc, curr_sent, doc_spans, sentence_offset)
                curr_sent = collections.defaultdict(list)

        else:
            fields = line.replace("/.", ".").split()
            for field_name, field_index in field_map.items():
                curr_sent[field_name].append(fields[field_index])

    return dataset
Example #6
0
def create_dataset(filename):

    mentions_map = collections.defaultdict(list)

    dataset = convert_lib.Dataset(CONLL12)

    document_counter = 0
    sentence_offset = 0

    curr_doc = None
    curr_doc_name = None
    curr_sent_orig_labels = []
    all_spans = collections.defaultdict(list)
    sentence_idx = 0

    for line in get_lines_from_file(filename):

        if line.startswith("#"):
            continue
        if not line.strip():
            # add sentence
            if curr_sent_orig_labels:
                (parts, tokens, pos, parse, speakers, ner,
                 coref) = zip(*curr_sent_orig_labels)
                coref_spans = conll_lib.coref_to_spans(coref, 0)
                parse_spans = conll_lib.parse_to_spans(parse)

                for entity, cluster in coref_spans.items():
                    for start, inclusive_end in cluster:
                        assert len(set(parts)) == 1
                        assert len(set(speakers)) == 1
                        end = inclusive_end + 1

                        parse_label = parse_spans.get(
                            (start, inclusive_end),
                            "~" + "".join(parse[start:end]))
                        mention_obj = Mention(curr_doc_id, parts[0], entity,
                                              sentence_idx, start, parse_label,
                                              tokens[start:end],
                                              pos[start:end], ner[start:end],
                                              speakers[0])

                        mentions_map[mention_obj.mention_id].append(
                            mention_obj)

                sentence_idx += 1

            curr_sent = []
            curr_sent_orig_labels = []
        else:
            fields = line.split()
            # check for new doc
            doc_name, part = line.split()[:2]
            doc_name = doc_name.replace("/", "-")
            if not doc_name == curr_doc_name:
                if curr_doc is not None:
                    curr_doc.clusters = list(all_spans.values())
                    all_spans = collections.defaultdict(list)
                    dataset.documents.append(curr_doc)
                curr_doc_name = doc_name
                curr_doc_id = convert_lib.make_doc_id(CONLL12, doc_name)
                curr_doc = convert_lib.Document(curr_doc_id, part)
                sentence_idx = 0
                sentence_offset = 0

            doc, part, token_idx, token, pos, parse, _, _, _, speaker, ner = fields[:
                                                                                    11]
            coref = fields[-1]

            curr_sent_orig_labels.append(
                (part, token, pos, parse, speaker, ner, coref))

    for mention_id, mentions in mentions_map.items():
        for mention in mentions:
            print(str(mention))