def create_dataset(filename): dataset = convert_lib.Dataset(PRECO) lines = get_lines_from_file(filename) for line in tqdm.tqdm(lines): orig_document = json.loads(line) new_document = convert_lib.Document( convert_lib.make_doc_id(PRECO, orig_document["id"]), DUMMY_DOC_PART) sentence_offsets = [] token_count = 0 new_sentences, sentence_index_map, sentence_offsets = condense_sentences( orig_document["sentences"]) new_document.sentences = new_sentences new_document.speakers = make_empty_speakers(new_document.sentences) new_document.clusters = [] for cluster in orig_document["mention_clusters"]: new_cluster = [] for sentence, begin, end in cluster: modified_sentence = sentence_index_map[sentence] new_cluster.append([ sentence_offsets[modified_sentence] + begin, sentence_offsets[modified_sentence] + end - 1 ]) new_document.clusters.append(new_cluster) dataset.documents.append(new_document) return dataset
def create_dataset(filename): dataset_name = convert_lib.DatasetName.wikicoref dataset = convert_lib.Dataset(dataset_name) document_counter = 0 sentence_offset = 0 curr_doc = None curr_sent = [] curr_sent_orig_coref_labels = [] all_spans = collections.defaultdict(list) print(filename) for line in get_lines_from_file(filename): if line.startswith("#end") or line.startswith("null"): continue elif line.startswith("#begin"): if curr_doc is not None: curr_doc.clusters = list(all_spans.values()) all_spans = collections.defaultdict(list) dataset.documents.append(curr_doc) print(line.split()[2:]) curr_doc_id = convert_lib.make_doc_id(dataset_name, "-".join(line.split()[2:])) curr_doc = convert_lib.Document(curr_doc_id, DUMMY_DOC_PART) sentence_offset = 0 else: fields = line.split() if not fields: if curr_sent: add_sentence(curr_doc, curr_sent) coref_spans = conll_lib.coref_to_spans( curr_sent_orig_coref_labels, sentence_offset) all_spans = ldd_append(all_spans, coref_spans) sentence_offset += len(curr_sent) curr_sent = [] curr_sent_orig_coref_labels = [] else: word = fields[3] coref_label = fields[4] curr_sent_orig_coref_labels.append(coref_label) curr_sent.append((word, convert_lib.NO_SPEAKER)) curr_doc.clusters = list(all_spans.values()) all_spans = collections.defaultdict(list) dataset.documents.append(curr_doc) return dataset
def main(): data_home = sys.argv[1] for dataset in convert_lib.DatasetName.ALL: if dataset == 'conll': print("Skipping conll") continue #for subset in convert_lib.DatasetSplit.ALL: for subset in ["test"]: input_file = os.path.join(data_home, "original", dataset, subset + ".miniconll") print(dataset, subset) listified_dataset = conll_lib.listify_conll_dataset(input_file) new_dataset = convert_lib.Dataset(dataset + "_" + subset) for document in tqdm.tqdm(listified_dataset): converted_document = convert(document, dataset) new_dataset.documents[convert_lib.ProcessingStage.TOKENIZED].append(converted_document) output_file = os.path.join(data_home, "processed", dataset, subset + ".jsonl") new_dataset.dump_to_jsonl(output_file)
def create_dataset(filename): dataset = convert_lib.Dataset(GAP) with open(filename, 'r') as tsvfile: for row in csv.DictReader(tsvfile, delimiter='\t'): text = clean_up_text(row) curr_document = convert_lib.Document( convert_lib.make_doc_id(GAP, row["ID"]), DUMMY_DOC_PART) (pronoun_indices, a_indices, b_indices) = char_to_tok_idx( text, ((row["Pronoun"], row["Pronoun-offset"]), (row["A"], row["A-offset"]), (row["B"], row["B-offset"]))) true_cluster = [pronoun_indices] other_cluster = [] if row["A-coref"] == "TRUE": true_cluster.append(a_indices) else: other_cluster.append(a_indices) if row["B-coref"] == "TRUE": true_cluster.append(b_indices) else: other_cluster.append(b_indices) curr_document.sentences = [ word_tokenize(sent) for sent in sent_tokenize(text) ] tok_running_count = 0 char_running_count = 0 for i, sent in enumerate(curr_document.sentences): for j, tok in enumerate(sent): char_running_count += len(tok) + 1 tok_running_count += len(sent) curr_document.speakers = convert_lib.make_empty_speakers( curr_document.sentences) curr_document.clusters = [true_cluster, other_cluster] dataset.documents.append(curr_document) return dataset
def create_dataset(filename, field_map): dataset = convert_lib.Dataset(CONLL12) document_counter = 0 sentence_offset = 0 curr_doc = None curr_doc_id = None curr_sent = collections.defaultdict(list) doc_spans = collections.defaultdict(list) for line in get_lines_from_file(filename): if line.startswith("#begin"): assert curr_doc is None curr_doc_id = line.split()[2][1:-2].replace("/", "-") part = str(int(line.split()[-1])) curr_doc = convert_lib.Document(curr_doc_id, part) sentence_offset = 0 elif line.startswith("#end"): curr_doc.clusters = list(doc_spans.values()) dataset.documents.append(curr_doc) doc_spans = collections.defaultdict(list) curr_doc = None elif not line.strip(): if curr_sent: doc_spans, sentence_offset = add_sentence( curr_doc, curr_sent, doc_spans, sentence_offset) curr_sent = collections.defaultdict(list) else: fields = line.replace("/.", ".").split() for field_name, field_index in field_map.items(): curr_sent[field_name].append(fields[field_index]) return dataset
def create_dataset(filename): mentions_map = collections.defaultdict(list) dataset = convert_lib.Dataset(CONLL12) document_counter = 0 sentence_offset = 0 curr_doc = None curr_doc_name = None curr_sent_orig_labels = [] all_spans = collections.defaultdict(list) sentence_idx = 0 for line in get_lines_from_file(filename): if line.startswith("#"): continue if not line.strip(): # add sentence if curr_sent_orig_labels: (parts, tokens, pos, parse, speakers, ner, coref) = zip(*curr_sent_orig_labels) coref_spans = conll_lib.coref_to_spans(coref, 0) parse_spans = conll_lib.parse_to_spans(parse) for entity, cluster in coref_spans.items(): for start, inclusive_end in cluster: assert len(set(parts)) == 1 assert len(set(speakers)) == 1 end = inclusive_end + 1 parse_label = parse_spans.get( (start, inclusive_end), "~" + "".join(parse[start:end])) mention_obj = Mention(curr_doc_id, parts[0], entity, sentence_idx, start, parse_label, tokens[start:end], pos[start:end], ner[start:end], speakers[0]) mentions_map[mention_obj.mention_id].append( mention_obj) sentence_idx += 1 curr_sent = [] curr_sent_orig_labels = [] else: fields = line.split() # check for new doc doc_name, part = line.split()[:2] doc_name = doc_name.replace("/", "-") if not doc_name == curr_doc_name: if curr_doc is not None: curr_doc.clusters = list(all_spans.values()) all_spans = collections.defaultdict(list) dataset.documents.append(curr_doc) curr_doc_name = doc_name curr_doc_id = convert_lib.make_doc_id(CONLL12, doc_name) curr_doc = convert_lib.Document(curr_doc_id, part) sentence_idx = 0 sentence_offset = 0 doc, part, token_idx, token, pos, parse, _, _, _, speaker, ner = fields[: 11] coref = fields[-1] curr_sent_orig_labels.append( (part, token, pos, parse, speaker, ner, coref)) for mention_id, mentions in mentions_map.items(): for mention in mentions: print(str(mention))