def handle_line(line, document_state, ner_type): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key( begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() return document_state.finalize() else: row = line.split() if len(row) == 0: document_state.sentences.append(tuple(document_state.text)) del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] return None assert len(row) >= 12 word = normalize_word(row[3]) coref = row[-1] doc_key = conll.get_doc_key(row[0], row[1]) speaker = row[9] person = row[10] word_index = len(document_state.text) + sum( len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) if coref != "-": for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append( (word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.stacks[cluster_id].append(word_index) else: cluster_id = int(segment[:-1]) start = document_state.stacks[cluster_id].pop() document_state.clusters[cluster_id].append( (start, word_index)) if ner_type in person: t = regex.sub('', person) if ')' in person: document_state.people.append((word_index, word_index)) else: document_state.person_start = word_index elif ')' in person and document_state.person_start is not None: document_state.people.append( (document_state.person_start, word_index)) document_state.person_start = None return None
def handle_line(line, document_state, language, labels, stats): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key( begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() finalized_state = document_state.finalize() stats["num_clusters"] += len(finalized_state["clusters"]) stats["num_mentions"] += sum( len(c) for c in finalized_state["clusters"]) return finalized_state else: row = line.split() if len(row) == 0: stats["max_sent_len_{}".format(language)] = max( len(document_state.text), stats["max_sent_len_{}".format(language)]) stats["num_sents_{}".format(language)] += 1 document_state.sentences.append(tuple(document_state.text)) del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] return None assert len(row) >= 12 doc_key = conll.get_doc_key(row[0], row[1]) word = normalize_word(row[3], language) parse = row[5] speaker = row[9] ner = row[10] coref = row[-1] word_index = len(document_state.text) + sum( len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) if coref != "-": for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append( (word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.coref_stacks[cluster_id].append( word_index) else: cluster_id = int(segment[:-1]) start = document_state.coref_stacks[cluster_id].pop() document_state.clusters[cluster_id].append( (start, word_index)) return None
def minimize_partition(name, language, extension, labels, stats, tokenizer, seg_len, input_dir, output_dir): input_path = "{}/{}.{}.{}".format(input_dir, name, language, extension) output_path = "{}/{}.{}.{}.jsonlines".format(output_dir, name, language, seg_len) count = 0 print("Minimizing {}".format(input_path)) documents = [] with open(input_path, "r") as input_file: for line in input_file.readlines(): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2)) documents.append((doc_key, [])) elif line.startswith("#end document"): continue else: documents[-1][1].append(line) with open(output_path, "w") as output_file: for document_lines in documents: if skip(document_lines[0]): continue document = get_document(document_lines, tokenizer, language, seg_len) output_file.write(json.dumps(document)) output_file.write("\n") count += 1 print("Wrote {} documents to {}".format(count, output_path))
def minimize_partition(partition, extension, args, tokenizer): input_path = os.path.join(args.input_dir, f'{partition}.{args.language}.{extension}') output_path = os.path.join( args.output_dir, f'{partition}.{args.language}.{args.seg_len}.jsonlines') doc_count = 0 logger.info(f'Minimizing {input_path}...') # Read documents documents = [] # [(doc_key, lines)] with open(input_path, 'r') as input_file: for line in input_file.readlines(): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2)) documents.append((doc_key, [])) elif line.startswith('#end document'): continue else: documents[-1][1].append(line) # Write documents with open(output_path, 'w') as output_file: for doc_key, doc_lines in documents: if skip_doc(doc_key): continue document = get_document(doc_key, doc_lines, args.language, args.seg_len, tokenizer) output_file.write(json.dumps(document)) output_file.write('\n') doc_count += 1 logger.info(f'Processed {doc_count} documents to {output_path}')
def handle_line(line, document_state): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() return document_state.finalize() else: row = line.split() if len(row) == 0: document_state.sentences.append(tuple(document_state.text)) del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] return None assert len(row) >= 12 word = normalize_word(row[3]) coref = row[-1] doc_key = conll.get_doc_key(row[0], row[1]) speaker = row[9] word_index = len(document_state.text) + sum(len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) if coref == "-": return None for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append((word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.stacks[cluster_id].append(word_index) else: cluster_id = int(segment[:-1]) start = document_state.stacks[cluster_id].pop() document_state.clusters[cluster_id].append((start, word_index)) return None
def read_conll_file(conll_file_path: str) -> List[Tuple]: documents = [] with open(conll_file_path) as fi: for line in fi: begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: doc_key = conll.get_doc_key(begin_document_match.group(1), begin_document_match.group(2)) documents.append((doc_key, [])) elif line.startswith("#end document"): continue else: documents[-1][1].append(line.strip()) return documents
def handle_line(line, document_state): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key( begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() return document_state.finalize() else: row = line.split() if len(row) == 0: document_state.sentences.append(tuple(document_state.text)) # print document_state.text # print document_state.text_ner_tags # I added this document_state.pos_tags.append(tuple(document_state.text_pos_tags)) del document_state.text_pos_tags[:] document_state.ner_tags.append(tuple(document_state.text_ner_tags)) del document_state.text_ner_tags[:] del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] return None # print line assert len(row) >= 12 word = normalize_word(row[3]) coref = row[-1] doc_key = conll.get_doc_key(row[0], row[1]) # print doc_key speaker = row[9] # I added this pos_tag = row[4] document_state.text_pos_tags.append(pos_tag) # ------------------------------------------- # NER stuff ner_tag = row[10] ner = '' if ner_tag[0] == "(": # beginning of tag if ner_tag[-1] == ")": # one liner ner = "B-" + ner_tag[1:-1] else: ner = "B-" + ner_tag[1:-1] document_state.ner_stack.append(ner_tag[1:-1]) else: if len(document_state.ner_stack) > 0: ner = "I-" + document_state.ner_stack[-1] if ner_tag[-1] == ")": document_state.ner_stack.pop() else: ner = "O" document_state.text_ner_tags.append(ner) # ------------------------------------------- word_index = len(document_state.text) + sum( len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) if coref == "-": return None for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append( (word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.stacks[cluster_id].append(word_index) else: cluster_id = int(segment[:-1]) start = document_state.stacks[cluster_id].pop() document_state.clusters[cluster_id].append((start, word_index)) return None
def handle_line(line, document_state, language, labels, stats): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key( begin_document_match.group(1), begin_document_match.group(2)) return None elif line.startswith("#end document"): document_state.assert_finalizable() finalized_state = document_state.finalize() stats["num_clusters"] += len(finalized_state["clusters"]) stats["num_mentions"] += sum( len(c) for c in finalized_state["clusters"]) #labels["{}_const_labels".format(language)].update(l for _, _, l in finalized_state["constituents"]) labels["ner"].update(l for _, _, l in finalized_state["ner"]) return finalized_state else: row = line.split() if len(row) == 0: stats["max_sent_len_{}".format(language)] = max( len(document_state.text), stats["max_sent_len_{}".format(language)]) stats["num_sents_{}".format(language)] += 1 document_state.sentences.append(tuple(document_state.text)) del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] document_state.start_times.append( tuple(document_state.start_times_item)) document_state.end_times.append( tuple(document_state.end_times_item)) document_state.video_npy_files.append( tuple(document_state.video_npy_files_item)) document_state.genders.append(tuple(document_state.text_genders)) document_state.fpronouns.append( tuple(document_state.text_fpronouns)) del document_state.start_times_item[:] del document_state.end_times_item[:] del document_state.video_npy_files_item[:] del document_state.text_genders[:] del document_state.text_fpronouns[:] return None assert len(row) >= 12 doc_key = conll.get_doc_key(row[0], row[1]) word = normalize_word(row[3], language) parse = row[5] speaker = row[9] ner = row[10] st_time = -1 if (row[-4] == 'NOTIME') else int(row[-4]) en_time = -1 if (row[-3] == 'NOTIME') else int(row[-3]) video_npy_file = row[-2] coref = row[-1] word_index = len(document_state.text) + sum( len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) document_state.start_times_item.append(st_time) document_state.end_times_item.append(en_time) document_state.video_npy_files_item.append(video_npy_file) if (word.lower() in ['he', 'him', 'his', 'himself', 'boy', 'man']): gender = 1 elif (word.lower() in ['she', 'her', 'hers', 'herself', 'girl', 'woman', 'lady']): gender = -1 else: gender = 0 document_state.text_genders.append(gender) firstpronoun = 1 if (word.lower() in ['i', 'my', 'me', 'mine', 'myself']) else 0 document_state.text_fpronouns.append(firstpronoun) #handle_bit(word_index, parse, document_state.const_stack, document_state.constituents) handle_bit(word_index, ner, document_state.ner_stack, document_state.ner) if coref != "-": for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append( (word_index, word_index)) else: cluster_id = int(segment[1:]) document_state.coref_stacks[cluster_id].append( word_index) else: cluster_id = int(segment[:-1]) start = document_state.coref_stacks[cluster_id].pop() document_state.clusters[cluster_id].append( (start, word_index)) return None
def handle_line(line, document_state, language, labels, stats): begin_document_match = re.match(conll.BEGIN_DOCUMENT_REGEX, line) if begin_document_match: document_state.assert_empty() document_state.doc_key = conll.get_doc_key( begin_document_match.group(1), begin_document_match.group(2)) print(document_state.doc_key) return None elif line.startswith("#end document"): #document_state.assert_finalizable() finalized_state = document_state.finalize() stats["num_clusters"] += len(finalized_state["clusters"]) stats["num_mentions"] += sum( len(c) for c in finalized_state["clusters"]) labels["{}_const_labels".format(language)].update( l for _, _, l in finalized_state["constituents"]) #labels["ner"].update(l for _, _, l in finalized_state["ner"]) return finalized_state else: row = line.split() if len(row) == 0: stats["max_sent_len_{}".format(language)] = max( len(document_state.text), stats["max_sent_len_{}".format(language)]) stats["num_sents_{}".format(language)] += 1 document_state.sentences.append(tuple(document_state.text)) del document_state.text[:] document_state.speakers.append(tuple(document_state.text_speakers)) del document_state.text_speakers[:] return None assert len(row) >= 12 doc_key = conll.get_doc_key(row[0], row[1]) word = normalize_word(row[3], language) #POS = row[4] #head_POS = row[7] parse = row[5] speaker = row[9] ner = row[10] st_time = -1 if (row[-4] == 'NOTIME') else int(row[-4]) en_time = -1 if (row[-3] == 'NOTIME') else int(row[-3]) video_npy_file = row[-2] coref = row[-1] entity = row[-5] word_index = len(document_state.text) + sum( len(s) for s in document_state.sentences) document_state.text.append(word) document_state.text_speakers.append(speaker) #document_state.POS.append(pos) #document_state.head_POS.append(head_POS) if (len(document_state.start_times) == 0 or (not (document_state.start_times[-1] == st_time and document_state.end_times[-1] == en_time))): document_state.start_times.append(st_time) document_state.end_times.append(en_time) document_state.video_npy_files.append(video_npy_file) #print(word_index, parse) #handle_bit(word_index, parse, document_state.const_stack, document_state.constituents) #handle_bit(word_index, ner, document_state.ner_stack, document_state.ner) #coref_number = 0 #entity_number = 0 if coref != "-": for segment in coref.split("|"): if segment[0] == "(": if segment[-1] == ")": cluster_id = int(segment[1:-1]) document_state.clusters[cluster_id].append( (word_index, word_index)) #coref_number += 1 else: cluster_id = int(segment[1:]) document_state.coref_stacks[cluster_id].append( word_index) else: cluster_id = int(segment[:-1]) #print(segment,cluster_id) start = document_state.coref_stacks[cluster_id].pop() #coref_number += 1 document_state.clusters[cluster_id].append( (start, word_index)) if entity != "-": for segment in entity.split("|"): if segment[0] == "<": if segment[-1] == ">": entity_id = int(segment[1:-1]) document_state.entities.append( (word_index, word_index, entity_id)) #entity_number += 1 else: entity_id = int(segment[1:]) document_state.entity_stacks[entity_id].append( word_index) else: entity_id = int(segment[:-1]) #print(segment,entity_id) start = document_state.entity_stacks[entity_id].pop() #entity_number += 1 document_state.entities.append( (start, word_index, entity_id)) if ner != "*": for segment in ner.split("|"): if segment[0] == "[": if segment[-1] == "]": ner_id = int(segment[1:-1]) document_state.ners.append( (word_index, word_index, ner_id)) else: ner_id = int(segment[1:]) document_state.ner_stacks[ner_id].append(word_index) else: ner_id = int(segment[:-1]) #print(segment, ner_id) start = document_state.ner_stacks[ner_id].pop() document_state.ners.append((start, word_index, ner_id)) return None