def load_file(self, filepath: tuple): phrase_tree_path = get_resource(filepath[0]) dep_tree_path = get_resource(filepath[1]) pf = TimingFileIterator(phrase_tree_path) message_prefix = f'Loading {os.path.basename(phrase_tree_path)} and {os.path.basename(dep_tree_path)}' for i, (dep_sent, phrase_sent) in enumerate( zip(read_tsv_as_sents(dep_tree_path), pf)): # Somehow the file contains escaped literals phrase_sent = phrase_sent.replace('\\/', '/') token = [x[1] for x in dep_sent] pos = [x[3] for x in dep_sent] head = [int(x[6]) for x in dep_sent] rel = [x[7] for x in dep_sent] phrase_tree = load_trees_from_str(phrase_sent, [head], [rel], [token]) assert len( phrase_tree ) == 1, f'{phrase_tree_path} must have on tree per line.' phrase_tree = phrase_tree[0] yield { 'FORM': token, 'CPOS': pos, 'HEAD': head, 'DEPREL': rel, 'tree': phrase_tree, 'hpsg': phrase_tree.convert() } pf.log( f'{message_prefix} {i + 1} samples [blink][yellow]...[/yellow][/blink]' ) pf.erase()
def load_file(self, filepath: str): filepath = get_resource(filepath) if os.path.isfile(filepath): files = [filepath] else: assert os.path.isdir( filepath), f'{filepath} has to be a directory of CoNLL 2012' files = sorted( glob.glob(f'{filepath}/**/*gold_conll', recursive=True)) timer = CountdownTimer(len(files)) for fid, f in enumerate(files): timer.log(f'files loading[blink][yellow]...[/yellow][/blink]') # 0:DOCUMENT 1:PART 2:INDEX 3:WORD 4:POS 5:PARSE 6:LEMMA 7:FRAME 8:SENSE 9:SPEAKER 10:NE 11-N:ARGS N:COREF for sent in read_tsv_as_sents(f, ignore_prefix='#'): sense = [cell[7] for cell in sent] props = [cell[11:-1] for cell in sent] props = map(lambda p: p, zip(*props)) prd_bio_labels = [ self._make_bio_labels(prop) for prop in props ] prd_bio_labels = [self._remove_B_V(x) for x in prd_bio_labels] prd_indices = [i for i, x in enumerate(sense) if x != '-'] token = [x[3] for x in sent] srl = [None for x in token] for idx, labels in zip(prd_indices, prd_bio_labels): srl[idx] = labels srl = [x if x else ['O'] * len(token) for x in srl] yield {'token': token, 'srl': srl}
def bmes_to_flat(inpath, outpath): with open(outpath, 'w', encoding='utf-8') as out: for sent in read_tsv_as_sents(inpath): chunks = get_entities([cells[1] for cells in sent]) chars = [cells[0] for cells in sent] words = [] for tag, start, end in chunks: word = ''.join(chars[start: end]) words.append(word) out.write(' '.join(f'{word}/{tag}' for word, (tag, _, _) in zip(words, chunks))) out.write('\n')
def ctb_pos_to_text_format(path, delimiter='_'): """ Convert ctb pos tagging corpus from tsv format to text format, where each word is followed by its pos tag. Args: path: File to be converted. delimiter: Delimiter between word and tag. """ path = get_resource(path) name, ext = os.path.splitext(path) with open(f'{name}.txt', 'w', encoding='utf-8') as out: for sent in read_tsv_as_sents(path): out.write(' '.join([delimiter.join(x) for x in sent])) out.write('\n')
def make_con_txt(conll_file, output_file): with open(output_file, 'w') as out: for sent in read_tsv_as_sents(conll_file): tree = [] pos_per_sent = [] for cell in sent: if cell[0] == '#begin' or cell[0] == '#end': continue if len(cell) < 8: print(cell) filename, sentence_id, token_id, word, POS, parse, framefile, roleset, *_ = cell parse = parse.replace('*', f'({POS} {word})') tree.append(parse) pos_per_sent.append(POS) bracketed = ' '.join(tree) out.write(bracketed) out.write('\n')
def convert_conll03_to_json(file_path): dataset = [] num_docs = [0] def new_doc(): doc_key = num_docs[0] num_docs[0] += 1 return { 'doc_key': doc_key, 'sentences': [], 'ner': [], } doc = new_doc() offset = 0 for cells in read_tsv_as_sents(file_path): if cells[0][0] == '-DOCSTART-' and doc['ner']: dataset.append(doc) doc = new_doc() offset = 0 sentence = [x[0] for x in cells] ner = [x[-1] for x in cells] ner = hanlp.utils.span_util.iobes_tags_to_spans(ner) adjusted_ner = [] for label, (span_start, span_end) in ner: adjusted_ner.append( [span_start + offset, span_end + offset, label]) doc['sentences'].append(sentence) doc['ner'].append(adjusted_ner) offset += len(sentence) if doc['ner']: dataset.append(doc) output_path = os.path.splitext(file_path)[0] + '.json' with open(output_path, 'w') as out: for each in dataset: json.dump(each, out) out.write('\n')