def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): ptb_sents = read_ptb.split(ptb_text) dep_sents = read_conll.split(dep_text) if len(ptb_sents) != len(dep_sents): return None if ner_text is not None: ner_sents = read_ner.split(ner_text) else: ner_sents = [None] * len(ptb_sents) i = 0 doc = {'id': file_id} if raw_paras is None: doc['paragraphs'] = [ format_para(None, ptb_sents, dep_sents, ner_sents) ] #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents): # doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent])) else: doc['paragraphs'] = [] for raw_sents in raw_paras: para = format_para(' '.join(raw_sents).replace('<SEP>', ''), ptb_sents[i:i + len(raw_sents)], dep_sents[i:i + len(raw_sents)], ner_sents[i:i + len(raw_sents)]) if para['sentences']: doc['paragraphs'].append(para) i += len(raw_sents) return doc
def format_doc(file_id, raw_paras, ptb_text, dep_text, ner_text): ptb_sents = read_ptb.split(ptb_text) dep_sents = read_conll.split(dep_text) if len(ptb_sents) != len(dep_sents): return None if ner_text is not None: ner_sents = read_ner.split(ner_text) else: ner_sents = [None] * len(ptb_sents) i = 0 doc = {'id': file_id} if raw_paras is None: doc['paragraphs'] = [format_para(None, ptb_sents, dep_sents, ner_sents)] #for ptb_sent, dep_sent, ner_sent in zip(ptb_sents, dep_sents, ner_sents): # doc['paragraphs'].append(format_para(None, [ptb_sent], [dep_sent], [ner_sent])) else: doc['paragraphs'] = [] for raw_sents in raw_paras: para = format_para( ' '.join(raw_sents).replace('<SEP>', ''), ptb_sents[i:i+len(raw_sents)], dep_sents[i:i+len(raw_sents)], ner_sents[i:i+len(raw_sents)]) if para['sentences']: doc['paragraphs'].append(para) i += len(raw_sents) return doc
def read_ptb_sec(ptb_sec_dir): ptb_sec_dir = Path(ptb_sec_dir) files = [] for loc in ptb_sec_dir.iterdir(): if not str(loc).endswith('parse') and not str(loc).endswith('mrg'): continue filename = loc.parts[-1].split('.')[0] with loc.open() as file_: text = file_.read() sents = [] for parse_str in read_ptb.split(text): words, brackets = read_ptb.parse(parse_str, strip_bad_periods=True) words = [_reform_ptb_word(word) for word in words] string = ' '.join(words) sents.append((filename, string)) files.append(sents) return files
def do_web(src_dir, onto_dir, out_dir): mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt')) if len(line.split()) == 2) for annot_fn, src_fn in mapping.items(): if not annot_fn.startswith('eng'): continue ptb_loc = path.join(onto_dir, annot_fn + '.parse') src_loc = path.join(src_dir, src_fn + '.sgm') if path.exists(ptb_loc) and path.exists(src_loc): src_doc = sgml_extract(open(src_loc).read()) ptb_doc = [read_ptb.parse(parse_str, strip_bad_periods=True)[0] for parse_str in read_ptb.split(open(ptb_loc).read())] print('Found') else: print('Miss')
def do_web(src_dir, onto_dir, out_dir): mapping = dict(line.split() for line in open(path.join(onto_dir, 'map.txt')) if len(line.split()) == 2) for annot_fn, src_fn in mapping.items(): if not annot_fn.startswith('eng'): continue ptb_loc = path.join(onto_dir, annot_fn + '.parse') src_loc = path.join(src_dir, src_fn + '.sgm') if path.exists(ptb_loc) and path.exists(src_loc): src_doc = sgml_extract(open(src_loc).read()) ptb_doc = [ read_ptb.parse(parse_str, strip_bad_periods=True)[0] for parse_str in read_ptb.split(open(ptb_loc).read()) ] print('Found') else: print('Miss')