def parse_odin_inst(string, corpus = None, idnum=None): """ Method to parse and create an IGT instance from odin-style text. """ # Start by looking for the doc_id, and the line range. doc_re = re.search('doc_id=(\S+)\s([0-9]+)\s([0-9]+)\s(.*)\n', string) docid, lnstart, lnstop, tagtypes = doc_re.groups() if idnum is not None: id = gen_item_id('i', idnum) elif corpus: id = corpus.askIgtId() else: corpus = XigtCorpus() id = 'i{}'.format(len(corpus)) inst = Igt(id = id, attributes={'doc-id':docid, 'line-range':'%s %s' % (lnstart, lnstop), 'tag-types':tagtypes}) # Now, find all the lines lines = re.findall('line=([0-9]+)\stag=(\S+):(.*)\n?', string) # --- 3) Create a raw tier. rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst) for lineno, linetag, linetxt in lines: l = Item(id = ask_item_id(rt), text=linetxt, attributes={'tag':linetag, 'line':lineno}, tier=rt) rt.append(l) inst.append(rt) basic_processing(inst) return inst
def xc_load(path, mode=FULL, do_basic_processing=False): f = open(path, 'r', encoding='utf-8') xc = xigtxml.load(f, mode=mode) if do_basic_processing: for inst in xc: basic_processing(inst) return xc