Beispiel #1
0
def parse_odin_inst(string, corpus = None, idnum=None):
    """
    Method to parse and create an IGT instance from odin-style text.
    """

    # Start by looking for the doc_id, and the line range.
    doc_re = re.search('doc_id=(\S+)\s([0-9]+)\s([0-9]+)\s(.*)\n', string)
    docid, lnstart, lnstop, tagtypes = doc_re.groups()

    if idnum is not None:
        id = gen_item_id('i', idnum)
    elif corpus:
        id = corpus.askIgtId()
    else:
        corpus = XigtCorpus()
        id = 'i{}'.format(len(corpus))

    inst = Igt(id = id, attributes={'doc-id':docid,
                                    'line-range':'%s %s' % (lnstart, lnstop),
                                    'tag-types':tagtypes})

    # Now, find all the lines
    lines = re.findall('line=([0-9]+)\stag=(\S+):(.*)\n?', string)

    # --- 3) Create a raw tier.
    rt = Tier(id = RAW_ID, type=ODIN_TIER_TYPE, attributes={STATE_ATTRIBUTE:RAW_STATE}, igt=inst)

    for lineno, linetag, linetxt in lines:
        l = Item(id = ask_item_id(rt), text=linetxt, attributes={'tag':linetag, 'line':lineno}, tier=rt)
        rt.append(l)

    inst.append(rt)
    basic_processing(inst)

    return inst
Beispiel #2
0
def xc_load(path, mode=FULL, do_basic_processing=False):
    f = open(path, 'r', encoding='utf-8')
    xc = xigtxml.load(f, mode=mode)
    if do_basic_processing:
        for inst in xc:
            basic_processing(inst)
    return xc