Exemple #1
0
def make_igt_raw_tier(block, options):
    items = [Item(id='r{}'.format(j+1), attributes=a, text=t)
             for j, (a, t) in enumerate(block.get('lines', []))]
    tier = Tier(
        id='r',
        type='odin',
        attributes={'state': 'raw'},
        items=items
    )
    return tier
Exemple #2
0
def make_igt_raw_tier(block, options):
    items = []
    for j, linedata in enumerate(block.get('lines', [])):
        text = replace_invalid_xml_chars(linedata.get('content', ''),
                                         options['replacement_char'])
        attrs = linedata.copy()
        del attrs['content']
        items.append(Item(id='r{}'.format(j + 1), attributes=attrs, text=text))
    tier = Tier(id='r', type='odin', attributes={'state': 'raw'}, items=items)
    return tier
Exemple #3
0
def make_phrase_tier(tier_id, aln_tokens):
    return Tier(
        id=tier_id,
        type='phrases',
        items=[
            Item(
                id='{}1'.format(tier_id),
                text=' '.join(t for aln in aln_tokens for t in aln[1])
            )
        ]
    )
Exemple #4
0
def add_normalized_tier(igt, options):
    orig_tier = igt.get('c', default=igt['r'])
    norm_items = normalize_items(orig_tier.items)
    tier = Tier(
        id='n',
        type='odin',
        alignment=orig_tier.id,
        attributes={'state': 'normalized'},
        items=norm_items
    )
    igt.append(tier)
Exemple #5
0
def add_cleaned_tier(igt, options):
    raw_tier = igt['r']
    cleaned_items = clean_items(raw_tier.items)
    tier = Tier(
        id='c',
        type='odin',
        alignment=raw_tier.id,
        attributes={'state': 'cleaned'},
        items=cleaned_items
    )
    igt.append(tier)
Exemple #6
0
def default_decode_tier(elem):
    ns, tag = _qname_split(elem.tag)
    assert tag == 'tier'
    tier = Tier(
        id=elem.get('id'),
        type=elem.get('type'),
        attributes=get_attributes(elem, ignore=('id', 'type')),
        metadata=[decode_metadata(md) for md in elem.findall('metadata')],
        items=[decode_item(item) for item in elem.findall('item')],
        namespace=ns,
        nsmap=elem.attrib.nsmap)
    elem.clear()
    return tier
Exemple #7
0
def add_normalized_tier(igt, base_tier):
    norm_id = None
    # check if ID is available
    for n_id in ('n', 'on', 'normalized', 'odin-normalized'):
        if igt.get(n_id) is None:
            norm_id = n_id
            break
    if norm_id is None:
        logging.warning('No preset ID for normalized tier was available '
                        'for IGT with id: {}'.format(str(igt.id)))
    else:
        norm_items = normalize_items(base_tier, norm_id)
        tier = Tier(id=norm_id,
                    type='odin',
                    alignment=base_tier.id,
                    attributes={'state': 'normalized'},
                    items=norm_items)
        igt.append(tier)
Exemple #8
0
def add_cleaned_tier(igt, raw_tier):
    clean_id = None
    # check if ID is available
    for c_id in ('c', 'oc', 'cleaned', 'odin-cleaned'):
        if igt.get(c_id) is None:
            clean_id = c_id
            break
    if clean_id is None:
        logging.warning(
            'No preset ID for cleaned tier was available for IGT with id: {}'.
            format(str(igt.id)))
    else:
        cleaned_items = clean_items(raw_tier, clean_id)
        tier = Tier(id=clean_id,
                    type='odin',
                    alignment=raw_tier.id,
                    attributes={'state': 'cleaned'},
                    items=cleaned_items)
        igt.append(tier)
Exemple #9
0
def make_tier(tier_type, tier_id, aligned_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aligned_tokens == [(None, None)]:
        pass  # nothing to do
    elif algn_tier is not None:
        attrs['alignment'] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aligned_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            assert tgt_tok == tgt_item.text  # FIXME is this necessary?
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={'alignment': tgt_item.id}))
                i += 1
    else:
        for tgt, src in aligned_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)
Exemple #10
0
def make_tier(tier_type, tier_id, refattr, aln_tokens, algn_tier):
    attrs = OrderedDict()
    items = list()
    i = 1  # start indices at 1
    if aln_tokens == [(None, None)]:
        pass  # nothing to do
    elif refattr is not None and algn_tier is not None:
        attrs[refattr] = algn_tier.id
        algn_data = zip_longest(algn_tier.items, aln_tokens)
        for tgt_item, src_data in algn_data:
            tgt_tok, src_toks = src_data
            for s in src_toks:
                items.append(
                    Item(id='{}{}'.format(tier_id, i),
                         text=s,
                         attributes={refattr:tgt_item.id})
                )
                i += 1
    else:
        for tgt, src in aln_tokens:
            for s in src:
                items.append(Item(id='{}{}'.format(tier_id, i), text=s))
                i += 1
    return Tier(id=tier_id, type=tier_type, items=items, attributes=attrs)