Exemple #1
0
def data_to_discourse(data, lexicon=None):
    attribute_mapping = data.mapping()
    d = Discourse(name=data.name, wav_path=data.wav_path)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon

    for k, v in attribute_mapping.items():
        a = data[k]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults=True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults=True)

    for level in data.word_levels:
        for i, s in enumerate(data[level]):
            word_kwargs = {"spelling": (attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j] : s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs["begin"] = seq[0].begin
                            word_token_kwargs["end"] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs["word"] = word
            if "begin" not in word_token_kwargs:
                word_token_kwargs["begin"] = ind
                word_token_kwargs["end"] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d
Exemple #2
0
def data_to_discourse(data, lexicon = None):
    attribute_mapping = data.mapping()
    d = Discourse(name = data.name, wav_path = data.wav_path)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon

    for k,v in attribute_mapping.items():
        a = data[k]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)

    for level in data.word_levels:
        for i, s in enumerate(data[level]):
            word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d
Exemple #3
0
def data_to_discourse(data, lexicon = None, call_back=None, stop_check=None):
    attribute_mapping = data.mapping()
    spelling_name, transcription_name = None, None

    for name, value in attribute_mapping.items():
        if value.att_type == 'spelling' and value.is_default:
            spelling_name = name
        elif value.att_type == 'tier' and value.is_default:
            transcription_name = name

    dkwargs = {'spelling_name': spelling_name, 'transcription_name': transcription_name,
               'name':data.name, 'wav_path':data.wav_path}
    d = Discourse(dkwargs)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon #despite the name, this is a Corpus object

    for k,v in attribute_mapping.items():
        a = data[v.name]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)


    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for level in data.word_levels:
        #word_levels is a list of spelling tiers, usually of length 1
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        for i, s in enumerate(data[level]):
            #word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            if not s.label:
                continue
            word_kwargs = {level:(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token:# is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d
def data_to_discourse(data, lexicon = None, call_back=None, stop_check=None):
    attribute_mapping = data.mapping()
    spelling_name, transcription_name = None, None

    for name, value in attribute_mapping.items():
        if value.att_type == 'spelling' and value.is_default:
            spelling_name = name
        elif value.att_type == 'tier' and value.is_default:
            transcription_name = name

    dkwargs = {'spelling_name': spelling_name, 'transcription_name': transcription_name,
               'name':data.name, 'wav_path':data.wav_path}
    d = Discourse(dkwargs)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon #despite the name, this is a Corpus object

    for k,v in attribute_mapping.items():
        a = data[v.name]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)


    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for level in data.word_levels:
        #word_levels is a list of spelling tiers, usually of length 1
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        for i, s in enumerate(data[level]):
            #word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            if not s.label:
                continue
            word_kwargs = {level:(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token:# is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d