Python Discourse Examples, corpustools.corpus.classes.Discourse Python Examples

Example #1

0

Show file

File: test_spontaneous_classes.py Project: PhonologicalCorpusTools/CorpusTools

def test_init():
    word_tokens = [{'begin':0,'end':1,'word':Word(**{'spelling':'a','transcription':['a','b']})},
                    {'begin':1,'end':2,'word':Word(**{'spelling':'c','transcription':['c','a','b']})},
                    {'begin':2,'end':3,'word':Word(**{'spelling':'a','transcription':['a','b']})},
                    {'begin':3,'end':4,'word':Word(**{'spelling':'d','transcription':['a','d']})}]
    d = Discourse()
    for wt in word_tokens:
        d.add_word(WordToken(**wt))

Example #2

0

Show file

File: helper.py Project: mmcauliffe/CorpusTools

def data_to_discourse(data, lexicon=None):
    attribute_mapping = data.mapping()
    d = Discourse(name=data.name, wav_path=data.wav_path)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon

    for k, v in attribute_mapping.items():
        a = data[k]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults=True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults=True)

    for level in data.word_levels:
        for i, s in enumerate(data[level]):
            word_kwargs = {"spelling": (attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j] : s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs["begin"] = seq[0].begin
                            word_token_kwargs["end"] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs["word"] = word
            if "begin" not in word_token_kwargs:
                word_token_kwargs["begin"] = ind
                word_token_kwargs["end"] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d

Example #3

0

Show file

File: test_spontaneous_classes.py Project: pedmiston/CorpusTools

def test_init():
    word_tokens = [{
        'begin': 0,
        'end': 1,
        'word': {
            'spelling': 'a',
            'transcription': ['a', 'b']
        },
        'following_token_time': 1
    }, {
        'begin': 1,
        'end': 2,
        'word': {
            'spelling': 'c',
            'transcription': ['c', 'a', 'b']
        },
        'previous_token_time': 0,
        'following_token_time': 2
    }, {
        'begin': 2,
        'end': 3,
        'word': {
            'spelling': 'a',
            'transcription': ['a', 'b']
        },
        'previous_token_time': 1,
        'following_token_time': 3
    }, {
        'begin': 3,
        'end': 4,
        'word': {
            'spelling': 'd',
            'transcription': ['a', 'd']
        },
        'previous_token_time': 2,
    }]
    d = Discourse()
    for wt in word_tokens:
        w = d.lexicon.get_or_create_word(**wt['word'])
        w.frequency += 1
        wt['word'] = w
        d.add_word(WordToken(**wt))
    corpus = SpontaneousSpeechCorpus('', '')

    corpus.add_discourse(d)

    d = corpus.discourses['']

    assert (d[0].wordtype.frequency == 2)
    assert (d[1].wordtype.frequency == 1)

Example #4

0

Show file

File: test_spontaneous_classes.py Project: pedmiston/CorpusTools

def test_init():
    word_tokens = [{
        'begin':
        0,
        'end':
        1,
        'word':
        Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }, {
        'begin':
        1,
        'end':
        2,
        'word':
        Word(**{
            'spelling': 'c',
            'transcription': ['c', 'a', 'b']
        })
    }, {
        'begin':
        2,
        'end':
        3,
        'word':
        Word(**{
            'spelling': 'a',
            'transcription': ['a', 'b']
        })
    }, {
        'begin':
        3,
        'end':
        4,
        'word':
        Word(**{
            'spelling': 'd',
            'transcription': ['a', 'd']
        })
    }]
    d = Discourse()
    for wt in word_tokens:
        d.add_word(WordToken(**wt))

Example #5

0

Show file

File: test_spontaneous_classes.py Project: PhonologicalCorpusTools/CorpusTools

def test_init():
    word_tokens = [{'begin':0,'end':1,'word':{'spelling':'a','transcription':['a','b']},'following_token_time':1},
                    {'begin':1,'end':2,'word':{'spelling':'c','transcription':['c','a','b']}, 'previous_token_time':0,'following_token_time':2},
                    {'begin':2,'end':3,'word':{'spelling':'a','transcription':['a','b']}, 'previous_token_time':1,'following_token_time':3},
                    {'begin':3,'end':4,'word':{'spelling':'d','transcription':['a','d']}, 'previous_token_time':2,}]
    d = Discourse()
    for wt in word_tokens:
        w = d.lexicon.get_or_create_word(**wt['word'])
        w.frequency += 1
        wt['word'] = w
        d.add_word(WordToken(**wt))
    corpus = SpontaneousSpeechCorpus('','')

    corpus.add_discourse(d)

    d = corpus.discourses['']

    assert(d[0].wordtype.frequency == 2)
    assert(d[1].wordtype.frequency == 1)

Example #6

0

Show file

def data_to_discourse(data, lexicon = None):
    attribute_mapping = data.mapping()
    d = Discourse(name = data.name, wav_path = data.wav_path)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon

    for k,v in attribute_mapping.items():
        a = data[k]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)

    for level in data.word_levels:
        for i, s in enumerate(data[level]):
            word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d

Example #7

0

Show file

def data_to_discourse(data, lexicon = None, call_back=None, stop_check=None):
    attribute_mapping = data.mapping()
    spelling_name, transcription_name = None, None

    for name, value in attribute_mapping.items():
        if value.att_type == 'spelling' and value.is_default:
            spelling_name = name
        elif value.att_type == 'tier' and value.is_default:
            transcription_name = name

    dkwargs = {'spelling_name': spelling_name, 'transcription_name': transcription_name,
               'name':data.name, 'wav_path':data.wav_path}
    d = Discourse(dkwargs)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon #despite the name, this is a Corpus object

    for k,v in attribute_mapping.items():
        a = data[v.name]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)


    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for level in data.word_levels:
        #word_levels is a list of spelling tiers, usually of length 1
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        for i, s in enumerate(data[level]):
            #word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            if not s.label:
                continue
            word_kwargs = {level:(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token:# is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d

Example #8

0

Show file

def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False,
                       call_back=None, stop_check=None):
    curr_word = list()
    annotations = {at:list() for at in annotation_types}
    spelling_name, transcription_name = None, None
    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for at in annotation_types:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        if all(isinstance(item, Annotation) for item in at._list):
            # it's a list of spellings, take each one and add it to the overall annotations list
            for item in at._list:
                if item.label:
                    annotations[at].append((item.label, None, None))

        elif all(type(item) == BaseAnnotation for item in at._list):
            #it's a list of transcriptions, with each segment as a BaseAnnotation
            for item in at._list:
                if item.begin is not None:
                    begin = item.begin
                if item.end is None:
                    curr_word.append(item)
                elif item.end is not None:
                    end = item.end
                    curr_word.append(item)
                    curr_word = Transcription(curr_word)
                    annotations[at].append((curr_word, begin, end))
                    curr_word = list()
        else:
            print(at._list)
            raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations")

    if support_corpus is not None:
        spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0]
        transcriptions = [key for key in annotations if key.name == 'Transcription'][0]
        for index, info in enumerate(spellings):
            spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None)
            try:
                transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription
            except KeyError:
                try:
                    no_punctuation = ''.join([x for x in spelling if not x in string.punctuation])
                    transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription
                except KeyError:
                    transcription = Transcription([symbol for symbol in spelling])
            annotations[transcriptions].append((transcription, index, index+1))


    discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)

    if 'spelling_name' not in discourse_kwargs:
        discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling')
    if 'transcription_name' not in discourse_kwargs:
        discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription')

    if stop_check is not None and stop_check():
        return
    if call_back is not None:
        cur += 1
        call_back(cur)

    discourse = Discourse(discourse_kwargs)

    if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]:
        # running text will not have a frequency attribute supplied by the user
        # textgrids are also unlikely to have this attribute
        discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency'))
        add_frequency = True
    else:
        add_frequency = False

    ind = 0
    limit = max([len(list(v)) for v in annotations.values()])
    for n in range(limit):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)

        word_kwargs = dict()
        for at in annotations:
            if at.token or at.ignored:
                continue
            else:
                try:
                    word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                    #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
                except IndexError:
                    word_kwargs[at.attribute.name] = (at.attribute, None)
        word = Word(**word_kwargs)
        try:
            word = discourse.lexicon.find(word.spelling)
            if add_frequency:
                word.frequency += 1
        except KeyError:
            discourse.lexicon.add_word(word)

        word_token_kwargs = dict()
        word_token_kwargs['word'] = word
        begin, end = None, None
        for at in annotations:
            if at.ignored:
                continue
            try:
                word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
            except IndexError:
                word_token_kwargs[at.attribute.name] = (at.attribute, None)
            #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0])
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = annotations[at][n][1]
                    end = annotations[at][n][2]
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0])
        word_token_kwargs['begin'] = begin if begin is not None else ind
        word_token_kwargs['end'] = end if end is not None else ind + 1
        word_token = WordToken(**word_token_kwargs)
        discourse.add_word(word_token)
        if any(a.token for a in annotations):
            word.wordtokens.append(word_token)
        ind += 1
    return discourse

Example #9

0

Show file

File: multiple_files.py Project: akki2825/CorpusTools

def load_discourse_multiple_files(corpus_name,
                                  word_path,
                                  phone_path,
                                  dialect,
                                  annotation_types=None,
                                  lexicon=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {
        'name': name,
        'wav_path': find_wav_path(word_path),
        'other_attributes': list()
    }
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute  #.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs[
                'transcription_name'] = at.attribute  #.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in (
                'tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {
            at.output_name: (at.attribute, w[at.output_name])
            for at in annotation_types
        }
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute,
                                                 w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs[
                        'begin'] = begin if begin is not None else ind
                    word_token_kwargs[
                        'end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute,
                                                           w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse

Example #10

0

Show file

File: helper.py Project: PhonologicalCorpusTools/CorpusTools

def data_to_discourse(data, lexicon = None, call_back=None, stop_check=None):
    attribute_mapping = data.mapping()
    spelling_name, transcription_name = None, None

    for name, value in attribute_mapping.items():
        if value.att_type == 'spelling' and value.is_default:
            spelling_name = name
        elif value.att_type == 'tier' and value.is_default:
            transcription_name = name

    dkwargs = {'spelling_name': spelling_name, 'transcription_name': transcription_name,
               'name':data.name, 'wav_path':data.wav_path}
    d = Discourse(dkwargs)
    ind = 0
    if lexicon is None:
        lexicon = d.lexicon #despite the name, this is a Corpus object

    for k,v in attribute_mapping.items():
        a = data[v.name]

        if a.token and v not in d.attributes:
            d.add_attribute(v, initialize_defaults = True)

        if not a.token and v not in d.lexicon.attributes:
            lexicon.add_attribute(v, initialize_defaults = True)


    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for level in data.word_levels:
        #word_levels is a list of spelling tiers, usually of length 1
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        for i, s in enumerate(data[level]):
            #word_kwargs = {'spelling':(attribute_mapping[level], s.label)}
            if not s.label:
                continue
            word_kwargs = {level:(attribute_mapping[level], s.label)}
            word_token_kwargs = {}
            if s.token:# is not None:
                for token_key, token_value in s.token.items():
                    att = attribute_mapping[token_key]
                    word_token_kwargs[att.name] = (att, token_value)
            if s.additional is not None:
                for add_key, add_value in s.additional.items():
                    att = attribute_mapping[add_key]
                    if data[add_key].token:
                        word_token_kwargs[att.name] = (att, add_value)
                    else:
                        word_kwargs[att.name] = (att, add_value)
            for j, r in enumerate(s.references):
                if r in data and len(data[r]) > 0:
                    seq = data[r][s.begins[j]:s.ends[j]]
                    att = attribute_mapping[r]
                    if data[r].token:
                        word_token_kwargs[att.name] = (att, seq)
                        if len(seq) > 0 and seq[0].begin is not None:
                            word_token_kwargs['begin'] = seq[0].begin
                            word_token_kwargs['end'] = seq[-1].end

                    else:
                        word_kwargs[att.name] = (att, seq)

            word = lexicon.get_or_create_word(**word_kwargs)
            word_token_kwargs['word'] = word
            if 'begin' not in word_token_kwargs:
                word_token_kwargs['begin'] = ind
                word_token_kwargs['end'] = ind + 1
            wordtoken = WordToken(**word_token_kwargs)
            word.frequency += 1
            word.wordtokens.append(wordtoken)
            d.add_word(wordtoken)
            ind += 1
    return d

Example #11

0

Show file

File: helper.py Project: PhonologicalCorpusTools/CorpusTools

def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False,
                       call_back=None, stop_check=None):
    curr_word = list()
    annotations = {at:list() for at in annotation_types}
    spelling_name, transcription_name = None, None
    if call_back is not None:
        call_back('Processing data...')
        cur = 0

    for at in annotation_types:
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)
        if all(isinstance(item, Annotation) for item in at._list):
            # it's a list of spellings, take each one and add it to the overall annotations list
            for item in at._list:
                if item.label:
                    annotations[at].append((item.label, None, None))

        elif all(type(item) == BaseAnnotation for item in at._list):
            #it's a list of transcriptions, with each segment as a BaseAnnotation
            for item in at._list:
                if item.begin is not None:
                    begin = item.begin
                if item.end is None:
                    curr_word.append(item)
                elif item.end is not None:
                    end = item.end
                    curr_word.append(item)
                    curr_word = Transcription(curr_word)
                    annotations[at].append((curr_word, begin, end))
                    curr_word = list()
        else:
            print(at._list)
            raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations")

    if support_corpus is not None:
        spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0]
        transcriptions = [key for key in annotations if key.name == 'Transcription'][0]
        for index, info in enumerate(spellings):
            spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None)
            try:
                transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription
            except KeyError:
                try:
                    no_punctuation = ''.join([x for x in spelling if not x in string.punctuation])
                    transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription
                except KeyError:
                    transcription = Transcription([symbol for symbol in spelling])
            annotations[transcriptions].append((transcription, index, index+1))


    discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)

    if 'spelling_name' not in discourse_kwargs:
        discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling')
    if 'transcription_name' not in discourse_kwargs:
        discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription')

    if stop_check is not None and stop_check():
        return
    if call_back is not None:
        cur += 1
        call_back(cur)

    discourse = Discourse(discourse_kwargs)

    if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]:
        # running text will not have a frequency attribute supplied by the user
        # textgrids are also unlikely to have this attribute
        discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency'))
        add_frequency = True
    else:
        add_frequency = False

    ind = 0
    limit = max([len(list(v)) for v in annotations.values()])
    for n in range(limit):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            cur += 1
            call_back(cur)

        word_kwargs = dict()
        for at in annotations:
            if at.token or at.ignored:
                continue
            else:
                try:
                    word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                    #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
                except IndexError:
                    word_kwargs[at.attribute.name] = (at.attribute, None)
        word = Word(**word_kwargs)
        try:
            word = discourse.lexicon.find(word.spelling)
            if add_frequency:
                word.frequency += 1
        except KeyError:
            discourse.lexicon.add_word(word)

        word_token_kwargs = dict()
        word_token_kwargs['word'] = word
        begin, end = None, None
        for at in annotations:
            if at.ignored:
                continue
            try:
                word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0])
                # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None)
            except IndexError:
                word_token_kwargs[at.attribute.name] = (at.attribute, None)
            #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0])
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = annotations[at][n][1]
                    end = annotations[at][n][2]
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0])
        word_token_kwargs['begin'] = begin if begin is not None else ind
        word_token_kwargs['end'] = end if end is not None else ind + 1
        word_token = WordToken(**word_token_kwargs)
        discourse.add_word(word_token)
        if any(a.token for a in annotations):
            word.wordtokens.append(word_token)
        ind += 1
    return discourse

Example #12

0

Show file

File: multiple_files.py Project: PhonologicalCorpusTools/CorpusTools

def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect,
                                    annotation_types = None,
                                    lexicon = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute#.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute#.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types}
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)

    return discourse