コード例 #1
0
def load_directory_ilg(corpus_name, path, annotation_types,
                        feature_system_path = None,
                        stop_check = None, call_back = None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    None,
                                    stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier)
    return corpus
コード例 #2
0
def load_directory_ilg(corpus_name, path, annotation_types,
                        feature_system_path = None,
                        stop_check = None, call_back = None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    None,
                                    stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
    return corpus
コード例 #3
0
def load_directory_spelling(corpus_name, path, annotation_types = None,
                            support_corpus_path = None, ignore_case = False,
                            stop_check = None, call_back = None):
    """
    Loads a directory of orthographic texts

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    support_corpus_path : str, optional
        File path of corpus binary to load transcriptions from
    ignore_case : bool, optional
        Specifies whether lookups in the support corpus should ignore case
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))

    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_spelling(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    support_corpus_path, ignore_case,
                                    stop_check, call_back)
        corpus.add_discourse(d)
    return corpus
コード例 #4
0
def test_init():
    word_tokens = [{
        'begin': 0,
        'end': 1,
        'word': {
            'spelling': 'a',
            'transcription': ['a', 'b']
        },
        'following_token_time': 1
    }, {
        'begin': 1,
        'end': 2,
        'word': {
            'spelling': 'c',
            'transcription': ['c', 'a', 'b']
        },
        'previous_token_time': 0,
        'following_token_time': 2
    }, {
        'begin': 2,
        'end': 3,
        'word': {
            'spelling': 'a',
            'transcription': ['a', 'b']
        },
        'previous_token_time': 1,
        'following_token_time': 3
    }, {
        'begin': 3,
        'end': 4,
        'word': {
            'spelling': 'd',
            'transcription': ['a', 'd']
        },
        'previous_token_time': 2,
    }]
    d = Discourse()
    for wt in word_tokens:
        w = d.lexicon.get_or_create_word(**wt['word'])
        w.frequency += 1
        wt['word'] = w
        d.add_word(WordToken(**wt))
    corpus = SpontaneousSpeechCorpus('', '')

    corpus.add_discourse(d)

    d = corpus.discourses['']

    assert (d[0].wordtype.frequency == 2)
    assert (d[1].wordtype.frequency == 1)
コード例 #5
0
def test_init():
    word_tokens = [{'begin':0,'end':1,'word':{'spelling':'a','transcription':['a','b']},'following_token_time':1},
                    {'begin':1,'end':2,'word':{'spelling':'c','transcription':['c','a','b']}, 'previous_token_time':0,'following_token_time':2},
                    {'begin':2,'end':3,'word':{'spelling':'a','transcription':['a','b']}, 'previous_token_time':1,'following_token_time':3},
                    {'begin':3,'end':4,'word':{'spelling':'d','transcription':['a','d']}, 'previous_token_time':2,}]
    d = Discourse()
    for wt in word_tokens:
        w = d.lexicon.get_or_create_word(**wt['word'])
        w.frequency += 1
        wt['word'] = w
        d.add_word(WordToken(**wt))
    corpus = SpontaneousSpeechCorpus('','')

    corpus.add_discourse(d)

    d = corpus.discourses['']

    assert(d[0].wordtype.frequency == 2)
    assert(d[1].wordtype.frequency == 1)
コード例 #6
0
def load_directory_multiple_files(corpus_name,
                                  path,
                                  dialect,
                                  annotation_types=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        Currently only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []

    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words')
                    or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0, len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(
                i + 1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name, ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        word_path = os.path.join(root, filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        try:
            d = load_discourse_multiple_files(name, word_path, phone_path,
                                              dialect, annotation_types,
                                              corpus.lexicon,
                                              feature_system_path, stop_check,
                                              None)
            corpus.add_discourse(d)
        except ValueError:
            print('Error importing for participant ' + name)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(
            corpus.lexicon.specifier)

    return corpus
コード例 #7
0
def load_directory_spelling(corpus_name,
                            path,
                            annotation_types=None,
                            support_corpus_path=None,
                            ignore_case=False,
                            stop_check=None,
                            call_back=None):
    """
    Loads a directory of orthographic texts

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    support_corpus_path : str, optional
        File path of corpus binary to load transcriptions from
    ignore_case : bool, optional
        Specifies whether lookups in the support corpus should ignore case
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))

    if call_back is not None:
        call_back('Parsing files...')
        call_back(0, len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(
                i + 1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_spelling(name, os.path.join(root, filename),
                                    annotation_types, corpus.lexicon,
                                    support_corpus_path, ignore_case,
                                    stop_check, call_back)
        corpus.add_discourse(d)
    return corpus
コード例 #8
0
def load_directory_multiple_files(corpus_name, path, dialect,
                                    annotation_types = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        One of 'buckeye' or 'timit'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name,ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        else:
            phone_ext = '.phn'
        word_path = os.path.join(root,filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        d = load_discourse_multiple_files(name, word_path, phone_path,
                                            dialect, annotation_types,
                                            corpus.lexicon, None,
                                            stop_check, None)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
    return corpus