Python modernize_specifierの例、corpustools.gui.modernize.modernize_specifier Pythonの例

コード例 #1

0

ファイルを表示

ファイル: text_ilg.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def load_directory_ilg(corpus_name, path, annotation_types,
                        feature_system_path = None,
                        stop_check = None, call_back = None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,filename),
                                    annotation_types, corpus.lexicon,
                                    None,
                                    stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier)
    return corpus

コード例 #2

0

ファイルを表示

ファイル: textgrid.py プロジェクト: YurikaAonuki/CorpusTools

def load_discourse_textgrid(corpus_name,
                            path,
                            annotation_types,
                            feature_system_path=None,
                            stop_check=None,
                            call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name,
                            path,
                            annotation_types,
                            call_back=call_back,
                            stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    discourse = data_to_discourse2(corpus_name,
                                   wav_path,
                                   annotation_types,
                                   stop_check=stop_check,
                                   call_back=call_back)

    # discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    # discourse is a Discourse object, see corpus\classes\spontaneous.py
    if discourse is None:
        return
    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse

コード例 #3

0

ファイルを表示

ファイル: text_transcription.py プロジェクト: YurikaAonuki/CorpusTools

def load_discourse_transcription(corpus_name,
                                 path,
                                 annotation_types=None,
                                 lexicon=None,
                                 feature_system_path=None,
                                 stop_check=None,
                                 call_back=None):
    """
    Load a discourse from a text file containing running transcribed text

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str, optional
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    if feature_system_path is not None:
        if not os.path.exists(feature_system_path):
            raise (PCTOSError(
                "The feature path specified ({}) does not exist".format(
                    feature_system_path)))

    data = transcription_text_to_data(corpus_name,
                                      path,
                                      annotation_types,
                                      stop_check=stop_check,
                                      call_back=call_back)
    # discourse = data_to_discourse(data, lexicon, stop_check=stop_check, call_back=call_back)
    discourse = data_to_discourse2(corpus_name=corpus_name,
                                   wav_path=data.wav_path,
                                   annotation_types=annotation_types,
                                   stop_check=stop_check,
                                   call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse

コード例 #4

0

ファイルを表示

ファイル: pct_textgrid.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def load_discourse_textgrid(corpus_name, path, annotation_types,
                            feature_system_path = None, support_corpus_path = None,
                            stop_check = None, call_back = None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            #the corpus is 'preloaded' if this function is called by load_directory_textgrid
            #otherwise the corpus has to be loaded once per file in a directory, which could be slow
            support = support_corpus_path
        else:
            #otherwise, it's a string representing a path to the corpus
            support = load_binary(support_corpus_path)
    else:
        support = None
    discourse = data_to_discourse2(corpus_name, wav_path,
                                   annotation_types=annotation_types, support_corpus = support,
                                   stop_check=stop_check, call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)
    return discourse

コード例 #5

0

ファイルを表示

def load_discourse_ilg(corpus_name,
                       path,
                       annotation_types,
                       lexicon=None,
                       feature_system_path=None,
                       stop_check=None,
                       call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = ilg_to_data(corpus_name, path, annotation_types, stop_check,
                       call_back)
    #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    discourse = data_to_discourse2(corpus_name=corpus_name,
                                   annotation_types=annotation_types,
                                   stop_check=stop_check,
                                   call_back=call_back)

    if discourse is None:
        return

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse

コード例 #6

0

ファイルを表示

ファイル: text_ilg.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def load_discourse_ilg(corpus_name, path, annotation_types,
                    lexicon = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """
    data = ilg_to_data(corpus_name, path, annotation_types,stop_check, call_back)
    #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check)
    discourse = data_to_discourse2(corpus_name=corpus_name, annotation_types=annotation_types,
                                   stop_check=stop_check, call_back=call_back)

    if discourse is None:
        return

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)

    return discourse

コード例 #7

0

ファイルを表示

def load_directory_ilg(corpus_name,
                       path,
                       annotation_types,
                       feature_system_path=None,
                       stop_check=None,
                       call_back=None):
    """
    Loads a directory of interlinear gloss text files

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the glosses.
        Can be generated through ``inspect_discourse_ilg``.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []
    for root, subdirs, files in os.walk(path):
        for filename in files:
            if not filename.lower().endswith('.txt'):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0, len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(
                i + 1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name = os.path.splitext(filename)[0]
        d = load_discourse_ilg(name, os.path.join(root,
                                                  filename), annotation_types,
                               corpus.lexicon, None, stop_check, call_back)
        corpus.add_discourse(d)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(
            corpus.lexicon.specifier)
    return corpus

コード例 #8

0

ファイルを表示

def load_discourse_textgrid(corpus_name,
                            path,
                            annotation_types,
                            feature_system_path=None,
                            support_corpus_path=None,
                            stop_check=None,
                            call_back=None):
    """
    Load a discourse from a TextGrid file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to TextGrid file
    annotation_types : list of AnnotationType
        List of AnnotationType specifying how to parse the TextGrids.
        Can be generated through ``inspect_discourse_textgrid``.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the TextGrid file
    """

    data = textgrid_to_data(corpus_name,
                            path,
                            annotation_types,
                            call_back=call_back,
                            stop_check=stop_check)
    #textgrid_to_data has side-effects that change annotation_types
    wav_path = find_wav_path(path)
    if support_corpus_path is not None:
        if isinstance(support_corpus_path, Corpus):
            #the corpus is 'preloaded' if this function is called by load_directory_textgrid
            #otherwise the corpus has to be loaded once per file in a directory, which could be slow
            support = support_corpus_path
        else:
            #otherwise, it's a string representing a path to the corpus
            support = load_binary(support_corpus_path)
    else:
        support = None
    discourse = data_to_discourse2(corpus_name,
                                   wav_path,
                                   annotation_types=annotation_types,
                                   support_corpus=support,
                                   stop_check=stop_check,
                                   call_back=call_back)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)
    return discourse

コード例 #9

0

ファイルを表示

def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None,
                               stop_check=None, call_back=None):

    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    missing = set()

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for line in f.readlines():
            line = line.strip()
            if not line:
                continue

            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    ignored = k.ignored_characters
                    if ignored is not None:
                        v = ''.join(x for x in v if x not in ignored)

                    sd = k.syllable_delimiter
                    if sd is not None:
                        syllables = v.split(sd)
                    else:
                        syllables = [v]

                    td = k.trans_delimiter
                    stress_spec = set(k.stress_specification.keys())
                    tone_spec = set(k.tone_specification.keys())
                    supra_spec = stress_spec.union(tone_spec)
                    for syllable in syllables:
                        syllable = ''.join(x for x in syllable if x not in supra_spec)

                        if td is None:
                            if k.digraph_pattern is not None:
                                string = k.digraph_pattern.findall(syllable)
                            else:
                                string = [x for x in syllable]
                        else:
                            string = syllable.split(td)

                        for seg in string:
                            if seg == '':
                                continue

                            if seg not in feature_matrix.segments:
                                missing.add(seg)

    print('In csv.py', missing)

コード例 #10

0

ファイルを表示

def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path,
                               stop_check, call_back)

    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        feature_matrix = modernize.modernize_specifier(feature_matrix)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, delimiter = inspect_csv(path, coldelim=delimiter)

    for a in annotation_types:
        a.reset()

    if call_back is not None:
        call_back('Loading...')
        call_back(0, 0)
        cur = 0

    with open(path, encoding='utf-8-sig') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers) == 1:
            e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches '
                                'the one used in the file.'))
            raise e
        headers = annotation_types

        for a in headers:
            corpus.add_attribute(a.attribute)

        trans_check = True

        for line in f.readlines():
            if stop_check is not None and stop_check():
                return
            if call_back is not None:
                cur += 1
                call_back(cur)

            line = line.strip()
            if not line:  # blank or just a newline
                continue

            d = {}
            for k, v in zip(headers, line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus)  # trans is a list of BaseAnnotation
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)

            if word.transcription:
                #transcriptions can have phonetic symbol delimiters
                if not word.spelling:
                    word.spelling = ''.join(map(str, word.transcription))

            corpus.add_word(word, allow_duplicates=True)

    if corpus.specifier is not None:
        corpus.inventory.update_features(corpus.specifier)

    if corpus.has_transcription and any(len(word.transcription) > 1 for word in corpus):
        if not trans_check:
            e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\nCheck that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
            raise e

    if stop_check is not None and stop_check():
        return

    return corpus

コード例 #11

0

ファイルを表示

ファイル: multiple_files.py プロジェクト: akki2825/CorpusTools

def load_discourse_multiple_files(corpus_name,
                                  word_path,
                                  phone_path,
                                  dialect,
                                  annotation_types=None,
                                  lexicon=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {
        'name': name,
        'wav_path': find_wav_path(word_path),
        'other_attributes': list()
    }
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute  #.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs[
                'transcription_name'] = at.attribute  #.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in (
                'tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {
            at.output_name: (at.attribute, w[at.output_name])
            for at in annotation_types
        }
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute,
                                                 w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs[
                        'begin'] = begin if begin is not None else ind
                    word_token_kwargs[
                        'end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute,
                                                           w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(
            discourse.lexicon.specifier)

    return discourse

コード例 #12

0

ファイルを表示

ファイル: multiple_files.py プロジェクト: akki2825/CorpusTools

def load_directory_multiple_files(corpus_name,
                                  path,
                                  dialect,
                                  annotation_types=None,
                                  feature_system_path=None,
                                  stop_check=None,
                                  call_back=None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        Currently only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []

    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words')
                    or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0, len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(
                i + 1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name, ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        word_path = os.path.join(root, filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        try:
            d = load_discourse_multiple_files(name, word_path, phone_path,
                                              dialect, annotation_types,
                                              corpus.lexicon,
                                              feature_system_path, stop_check,
                                              None)
            corpus.add_discourse(d)
        except ValueError:
            print('Error importing for participant ' + name)

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(
            corpus.lexicon.specifier)

    return corpus

コード例 #13

0

ファイルを表示

ファイル: multiple_files.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect,
                                    annotation_types = None,
                                    lexicon = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Load a discourse from a text file containing interlinear glosses

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    word_path : str
        Full path to words text file
    phone_path : str
        Full path to phones text file
    dialect : str
        Currently, only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    lexicon : Corpus, optional
        Corpus to store Discourse word information
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    Discourse
        Discourse object generated from the text file
    """

    name = os.path.splitext(os.path.split(word_path)[1])[0]
    discourse_kwargs = {'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list()}
    for at in annotation_types:
        if at.name == 'Orthography (default)':
            discourse_kwargs['spelling_name'] = at.attribute#.output_name
        elif at.name == 'Transcription (default)':
            discourse_kwargs['transcription_name'] = at.attribute#.output_name
        elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'):
            discourse_kwargs['other_attributes'].append(at.attribute)
    discourse = Discourse(discourse_kwargs)
    words = read_words(word_path, dialect)
    ind = 0
    for w in words:
        word_kwargs = {at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types}
        word = Word(**word_kwargs)
        word_token_kwargs = dict()
        for at in annotation_types:
            if at.ignored:
                continue
            word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name])
            word_token_kwargs['word'] = word
            if at.attribute.att_type == 'tier':
                if at.attribute.is_default:
                    begin = w['begin']
                    end = w['end']
                    word_token_kwargs['begin'] = begin if begin is not None else ind
                    word_token_kwargs['end'] = end if end is not None else ind + 1
                if at.token:
                    word_token_kwargs['_transcription'] = (at.attribute, w['transcription'])
        word_token = WordToken(**word_token_kwargs)
        word.wordtokens.append(word_token)
        discourse.lexicon.add_word(word)
        discourse.add_word(word_token)
        ind += 1

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        discourse.lexicon.set_feature_matrix(feature_matrix)
        discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier)

    return discourse

コード例 #14

0

ファイルを表示

ファイル: multiple_files.py プロジェクト: PhonologicalCorpusTools/CorpusTools

def load_directory_multiple_files(corpus_name, path, dialect,
                                    annotation_types = None,
                                    feature_system_path = None,
                                    stop_check = None, call_back = None):
    """
    Loads a directory of corpus standard files (separated into words files
    and phones files)

    Parameters
    ----------
    corpus_name : str
        Name of corpus
    path : str
        Path to directory of text files
    dialect : str
        Currently only 'buckeye'
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse the glosses.
        Auto-generated based on dialect.
    feature_system_path : str, optional
        File path of FeatureMatrix binary to specify segments
    stop_check : callable or None
        Optional function to check whether to gracefully terminate early
    call_back : callable or None
        Optional function to supply progress information during the loading

    Returns
    -------
    SpontaneousSpeechCorpus
        Corpus containing Discourses corresponding to the text files
    """
    if call_back is not None:
        call_back('Finding  files...')
        call_back(0, 0)
    file_tuples = []

    for root, subdirs, files in os.walk(path):
        for filename in files:
            if stop_check is not None and stop_check():
                return
            if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')):
                continue
            file_tuples.append((root, filename))
    if call_back is not None:
        call_back('Parsing files...')
        call_back(0,len(file_tuples))
        cur = 0
    corpus = SpontaneousSpeechCorpus(corpus_name, path)
    for i, t in enumerate(file_tuples):
        if stop_check is not None and stop_check():
            return
        if call_back is not None:
            call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples)))
            call_back(i)
        root, filename = t
        name,ext = os.path.splitext(filename)
        if ext == '.words':
            phone_ext = '.phones'
        word_path = os.path.join(root,filename)
        phone_path = os.path.splitext(word_path)[0] + phone_ext
        try:
            d = load_discourse_multiple_files(name, word_path, phone_path,
                                    dialect, annotation_types,
                                    corpus.lexicon, feature_system_path,
                                    stop_check, None)
            corpus.add_discourse(d)
        except ValueError:
            print('Error importing for participant ' + name)
    

    if feature_system_path is not None:
        feature_matrix = load_binary(feature_system_path)
        corpus.lexicon.set_feature_matrix(feature_matrix)
        corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier)

    return corpus