def load_directory_ilg(corpus_name, path, annotation_types, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of interlinear gloss text files Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_ilg(name, os.path.join(root,filename), annotation_types, corpus.lexicon, None, stop_check, call_back) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier) return corpus
def load_directory_ilg(corpus_name, path, annotation_types, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of interlinear gloss text files Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_ilg(name, os.path.join(root,filename), annotation_types, corpus.lexicon, None, stop_check, call_back) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) return corpus
def load_directory_spelling(corpus_name, path, annotation_types = None, support_corpus_path = None, ignore_case = False, stop_check = None, call_back = None): """ Loads a directory of orthographic texts Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files support_corpus_path : str, optional File path of corpus binary to load transcriptions from ignore_case : bool, optional Specifies whether lookups in the support corpus should ignore case stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_spelling(name, os.path.join(root,filename), annotation_types, corpus.lexicon, support_corpus_path, ignore_case, stop_check, call_back) corpus.add_discourse(d) return corpus
def test_init(): word_tokens = [{ 'begin': 0, 'end': 1, 'word': { 'spelling': 'a', 'transcription': ['a', 'b'] }, 'following_token_time': 1 }, { 'begin': 1, 'end': 2, 'word': { 'spelling': 'c', 'transcription': ['c', 'a', 'b'] }, 'previous_token_time': 0, 'following_token_time': 2 }, { 'begin': 2, 'end': 3, 'word': { 'spelling': 'a', 'transcription': ['a', 'b'] }, 'previous_token_time': 1, 'following_token_time': 3 }, { 'begin': 3, 'end': 4, 'word': { 'spelling': 'd', 'transcription': ['a', 'd'] }, 'previous_token_time': 2, }] d = Discourse() for wt in word_tokens: w = d.lexicon.get_or_create_word(**wt['word']) w.frequency += 1 wt['word'] = w d.add_word(WordToken(**wt)) corpus = SpontaneousSpeechCorpus('', '') corpus.add_discourse(d) d = corpus.discourses[''] assert (d[0].wordtype.frequency == 2) assert (d[1].wordtype.frequency == 1)
def test_init(): word_tokens = [{'begin':0,'end':1,'word':{'spelling':'a','transcription':['a','b']},'following_token_time':1}, {'begin':1,'end':2,'word':{'spelling':'c','transcription':['c','a','b']}, 'previous_token_time':0,'following_token_time':2}, {'begin':2,'end':3,'word':{'spelling':'a','transcription':['a','b']}, 'previous_token_time':1,'following_token_time':3}, {'begin':3,'end':4,'word':{'spelling':'d','transcription':['a','d']}, 'previous_token_time':2,}] d = Discourse() for wt in word_tokens: w = d.lexicon.get_or_create_word(**wt['word']) w.frequency += 1 wt['word'] = w d.add_word(WordToken(**wt)) corpus = SpontaneousSpeechCorpus('','') corpus.add_discourse(d) d = corpus.discourses[''] assert(d[0].wordtype.frequency == 2) assert(d[1].wordtype.frequency == 1)
def load_directory_multiple_files(corpus_name, path, dialect, annotation_types=None, feature_system_path=None, stop_check=None, call_back=None): """ Loads a directory of corpus standard files (separated into words files and phones files) Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files dialect : str Currently only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if stop_check is not None and stop_check(): return if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0, len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format( i + 1, len(file_tuples))) call_back(i) root, filename = t name, ext = os.path.splitext(filename) if ext == '.words': phone_ext = '.phones' word_path = os.path.join(root, filename) phone_path = os.path.splitext(word_path)[0] + phone_ext try: d = load_discourse_multiple_files(name, word_path, phone_path, dialect, annotation_types, corpus.lexicon, feature_system_path, stop_check, None) corpus.add_discourse(d) except ValueError: print('Error importing for participant ' + name) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) corpus.lexicon.specifier = modernize.modernize_specifier( corpus.lexicon.specifier) return corpus
def load_directory_spelling(corpus_name, path, annotation_types=None, support_corpus_path=None, ignore_case=False, stop_check=None, call_back=None): """ Loads a directory of orthographic texts Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files support_corpus_path : str, optional File path of corpus binary to load transcriptions from ignore_case : bool, optional Specifies whether lookups in the support corpus should ignore case stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0, len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format( i + 1, len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_spelling(name, os.path.join(root, filename), annotation_types, corpus.lexicon, support_corpus_path, ignore_case, stop_check, call_back) corpus.add_discourse(d) return corpus
def load_directory_multiple_files(corpus_name, path, dialect, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of corpus standard files (separated into words files and phones files) Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files dialect : str One of 'buckeye' or 'timit' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if stop_check is not None and stop_check(): return if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples))) call_back(i) root, filename = t name,ext = os.path.splitext(filename) if ext == '.words': phone_ext = '.phones' else: phone_ext = '.phn' word_path = os.path.join(root,filename) phone_path = os.path.splitext(word_path)[0] + phone_ext d = load_discourse_multiple_files(name, word_path, phone_path, dialect, annotation_types, corpus.lexicon, None, stop_check, None) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) return corpus