def load_discourse_textgrid(corpus_name, path, annotation_types, feature_system_path = None, support_corpus_path = None, stop_check = None, call_back = None): """ Load a discourse from a TextGrid file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to TextGrid file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the TextGrids. Can be generated through ``inspect_discourse_textgrid``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the TextGrid file """ data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check) #textgrid_to_data has side-effects that change annotation_types wav_path = find_wav_path(path) if support_corpus_path is not None: if isinstance(support_corpus_path, Corpus): #the corpus is 'preloaded' if this function is called by load_directory_textgrid #otherwise the corpus has to be loaded once per file in a directory, which could be slow support = support_corpus_path else: #otherwise, it's a string representing a path to the corpus support = load_binary(support_corpus_path) else: support = None discourse = data_to_discourse2(corpus_name, wav_path, annotation_types=annotation_types, support_corpus = support, stop_check=stop_check, call_back=call_back) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier) return discourse
def load_directory_ilg(corpus_name, path, annotation_types, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of interlinear gloss text files Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_ilg(name, os.path.join(root,filename), annotation_types, corpus.lexicon, None, stop_check, call_back) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) corpus.lexicon.specifier = modernize.modernize_specifier(corpus.lexicon.specifier) return corpus
def test_save(export_test_dir, unspecified_test_corpus): save_path = os.path.join(export_test_dir, 'testsave.corpus') save_binary(unspecified_test_corpus,save_path) c = load_binary(save_path) assert(unspecified_test_corpus == c)
def test_save(export_test_dir, unspecified_test_corpus): save_path = os.path.join(export_test_dir, 'testsave.corpus') save_binary(unspecified_test_corpus, save_path) c = load_binary(save_path) assert (unspecified_test_corpus == c)
def load_directory_ilg(corpus_name, path, annotation_types, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of interlinear gloss text files Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if not filename.lower().endswith('.txt'): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1,len(file_tuples))) call_back(i) root, filename = t name = os.path.splitext(filename)[0] d = load_discourse_ilg(name, os.path.join(root,filename), annotation_types, corpus.lexicon, None, stop_check, call_back) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) return corpus
def load_discourse_textgrid(corpus_name, path, annotation_types, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a TextGrid file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to TextGrid file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the TextGrids. Can be generated through ``inspect_discourse_textgrid``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the TextGrid file """ data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check) #textgrid_to_data has side-effects that change annotation_types wav_path = find_wav_path(path) discourse = data_to_discourse2(corpus_name, wav_path, annotation_types, stop_check=stop_check, call_back=call_back) # discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check) # discourse is a Discourse object, see corpus\classes\spontaneous.py if discourse is None: return if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def load_discourse_ilg(corpus_name, path, annotation_types, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ data = ilg_to_data(corpus_name, path, annotation_types, stop_check, call_back) #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check) discourse = data_to_discourse2(corpus_name=corpus_name, annotation_types=annotation_types, stop_check=stop_check, call_back=call_back) if discourse is None: return if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types=None, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str One of 'buckeye' or 'timit' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ data = multiple_files_to_data(word_path, phone_path, dialect, annotation_types, call_back, stop_check) data.name = corpus_name data.wav_path = find_wav_path(word_path) discourse = data_to_discourse(data, lexicon) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types = None, lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str One of 'buckeye' or 'timit' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ data = multiple_files_to_data(word_path,phone_path, dialect, annotation_types, call_back, stop_check) data.name = corpus_name data.wav_path = find_wav_path(word_path) discourse = data_to_discourse(data, lexicon) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) return discourse
def load_discourse_ilg(corpus_name, path, annotation_types, lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the glosses. Can be generated through ``inspect_discourse_ilg``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ data = ilg_to_data(corpus_name, path, annotation_types,stop_check, call_back) #discourse = data_to_discourse(data, lexicon, call_back=call_back, stop_check=stop_check) discourse = data_to_discourse2(corpus_name=corpus_name, annotation_types=annotation_types, stop_check=stop_check, call_back=call_back) if discourse is None: return if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier) return discourse
def load_discourse_textgrid(corpus_name, path, annotation_types, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a TextGrid file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to TextGrid file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the TextGrids. Can be generated through ``inspect_discourse_textgrid``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the TextGrid file """ data = textgrid_to_data(path, annotation_types, call_back, stop_check) data.name = corpus_name data.wav_path = find_wav_path(path) discourse = data_to_discourse(data, lexicon) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) return discourse
def load_discourse_textgrid(corpus_name, path, annotation_types, lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a TextGrid file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to TextGrid file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the TextGrids. Can be generated through ``inspect_discourse_textgrid``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the TextGrid file """ data = textgrid_to_data(path, annotation_types, call_back, stop_check) data.name = corpus_name data.wav_path = find_wav_path(path) discourse = data_to_discourse(data, lexicon) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types=None, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str Currently, only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ name = os.path.splitext(os.path.split(word_path)[1])[0] discourse_kwargs = { 'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list() } for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute #.output_name elif at.name == 'Transcription (default)': discourse_kwargs[ 'transcription_name'] = at.attribute #.output_name elif at.name == 'Other (character)' or at.attribute.att_type in ( 'tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) discourse = Discourse(discourse_kwargs) words = read_words(word_path, dialect) ind = 0 for w in words: word_kwargs = { at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types } word = Word(**word_kwargs) word_token_kwargs = dict() for at in annotation_types: if at.ignored: continue word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name]) word_token_kwargs['word'] = word if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = w['begin'] end = w['end'] word_token_kwargs[ 'begin'] = begin if begin is not None else ind word_token_kwargs[ 'end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, w['transcription']) word_token = WordToken(**word_token_kwargs) word.wordtokens.append(word_token) discourse.lexicon.add_word(word) discourse.add_word(word_token) ind += 1 if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types = None, lexicon = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str Currently, only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ name = os.path.splitext(os.path.split(word_path)[1])[0] discourse_kwargs = {'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list()} for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute#.output_name elif at.name == 'Transcription (default)': discourse_kwargs['transcription_name'] = at.attribute#.output_name elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) discourse = Discourse(discourse_kwargs) words = read_words(word_path, dialect) ind = 0 for w in words: word_kwargs = {at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types} word = Word(**word_kwargs) word_token_kwargs = dict() for at in annotation_types: if at.ignored: continue word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name]) word_token_kwargs['word'] = word if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = w['begin'] end = w['end'] word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, w['transcription']) word_token = WordToken(**word_token_kwargs) word.wordtokens.append(word_token) discourse.lexicon.add_word(word) discourse.add_word(word_token) ind += 1 if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier(discourse.lexicon.specifier) return discourse
def load_discourse_textgrid(corpus_name, path, annotation_types, feature_system_path=None, support_corpus_path=None, stop_check=None, call_back=None): """ Load a discourse from a TextGrid file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to TextGrid file annotation_types : list of AnnotationType List of AnnotationType specifying how to parse the TextGrids. Can be generated through ``inspect_discourse_textgrid``. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the TextGrid file """ data = textgrid_to_data(corpus_name, path, annotation_types, call_back=call_back, stop_check=stop_check) #textgrid_to_data has side-effects that change annotation_types wav_path = find_wav_path(path) if support_corpus_path is not None: if isinstance(support_corpus_path, Corpus): #the corpus is 'preloaded' if this function is called by load_directory_textgrid #otherwise the corpus has to be loaded once per file in a directory, which could be slow support = support_corpus_path else: #otherwise, it's a string representing a path to the corpus support = load_binary(support_corpus_path) else: support = None discourse = data_to_discourse2(corpus_name, wav_path, annotation_types=annotation_types, support_corpus=support, stop_check=stop_check, call_back=call_back) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def load_directory_multiple_files(corpus_name, path, dialect, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Loads a directory of corpus standard files (separated into words files and phones files) Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files dialect : str One of 'buckeye' or 'timit' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if stop_check is not None and stop_check(): return if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0,len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format(i+1, len(file_tuples))) call_back(i) root, filename = t name,ext = os.path.splitext(filename) if ext == '.words': phone_ext = '.phones' else: phone_ext = '.phn' word_path = os.path.join(root,filename) phone_path = os.path.splitext(word_path)[0] + phone_ext d = load_discourse_multiple_files(name, word_path, phone_path, dialect, annotation_types, corpus.lexicon, None, stop_check, None) corpus.add_discourse(d) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) return corpus
def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None, stop_check=None, call_back=None): if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) feature_matrix = modernize.modernize_specifier(feature_matrix) if annotation_types is None: annotation_types, delimiter = inspect_csv(path, coldelim=delimiter) for a in annotation_types: a.reset() missing = set() with open(path, encoding='utf-8-sig') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers) == 1: e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches ' 'the one used in the file.')) raise e headers = annotation_types for line in f.readlines(): line = line.strip() if not line: continue for k, v in zip(headers, line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': ignored = k.ignored_characters if ignored is not None: v = ''.join(x for x in v if x not in ignored) sd = k.syllable_delimiter if sd is not None: syllables = v.split(sd) else: syllables = [v] td = k.trans_delimiter stress_spec = set(k.stress_specification.keys()) tone_spec = set(k.tone_specification.keys()) supra_spec = stress_spec.union(tone_spec) for syllable in syllables: syllable = ''.join(x for x in syllable if x not in supra_spec) if td is None: if k.digraph_pattern is not None: string = k.digraph_pattern.findall(syllable) else: string = [x for x in syllable] else: string = syllable.split(td) for seg in string: if seg == '': continue if seg not in feature_matrix.segments: missing.add(seg) print('In csv.py', missing)
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path, stop_check, call_back) corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) feature_matrix = modernize.modernize_specifier(feature_matrix) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, delimiter = inspect_csv(path, coldelim=delimiter) for a in annotation_types: a.reset() if call_back is not None: call_back('Loading...') call_back(0, 0) cur = 0 with open(path, encoding='utf-8-sig') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers) == 1: e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches ' 'the one used in the file.')) raise e headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = True for line in f.readlines(): if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) line = line.strip() if not line: # blank or just a newline continue d = {} for k, v in zip(headers, line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k, feature_matrix=feature_matrix, corpus=corpus) # trans is a list of BaseAnnotation if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters if not word.spelling: word.spelling = ''.join(map(str, word.transcription)) corpus.add_word(word, allow_duplicates=True) if corpus.specifier is not None: corpus.inventory.update_features(corpus.specifier) if corpus.has_transcription and any(len(word.transcription) > 1 for word in corpus): if not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\nCheck that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise e if stop_check is not None and stop_check(): return return corpus
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, _ = inspect_csv(path, coldelim = delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus
def load_corpus_csv(corpus_name, path, delimiter, trans_delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns trans_delimiter : str Character to use for spliting transcriptions into segments annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(best_delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(best_delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus
def load_directory_multiple_files(corpus_name, path, dialect, annotation_types=None, feature_system_path=None, stop_check=None, call_back=None): """ Loads a directory of corpus standard files (separated into words files and phones files) Parameters ---------- corpus_name : str Name of corpus path : str Path to directory of text files dialect : str Currently only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. feature_system_path : str, optional File path of FeatureMatrix binary to specify segments stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- SpontaneousSpeechCorpus Corpus containing Discourses corresponding to the text files """ if call_back is not None: call_back('Finding files...') call_back(0, 0) file_tuples = [] for root, subdirs, files in os.walk(path): for filename in files: if stop_check is not None and stop_check(): return if not (filename.lower().endswith('.words') or filename.lower().endswith('.wrd')): continue file_tuples.append((root, filename)) if call_back is not None: call_back('Parsing files...') call_back(0, len(file_tuples)) cur = 0 corpus = SpontaneousSpeechCorpus(corpus_name, path) for i, t in enumerate(file_tuples): if stop_check is not None and stop_check(): return if call_back is not None: call_back('Parsing file {} of {}...'.format( i + 1, len(file_tuples))) call_back(i) root, filename = t name, ext = os.path.splitext(filename) if ext == '.words': phone_ext = '.phones' word_path = os.path.join(root, filename) phone_path = os.path.splitext(word_path)[0] + phone_ext try: d = load_discourse_multiple_files(name, word_path, phone_path, dialect, annotation_types, corpus.lexicon, feature_system_path, stop_check, None) corpus.add_discourse(d) except ValueError: print('Error importing for participant ' + name) if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) corpus.lexicon.set_feature_matrix(feature_matrix) corpus.lexicon.specifier = modernize.modernize_specifier( corpus.lexicon.specifier) return corpus
def main(): #### Parse command-line arguments parser = argparse.ArgumentParser(description = \ 'Phonological CorpusTools: neighborhood density CL interface') parser.add_argument('corpus_file_name', help='Name of corpus file') parser.add_argument('query', help='Word to query, or name of file including a list of words') parser.add_argument('-c', '--context_type', type=str, default='Canonical', help="How to deal with variable pronunciations. Options are 'Canonical', 'MostFrequent', 'SeparatedTokens', or 'Weighted'. See documentation for details.") parser.add_argument('-a', '--algorithm', default= 'edit_distance', help="The algorithm used to determine distance") parser.add_argument('-d', '--max_distance', type=int, default = 1, help="Maximum edit distance from the queried word to consider a word a neighbor.") parser.add_argument('-s', '--sequence_type', default = 'transcription', help="The name of the tier on which to calculate distance") parser.add_argument('-w', '--count_what', default ='type', help="If 'type', count neighbors in terms of their type frequency. If 'token', count neighbors in terms of their token frequency.") parser.add_argument('-e', '--trans_delimiter', default='', help="If not empty string, splits the query by this str to make a transcription/spelling list for the query's Word object.") parser.add_argument('-m', '--find_mutation_minpairs', action='store_true', help='This flag causes the script not to calculate neighborhood density, but rather to find minimal pairs---see documentation.') parser.add_argument('-q', '--force_quadratic_algorithm', action='store_true', help='This flag prevents PCT from using the more efficient linear-time algorithm for edit distance of 1 neighborhoods.') parser.add_argument('-o', '--outfile', help='Name of output file') args = parser.parse_args() #### try: home = os.path.expanduser('~') corpus = load_binary(os.path.join(home, 'Documents', 'PCT', 'CorpusTools', 'CORPUS', args.corpus_file_name)) except FileNotFoundError: corpus = load_binary(args.corpus_file_name) if args.context_type == 'Canonical': corpus = CanonicalVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'MostFrequent': corpus = MostFrequentVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'SeparatedTokens': corpus = SeparatedTokensVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) elif args.context_type == 'Weighted': corpus = WeightedVariantContext(corpus, args.sequence_type, type_or_token=args.count_what) if args.find_mutation_minpairs: query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter) matches = find_mutation_minpairs(corpus, query) for match in matches[1]: print(match) print('Total number of matches: {}'.format(str(matches[0]))) else: try: # read query as a file name with open(args.query) as queryfile: queries = [line[0] for line in csv.reader(queryfile, delimiter='\t') if len(line) > 0] queries = [ensure_query_is_word(q, corpus, args.sequence_type, args.trans_delimiter) for q in queries] results = [neighborhood_density(corpus, q, algorithm = args.algorithm, max_distance = args.max_distance, force_quadratic=args.force_quadratic_algorithm) for q in queries] if args.outfile: with open(args.outfile, 'w') as outfile: for q, r in zip(queries, results): outfile.write('{}\t{}'.format(q, str(r[0])) + ''.join(['\t{}'.format(str(n)) for n in r[1]]) + '\n') else: raise Exception('In order to use a file of queries as input, you must provide an output file name using the option -o.') except FileNotFoundError: # read query as a single word query = ensure_query_is_word(args.query, corpus, args.sequence_type, args.trans_delimiter) result = neighborhood_density(corpus, query, algorithm = args.algorithm, max_distance = args.max_distance, force_quadratic=args.force_quadratic_algorithm) if args.outfile: with open(args.outfile, 'w') as outfile: outfile.write('{}\t{}'.format(query, str(result[0])) + ''.join(['\t{}'.format(str(n)) for n in result[1]])) else: print('No output file name provided.') print('The neighborhood density of the given form is {}. For a list of neighbors, please provide an output file name.'.format(str(result[0])))