def ensure_query_is_word(query, corpus, sequence_type, tier_type, trans_delimiter='.', file_type=None): if isinstance(query, Word): query_word = query else: if tier_type.att_type == 'spelling': if file_type == sequence_type: query_word = Word(**{sequence_type: list(query)}) else: query_word = query.replace(trans_delimiter, '') query_word = Word(**{sequence_type: list(query_word)}) elif tier_type.att_type == 'tier': if file_type == sequence_type: new_query = parse(query, trans_delimiter) query_word = Word(**{sequence_type: new_query}) else: try: query_word = corpus.corpus.find(query) except KeyError: new_query = parse(query, trans_delimiter) query_word = Word(**{sequence_type: list(new_query)}) return query_word
def test_init(): word_type_only = { 'begin': 0, 'end': 1, 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) } word_type_and = { 'begin': 0, 'end': 1, 'spelling': 'a2', 'transcription': ['a', 'b2'], 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) } wt = WordToken(**word_type_only) assert (wt.spelling == 'a') assert (str(wt.transcription) == 'a.b') wt = WordToken(**word_type_and) assert (wt.spelling == 'a2') assert (str(wt.transcription) == 'a.b2')
def ensure_query_is_word(query, corpus, sequence_type, trans_delimiter): if isinstance(query, Word): query_word = query else: try: query_word = corpus.corpus.find(query) except KeyError: if trans_delimiter == '': query_word = Word(**{sequence_type: list(query)}) else: query_word = Word(**{sequence_type: query.split(trans_delimiter)}) return query_word
def test_basic(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm)
def test_homographs(self): return corpus = Corpus('test') for w in self.homograph_info: corpus.add_word(Word(**w)) #Error, should find return an iterable of homographs? self.assertEqual([x.spelling for x in corpus.find('a')], ['a', 'a'])
def test_basic(self): corpus = Corpus('test') for w in self.basic_info: self.assertRaises(KeyError, corpus.find, w['spelling'], True) corpus.add_word(Word(**w)) self.assertEqual(corpus[w['spelling']], Word(**w)) self.assertEqual(corpus.find(w['spelling']), Word(**w)) self.assertTrue(w['spelling'] in corpus) self.assertEqual( corpus.inventory._data, { '#': Segment('#'), 'a': Segment('a'), 'b': Segment('b'), 'c': Segment('c'), 'd': Segment('d') })
def test_init(): word_tokens = [{ 'begin': 0, 'end': 1, 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) }, { 'begin': 1, 'end': 2, 'word': Word(**{ 'spelling': 'c', 'transcription': ['c', 'a', 'b'] }) }, { 'begin': 2, 'end': 3, 'word': Word(**{ 'spelling': 'a', 'transcription': ['a', 'b'] }) }, { 'begin': 3, 'end': 4, 'word': Word(**{ 'spelling': 'd', 'transcription': ['a', 'd'] }) }] d = Discourse() for wt in word_tokens: d.add_word(WordToken(**wt))
def test_spelling_only_word(self): t = Word(**self.spelling_only) self.assertEqual(t.transcription, None) self.assertEqual(t.frequency, float(self.spelling_only['frequency'])) self.assertEqual(t.spelling, self.spelling_only['spelling']) self.assertRaises(AttributeError, getattr, t, 'tier1')
def test_basic_corpus_mutation_minpairs(specified_test_corpus): calls = [({'query':Word(**{'transcription': ['s', 'ɑ', 't', 'ɑ']}), },2)] with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c: for kwargs,v in calls: result = find_mutation_minpairs(c, **kwargs) assert(result[0] == v) assert(sorted(result[1]) == sorted(['n.ɑ.t.ɑ', 'm.ɑ.t.ɑ']))
def test_coverage(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_no_d_info) corpus.set_feature_matrix(fm) self.assertEqual(corpus.check_coverage(), ['d'])
def test_no_freq_word(self): t = Word(**self.no_freq) self.assertEqual(str(t.transcription), '.'.join(self.no_freq['transcription'])) self.assertEqual(t.frequency, 0) self.assertEqual(t.spelling, self.no_freq['spelling']) self.assertRaises(AttributeError, getattr, t, 'tier1')
def test_trans_only_word(self): t = Word(**self.trans_only) self.assertEqual(str(t.transcription), '.'.join(self.trans_only['transcription'])) self.assertEqual(t.frequency, float(self.trans_only['frequency'])) self.assertEqual(t.spelling, 'abcd') self.assertRaises(AttributeError, getattr, t, 'tier1')
def test_basic_word(self): t = Word(**self.basic) self.assertEqual(str(t.transcription), '.'.join(self.basic['transcription'])) self.assertEqual(t.frequency, float(self.basic['frequency'])) self.assertEqual(t.spelling, self.basic['spelling']) self.assertRaises(AttributeError, getattr, t, 'tier1')
def test_feats_to_segs(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm) self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])), sorted(['a', 'b']))
def test_tiered_word(self): t = Word(**self.tiered) self.assertEqual(str(t.transcription), '.'.join(self.tiered['transcription'])) self.assertEqual(t.frequency, float(self.tiered['frequency'])) self.assertEqual(t.spelling, self.tiered['spelling']) self.assertEqual(t.tier1, self.tiered['tier1']) self.assertEqual(t.tier2, self.tiered['tier2'])
def test_extra_word(self): t = Word(**self.extra) self.assertEqual(str(t.transcription), '.'.join(self.extra['transcription'])) self.assertEqual(t.spelling, self.extra['spelling']) self.assertEqual(t.frequency, float(self.extra['frequency'])) self.assertEqual(t.num_sylls, float(self.extra['num_sylls'])) self.assertEqual(t.some_other_label, self.extra['some_other_label'])
def load_words_neighden(path): output = list() with open(path, 'r') as f: for line in f: fields = [x for x in line.strip().split(None) if x != ''] if len(fields) > 1: fields[1] = fields[1].split('.') fields = Word(spelling=fields[0], transcription=fields[1]) elif len(fields) == 1: fields = fields[0] else: continue output.append(fields) return output
def setUp(self): self.corpus_info = [ { 'spelling': 'a', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'b', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'c', 'transcription': ['c', 'a', 'b'], 'frequency': 32.0 }, { 'spelling': 'd', 'transcription': ['a', 'd'], 'frequency': 32.0 }, ] self.feature_info = [{ 'symbol': 'a', 'feature1': '+', 'feature2': '+' }, { 'symbol': 'b', 'feature1': '+', 'feature2': '-' }, { 'symbol': 'c', 'feature1': '-', 'feature2': '+' }, { 'symbol': 'd', 'feature1': '-', 'feature2': '-' }] self.corpus = Corpus('test') for w in self.corpus_info: self.corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_info) self.corpus.set_feature_matrix(fm) self.corpus.inventory.update_features(self.corpus.specifier)
def test_add_tier(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm) corpus.add_tier('t', '+feature1') self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]]) corpus.remove_attribute('t') self.assertRaises(AttributeError, getattr, corpus['d'], 't')
def accept(self): kwargs = {} for a in self.corpus.attributes: if a.att_type == 'tier': text = self.edits['transcription'].text() if text == 'Empty': text = '' kwargs[a.name] = [x for x in text.split('.') if x != ''] # if not kwargs[a.name]: # reply = QMessageBox.critical(self, # "Missing information", "Words must have a Transcription.".format(str(a))) # return for i in list(dict.fromkeys(kwargs[a.name])): if i not in self.inventory.segs: reply = QMessageBox.critical(self, 'Invalid information', 'The transcription can only contain only symbols ' 'from the corpus\' inventory.'.format(str(a))) return elif a.att_type == 'spelling': kwargs[a.name] = self.edits['spelling'].text() if kwargs[a.name] == '': # and a.name == 'spelling': kwargs[a.name] = None if a.name != 'Spelling': kwargs['_spelling_name'] = a.name # if not kwargs[a.name] and a.name == 'spelling': # reply = QMessageBox.critical(self, # "Missing information", "Words must have a spelling.".format(str(a))) # return elif a.att_type == 'numeric' and (hasattr(a, 'is_freq') or a.display_name == 'Frequency'): try: kwargs[a.name] = float(self.edits['frequency'].text()) except ValueError: reply = QMessageBox.critical(self, "Invalid information", "The column '{}' must be a number.".format(str(a))) return if a.name != 'Frequency': kwargs['_freq_name'] = a.name elif a.att_type == 'factor': kwargs[a.name] = self.edits[a.name].text() self.word = Word(**kwargs) QDialog.accept(self)
def test_corpus_model(qtbot, specified_test_corpus, settings): model = CorpusModel(specified_test_corpus, settings) assert(model.headerData(0,Qt.Horizontal,Qt.DisplayRole) == 'Spelling') assert(model.headerData(1,Qt.Horizontal,Qt.DisplayRole) == 'Transcription') assert(model.headerData(2,Qt.Horizontal,Qt.DisplayRole) == 'Frequency') a = Attribute('test', 'spelling','Test2') model.addColumn(a) assert(model.headerData(3,Qt.Horizontal,Qt.DisplayRole) == 'Test2') model.removeAttributes(['Test2']) assert(len(model.columns) == 3) a = Attribute('test','factor','Test') model.addAbstractTier(a, {'C':['t','m']}) assert(model.wordObject(0).test == 'CC') model.removeAttributes(['Test']) a = Attribute('test','numeric','Test') model.addCountColumn(a, 'transcription', ['t','m']) assert(model.wordObject(0).test == 2) model.removeAttributes(['Test']) a = Attribute('test','tier','Test') model.addTier(a, ['t','m']) assert(model.wordObject(0).test == ['t','m']) model.removeAttributes(['Test']) w = model.wordObject(0) assert(w.spelling == 'atema') w = Word(spelling = 'atema', transcription = []) model.replaceWord(0, w) w = model.wordObject(0) assert(w.spelling == 'atema' and w.transcription == []) model.hideNonLexical(True) w = model.wordObject(0) assert(w.spelling != 'atema') model.hideNonLexical(False) w = model.wordObject(0) assert(w.spelling == 'atema')
def accept(self): kwargs = {} for a in self.corpus.attributes: if a.att_type == 'tier': text = self.edits[a.name].text() if text == 'Empty': text = '' kwargs[a.name] = [x for x in text.split('.') if x != ''] #if not kwargs[a.name]: # reply = QMessageBox.critical(self, # "Missing information", "Words must have a Transcription.".format(str(a))) # return for i in kwargs[a.name]: if i not in self.corpus.inventory: reply = QMessageBox.critical( self, "Invalid information", "The column '{}' must contain only symbols in the corpus' inventory." .format(str(a))) return elif a.att_type == 'spelling': kwargs[a.name] = self.edits[a.name].text() if kwargs[a.name] == '' and a.name == 'spelling': kwargs[a.name] = None #if not kwargs[a.name] and a.name == 'spelling': # reply = QMessageBox.critical(self, # "Missing information", "Words must have a spelling.".format(str(a))) # return elif a.att_type == 'numeric': try: kwargs[a.name] = float(self.edits[a.name].text()) except ValueError: reply = QMessageBox.critical( self, "Invalid information", "The column '{}' must be a number.".format(str(a))) return elif a.att_type == 'factor': kwargs[a.name] = self.edits[a.name].text() self.word = Word(**kwargs) QDialog.accept(self)
def unspecified_test_corpus(): # Segments: ɑ, i, u, e, o, ʃ, t, m, n, s (10 segments) corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0}, {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0}, {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0}, {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0}, {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0}, {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0}, {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0}, {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0}, {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0}, {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0}, {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0}, {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0}, {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}] corpus = Corpus('test') for w in corpus_data: corpus.add_word(Word(**w)) return corpus
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, _ = inspect_csv(path, coldelim = delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus
def load_discourse_multiple_files(corpus_name, word_path, phone_path, dialect, annotation_types=None, lexicon=None, feature_system_path=None, stop_check=None, call_back=None): """ Load a discourse from a text file containing interlinear glosses Parameters ---------- corpus_name : str Informative identifier to refer to corpus word_path : str Full path to words text file phone_path : str Full path to phones text file dialect : str Currently, only 'buckeye' annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse the glosses. Auto-generated based on dialect. lexicon : Corpus, optional Corpus to store Discourse word information feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable or None Optional function to check whether to gracefully terminate early call_back : callable or None Optional function to supply progress information during the loading Returns ------- Discourse Discourse object generated from the text file """ name = os.path.splitext(os.path.split(word_path)[1])[0] discourse_kwargs = { 'name': name, 'wav_path': find_wav_path(word_path), 'other_attributes': list() } for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute #.output_name elif at.name == 'Transcription (default)': discourse_kwargs[ 'transcription_name'] = at.attribute #.output_name elif at.name == 'Other (character)' or at.attribute.att_type in ( 'tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) discourse = Discourse(discourse_kwargs) words = read_words(word_path, dialect) ind = 0 for w in words: word_kwargs = { at.output_name: (at.attribute, w[at.output_name]) for at in annotation_types } word = Word(**word_kwargs) word_token_kwargs = dict() for at in annotation_types: if at.ignored: continue word_token_kwargs[at.output_name] = (at.attribute, w[at.output_name]) word_token_kwargs['word'] = word if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = w['begin'] end = w['end'] word_token_kwargs[ 'begin'] = begin if begin is not None else ind word_token_kwargs[ 'end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, w['transcription']) word_token = WordToken(**word_token_kwargs) word.wordtokens.append(word_token) discourse.lexicon.add_word(word) discourse.add_word(word_token) ind += 1 if feature_system_path is not None: feature_matrix = load_binary(feature_system_path) discourse.lexicon.set_feature_matrix(feature_matrix) discourse.lexicon.specifier = modernize.modernize_specifier( discourse.lexicon.specifier) return discourse
def test_word_init(self): t = Word(**self.basic) self.assertEqual(t.spelling, self.basic['spelling']) self.assertEqual(t.frequency, float(self.basic['frequency'])) self.assertRaises(AttributeError, getattr, t, 'tier1')
def data_to_discourse2(corpus_name=None, wav_path=None, annotation_types=None, support_corpus=None, ignore_case=False, call_back=None, stop_check=None): curr_word = list() annotations = {at:list() for at in annotation_types} spelling_name, transcription_name = None, None if call_back is not None: call_back('Processing data...') cur = 0 for at in annotation_types: if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) if all(isinstance(item, Annotation) for item in at._list): # it's a list of spellings, take each one and add it to the overall annotations list for item in at._list: if item.label: annotations[at].append((item.label, None, None)) elif all(type(item) == BaseAnnotation for item in at._list): #it's a list of transcriptions, with each segment as a BaseAnnotation for item in at._list: if item.begin is not None: begin = item.begin if item.end is None: curr_word.append(item) elif item.end is not None: end = item.end curr_word.append(item) curr_word = Transcription(curr_word) annotations[at].append((curr_word, begin, end)) curr_word = list() else: print(at._list) raise TypeError("AnnotationType._list cannot contain a mix of Annotations and BaseAnnotations") if support_corpus is not None: spellings = [value for key,value in annotations.items() if key.name=='Orthography (default)'][0] transcriptions = [key for key in annotations if key.name == 'Transcription'][0] for index, info in enumerate(spellings): spelling = info[0] #info[1] is the start time, info[2] is the end time (or else None) try: transcription = support_corpus.find(spelling, ignore_case=ignore_case).transcription except KeyError: try: no_punctuation = ''.join([x for x in spelling if not x in string.punctuation]) transcription = support_corpus.find(no_punctuation, ignore_case=ignore_case).transcription except KeyError: transcription = Transcription([symbol for symbol in spelling]) annotations[transcriptions].append((transcription, index, index+1)) discourse_kwargs = {'name': corpus_name, 'wav_path': wav_path, 'other_attributes': list()} for at in annotation_types: if at.name == 'Orthography (default)': discourse_kwargs['spelling_name'] = at.attribute elif at.name == 'Transcription (default)': discourse_kwargs['transcription_name'] = at.attribute elif at.name == 'Other (character)' or at.attribute.att_type in ('tier', 'spelling'): discourse_kwargs['other_attributes'].append(at.attribute) if 'spelling_name' not in discourse_kwargs: discourse_kwargs['spelling_name'] = Attribute('Spelling', 'spelling', 'Spelling') if 'transcription_name' not in discourse_kwargs: discourse_kwargs['transcription_name'] = Attribute('Transcription', 'tier', 'Transcription') if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) discourse = Discourse(discourse_kwargs) if not 'Frequency' in [a.name for a in discourse.lexicon.attributes]: # running text will not have a frequency attribute supplied by the user # textgrids are also unlikely to have this attribute discourse.lexicon.add_attribute(Attribute('frequency', 'numeric', 'Frequency')) add_frequency = True else: add_frequency = False ind = 0 limit = max([len(list(v)) for v in annotations.values()]) for n in range(limit): if stop_check is not None and stop_check(): return if call_back is not None: cur += 1 call_back(cur) word_kwargs = dict() for at in annotations: if at.token or at.ignored: continue else: try: word_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) #annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_kwargs[at.attribute.name] = (at.attribute, None) word = Word(**word_kwargs) try: word = discourse.lexicon.find(word.spelling) if add_frequency: word.frequency += 1 except KeyError: discourse.lexicon.add_word(word) word_token_kwargs = dict() word_token_kwargs['word'] = word begin, end = None, None for at in annotations: if at.ignored: continue try: word_token_kwargs[at.attribute.name] = (at.attribute, annotations[at][n][0]) # annotations[at][n] should be a tuple of (curr_word, begin, end) or (item_label, None, None) except IndexError: word_token_kwargs[at.attribute.name] = (at.attribute, None) #word_token_kwargs[at.output_name] = (at.attribute, annotations[at][n][0]) if at.attribute.att_type == 'tier': if at.attribute.is_default: begin = annotations[at][n][1] end = annotations[at][n][2] word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 if at.token: word_token_kwargs['_transcription'] = (at.attribute, annotations[at][n][0]) word_token_kwargs['begin'] = begin if begin is not None else ind word_token_kwargs['end'] = end if end is not None else ind + 1 word_token = WordToken(**word_token_kwargs) discourse.add_word(word_token) if any(a.token for a in annotations): word.wordtokens.append(word_token) ind += 1 return discourse
def load_corpus_csv(corpus_name, path, delimiter, trans_delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns trans_delimiter : str Character to use for spliting transcriptions into segments annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(best_delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(best_delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus