def test_basic(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_basic_info) corpus.set_feature_matrix(fm)
def test_homographs(self): return corpus = Corpus('test') for w in self.homograph_info: corpus.add_word(Word(**w)) #Error, should find return an iterable of homographs? self.assertEqual([x.spelling for x in corpus.find('a')], ['a', 'a'])
def test_homographs(self): return corpus = Corpus('test') for w in self.homograph_info: corpus.add_word(Word(**w)) #Error, should find return an iterable of homographs? self.assertEqual([x.spelling for x in corpus.find('a')],['a','a'])
def test_basic(self): corpus = Corpus('test') for w in self.basic_info: self.assertRaises(KeyError,corpus.find,w['spelling'],True) corpus.add_word(Word(**w)) self.assertEqual(corpus[w['spelling']],Word(**w)) self.assertEqual(corpus.find(w['spelling']),Word(**w)) self.assertTrue(w['spelling'] in corpus) self.assertEqual(corpus.inventory._data,{'#':Segment('#'), 'a':Segment('a'), 'b':Segment('b'), 'c':Segment('c'), 'd':Segment('d')})
def setUp(self): self.corpus_info = [ { 'spelling': 'a', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'b', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'c', 'transcription': ['c', 'a', 'b'], 'frequency': 32.0 }, { 'spelling': 'd', 'transcription': ['a', 'd'], 'frequency': 32.0 }, ] self.feature_info = [{ 'symbol': 'a', 'feature1': '+', 'feature2': '+' }, { 'symbol': 'b', 'feature1': '+', 'feature2': '-' }, { 'symbol': 'c', 'feature1': '-', 'feature2': '+' }, { 'symbol': 'd', 'feature1': '-', 'feature2': '-' }] self.corpus = Corpus('test') for w in self.corpus_info: self.corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_info) self.corpus.set_feature_matrix(fm) self.corpus.inventory.update_features(self.corpus.specifier)
def test_add_tier(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm) corpus.add_tier('t', '+feature1') self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]]) corpus.remove_attribute('t') self.assertRaises(AttributeError, getattr, corpus['d'], 't')
class EnvironmentFilterTest(unittest.TestCase): def setUp(self): self.corpus_info = [{'spelling':'a','transcription':['a','b'],'frequency':32.0}, {'spelling':'b','transcription':['a','b'],'frequency':32.0}, {'spelling':'c','transcription':['c','a','b'],'frequency':32.0}, {'spelling':'d','transcription':['a','d'],'frequency':32.0},] self.feature_info = [{'symbol':'a','feature1':'+','feature2':'+'}, {'symbol':'b','feature1':'+','feature2':'-'}, {'symbol':'c','feature1':'-','feature2':'+'}, {'symbol':'d','feature1':'-','feature2':'-'}] self.corpus = Corpus('test') for w in self.corpus_info: self.corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_info) self.corpus.set_feature_matrix(fm) self.corpus.inventory.update_features(self.corpus.specifier) def test_init(self): segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter(['a'], lhs = [segs]) self.assertEqual(sorted(envfilt.lhs[0]),sorted(['a','b'])) self.assertEqual(envfilt.rhs, None) segs = self.corpus.features_to_segments('-feature1') envfilt = EnvironmentFilter('a',rhs = [segs]) self.assertEqual(sorted(envfilt.rhs[0]),sorted(['c','d'])) self.assertEqual(envfilt.lhs,None) segs = self.corpus.features_to_segments('-feature1,-feature2') envfilt = EnvironmentFilter('a',rhs = [segs]) self.assertEqual(sorted(envfilt.rhs[0]),sorted(['d'])) def test_contains(self): segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter('a',lhs = [segs]) env1 = Environment('a', None, lhs = ['a'], rhs = ['b']) env2 = Environment('a', None, lhs = ['c'], rhs = ['#']) env3 = Environment('a', None, lhs = ['a'], rhs = ['c']) self.assertTrue(env1 in envfilt) self.assertFalse(env2 in envfilt) segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter('a',rhs = [segs], lhs=[segs]) self.assertTrue(env1 in envfilt) self.assertFalse(env2 in envfilt) self.assertFalse(env3 in envfilt)
def test_basic(self): corpus = Corpus('test') for w in self.basic_info: self.assertRaises(KeyError, corpus.find, w['spelling'], True) corpus.add_word(Word(**w)) self.assertEqual(corpus[w['spelling']], Word(**w)) self.assertEqual(corpus.find(w['spelling']), Word(**w)) self.assertTrue(w['spelling'] in corpus) self.assertEqual( corpus.inventory._data, { '#': Segment('#'), 'a': Segment('a'), 'b': Segment('b'), 'c': Segment('c'), 'd': Segment('d') })
def test_add_tier(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_basic_info) corpus.set_feature_matrix(fm) corpus.add_tier('t','+feature1') self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]]) corpus.remove_attribute('t') self.assertRaises(AttributeError,getattr,corpus['d'],'t')
def unspecified_test_corpus(): corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0}, {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0}, {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0}, {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0}, {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0}, {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0}, {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0}, {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0}, {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0}, {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0}, {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0}, {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0}, {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}] corpus = Corpus('test') for w in corpus_data: corpus.add_word(Word(**w)) return corpus
def test_coverage(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_no_d_info) corpus.set_feature_matrix(fm) self.assertEqual(corpus.check_coverage(), ['d'])
def unspecified_test_corpus(): # Segments: ɑ, i, u, e, o, ʃ, t, m, n, s (10 segments) corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0}, {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0}, {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0}, {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0}, {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0}, {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0}, {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0}, {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0}, {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0}, {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0}, {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0}, {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0}, {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0}, {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}] corpus = Corpus('test') for w in corpus_data: corpus.add_word(Word(**w)) return corpus
def test_feats_to_segs(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm) self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])), sorted(['a', 'b']))
def test_basic(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_basic_info) corpus.set_feature_matrix(fm)
def test_feats_to_segs(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_basic_info) corpus.set_feature_matrix(fm) self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),sorted(['a','b']))
def test_coverage(self): corpus = Corpus('test') for w in self.corpus_basic_info: corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_no_d_info) corpus.set_feature_matrix(fm) self.assertEqual(corpus.check_coverage(),['d'])
def setUp(self): self.corpus_info = [{'spelling':'a','transcription':['a','b'],'frequency':32.0}, {'spelling':'b','transcription':['a','b'],'frequency':32.0}, {'spelling':'c','transcription':['c','a','b'],'frequency':32.0}, {'spelling':'d','transcription':['a','d'],'frequency':32.0},] self.feature_info = [{'symbol':'a','feature1':'+','feature2':'+'}, {'symbol':'b','feature1':'+','feature2':'-'}, {'symbol':'c','feature1':'-','feature2':'+'}, {'symbol':'d','feature1':'-','feature2':'-'}] self.corpus = Corpus('test') for w in self.corpus_info: self.corpus.add_word(Word(**w)) fm = FeatureMatrix('test',self.feature_info) self.corpus.set_feature_matrix(fm)
class EnvironmentFilterTest(unittest.TestCase): def setUp(self): self.corpus_info = [ { 'spelling': 'a', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'b', 'transcription': ['a', 'b'], 'frequency': 32.0 }, { 'spelling': 'c', 'transcription': ['c', 'a', 'b'], 'frequency': 32.0 }, { 'spelling': 'd', 'transcription': ['a', 'd'], 'frequency': 32.0 }, ] self.feature_info = [{ 'symbol': 'a', 'feature1': '+', 'feature2': '+' }, { 'symbol': 'b', 'feature1': '+', 'feature2': '-' }, { 'symbol': 'c', 'feature1': '-', 'feature2': '+' }, { 'symbol': 'd', 'feature1': '-', 'feature2': '-' }] self.corpus = Corpus('test') for w in self.corpus_info: self.corpus.add_word(Word(**w)) fm = FeatureMatrix('test', self.feature_info) self.corpus.set_feature_matrix(fm) self.corpus.inventory.update_features(self.corpus.specifier) def test_init(self): segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter(['a'], lhs=[segs]) self.assertEqual(sorted(envfilt.lhs[0]), sorted(['a', 'b'])) self.assertEqual(envfilt.rhs, None) segs = self.corpus.features_to_segments('-feature1') envfilt = EnvironmentFilter('a', rhs=[segs]) self.assertEqual(sorted(envfilt.rhs[0]), sorted(['c', 'd'])) self.assertEqual(envfilt.lhs, None) segs = self.corpus.features_to_segments('-feature1,-feature2') envfilt = EnvironmentFilter('a', rhs=[segs]) self.assertEqual(sorted(envfilt.rhs[0]), sorted(['d'])) def test_contains(self): segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter('a', lhs=[segs]) env1 = Environment('a', None, lhs=['a'], rhs=['b']) env2 = Environment('a', None, lhs=['c'], rhs=['#']) env3 = Environment('a', None, lhs=['a'], rhs=['c']) self.assertTrue(env1 in envfilt) self.assertFalse(env2 in envfilt) segs = self.corpus.features_to_segments('+feature1') envfilt = EnvironmentFilter('a', rhs=[segs], lhs=[segs]) self.assertTrue(env1 in envfilt) self.assertFalse(env2 in envfilt) self.assertFalse(env3 in envfilt)
def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, _ = inspect_csv(path, coldelim = delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus
def load_corpus_csv(corpus_name, path, delimiter, trans_delimiter, annotation_types = None, feature_system_path = None, stop_check = None, call_back = None): """ Load a corpus from a column-delimited text file Parameters ---------- corpus_name : str Informative identifier to refer to corpus path : str Full path to text file delimiter : str Character to use for spliting lines into columns trans_delimiter : str Character to use for spliting transcriptions into segments annotation_types : list of AnnotationType, optional List of AnnotationType specifying how to parse text files feature_system_path : str Full path to pickled FeatureMatrix to use with the Corpus stop_check : callable, optional Optional function to check whether to gracefully terminate early call_back : callable, optional Optional function to supply progress information during the function Returns ------- Corpus Corpus object generated from the text file """ #begin = time.time() corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) corpus.set_feature_matrix(feature_matrix) if annotation_types is None: annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter) else: for a in annotation_types: if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier': raise(CorpusIntegrityError(('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(best_delimiter) if len(headers)==1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise(e) headers = annotation_types for a in headers: corpus.add_attribute(a.attribute) trans_check = False for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for k,v in zip(headers,line.split(best_delimiter)): v = v.strip() if k.attribute.att_type == 'tier': trans = parse_transcription(v, k) if not trans_check and len(trans) > 1: trans_check = True d[k.attribute.name] = (k.attribute, trans) else: d[k.attribute.name] = (k.attribute, v) word = Word(**d) if word.transcription: #transcriptions can have phonetic symbol delimiters which is a period if not word.spelling: word.spelling = ''.join(map(str,word.transcription)) corpus.add_word(word) if corpus.has_transcription and not trans_check: e = DelimiterError(('Could not parse transcriptions with that delimiter. ' '\n\Check that the transcription delimiter you typed ' 'in matches the one used in the file.')) raise(e) transcription_errors = corpus.check_coverage() return corpus