Exemple #1
0
    def test_basic(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)
    def test_basic(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)
    def test_feats_to_segs(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),sorted(['a','b']))
    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(),['d'])
Exemple #5
0
    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(), ['d'])
Exemple #6
0
    def test_feats_to_segs(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),
                         sorted(['a', 'b']))
class EnvironmentFilterTest(unittest.TestCase):
    def setUp(self):
        self.corpus_info = [{'spelling':'a','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'b','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'c','transcription':['c','a','b'],'frequency':32.0},
                            {'spelling':'d','transcription':['a','d'],'frequency':32.0},]

        self.feature_info = [{'symbol':'a','feature1':'+','feature2':'+'},
                            {'symbol':'b','feature1':'+','feature2':'-'},
                            {'symbol':'c','feature1':'-','feature2':'+'},
                            {'symbol':'d','feature1':'-','feature2':'-'}]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)

    def test_init(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter(['a'], lhs = [segs])
        self.assertEqual(sorted(envfilt.lhs[0]),sorted(['a','b']))
        self.assertEqual(envfilt.rhs, None)

        segs = self.corpus.features_to_segments('-feature1')
        envfilt = EnvironmentFilter('a',rhs = [segs])
        self.assertEqual(sorted(envfilt.rhs[0]),sorted(['c','d']))
        self.assertEqual(envfilt.lhs,None)

        segs = self.corpus.features_to_segments('-feature1,-feature2')
        envfilt = EnvironmentFilter('a',rhs = [segs])
        self.assertEqual(sorted(envfilt.rhs[0]),sorted(['d']))

    def test_contains(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a',lhs = [segs])
        env1 = Environment('a', None, lhs = ['a'], rhs = ['b'])
        env2 = Environment('a', None, lhs = ['c'], rhs = ['#'])
        env3 = Environment('a', None, lhs = ['a'], rhs = ['c'])

        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)

        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a',rhs = [segs], lhs=[segs])
        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)
        self.assertFalse(env3 in envfilt)
    def test_add_tier(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        corpus.add_tier('t','+feature1')
        self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]])

        corpus.remove_attribute('t')

        self.assertRaises(AttributeError,getattr,corpus['d'],'t')
Exemple #9
0
    def test_add_tier(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        corpus.add_tier('t', '+feature1')
        self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]])

        corpus.remove_attribute('t')

        self.assertRaises(AttributeError, getattr, corpus['d'], 't')
Exemple #10
0
def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, _ = inspect_csv(path, coldelim = delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus
Exemple #11
0
class EnvironmentFilterTest(unittest.TestCase):
    def setUp(self):
        self.corpus_info = [
            {
                'spelling': 'a',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'b',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'c',
                'transcription': ['c', 'a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'd',
                'transcription': ['a', 'd'],
                'frequency': 32.0
            },
        ]

        self.feature_info = [{
            'symbol': 'a',
            'feature1': '+',
            'feature2': '+'
        }, {
            'symbol': 'b',
            'feature1': '+',
            'feature2': '-'
        }, {
            'symbol': 'c',
            'feature1': '-',
            'feature2': '+'
        }, {
            'symbol': 'd',
            'feature1': '-',
            'feature2': '-'
        }]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)

    def test_init(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter(['a'], lhs=[segs])
        self.assertEqual(sorted(envfilt.lhs[0]), sorted(['a', 'b']))
        self.assertEqual(envfilt.rhs, None)

        segs = self.corpus.features_to_segments('-feature1')
        envfilt = EnvironmentFilter('a', rhs=[segs])
        self.assertEqual(sorted(envfilt.rhs[0]), sorted(['c', 'd']))
        self.assertEqual(envfilt.lhs, None)

        segs = self.corpus.features_to_segments('-feature1,-feature2')
        envfilt = EnvironmentFilter('a', rhs=[segs])
        self.assertEqual(sorted(envfilt.rhs[0]), sorted(['d']))

    def test_contains(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a', lhs=[segs])
        env1 = Environment('a', None, lhs=['a'], rhs=['b'])
        env2 = Environment('a', None, lhs=['c'], rhs=['#'])
        env3 = Environment('a', None, lhs=['a'], rhs=['c'])

        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)

        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a', rhs=[segs], lhs=[segs])
        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)
        self.assertFalse(env3 in envfilt)
Exemple #12
0
def load_corpus_csv(corpus_name, path, delimiter,
                    trans_delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    trans_delimiter : str
        Character to use for spliting transcriptions into segments
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(best_delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(best_delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus