Python Corpus Exemples, corpustools.corpus.classes.Corpus Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_basic(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)

Exemple #2

0

Afficher le fichier

    def test_homographs(self):
        return
        corpus = Corpus('test')
        for w in self.homograph_info:
            corpus.add_word(Word(**w))

        #Error, should find return an iterable of homographs?
        self.assertEqual([x.spelling for x in corpus.find('a')], ['a', 'a'])

Exemple #3

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_homographs(self):
        return
        corpus = Corpus('test')
        for w in self.homograph_info:
            corpus.add_word(Word(**w))


        #Error, should find return an iterable of homographs?
        self.assertEqual([x.spelling for x in corpus.find('a')],['a','a'])

Exemple #4

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_basic(self):
        corpus = Corpus('test')
        for w in self.basic_info:
            self.assertRaises(KeyError,corpus.find,w['spelling'],True)
            corpus.add_word(Word(**w))
            self.assertEqual(corpus[w['spelling']],Word(**w))
            self.assertEqual(corpus.find(w['spelling']),Word(**w))
            self.assertTrue(w['spelling'] in corpus)

        self.assertEqual(corpus.inventory._data,{'#':Segment('#'),
                                        'a':Segment('a'),
                                        'b':Segment('b'),
                                        'c':Segment('c'),
                                        'd':Segment('d')})

Exemple #5

0

Afficher le fichier

    def setUp(self):
        self.corpus_info = [
            {
                'spelling': 'a',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'b',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'c',
                'transcription': ['c', 'a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'd',
                'transcription': ['a', 'd'],
                'frequency': 32.0
            },
        ]

        self.feature_info = [{
            'symbol': 'a',
            'feature1': '+',
            'feature2': '+'
        }, {
            'symbol': 'b',
            'feature1': '+',
            'feature2': '-'
        }, {
            'symbol': 'c',
            'feature1': '-',
            'feature2': '+'
        }, {
            'symbol': 'd',
            'feature1': '-',
            'feature2': '-'
        }]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)

Exemple #6

0

Afficher le fichier

    def test_add_tier(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        corpus.add_tier('t', '+feature1')
        self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]])

        corpus.remove_attribute('t')

        self.assertRaises(AttributeError, getattr, corpus['d'], 't')

Exemple #7

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

class EnvironmentFilterTest(unittest.TestCase):
    def setUp(self):
        self.corpus_info = [{'spelling':'a','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'b','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'c','transcription':['c','a','b'],'frequency':32.0},
                            {'spelling':'d','transcription':['a','d'],'frequency':32.0},]

        self.feature_info = [{'symbol':'a','feature1':'+','feature2':'+'},
                            {'symbol':'b','feature1':'+','feature2':'-'},
                            {'symbol':'c','feature1':'-','feature2':'+'},
                            {'symbol':'d','feature1':'-','feature2':'-'}]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)

    def test_init(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter(['a'], lhs = [segs])
        self.assertEqual(sorted(envfilt.lhs[0]),sorted(['a','b']))
        self.assertEqual(envfilt.rhs, None)

        segs = self.corpus.features_to_segments('-feature1')
        envfilt = EnvironmentFilter('a',rhs = [segs])
        self.assertEqual(sorted(envfilt.rhs[0]),sorted(['c','d']))
        self.assertEqual(envfilt.lhs,None)

        segs = self.corpus.features_to_segments('-feature1,-feature2')
        envfilt = EnvironmentFilter('a',rhs = [segs])
        self.assertEqual(sorted(envfilt.rhs[0]),sorted(['d']))

    def test_contains(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a',lhs = [segs])
        env1 = Environment('a', None, lhs = ['a'], rhs = ['b'])
        env2 = Environment('a', None, lhs = ['c'], rhs = ['#'])
        env3 = Environment('a', None, lhs = ['a'], rhs = ['c'])

        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)

        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a',rhs = [segs], lhs=[segs])
        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)
        self.assertFalse(env3 in envfilt)

Exemple #8

0

Afficher le fichier

    def test_basic(self):
        corpus = Corpus('test')
        for w in self.basic_info:
            self.assertRaises(KeyError, corpus.find, w['spelling'], True)
            corpus.add_word(Word(**w))
            self.assertEqual(corpus[w['spelling']], Word(**w))
            self.assertEqual(corpus.find(w['spelling']), Word(**w))
            self.assertTrue(w['spelling'] in corpus)

        self.assertEqual(
            corpus.inventory._data, {
                '#': Segment('#'),
                'a': Segment('a'),
                'b': Segment('b'),
                'c': Segment('c'),
                'd': Segment('d')
            })

Exemple #9

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_add_tier(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        corpus.add_tier('t','+feature1')
        self.assertEqual(corpus['d'].t, [corpus['d'].transcription[0]])

        corpus.remove_attribute('t')

        self.assertRaises(AttributeError,getattr,corpus['d'],'t')

Exemple #10

0

Afficher le fichier

Fichier : conftest.py Projet : adilnurimanov/CorpusTools

def unspecified_test_corpus():
    corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0},
                    {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0},
                    {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0},
                    {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0},
                    {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0},
                    {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0},
                    {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0},
                    {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0},
                    {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0},
                    {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0},
                    {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0},
                    {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0},
                    {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}]
    corpus = Corpus('test')
    for w in corpus_data:
        corpus.add_word(Word(**w))
    return corpus

Exemple #11

0

Afficher le fichier

    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(), ['d'])

Exemple #12

0

Afficher le fichier

def unspecified_test_corpus():
    # Segments: ɑ, i, u, e, o, ʃ, t, m, n, s (10 segments)
    corpus_data = [{'spelling':'atema','transcription':['ɑ','t','e','m','ɑ'],'frequency':11.0},
                    {'spelling':'enuta','transcription':['e','n','u','t','ɑ'],'frequency':11.0},
                    {'spelling':'mashomisi','transcription':['m','ɑ','ʃ','o','m','i','s','i'],'frequency':5.0},
                    {'spelling':'mata','transcription':['m','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'nata','transcription':['n','ɑ','t','ɑ'],'frequency':2.0},
                    {'spelling':'sasi','transcription':['s','ɑ','s','i'],'frequency':139.0},
                    {'spelling':'shashi','transcription':['ʃ','ɑ','ʃ','i'],'frequency':43.0},
                    {'spelling':'shisata','transcription':['ʃ','i','s','ɑ','t','ɑ'],'frequency':3.0},
                    {'spelling':'shushoma','transcription':['ʃ','u','ʃ','o','m','ɑ'],'frequency':126.0},
                    {'spelling':'ta','transcription':['t','ɑ'],'frequency':67.0},
                    {'spelling':'tatomi','transcription':['t','ɑ','t','o','m','i'],'frequency':7.0},
                    {'spelling':'tishenishu','transcription':['t','i','ʃ','e','n','i','ʃ','u'],'frequency':96.0},
                    {'spelling':'toni','transcription':['t','o','n','i'],'frequency':33.0},
                    {'spelling':'tusa','transcription':['t','u','s','ɑ'],'frequency':32.0},
                    {'spelling':'ʃi','transcription':['ʃ','i'],'frequency':2.0}]
    corpus = Corpus('test')
    for w in corpus_data:
        corpus.add_word(Word(**w))
    return corpus

Exemple #13

0

Afficher le fichier

    def test_feats_to_segs(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),
                         sorted(['a', 'b']))

Exemple #14

0

Afficher le fichier

    def test_basic(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_basic_info)

        corpus.set_feature_matrix(fm)

Exemple #15

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_feats_to_segs(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_basic_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(sorted(corpus.features_to_segments(['+feature1'])),sorted(['a','b']))

Exemple #16

0

Afficher le fichier

Fichier : test_lexicon.py Projet : PhonologicalCorpusTools/CorpusTools

    def test_coverage(self):
        corpus = Corpus('test')
        for w in self.corpus_basic_info:
            corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_no_d_info)

        corpus.set_feature_matrix(fm)

        self.assertEqual(corpus.check_coverage(),['d'])

Exemple #17

0

Afficher le fichier

Fichier : test_lexicon.py Projet : adilnurimanov/CorpusTools

    def setUp(self):
        self.corpus_info = [{'spelling':'a','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'b','transcription':['a','b'],'frequency':32.0},
                            {'spelling':'c','transcription':['c','a','b'],'frequency':32.0},
                            {'spelling':'d','transcription':['a','d'],'frequency':32.0},]

        self.feature_info = [{'symbol':'a','feature1':'+','feature2':'+'},
                            {'symbol':'b','feature1':'+','feature2':'-'},
                            {'symbol':'c','feature1':'-','feature2':'+'},
                            {'symbol':'d','feature1':'-','feature2':'-'}]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test',self.feature_info)

        self.corpus.set_feature_matrix(fm)

Exemple #18

0

Afficher le fichier

class EnvironmentFilterTest(unittest.TestCase):
    def setUp(self):
        self.corpus_info = [
            {
                'spelling': 'a',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'b',
                'transcription': ['a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'c',
                'transcription': ['c', 'a', 'b'],
                'frequency': 32.0
            },
            {
                'spelling': 'd',
                'transcription': ['a', 'd'],
                'frequency': 32.0
            },
        ]

        self.feature_info = [{
            'symbol': 'a',
            'feature1': '+',
            'feature2': '+'
        }, {
            'symbol': 'b',
            'feature1': '+',
            'feature2': '-'
        }, {
            'symbol': 'c',
            'feature1': '-',
            'feature2': '+'
        }, {
            'symbol': 'd',
            'feature1': '-',
            'feature2': '-'
        }]

        self.corpus = Corpus('test')
        for w in self.corpus_info:
            self.corpus.add_word(Word(**w))

        fm = FeatureMatrix('test', self.feature_info)

        self.corpus.set_feature_matrix(fm)
        self.corpus.inventory.update_features(self.corpus.specifier)

    def test_init(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter(['a'], lhs=[segs])
        self.assertEqual(sorted(envfilt.lhs[0]), sorted(['a', 'b']))
        self.assertEqual(envfilt.rhs, None)

        segs = self.corpus.features_to_segments('-feature1')
        envfilt = EnvironmentFilter('a', rhs=[segs])
        self.assertEqual(sorted(envfilt.rhs[0]), sorted(['c', 'd']))
        self.assertEqual(envfilt.lhs, None)

        segs = self.corpus.features_to_segments('-feature1,-feature2')
        envfilt = EnvironmentFilter('a', rhs=[segs])
        self.assertEqual(sorted(envfilt.rhs[0]), sorted(['d']))

    def test_contains(self):
        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a', lhs=[segs])
        env1 = Environment('a', None, lhs=['a'], rhs=['b'])
        env2 = Environment('a', None, lhs=['c'], rhs=['#'])
        env3 = Environment('a', None, lhs=['a'], rhs=['c'])

        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)

        segs = self.corpus.features_to_segments('+feature1')
        envfilt = EnvironmentFilter('a', rhs=[segs], lhs=[segs])
        self.assertTrue(env1 in envfilt)
        self.assertFalse(env2 in envfilt)
        self.assertFalse(env3 in envfilt)

Exemple #19

0

Afficher le fichier

Fichier : csv.py Projet : FieldDB/CorpusTools

def load_corpus_csv(corpus_name, path, delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, _ = inspect_csv(path, coldelim = delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus

Exemple #20

0

Afficher le fichier

def load_corpus_csv(corpus_name, path, delimiter,
                    trans_delimiter,
                    annotation_types = None,
                    feature_system_path = None,
                    stop_check = None, call_back = None):
    """
    Load a corpus from a column-delimited text file

    Parameters
    ----------
    corpus_name : str
        Informative identifier to refer to corpus
    path : str
        Full path to text file
    delimiter : str
        Character to use for spliting lines into columns
    trans_delimiter : str
        Character to use for spliting transcriptions into segments
    annotation_types : list of AnnotationType, optional
        List of AnnotationType specifying how to parse text files
    feature_system_path : str
        Full path to pickled FeatureMatrix to use with the Corpus
    stop_check : callable, optional
        Optional function to check whether to gracefully terminate early
    call_back : callable, optional
        Optional function to supply progress information during the function

    Returns
    -------
    Corpus
        Corpus object generated from the text file

    """
    #begin = time.time()
    corpus = Corpus(corpus_name)
    if feature_system_path is not None and os.path.exists(feature_system_path):
        feature_matrix = load_binary(feature_system_path)
        corpus.set_feature_matrix(feature_matrix)

    if annotation_types is None:
        annotation_types, best_delimiter = inspect_csv(path, coldelim = delimiter, transdelim=trans_delimiter)
    else:
        for a in annotation_types:
            if a.attribute.name == 'transcription' and a.attribute.att_type != 'tier':
                raise(CorpusIntegrityError(('The column \'{}\' is currently '
                                            'not being parsed as transcriptions '
                                            'despite its name.  Please ensure correct '
                                            'parsing for this column by changing its '
                                            '\'Annotation type\' in the parsing '
                                            'preview to the right.').format(a.name)))
    for a in annotation_types:
        a.reset()

    with open(path, encoding='utf-8') as f:
        headers = f.readline()
        headers = headers.split(best_delimiter)
        if len(headers)==1:
            e = DelimiterError(('Could not parse the corpus.\n\Check '
                                'that the delimiter you typed in matches '
                                'the one used in the file.'))
            raise(e)
        headers = annotation_types
        for a in headers:
            corpus.add_attribute(a.attribute)
        trans_check = False

        for line in f.readlines():
            line = line.strip()
            if not line: #blank or just a newline
                continue
            d = {}
            for k,v in zip(headers,line.split(best_delimiter)):
                v = v.strip()
                if k.attribute.att_type == 'tier':
                    trans = parse_transcription(v, k)
                    if not trans_check and len(trans) > 1:
                        trans_check = True
                    d[k.attribute.name] = (k.attribute, trans)
                else:
                    d[k.attribute.name] = (k.attribute, v)
            word = Word(**d)
            if word.transcription:
                #transcriptions can have phonetic symbol delimiters which is a period
                if not word.spelling:
                    word.spelling = ''.join(map(str,word.transcription))

            corpus.add_word(word)
    if corpus.has_transcription and not trans_check:
        e = DelimiterError(('Could not parse transcriptions with that delimiter. '
                            '\n\Check that the transcription delimiter you typed '
                            'in matches the one used in the file.'))
        raise(e)

    transcription_errors = corpus.check_coverage()
    return corpus