Beispiel #1
0
def test_two_format_parsing(mfa_test_dir, graph_db):
    #for file in os.listdir(os.path.abspath(mfa_test_dir)):
    #    if file.endswith("yes.TextGrid") or file.endswith("no.TextGrid"):
    #        path = os.path.join(mfa_test_dir, file)
    #parser = MfaParser("a", "b")
    #        curTg = TextGrid()
    #        curTg.read(path)
    #value = parser._is_valid(curTg)

    #if file.endswith("yes.TextGrid"):
    #    assert True
    #elif file.endswith("no.TextGrid"):
    #    assert False
    valid_dir = os.path.join(mfa_test_dir, "valid")
    invalid_dir = os.path.join(mfa_test_dir, "invalid")

    # Check that valids load
    with CorpusContext('mfa_valid', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(valid_dir)
        c.load(parser, valid_dir)

    # Check that invalids don't
    with CorpusContext('mfa_invalid', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(invalid_dir)
        with pytest.raises(ParseError):
            c.load(parser, invalid_dir)
Beispiel #2
0
def test_mismatch_parser(timit_test_dir, graph_db):

    with CorpusContext('test_mismatch', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(timit_test_dir)
        with pytest.raises(ParseError):
            c.load(parser, timit_test_dir)
Beispiel #3
0
    def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host='localhost', graph_port=7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back=self.kwargs['call_back'],
                        stop_check=self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
        return could_not_parse
Beispiel #4
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        c.load(parser, corpus_dir)
Beispiel #5
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
    def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            elif name == 'partitur':
                parser = inspect_partitur(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
            self.actionCompleted.emit('importing corpus') 
        return could_not_parse
Beispiel #7
0
def test_mismatch_parser(timit_test_dir, graph_db):

    with CorpusContext('test_mismatch', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(timit_test_dir)
        with pytest.raises(ParseError):
            c.load(parser, timit_test_dir)
Beispiel #8
0
def test_load_mfa(mfa_test_dir, graph_db):

    with CorpusContext('test_mfa', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(mfa_test_dir)
        c.load(parser, mfa_test_dir)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'mfa')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        results = q.all()
        assert(len(results) == 1)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length = 0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'mfa')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label, c.word.following.label.column_name('following'))
        results = q.all()
        assert(len(results) == 1)
        assert(results[0]['following'] == 'JURASSIC')

        s = c.census['mfa']

        assert(len(s.discourses) == 1)
        assert([x.discourse.name for x in s.discourses] == ['mfa_test'])
Beispiel #9
0
def test_load_mfa(mfa_test_dir, graph_db):

    with CorpusContext('test_mfa', **graph_db) as c:
        c.reset()
        parser = inspect_mfa(mfa_test_dir)
        c.load(parser, mfa_test_dir)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'mfa')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        results = q.all()
        assert (len(results) == 1)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length=0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'mfa')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        results = q.all()
        assert (len(results) == 1)
        assert (results[0]['following'] == 'JURASSIC')

        s = c.census['mfa']

        assert (len(s.discourses) == 1)
        assert ([x.discourse.name for x in s.discourses] == ['mfa_test'])
Beispiel #10
0
def stressed_config(graph_db, textgrid_test_dir):
    config = CorpusConfig('stressed', **graph_db)

    stressed_path = os.path.join(textgrid_test_dir, 'stressed_corpus.TextGrid')
    with CorpusContext(config) as c:
        c.reset()
        parser = inspect_mfa(stressed_path)
        c.load(parser, stressed_path)
    return config
def stressed_config(graph_db, textgrid_test_dir):
    config = CorpusConfig('stressed', **graph_db)

    stressed_path = os.path.join(textgrid_test_dir,'stressed_corpus.TextGrid')
    with CorpusContext(config) as c:
        c.reset()
        parser = inspect_mfa(stressed_path)
        c.load(parser, stressed_path)
    return config
Beispiel #12
0
def loading(config, corpus_dir, textgrid_format):
    """Load the corpus"""

    ## first check if a database for the corpus
    ## has already been created
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)

    ## if there is no database file,
    ## begin with importing the corpus
    textgrid_format = textgrid_format.upper()
    with CorpusContext(config) as c:
        print('loading')

        ## Use the appropriate importer based
        ## on the format of the corpus
        if textgrid_format in ["BUCKEYE", "B"]:
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "CSV":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() in ["FAVE", "F"]:
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ILG":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format in ["LABBCAT", "L"]:
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format in ["P", "PARTITUR"]:
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format in ["MAUS", "W"]:
            parser = pgio.inspect_maus(corpus_dir)
        elif textgrid_format in ["TIMIT", "T"]:
            parser = pgio.inspect_timit(corpus_dir)
        elif textgrid_format in ["W", "maus"]:
            parser = pgio.inspect_maus(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
Beispiel #13
0
def import_corpus_run_query(data, path):
    with CorpusContext(data, **graph_db) as c:
        c.reset()
        beg = time.time()
        if data == 'buckeyebenchmark':
            parser = inspect_buckeye(path)
        elif data == 'timitbenchmark':
            parser = inspect_timit(path)
        else:
            parser = inspect_mfa(path)
        parser.call_back = call_back
        c.load(parser, path)
        end = time.time()
        avgtime = sum(times) / (len(times))
        sd = statistics.stdev(times)
        return [(end - beg), avgtime, sd]
def import_corpus_run_query(data, path):
    with CorpusContext(data, **graph_db) as c:
        c.reset()
        beg = time.time()
        if data == 'buckeyebenchmark':
            parser = inspect_buckeye(path)
        elif data == 'timitbenchmark':
            parser = inspect_timit(path)
        else:
            parser = inspect_mfa(path)
        parser.call_back = call_back
        c.load(parser, path)
        end = time.time()
        avgtime = sum(times)/(len(times))
        sd = statistics.stdev(times)
        return [(end - beg), avgtime, sd]
Beispiel #15
0
def overlapped_config(graph_db, textgrid_test_dir, acoustic_syllabics):
    config = CorpusConfig('overlapped', **graph_db)

    acoustic_path = os.path.join(textgrid_test_dir, 'overlapped_speech')
    with CorpusContext(config) as c:
        c.reset()
        parser = inspect_mfa(acoustic_path)
        c.load(parser, acoustic_path)

        c.encode_pauses(['sil'])
        c.encode_utterances(min_pause_length=0)
        c.encode_syllabic_segments(acoustic_syllabics)
        c.encode_syllables()

    config.pitch_algorithm = 'acousticsim'
    config.formant_source = 'acousticsim'
    return config
Beispiel #16
0
def test_load_mfa(mfa_test_dir, graph_db):
    with CorpusContext('test_mfa', **graph_db) as c:
        c.reset()
        testFilePath = os.path.join(mfa_test_dir, "mfa_test.TextGrid")
        parser = inspect_mfa(testFilePath)
        print(parser.speaker_parser)
        c.load(parser, testFilePath)
        assert (c.hierarchy.has_type_property('word', 'transcription'))

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        print(q)
        print(q.all())
        q = q.filter(c.word.speaker.name == 'mfa')
        #print(c.word.speaker.name)
        print(q.all())
        q = q.order_by(c.word.begin)
        print(q.all())
        q = q.columns(c.word.label)
        print(q.all())
        results = q.all()
        assert (len(results) == 1)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length=0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'mfa')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        results = q.all()
        assert (len(results) == 1)
        assert (results[0]['following'] == 'JURASSIC')

        q = c.query_speakers().filter(c.speaker.name == 'mfa')
        q = q.columns(c.speaker.discourses.name.column_name('discourses'))

        s = q.get()

        assert (len(s['discourses']) == 1)
        assert (s['discourses'] == ['mfa_test'])
Beispiel #17
0
def test_load_discourse(graph_db, mfa_test_dir, textgrid_test_dir):
    test_file_path = os.path.join(mfa_test_dir, "mfa_test.TextGrid")
    acoustic_path = os.path.join(textgrid_test_dir, 'acoustic_corpus.TextGrid')
    mfa_parser = inspect_mfa(test_file_path)
    parser = inspect_textgrid(acoustic_path)
    with CorpusContext('load_remove_test', **graph_db) as c:
        c.reset()
        c.load_discourse(parser, acoustic_path)
        c.load_discourse(mfa_parser, test_file_path)

        syllabics = ['ER', 'AE', 'IH', 'EH', 'ae', 'ih', 'er', 'eh']
        c.encode_syllabic_segments(syllabics)
        c.encode_syllables()

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        assert q.count() > 0
        q = c.query_graph(c.phone).filter(c.phone.label == 'AE')
        assert q.count() > 0
        q = c.query_lexicon(c.syllable).filter(c.syllable.label == 'JH.ER')
        assert q.count() > 0

        q = c.query_lexicon(
            c.lexicon_word).filter(c.lexicon_word.label == 'JURASSIC')
        assert q.count() > 0
        q = c.query_lexicon(
            c.lexicon_phone).filter(c.lexicon_phone.label == 'AE')
        assert q.count() > 0
        q = c.query_lexicon(
            c.lexicon_phone).filter(c.lexicon_phone.label == 'ae')
        assert q.count() > 0
        q = c.query_lexicon(
            c.lexicon_syllable).filter(c.lexicon_syllable.label == 'JH.ER')
        assert q.count() > 0

        q = c.query_discourses().filter(c.discourse.name == 'mfa_test')
        assert q.count() > 0
        q = c.query_speakers().filter(c.speaker.name == 'mfa')
        assert q.count() > 0

        d = c.discourse_sound_file('acoustic_corpus')
        assert os.path.exists(d['consonant_file_path'])
from polyglotdb.utils import get_corpora_list

graph_db = ({
    'graph_host': 'localhost',
    'graph_port': 7474,
    'graph_user': '******',
    'graph_password': '******'
})


def call_back(*args):
    args = [x for x in args if isinstance(x, str)]
    if args:
        print(' '.join(args))


if __name__ == '__main__':
    with CorpusContext("Hillenbrand", **graph_db) as c:
        print("Loading...")
        c.reset()
        parser = pgio.inspect_mfa(
            '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav')
        parser.call_back = call_back
        #beg = time.time()
        c.load(parser,
               '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav')
        #end = time.time()
        #time = end-beg
        #logger.info('Loading took: ' + str(time))
Beispiel #19
0
from polyglotdb import CorpusContext
import polyglotdb.io as pgio

# change this path to where you put the pg_tutorial directory after downloading, unzipping from tutorial site
corpus_root = '/mnt/e/Data/pg_tutorial'

parser = pgio.inspect_mfa(corpus_root)
parser.call_back = print

# for verbose output during corpus import:
parser.call_back = print

with CorpusContext('pg_tutorial') as c:
    c.load(parser, corpus_root)


# Simple queries

## uncomment the following to carry out the "Testing some simple queries" part:
with CorpusContext('pg_tutorial') as c:
    print('Speakers:', c.speakers)
    print('Discourses:', c.discourses)

    q = c.query_lexicon(c.lexicon_phone)
    q = q.order_by(c.lexicon_phone.label)
    q = q.columns(c.lexicon_phone.label.column_name('phone'))
    results = q.all()
    print(results)

from polyglotdb.query.base.func import Count, Average
import logging

import polyglotdb.io as pgio

from polyglotdb import CorpusContext
from polyglotdb.config import CorpusConfig
from polyglotdb.io.parsers import FilenameSpeakerParser
from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv

from polyglotdb.utils import get_corpora_list

graph_db = ({'graph_host':'localhost', 'graph_port': 7474,
	'graph_user': '******', 'graph_password': '******'})

def call_back(*args):
    args = [x for x in args if isinstance(x, str)]
    if args:
        print(' '.join(args))

if __name__ == '__main__':
	with CorpusContext("Hillenbrand", **graph_db) as c:
		print ("Loading...")
		c.reset()
		parser = pgio.inspect_mfa('/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav')
		parser.call_back = call_back
		#beg = time.time()
		c.load(parser, '/Users/mlml/Documents/transfer/Hillenbrand/textgrid-wav')
		#end = time.time()
		#time = end-beg
		#logger.info('Loading took: ' + str(time))