def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            elif name == 'partitur':
                parser = inspect_partitur(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
            self.actionCompleted.emit('importing corpus') 
        return could_not_parse
Beispiel #2
0
    def run_query(self):
        time.sleep(0.1)
        name = self.kwargs['name']
        directory = self.kwargs['directory']
        reset = True
        config = CorpusConfig(name, graph_host='localhost', graph_port=7474)
        with CorpusContext(config) as c:
            if name == 'buckeye':
                parser = inspect_buckeye(directory)
            elif name == 'timit':
                parser = inspect_timit(directory)
            else:
                form = guess_textgrid_format(directory)
                if form == 'labbcat':
                    parser = inspect_labbcat(directory)
                elif form == 'mfa':
                    parser = inspect_mfa(directory)
                elif form == 'fave':
                    parser = inspect_fave(directory)
                else:
                    parser = inspect_textgrid(directory)

            parser.call_back = self.kwargs['call_back']
            parser.stop_check = self.kwargs['stop_check']
            parser.call_back('Resetting corpus...')
            if reset:
                c.reset(call_back=self.kwargs['call_back'],
                        stop_check=self.kwargs['stop_check'])
            could_not_parse = c.load(parser, directory)
        return could_not_parse
Beispiel #3
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        c.load(parser, corpus_dir)
Beispiel #4
0
def loading(config, corpus_dir, textgrid_format):
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)
    with CorpusContext(config) as c:
        print('loading')

        if textgrid_format == "buckeye":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "csv":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() == "fave":
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ilg":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format == "labbcat":
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format == "partitur":
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format == "timit":
            parser = pgio.inspect_timit(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
Beispiel #5
0
def fave_corpus_config(graph_db, fave_test_dir):
    config = CorpusConfig('fave_test_corpus', **graph_db)

    with CorpusContext(config) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)
    return config
Beispiel #6
0
def fave_corpus_config(graph_db, fave_test_dir):
    config = CorpusConfig('fave_test_corpus', **graph_db)

    with CorpusContext(config) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)
    return config
Beispiel #7
0
def test_load_fave(fave_test_dir, graph_db):

    with CorpusContext('test_fave', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert (len(results) == 1)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Interviewer')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert (len(results) == 0)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length=0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert (len(results) == 1)
        assert (results[0]['following'] == 'JURASSIC')

        q = c.query_graph(c.word).filter(c.word.label == 'MURDER')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert (len(results) == 2)
        assert (results[0]['following'] == 'KNOW')

        interviewer = c.census['Interviewer']

        assert (len(interviewer.discourses) == 2)
        assert (sorted(x.discourse.name for x in interviewer.discourses) == [
            'fave_test', 'fave_test2'
        ])

        s = c.census['Gary Salvi']

        assert (len(s.discourses) == 1)
        assert ([x.discourse.name for x in s.discourses] == ['fave_test'])
Beispiel #8
0
def test_load_fave(fave_test_dir, graph_db):

    with CorpusContext('test_fave', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert(len(results) == 1)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Interviewer')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert(len(results) == 0)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length = 0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label, c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert(len(results) == 1)
        assert(results[0]['following'] == 'JURASSIC')

        q = c.query_graph(c.word).filter(c.word.label == 'MURDER')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label, c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert(len(results) == 2)
        assert(results[0]['following'] == 'KNOW')

        interviewer = c.census['Interviewer']

        assert(len(interviewer.discourses) == 2)
        assert(sorted(x.discourse.name for x in interviewer.discourses) == ['fave_test', 'fave_test2'])

        s = c.census['Gary Salvi']

        assert(len(s.discourses) == 1)
        assert([x.discourse.name for x in s.discourses] == ['fave_test'])
Beispiel #9
0
def loading(config):
    # Initial import of the corpus to PGDB
    # only needs to be done once. resets the corpus if it was loaded previously.
    with CorpusContext(config) as c:
        c.reset()
        print('reset')
        parser = pgio.inspect_fave(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        print('Loading took: {}'.format(end - beg))
Beispiel #10
0
def loading(config, corpus_dir, textgrid_format):
    """Load the corpus"""

    ## first check if a database for the corpus
    ## has already been created
    with CorpusContext(config) as c:
        exists = c.exists()
    if exists:
        print('Corpus already loaded, skipping import.')
        return
    if not os.path.exists(corpus_dir):
        print('The path {} does not exist.'.format(corpus_dir))
        sys.exit(1)

    ## if there is no database file,
    ## begin with importing the corpus
    textgrid_format = textgrid_format.upper()
    with CorpusContext(config) as c:
        print('loading')

        ## Use the appropriate importer based
        ## on the format of the corpus
        if textgrid_format in ["BUCKEYE", "B"]:
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format == "CSV":
            parser = pgio.inspect_buckeye(corpus_dir)
        elif textgrid_format.lower() in ["FAVE", "F"]:
            parser = pgio.inspect_fave(corpus_dir)
        elif textgrid_format == "ILG":
            parser = pgio.inspect_ilg(corpus_dir)
        elif textgrid_format in ["LABBCAT", "L"]:
            parser = pgio.inspect_labbcat(corpus_dir)
        elif textgrid_format in ["P", "PARTITUR"]:
            parser = pgio.inspect_partitur(corpus_dir)
        elif textgrid_format in ["MAUS", "W"]:
            parser = pgio.inspect_maus(corpus_dir)
        elif textgrid_format in ["TIMIT", "T"]:
            parser = pgio.inspect_timit(corpus_dir)
        elif textgrid_format in ["W", "maus"]:
            parser = pgio.inspect_maus(corpus_dir)
        else:
            parser = pgio.inspect_mfa(corpus_dir)
        parser.call_back = call_back
        beg = time.time()
        c.load(parser, corpus_dir)
        end = time.time()
        time_taken = end - beg
        print('Loading took: {}'.format(time_taken))
    save_performance_benchmark(config, 'import', time_taken)
Beispiel #11
0
def test_load_fave_stereo(fave_test_dir, graph_db):

    with CorpusContext('test_stereo', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)

        s = c.census['Speaker 1']

        assert (len(s.discourses) == 1)
        assert ([x.channel for x in s.discourses] == [0])

        s = c.census['Speaker 2']

        assert (len(s.discourses) == 1)
        assert ([x.channel for x in s.discourses] == [1])
Beispiel #12
0
def test_load_fave_stereo(fave_test_dir, graph_db):

    with CorpusContext('test_stereo', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)

        s = c.census['Speaker 1']

        assert(len(s.discourses) == 1)
        assert([x.channel for x in s.discourses] == [0])

        s = c.census['Speaker 2']

        assert(len(s.discourses) == 1)
        assert([x.channel for x in s.discourses] == [1])
Beispiel #13
0
def test_load_fave_stereo(fave_test_dir, graph_db):
    with CorpusContext('test_stereo', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)

        q = c.query_speakers().filter(c.speaker.name == 'Speaker 1')
        q = q.columns(c.speaker.discourses.name.column_name('discourses'),
                      c.speaker.discourses.channel.column_name('channels'))

        s = q.get()

        assert (len(s['channels']) == 1)
        assert (s['channels'] == [0])

        q = c.query_speakers().filter(c.speaker.name == 'Speaker 2')
        q = q.columns(c.speaker.discourses.name.column_name('discourses'),
                      c.speaker.discourses.channel.column_name('channels'))

        s = q.get()

        assert (len(s['channels']) == 1)
        assert (s['channels'] == [1])
Beispiel #14
0
def test_load_fave(fave_test_dir, graph_db):
    with CorpusContext('test_fave', **graph_db) as c:
        c.reset()
        parser = inspect_fave(fave_test_dir)
        c.load(parser, fave_test_dir)
        assert (c.hierarchy.has_type_property('word', 'transcription'))

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert (len(results) == 1)

        q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC')
        q = q.filter(c.word.speaker.name == 'Interviewer')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label)
        print(q.cypher())
        results = q.all()
        assert (len(results) == 0)

        c.encode_pauses('<SIL>')

        c.encode_utterances(min_pause_length=0)

        q = c.query_graph(c.word).filter(c.word.label == 'PLANET')
        q = q.filter(c.word.speaker.name == 'Gary Salvi')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert (len(results) == 1)
        assert (results[0]['following'] == 'JURASSIC')

        q = c.query_graph(c.word).filter(c.word.label == 'MURDER')
        q = q.order_by(c.word.begin)
        q = q.columns(c.word.label,
                      c.word.following.label.column_name('following'))
        print(q.cypher())
        results = q.all()
        assert (len(results) == 2)
        assert (results[0]['following'] == 'KNOW')

        q = c.query_speakers().filter(c.speaker.name == 'Interviewer')
        q = q.columns(c.speaker.discourses.name.column_name('discourses'))

        interviewer = q.get()

        assert (len(interviewer['discourses']) == 2)
        assert (sorted(
            interviewer['discourses']) == ['fave_test', 'fave_test2'])

        q = c.query_speakers().filter(c.speaker.name == 'Gary Salvi')
        q = q.columns(c.speaker.discourses.name.column_name('discourses'))

        s = q.get()

        assert (len(s['discourses']) == 1)
        assert (s['discourses'] == ['fave_test'])
from polyglotdb import CorpusContext
from polyglotdb.config import CorpusConfig
import polyglotdb.io as pgio
import sys
import os

graph_db = {'host': 'localhost', 'port': 7474}

path_to_switchboard = os.path.join("/Volumes", "data", "corpora",
                                   "Switchboard_for_MFA")

if __name__ == '__main__':
    config = CorpusConfig("switchboard", **graph_db)
    print("loading corpus...")
    with CorpusContext(config) as g:
        g.reset()
        parser = pgio.inspect_fave(path_to_switchboard)
        g.load(parser, path_to_switchboard)

        q = g.query_graph(g.word).filter(g.word.label == "think")

        results = q.all()

        assert (len(results) > 0)

        q = g.query_graph(g.phone).filter(g.phone.label == "ow")
        results_phone = q.all()
        assert (len(results_phone) > 0)
from polyglotdb import CorpusContext
from polyglotdb.config import CorpusConfig 
import polyglotdb.io as pgio
import sys
import os


graph_db = {'host':'localhost', 'port': 7474}

path_to_SB = os.path.join("/Volumes","data","corpora","SantaBarbara_aligned", "Part2_aligned")

if __name__ == '__main__':
    config = CorpusConfig("santabarbara_part2", **graph_db)
    print("loading corpus...")
    with CorpusContext(config) as g:
        g.reset()
        parser = pgio.inspect_fave(path_to_SB)
        g.load(parser, path_to_SB)

        q = g.query_graph(g.word).filter(g.word.label=="think")

        results = q.all()

        assert(len(results) > 0)


        q = g.query_graph(g.phone).filter(g.phone.label=="ow")
        results_phone = q.all()
        assert(len(results_phone) > 0 )