def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) elif name == 'partitur': parser = inspect_partitur(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) self.actionCompleted.emit('importing corpus') return could_not_parse
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host='localhost', graph_port=7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back=self.kwargs['call_back'], stop_check=self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) return could_not_parse
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) c.load(parser, corpus_dir)
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def fave_corpus_config(graph_db, fave_test_dir): config = CorpusConfig('fave_test_corpus', **graph_db) with CorpusContext(config) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) return config
def test_load_fave(fave_test_dir, graph_db): with CorpusContext('test_fave', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert (len(results) == 1) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Interviewer') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert (len(results) == 0) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length=0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert (len(results) == 1) assert (results[0]['following'] == 'JURASSIC') q = c.query_graph(c.word).filter(c.word.label == 'MURDER') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert (len(results) == 2) assert (results[0]['following'] == 'KNOW') interviewer = c.census['Interviewer'] assert (len(interviewer.discourses) == 2) assert (sorted(x.discourse.name for x in interviewer.discourses) == [ 'fave_test', 'fave_test2' ]) s = c.census['Gary Salvi'] assert (len(s.discourses) == 1) assert ([x.discourse.name for x in s.discourses] == ['fave_test'])
def test_load_fave(fave_test_dir, graph_db): with CorpusContext('test_fave', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert(len(results) == 1) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Interviewer') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert(len(results) == 0) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length = 0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert(len(results) == 1) assert(results[0]['following'] == 'JURASSIC') q = c.query_graph(c.word).filter(c.word.label == 'MURDER') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert(len(results) == 2) assert(results[0]['following'] == 'KNOW') interviewer = c.census['Interviewer'] assert(len(interviewer.discourses) == 2) assert(sorted(x.discourse.name for x in interviewer.discourses) == ['fave_test', 'fave_test2']) s = c.census['Gary Salvi'] assert(len(s.discourses) == 1) assert([x.discourse.name for x in s.discourses] == ['fave_test'])
def loading(config): # Initial import of the corpus to PGDB # only needs to be done once. resets the corpus if it was loaded previously. with CorpusContext(config) as c: c.reset() print('reset') parser = pgio.inspect_fave(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() print('Loading took: {}'.format(end - beg))
def loading(config, corpus_dir, textgrid_format): """Load the corpus""" ## first check if a database for the corpus ## has already been created with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) ## if there is no database file, ## begin with importing the corpus textgrid_format = textgrid_format.upper() with CorpusContext(config) as c: print('loading') ## Use the appropriate importer based ## on the format of the corpus if textgrid_format in ["BUCKEYE", "B"]: parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "CSV": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() in ["FAVE", "F"]: parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ILG": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format in ["LABBCAT", "L"]: parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format in ["P", "PARTITUR"]: parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format in ["MAUS", "W"]: parser = pgio.inspect_maus(corpus_dir) elif textgrid_format in ["TIMIT", "T"]: parser = pgio.inspect_timit(corpus_dir) elif textgrid_format in ["W", "maus"]: parser = pgio.inspect_maus(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def test_load_fave_stereo(fave_test_dir, graph_db): with CorpusContext('test_stereo', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) s = c.census['Speaker 1'] assert (len(s.discourses) == 1) assert ([x.channel for x in s.discourses] == [0]) s = c.census['Speaker 2'] assert (len(s.discourses) == 1) assert ([x.channel for x in s.discourses] == [1])
def test_load_fave_stereo(fave_test_dir, graph_db): with CorpusContext('test_stereo', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) s = c.census['Speaker 1'] assert(len(s.discourses) == 1) assert([x.channel for x in s.discourses] == [0]) s = c.census['Speaker 2'] assert(len(s.discourses) == 1) assert([x.channel for x in s.discourses] == [1])
def test_load_fave_stereo(fave_test_dir, graph_db): with CorpusContext('test_stereo', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) q = c.query_speakers().filter(c.speaker.name == 'Speaker 1') q = q.columns(c.speaker.discourses.name.column_name('discourses'), c.speaker.discourses.channel.column_name('channels')) s = q.get() assert (len(s['channels']) == 1) assert (s['channels'] == [0]) q = c.query_speakers().filter(c.speaker.name == 'Speaker 2') q = q.columns(c.speaker.discourses.name.column_name('discourses'), c.speaker.discourses.channel.column_name('channels')) s = q.get() assert (len(s['channels']) == 1) assert (s['channels'] == [1])
def test_load_fave(fave_test_dir, graph_db): with CorpusContext('test_fave', **graph_db) as c: c.reset() parser = inspect_fave(fave_test_dir) c.load(parser, fave_test_dir) assert (c.hierarchy.has_type_property('word', 'transcription')) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert (len(results) == 1) q = c.query_graph(c.word).filter(c.word.label == 'JURASSIC') q = q.filter(c.word.speaker.name == 'Interviewer') q = q.order_by(c.word.begin) q = q.columns(c.word.label) print(q.cypher()) results = q.all() assert (len(results) == 0) c.encode_pauses('<SIL>') c.encode_utterances(min_pause_length=0) q = c.query_graph(c.word).filter(c.word.label == 'PLANET') q = q.filter(c.word.speaker.name == 'Gary Salvi') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert (len(results) == 1) assert (results[0]['following'] == 'JURASSIC') q = c.query_graph(c.word).filter(c.word.label == 'MURDER') q = q.order_by(c.word.begin) q = q.columns(c.word.label, c.word.following.label.column_name('following')) print(q.cypher()) results = q.all() assert (len(results) == 2) assert (results[0]['following'] == 'KNOW') q = c.query_speakers().filter(c.speaker.name == 'Interviewer') q = q.columns(c.speaker.discourses.name.column_name('discourses')) interviewer = q.get() assert (len(interviewer['discourses']) == 2) assert (sorted( interviewer['discourses']) == ['fave_test', 'fave_test2']) q = c.query_speakers().filter(c.speaker.name == 'Gary Salvi') q = q.columns(c.speaker.discourses.name.column_name('discourses')) s = q.get() assert (len(s['discourses']) == 1) assert (s['discourses'] == ['fave_test'])
from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig import polyglotdb.io as pgio import sys import os graph_db = {'host': 'localhost', 'port': 7474} path_to_switchboard = os.path.join("/Volumes", "data", "corpora", "Switchboard_for_MFA") if __name__ == '__main__': config = CorpusConfig("switchboard", **graph_db) print("loading corpus...") with CorpusContext(config) as g: g.reset() parser = pgio.inspect_fave(path_to_switchboard) g.load(parser, path_to_switchboard) q = g.query_graph(g.word).filter(g.word.label == "think") results = q.all() assert (len(results) > 0) q = g.query_graph(g.phone).filter(g.phone.label == "ow") results_phone = q.all() assert (len(results_phone) > 0)
from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig import polyglotdb.io as pgio import sys import os graph_db = {'host':'localhost', 'port': 7474} path_to_SB = os.path.join("/Volumes","data","corpora","SantaBarbara_aligned", "Part2_aligned") if __name__ == '__main__': config = CorpusConfig("santabarbara_part2", **graph_db) print("loading corpus...") with CorpusContext(config) as g: g.reset() parser = pgio.inspect_fave(path_to_SB) g.load(parser, path_to_SB) q = g.query_graph(g.word).filter(g.word.label=="think") results = q.all() assert(len(results) > 0) q = g.query_graph(g.phone).filter(g.phone.label=="ow") results_phone = q.all() assert(len(results_phone) > 0 )