def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host='localhost', graph_port=7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back=self.kwargs['call_back'], stop_check=self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) return could_not_parse
def run_query(self): time.sleep(0.1) name = self.kwargs['name'] directory = self.kwargs['directory'] reset = True config = CorpusConfig(name, graph_host = 'localhost', graph_port = 7474) with CorpusContext(config) as c: if name == 'buckeye': parser = inspect_buckeye(directory) elif name == 'timit': parser = inspect_timit(directory) elif name == 'partitur': parser = inspect_partitur(directory) else: form = guess_textgrid_format(directory) if form == 'labbcat': parser = inspect_labbcat(directory) elif form == 'mfa': parser = inspect_mfa(directory) elif form == 'fave': parser = inspect_fave(directory) else: parser = inspect_textgrid(directory) parser.call_back = self.kwargs['call_back'] parser.stop_check = self.kwargs['stop_check'] parser.call_back('Resetting corpus...') if reset: c.reset(call_back = self.kwargs['call_back'], stop_check = self.kwargs['stop_check']) could_not_parse = c.load(parser, directory) self.actionCompleted.emit('importing corpus') return could_not_parse
def loading(config, corpus_dir, textgrid_format): with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) with CorpusContext(config) as c: print('loading') if textgrid_format == "buckeye": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "csv": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() == "fave": parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ilg": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format == "labbcat": parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format == "partitur": parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format == "timit": parser = pgio.inspect_timit(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) c.load(parser, corpus_dir)
def loading(config, path_to_timit): # Initial import of the corpus to PGDB # only needs to be done once. resets the corpus if it was loaded previously. with CorpusContext(config) as c: c.reset() print('reset') parser = pgio.inspect_timit(path_to_timit) parser.call_back = call_back beg = time.time() c.load(parser, path_to_timit) end = time.time() print('Loading took: {}'.format(end - beg))
def test_load_directory_timit(graph_db, timit_test_dir): parser = inspect_timit(timit_test_dir) with CorpusContext('directory_timit', **graph_db) as c: c.reset() c.load(parser, timit_test_dir) q = c.query_graph(c.phone).filter(c.phone.label == 'dcl') assert (q.count() == 2) q = q.columns(c.phone.speaker.name.column_name('speaker')) results = q.all() assert (all(x['speaker'] == 'timit' for x in results))
def test_load_directory_timit(graph_db, timit_test_dir): parser = inspect_timit(timit_test_dir) with CorpusContext('directory_timit', **graph_db) as c: c.reset() c.load(parser, timit_test_dir) q = c.query_graph(c.surface_transcription).filter(c.surface_transcription.label == 'dcl') assert(q.count() == 2) q = q.columns(c.surface_transcription.speaker.name.column_name('speaker')) results = q.all() assert(all(x.speaker == 'timit' for x in results))
def test_load_discourse_timit(graph_db, timit_test_dir): word_path = os.path.join(timit_test_dir, 'test.WRD') with CorpusContext('discourse_timit', **graph_db) as c: c.reset() parser = inspect_timit(word_path) c.load(parser, word_path) q = c.query_graph(c.phone).filter(c.phone.label == 'dcl') assert (q.count() == 2) q = q.columns(c.phone.speaker.name.column_name('speaker')) results = q.all() assert (all(x['speaker'] == 'timit' for x in results))
def test_load_discourse_timit(graph_db, timit_test_dir): word_path = os.path.join(timit_test_dir,'test.WRD') with CorpusContext('discourse_timit', **graph_db) as c: c.reset() parser = inspect_timit(word_path) c.load(parser, word_path) q = c.query_graph(c.surface_transcription).filter(c.surface_transcription.label == 'dcl') assert(q.count() == 2) q = q.columns(c.surface_transcription.speaker.name.column_name('speaker')) results = q.all() assert(all(x.speaker == 'timit' for x in results))
def loading(config, corpus_dir, textgrid_format): """Load the corpus""" ## first check if a database for the corpus ## has already been created with CorpusContext(config) as c: exists = c.exists() if exists: print('Corpus already loaded, skipping import.') return if not os.path.exists(corpus_dir): print('The path {} does not exist.'.format(corpus_dir)) sys.exit(1) ## if there is no database file, ## begin with importing the corpus textgrid_format = textgrid_format.upper() with CorpusContext(config) as c: print('loading') ## Use the appropriate importer based ## on the format of the corpus if textgrid_format in ["BUCKEYE", "B"]: parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format == "CSV": parser = pgio.inspect_buckeye(corpus_dir) elif textgrid_format.lower() in ["FAVE", "F"]: parser = pgio.inspect_fave(corpus_dir) elif textgrid_format == "ILG": parser = pgio.inspect_ilg(corpus_dir) elif textgrid_format in ["LABBCAT", "L"]: parser = pgio.inspect_labbcat(corpus_dir) elif textgrid_format in ["P", "PARTITUR"]: parser = pgio.inspect_partitur(corpus_dir) elif textgrid_format in ["MAUS", "W"]: parser = pgio.inspect_maus(corpus_dir) elif textgrid_format in ["TIMIT", "T"]: parser = pgio.inspect_timit(corpus_dir) elif textgrid_format in ["W", "maus"]: parser = pgio.inspect_maus(corpus_dir) else: parser = pgio.inspect_mfa(corpus_dir) parser.call_back = call_back beg = time.time() c.load(parser, corpus_dir) end = time.time() time_taken = end - beg print('Loading took: {}'.format(time_taken)) save_performance_benchmark(config, 'import', time_taken)
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times) / (len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
def import_corpus_run_query(data, path): with CorpusContext(data, **graph_db) as c: c.reset() beg = time.time() if data == 'buckeyebenchmark': parser = inspect_buckeye(path) elif data == 'timitbenchmark': parser = inspect_timit(path) else: parser = inspect_mfa(path) parser.call_back = call_back c.load(parser, path) end = time.time() avgtime = sum(times)/(len(times)) sd = statistics.stdev(times) return [(end - beg), avgtime, sd]
from polyglotdb.config import CorpusConfig from polyglotdb.io.parsers import FilenameSpeakerParser from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv from polyglotdb.utils import get_corpora_list graph_db = ({ 'graph_host': 'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******' }) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("VTRSubset", **graph_db) as c: print("Loading...") c.reset() parser = pgio.inspect_timit( '/Volumes/data/datasets/sct_benchmarks/VTRFormants') #parser = pgio.inspect_timit('/Users/mlml/Documents/transfer/VTRSubset') parser.call_back = call_back c.load(parser, '/Volumes/data/datasets/sct_benchmarks/VTRFormants') #c.load(parser, '/Users/mlml/Documents/transfer/VTRSubset')
import os import time import logging import polyglotdb.io as pgio from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig from polyglotdb.io.parsers import FilenameSpeakerParser from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv from polyglotdb.utils import get_corpora_list graph_db = ({'graph_host':'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******'}) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("VTRSubset", **graph_db) as c: print ("Loading...") c.reset() parser = pgio.inspect_timit('/Volumes/data/datasets/sct_benchmarks/VTRFormants') #parser = pgio.inspect_timit('/Users/mlml/Documents/transfer/VTRSubset') parser.call_back = call_back c.load(parser, '/Volumes/data/datasets/sct_benchmarks/VTRFormants') #c.load(parser, '/Users/mlml/Documents/transfer/VTRSubset')
sys.path.insert(0, base) import polyglotdb.io as pgio from speechtools.corpus import CorpusContext path_to_timit = r'D:\Data\TIMIT_fixed' graph_db = { 'host': 'localhost', 'port': 7474, 'user': '******', 'password': '******' } def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) parser = pgio.inspect_timit(path_to_timit) parser.call_back = call_back with CorpusContext('timit', **graph_db) as c: c.reset() beg = time.time() c.load(parser, path_to_timit) end = time.time() print('Time taken: {}'.format(end - beg))
from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig from polyglotdb.io.parsers import FilenameSpeakerParser from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv from polyglotdb.utils import get_corpora_list graph_db = ({ 'graph_host': 'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******' }) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("VTRSubset", **graph_db) as c: print("Loading...") c.reset() #parser = pgio.inspect_timit('/Volumes/data/datasets/sct_benchmarks/VTRFormants') parser = pgio.inspect_timit('/Users/mlml/Documents/transfer/VTRSubset') parser.call_back = call_back #c.load(parser, '/Volumes/data/datasets/sct_benchmarks/VTRFormants') c.load(parser, '/Users/mlml/Documents/transfer/VTRSubset')
import os import time import logging import polyglotdb.io as pgio from polyglotdb import CorpusContext from polyglotdb.config import CorpusConfig from polyglotdb.io.parsers import FilenameSpeakerParser from polyglotdb.io.enrichment import enrich_speakers_from_csv, enrich_lexicon_from_csv from polyglotdb.utils import get_corpora_list graph_db = ({'graph_host':'localhost', 'graph_port': 7474, 'graph_user': '******', 'graph_password': '******'}) def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) if __name__ == '__main__': with CorpusContext("VTRSubset", **graph_db) as c: print ("Loading...") c.reset() #parser = pgio.inspect_timit('/Volumes/data/datasets/sct_benchmarks/VTRFormants') parser = pgio.inspect_timit('/Users/mlml/Documents/transfer/VTRSubset') parser.call_back = call_back #c.load(parser, '/Volumes/data/datasets/sct_benchmarks/VTRFormants') c.load(parser, '/Users/mlml/Documents/transfer/VTRSubset')
import sys import os import time base = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) sys.path.insert(0,base) import polyglotdb.io as pgio from polyglotdb import CorpusContext path_to_timit = r'D:\Data\TIMIT_fixed' graph_db = {'host':'localhost', 'port': 7474, 'user': '******', 'password': '******'} def call_back(*args): args = [x for x in args if isinstance(x, str)] if args: print(' '.join(args)) parser = pgio.inspect_timit(path_to_timit) parser.call_back = call_back with CorpusContext('timit', **graph_db) as c: c.reset() beg = time.time() c.load(parser, path_to_timit) end = time.time() print('Time taken: {}'.format(end - beg))