def test_load_config(self): cfg = misc.load_config('.speechrc') host = cfg.get('tts', 'host') self.assertEqual (host, 'local')
def test_tts_mary(self): config = misc.load_config('.speechrc') tts = TTS(config.get('tts', 'host'), int(config.get('tts', 'port'))) # test mary tts.engine = 'mary' for l, voice, word, ph in MARY_TESTS: tts.locale = l tts.voice = voice mary_ph = tts.gen_ipa(word) self.assertEqual(mary_ph, ph) wav = tts.synthesize(word) logging.debug('wav len: %d bytes.' % len(wav)) self.assertGreater(len(wav), 100) wav = tts.synthesize(ph, mode='ipa') logging.debug('wav len: %d bytes.' % len(wav)) self.assertGreater(len(wav), 100)
def test_tts_espeak(self): config = misc.load_config('.speechrc') tts = TTS(config.get('tts', 'host'), int(config.get('tts', 'port'))) tts.engine = 'espeak' first = True for v, word, ph in ESPEAK_TESTS: tts.locale = v tts.voice = v espeak_ph = tts.gen_ipa(word) self.assertEqual(espeak_ph, ph) wav = tts.synthesize(word) logging.debug('wav len: %d bytes.' % len(wav)) self.assertGreater(len(wav), 100) wav = tts.synthesize(ph, mode='ipa') logging.debug('wav len: %d bytes.' % len(wav)) self.assertGreater(len(wav), 100) if first: tts.say(word) first = False
def setUp(self): config = misc.load_config('.airc') # # db, store # db_url = config.get('db', 'url') # db_url = 'sqlite:///tmp/foo.db' self.sas = SPARQLAlchemyStore(db_url, 'unittests', echo=True) self.context = u'http://example.com' # # import triples to test on # self.sas.clear_all_graphs() samplefn = 'tests/dt.n3' with codecs.open(samplefn, 'r', 'utf8') as samplef: data = samplef.read() self.sas.parse(data=data, context=self.context, format='n3')
def setUp(self): config = misc.load_config('.airc') # # logic DB # self.db = LogicDB(model.url) # # knowledge base # self.kb = AIKB(UNITTEST_MODULE) for prefix in COMMON_PREFIXES: self.kb.register_prefix(prefix, COMMON_PREFIXES[prefix]) self.kb.clear_all_graphs() self.kb.parse_file (UNITTEST_CONTEXT, 'n3', 'tests/chancellors.n3') self.kb.parse_file (UNITTEST_CONTEXT, 'n3', 'tests/wev.n3') # # aiprolog environment setup # self.prolog_rt = AIPrologRuntime(self.db, self.kb) self.parser = AIPrologParser() self.prolog_rt.set_trace(True) self.db.clear_module(UNITTEST_MODULE)
def __init__(self): # # prepare our lightweight sparql wrapper # self.query_prefixes = ''.join( map(lambda k: "PREFIX %s: <%s>\n" % (k, COMMON_PREFIXES[k]), COMMON_PREFIXES)) # # set up graph store # config = misc.load_config('.nlprc') # self.graph = rdflib.ConjunctiveGraph('Sleepycat') # self.graph.open(RDF_LIB_STORE_PATH, create = True) # SQLAlchemy url = config.get('db', 'url') self.uri = rdflib.Literal(url) rdflib_sqlalchemy2.registerplugins() store = rdflib.plugin.get("SQLAlchemy2", rdflib.store.Store)(identifier=self.ident) self.graph = rdflib.ConjunctiveGraph(store, identifier=self.ident) self.graph.open(self.uri, create=True)
def __init__(self): cmdln.Cmdln.__init__(self) self.config = misc.load_config('.airc') toplevel = self.config.get('semantics', 'toplevel') xsb_root = self.config.get('semantics', 'xsb_root') db_url = self.config.get('db', 'url') self.kernal = AIKernal(db_url, xsb_root, toplevel)
def __init__(self, load_all_modules=False): self.config = misc.load_config('.airc') # # database # Session = sessionmaker(bind=model.engine) self.session = Session() # # TensorFlow (deferred, as tf can take quite a bit of time to set up) # self.tf_session = None self.nlp_model = None # # module management, setup # self.modules = {} self.initialized_modules = set() s = self.config.get('semantics', 'modules') self.all_modules = list(map(lambda s: s.strip(), s.split(','))) sys.path.append('modules') # # AIProlog parser, runtime # db_url = self.config.get('db', 'url') self.db = LogicDB(db_url) self.aip_parser = AIPrologParser(self) self.rt = AIPrologRuntime(self.db) self.dummyloc = SourceLocation('<rt>') # # alignment / word2vec (on-demand model loading) # self.w2v_model = None self.w2v_lang = None self.w2v_all_utterances = [] # # load modules, if requested # if load_all_modules: for mn2 in self.all_modules: self.load_module(mn2) self.init_module(mn2)
def __init__(self): cmdln.Cmdln.__init__(self) self.config = misc.load_config('.airc') all_modules = list( map(lambda m: m.strip(), self.config.get('semantics', 'modules').split(','))) db_url = self.config.get('db', 'url') db = LogicDB(db_url) self.kernal = AIKernal(db=db, all_modules=all_modules)
def __init__(self): self.config = misc.load_config('.nlprc') # # database # Session = sessionmaker(bind=model.engine) self.session = Session() # # logic DB # self.db = LogicDB(self.session) # # knowledge base # self.kb = HALKB() # # TensorFlow (deferred, as tf can take quite a bit of time to set up) # self.tf_session = None self.nlp_model = None # # module management, setup # self.modules = {} s = self.config.get('semantics', 'modules') self.all_modules = map(lambda s: s.strip(), s.split(',')) for mn2 in self.all_modules: self.load_module(mn2) # # prolog environment setup # self.prolog_engine = PrologAIEngine(self.db) self.parser = PrologParser()
def main(model_name, dictionary, language_model, sequitur_model=None, debug=0, verbose=False, prompt_words=False, *audio_corpora): misc.init_app('speech_kaldi_export') if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) language_model_dir = LANGUAGE_MODELS_DIR.resolve() / language_model exit_if_language_model_dir_doesnt_exist(language_model_dir) config = misc.load_config ('.speechrc') work_dir = ASR_MODELS_DIR / 'kaldi' / model_name kaldi_root = config.get("speech", "kaldi_root") data_dir = work_dir / "data" mfcc_dir = work_dir / "mfcc" wav16_dir = config.get("speech", "wav16") create_basic_work_dir_structure( str(data_dir), wav16_dir, str(mfcc_dir), str(work_dir), str(language_model_dir), kaldi_root) if sequitur_model: sequitur_model_path = str(SEQUITUR_MODEL_DIR / sequitur_model) else: sequitur_model_path = None generate_speech_and_text_corpora(data_dir, wav16_dir, debug, sequitur_model_path, dictionary, audio_corpora, prompt_words) copy_scripts_and_config_files(work_dir, kaldi_root)
def test_tts_pico(self): config = misc.load_config('.speechrc') tts = TTS(config.get('tts', 'host'), int(config.get('tts', 'port'))) tts.engine = 'pico' for v, word in PICO_TESTS: tts.locale = v tts.voice = v wav = tts.synthesize(word) logging.debug('wav len: %d bytes.' % len(wav)) self.assertGreater(len(wav), 100) tts.say(word)
def setUp(self): config = misc.load_config('.airc') # # logic DB # self.db = LogicDB(model.url) # # aiprolog environment setup # self.prolog_rt = AIPrologRuntime(self.db) self.parser = AIPrologParser(self.db) self.prolog_rt.set_trace(True) self.db.clear_module(UNITTEST_MODULE)
def setUp(self): config = misc.load_config('.airc') # # logic DB # self.db = LogicDB(model.url) # # aiprolog environment setup # self.prolog_rt = AIPrologRuntime(self.db) self.parser = AIPrologParser(self.db) self.prolog_rt.set_trace(True) self.db.clear_module(UNITTEST_MODULE)
def setUp(self): config = misc.load_config('.airc') # # db, store # db_url = config.get('db', 'url') # db_url = 'sqlite:///tmp/foo.db' self.sas = SPARQLAlchemyStore(db_url, 'unittests', echo=True, prefixes=COMMON_PREFIXES, aliases=RESOURCE_ALIASES) self.context = u'http://example.com' self.sas.clear_all_graphs() # # LDF Mirror # self.ldfmirror = LDFMirror (self.sas, ENDPOINTS)
def main(verbose=False, *speech_corpora): """Scan directory for audio files and convert them to wav files For each speech corpus `speech_corpus` 1. the resulting wav files are written to the directory `.speechrc.wav16`/<speech_corpus>/ 2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are updated. """ misc.init_app('speech_audio_scan') config = misc.load_config('.speechrc') speech_corpora_dir = Path(config.get("speech", "speech_corpora")) wav16 = Path(config.get("speech", "wav16")) if len(speech_corpora) < 1: logging.error("At least one speech corpus must be provided.") sys.exit(1) if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora) for speech_corpus in speech_corpora: transcripts = Transcripts(corpus_name=speech_corpus, create_db=True) out_wav16_subdir = wav16 / speech_corpus out_wav16_subdir.mkdir(parents=True, exist_ok=True) in_root_corpus_dir = speech_corpora_dir / speech_corpus scan_audiodir(str(in_root_corpus_dir), transcripts, str(out_wav16_subdir)) transcripts.save() print speech_corpus, "new transcripts saved." print
def __init__(self, kbname='kb'): # # prepare our lightweight sparql wrapper # self.query_prefixes = '' # # set up graph store # config = misc.load_config('.airc') # DB, SPARQLAlchemyStore db_url = config.get('db', 'url') self.sas = SPARQLAlchemyStore(db_url, kbname, echo=False) self.endpoints = {} # host name -> LDF endpoint
def main(corpus, verbose=False): """Generate training sentences for language models Let text_corpus be the argument given on the command line. Then the corpus text_corpus is tokenized and each sentence is written on a separate line into `data/dst/text-corpora/<text_corpus>.txt`. All punctuation marks are stripped. """ init_app('speech_sentences') if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) config = load_config('.speechrc') TEXT_CORPORA_DIR.mkdir(parents=True, exist_ok=True) out_file = TEXT_CORPORA_DIR / (corpus + ".txt") with codecs.open(str(out_file), "w", "utf-8") as outf: # I haven't figured out how to refactor the processing algorithms of the # parole corpus to implement a generator. if corpus == "parole_de": corpus_path = config.get("speech", corpus) proc_parole_de(corpus_path, load_punkt_tokenizer, outf) elif corpus in TEXT_CORPORA: corpus_path = config.get("speech", corpus) for sentence in TEXT_CORPORA[corpus](corpus_path): outf.write(sentence + "\n") elif corpus in SPEECH_CORPORA: for sentence in SPEECH_CORPORA[corpus](): outf.write(sentence + "\n") else: raise Exception("This shouldn't happen.") logging.info('%s written.' % out_file)
def main(verbose=False, debug_sgm_limit=0): """Train the Punkt tokenizer on the German Parole corpus""" init_app('speech_sentences') if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) config = load_config('.speechrc') parole_path = config.get("speech", "parole_de") logging.info("training punkt...") punkt_trainer = nltk.tokenize.punkt.PunktTrainer() train_punkt_wrapper = parole.TrainPunktWrapper(punkt_trainer) parole.parole_crawl(parole_path, train_punkt_wrapper.train_punkt, debug_sgm_limit) logging.info("finalizing punkt training...") punkt_trainer.finalize_training(verbose=True) logging.info("punkt training done. %d text segments." % train_punkt_wrapper.punkt_count) params = punkt_trainer.get_params() # print "Params: %s" % repr(params) parole.PUNKT_PICKLEFN.parent.mkdir(parents=True, exist_ok=True) tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer(params) with open(str(parole.PUNKT_PICKLEFN), mode='wb') as f: pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL) logging.info('%s written.' % parole.PUNKT_PICKLEFN)
finally: msg_cond.release() # # init # misc.init_app(PROC_TITLE) # # config, cmdline # config = misc.load_config('.airc', defaults=DEFAULTS) broker_host = config.get('mqtt', 'broker_host') broker_port = config.getint('mqtt', 'broker_port') broker_user = config.get('mqtt', 'broker_user') broker_pw = config.get('mqtt', 'broker_pw') ai_model = config.get('server', 'model') lang = config.get('server', 'lang') vf_login = config.get('server', 'vf_login') rec_dir = config.get('server', 'rec_dir') kaldi_model_dir = config.get('server', 'kaldi_model_dir') kaldi_model = config.get('server', 'kaldi_model') kaldi_acoustic_scale = config.getfloat('server', 'kaldi_acoustic_scale') kaldi_beam = config.getfloat('server', 'kaldi_beam') kaldi_frame_subsampling_factor = config.getint(
from nltools.misc import compress_ws, load_config, init_app from nltools.tokenizer import tokenize SENTENCEFN = 'data/dst/speech/en/sentences.txt' SENTENCES_STATS = 1000 DEBUG_LIMIT = 0 # DEBUG_LIMIT = 1000 # # init # init_app ('speech_sentences') config = load_config ('.speechrc') europarl = config.get("speech", "europarl_en") movie_dialogs = config.get("speech", "cornell_movie_dialogs") web_questions = config.get("speech", "web_questions") yahoo_answers = config.get("speech", "yahoo_answers") # # commandline parsing # parser = OptionParser("usage: %prog [options] )") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="enable verbose logging")
# GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # import sys from sqlalchemy import create_engine from sqlalchemy import Column, Integer, String, Text, Unicode, UnicodeText, Enum, DateTime, ForeignKey, Index from sqlalchemy.orm import relationship from sqlalchemy.ext.declarative import declarative_base from nltools import misc config = misc.load_config('.airc') # db_server = config.get("semantics", "dbserver") # db_name = config.get("semantics", "dbname") # db_user = config.get("semantics", "dbuser") # db_pass = config.get("semantics", "dbpass") # # # We connect with the help of the PostgreSQL URL # # postgresql://federer:grandestslam@localhost:5432/tennis # url = 'postgresql://{}:{}@{}:{}/{}' # url = url.format(db_user, db_pass, db_server, 5432, db_name) url = config.get("db", "url") #engine = create_engine(url, echo=True) engine = create_engine(url)
def fetch_weather_forecast(kernal): config = misc.load_config('.airc') api_key = config.get("weather", "api_key") logging.debug('fetch_weather_forecast cronj ob, api key: %s' % api_key) sl = SourceLocation(fn='__internet__', col=0, line=0) # # resolve city ids, timezones # locations = {} # owmCityId(wdeLosAngeles, 5368361). solutions = kernal.rt.search_predicate('owmCityId', ['_1', '_2']) for s in solutions: location = s['_1'].name city_id = int(s['_2'].f) # aiTimezone(wdeNewYorkCity, "America/New_York"). solutions2 = kernal.rt.search_predicate('aiTimezone', [location, '_1']) if len(solutions2) < 1: continue timezone = solutions2[0]['_1'].s solutions2 = kernal.rt.search_predicate('rdfsLabel', [location, 'en', '_1']) if len(solutions2) < 1: continue label = solutions2[0]['_1'].s # wdpdCoordinateLocation(wdeBerlin, "Point(13.383333333 52.516666666)"). solutions2 = kernal.rt.search_predicate('wdpdCoordinateLocation', [location, '_1']) if len(solutions2) < 1: continue m = coord_matcher.match(solutions2[0]['_1'].s) if not m: continue geo_lat = float(m.group(2)) geo_long = float(m.group(1)) if not location in locations: locations[location] = {} locations[location]['city_id'] = city_id locations[location]['timezone'] = timezone locations[location]['label'] = label locations[location]['long'] = geo_long locations[location]['lat'] = geo_lat def mangle_label(label): return ''.join(map(lambda c: c if c.isalnum() else '', label)) # # generate triples of weather and astronomical data # env = {} for location in locations: city_id = locations[location]['city_id'] timezone = locations[location]['timezone'] loc_label = mangle_label(locations[location]['label']) geo_lat = locations[location]['lat'] geo_long = locations[location]['long'] tz = pytz.timezone(timezone) ref_dt = datetime.now(tz).replace(hour=0, minute=0, second=0, microsecond=0) logging.debug("%s %s" % (location, ref_dt)) # # sunrise / sunset # l = astral.Location() l.name = 'name' l.region = 'region' l.latitude = geo_lat l.longitude = geo_long l.timezone = timezone l.elevation = 0 for day_offset in range(7): cur_date = (ref_dt + timedelta(days=day_offset)).date() sun = l.sun(date=cur_date, local=True) sun_const = u'aiUnlabeledSun%s%s' % (loc_label, cur_date.strftime('%Y%m%d')) env = do_retract(env, build_predicate('aiLocation', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDate', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDawn', [sun_const, '_'])) env = do_retract(env, build_predicate('aiSunrise', [sun_const, '_'])) env = do_retract(env, build_predicate('aiNoon', [sun_const, '_'])) env = do_retract(env, build_predicate('aiSunset', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDusk', [sun_const, '_'])) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiLocation', [sun_const, location]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiDate', [sun_const, StringLiteral(cur_date.isoformat())]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiDawn', [sun_const, StringLiteral(sun['dawn'].isoformat())]))) env = do_assertz( env, Clause( location=sl, head=build_predicate( 'aiSunrise', [sun_const, StringLiteral(sun['sunrise'].isoformat())]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiNoon', [sun_const, StringLiteral(sun['noon'].isoformat())]))) env = do_assertz( env, Clause( location=sl, head=build_predicate( 'aiSunset', [sun_const, StringLiteral(sun['sunset'].isoformat())]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiDusk', [sun_const, StringLiteral(sun['dusk'].isoformat())]))) logging.debug("%s %s %s -> %s" % (sun_const, cur_date, sun['sunrise'], sun['sunset'])) # # fetch json forecast data from OpenWeatherMap # url = 'http://api.openweathermap.org/data/2.5/forecast?id=%s&APPID=%s' % ( city_id, api_key) data = json.load(urllib2.urlopen(url)) if not 'list' in data: logging.error('failed to fetch weather data for %s, got: %s' % (location, repr(data))) continue # print repr(data['list']) for fc in data['list']: dt_to = datetime.strptime(fc['dt_txt'], '%Y-%m-%d %H:%M:%S') dt_to = dt_to.replace(tzinfo=pytz.utc) dt_from = dt_to - timedelta(hours=3) city_id = city_id temp_min = fc['main']['temp_min'] - KELVIN temp_max = fc['main']['temp_max'] - KELVIN code = fc['weather'][0]['id'] precipitation = float( fc['rain'] ['3h']) if 'rain' in fc and '3h' in fc['rain'] else 0.0 icon = fc['weather'][0]['icon'] description = fc['weather'][0]['description'] clouds = float(fc['clouds']['all']) fc_const = 'aiUnlabeledFc%s%s' % (loc_label, dt_from.strftime('%Y%m%d%H%M%S')) logging.debug("%s on %s-%s city_id=%s" % (fc_const, dt_from, dt_to, city_id)) # aiDescription(aiUnlabeledFcFreudental20161205180000, "clear sky"). # aiDtEnd(aiUnlabeledFcFreudental20161205180000, "2016-12-05T21:00:00+00:00"). # aiTempMin(aiUnlabeledFcFreudental20161205180000, -6.666). # aiIcon(aiUnlabeledFcFreudental20161205180000, "01n"). # aiLocation(aiUnlabeledFcFreudental20161205180000, wdeFreudental). # aiDtStart(aiUnlabeledFcFreudental20161205180000, "2016-12-05T18:00:00+00:00"). # aiClouds(aiUnlabeledFcFreudental20161205180000, 0.0). # aiPrecipitation(aiUnlabeledFcFreudental20161205180000, 0.0). # aiTempMax(aiUnlabeledFcFreudental20161205180000, -6.45). env = do_retract(env, build_predicate('aiDescription', [fc_const, '_'])) env = do_retract(env, build_predicate('aiDtEnd', [fc_const, '_'])) env = do_retract(env, build_predicate('aiTempMin', [fc_const, '_'])) env = do_retract(env, build_predicate('aiIcon', [fc_const, '_'])) env = do_retract(env, build_predicate('aiLocation', [fc_const, '_'])) env = do_retract(env, build_predicate('aiDtStart', [fc_const, '_'])) env = do_retract(env, build_predicate('aiClouds', [fc_const, '_'])) env = do_retract( env, build_predicate('aiPrecipitation', [fc_const, '_'])) env = do_retract(env, build_predicate('aiTempMax', [fc_const, '_'])) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiLocation', [fc_const, location]))) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiTempMin', [fc_const, temp_min]))) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiTempMax', [fc_const, temp_max]))) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiPrecipitation', [fc_const, precipitation]))) env = do_assertz( env, Clause(location=sl, head=build_predicate('aiClouds', [fc_const, clouds]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiIcon', [fc_const, StringLiteral(icon)]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiDescription', [fc_const, StringLiteral(description)]))) env = do_assertz( env, Clause(location=sl, head=build_predicate( 'aiDtStart', [fc_const, StringLiteral(dt_from.isoformat())]))) env = do_assertz( env, Clause( location=sl, head=build_predicate( 'aiDtEnd', [fc_const, StringLiteral(dt_to.isoformat())]))) kernal.rt.apply_overlay(WEATHER_DATA_MODULE, env)
def main(verbose=False): """Convert gspv2 corpus to the VoxForge corpus format The variable `speech_arc` in ~/.speechrc must point to a folder gspv2 which is used as the source containing the original gspv2 corpus, i.e. containing the subfolders dev, test, and train. The variable `speech_corpora` in ~/.speechrc must point to a folder where the resulting corpus should be written. The script will create a subfolder gspv2 here for the resulting voxforge-formatted data. """ misc.init_app('speech_audio_scan') config = misc.load_config('.speechrc') if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) speech_arc_dir = Path(config.get("speech", "speech_arc")) speech_corpora_dir = Path(config.get("speech", "speech_corpora")) src_root_dir = speech_arc_dir / "gspv2" dst_root_dir = speech_corpora_dir / "gspv2" exit_if_dst_root_dir_exists(dst_root_dir) speakers = set() speaker_gender = {} for folder in ['train', 'test', 'dev']: destdir = dst_root_dir src_dir = src_root_dir / folder num_files = len([f for f in src_dir.glob("*.xml")]) cnt_files = 0 for xml_path in src_dir.glob("*.xml"): f = str(xml_path) cnt_files += 1 fbase = f[0:len(f) - 4] with codecs.open(f, 'r', 'utf-8') as xmlfile: # remove urls text = xmlfile.read() soup = BeautifulSoup(text) sentence = (soup.recording.sentence.string).strip() cleaned_sentence = ( soup.recording.cleaned_sentence.string).strip() sentence_id = int((soup.recording.sentence_id.string).strip()) speaker_id = (soup.recording.speaker_id.string).strip() gender = (soup.recording.gender.string).strip() name = 'gsp%s' % speaker_id.replace('-', '') speakerdir = destdir / (name + "-1") if not speaker_id in speakers: speakers.add(speaker_id) speaker_gender[name] = \ 'm' if gender == 'male' else 'f' (speakerdir / "wav").mkdir(parents=True, exist_ok=True) (speakerdir / "etc").mkdir(parents=True, exist_ok=True) for mic in [ 'Yamaha', 'Kinect-Beam', 'Kinect-RAW', 'Realtek', 'Samson' ]: srcaudiofn = src_dir / ('%s_%s.wav' % (fbase, mic)) if not srcaudiofn.is_file(): continue audiofn = Path('%s-%s' % (fbase, mic)).name dstaudiofn = speakerdir / "wav" / (audiofn + ".wav") logging.info( '%5d/%5d %s %s %s' % (cnt_files, num_files, name, audiofn, str(srcaudiofn))) prompts_orig = speakerdir / "etc" / "prompts-original" with open(str(prompts_orig), 'a') as promptsf: promptsf.write( (u'%s %s\n' % (audiofn, cleaned_sentence)).encode('utf8')) copy_file(str(srcaudiofn), str(dstaudiofn))
from optparse import OptionParser from StringIO import StringIO from nltools import misc from zamiaprolog.logicdb import LogicDB from zamiaprolog.parser import PrologParser, PrologError from zamiaprolog.runtime import PrologRuntime, PrologRuntimeError # logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.INFO) # # init # misc.init_app('prolog_shell') config = misc.load_config('.nlprc') # # readline, history # histfile = os.path.join(os.path.expanduser("~"), ".hal_prolog_history") try: readline.read_history_file(histfile) # default history len is -1 (infinite), which may grow unruly readline.set_history_length(1000) except IOError: pass atexit.register(readline.write_history_file, histfile) #
def get_api_key(): config = misc.load_config('.airc') return config.get('weather', 'api_key')
def fetch_weather_forecast(kernal): config = misc.load_config('.airc') api_key = config.get("weather", "api_key") logging.debug ('fetch_weather_forecast cronj ob, api key: %s' % api_key) sl = SourceLocation(fn='__internet__', col=0, line=0) # # resolve city ids, timezones # locations = {} # owmCityId(wdeLosAngeles, 5368361). solutions = kernal.rt.search_predicate ('owmCityId', ['_1', '_2']) for s in solutions: location = s['_1'].name city_id = int(s['_2'].f) # aiTimezone(wdeNewYorkCity, "America/New_York"). solutions2 = kernal.rt.search_predicate ('aiTimezone', [location, '_1']) if len(solutions2)<1: continue timezone = solutions2[0]['_1'].s solutions2 = kernal.rt.search_predicate ('rdfsLabel', [location, 'en', '_1']) if len(solutions2)<1: continue label = solutions2[0]['_1'].s # wdpdCoordinateLocation(wdeBerlin, "Point(13.383333333 52.516666666)"). solutions2 = kernal.rt.search_predicate ('wdpdCoordinateLocation', [location, '_1']) if len(solutions2)<1: continue m = coord_matcher.match(solutions2[0]['_1'].s) if not m: continue geo_lat = float(m.group(2)) geo_long = float(m.group(1)) if not location in locations: locations[location] = {} locations[location]['city_id'] = city_id locations[location]['timezone'] = timezone locations[location]['label'] = label locations[location]['long'] = geo_long locations[location]['lat'] = geo_lat def mangle_label(label): return ''.join(map(lambda c: c if c.isalnum() else '', label)) # # generate triples of weather and astronomical data # env = {} for location in locations: city_id = locations[location]['city_id'] timezone = locations[location]['timezone'] loc_label = mangle_label(locations[location]['label']) geo_lat = locations[location]['lat'] geo_long = locations[location]['long'] tz = pytz.timezone(timezone) ref_dt = datetime.now(tz).replace( hour = 0, minute = 0, second = 0, microsecond = 0) logging.debug("%s %s" % ( location, ref_dt ) ) # # sunrise / sunset # l = astral.Location() l.name = 'name' l.region = 'region' l.latitude = geo_lat l.longitude = geo_long l.timezone = timezone l.elevation = 0 for day_offset in range(7): cur_date = (ref_dt + timedelta(days=day_offset)).date() sun = l.sun(date=cur_date, local=True) sun_const = u'aiUnlabeledSun%s%s' % (loc_label, cur_date.strftime('%Y%m%d')) env = do_retract(env, build_predicate('aiLocation', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDate', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDawn', [sun_const, '_'])) env = do_retract(env, build_predicate('aiSunrise', [sun_const, '_'])) env = do_retract(env, build_predicate('aiNoon', [sun_const, '_'])) env = do_retract(env, build_predicate('aiSunset', [sun_const, '_'])) env = do_retract(env, build_predicate('aiDusk', [sun_const, '_'])) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiLocation', [sun_const, location]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDate', [sun_const, StringLiteral(cur_date.isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDawn', [sun_const, StringLiteral(sun['dawn'].isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiSunrise', [sun_const, StringLiteral(sun['sunrise'].isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiNoon', [sun_const, StringLiteral(sun['noon'].isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiSunset', [sun_const, StringLiteral(sun['sunset'].isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDusk', [sun_const, StringLiteral(sun['dusk'].isoformat())]))) logging.debug ("%s %s %s -> %s" % (sun_const, cur_date, sun['sunrise'], sun['sunset']) ) # # fetch json forecast data from OpenWeatherMap # url = 'http://api.openweathermap.org/data/2.5/forecast?id=%s&APPID=%s' % (city_id, api_key) data = json.load(urllib2.urlopen(url)) if not 'list' in data: logging.error ('failed to fetch weather data for %s, got: %s' % (location, repr(data))) continue # print repr(data['list']) for fc in data['list']: dt_to = datetime.strptime (fc['dt_txt'], '%Y-%m-%d %H:%M:%S') dt_to = dt_to.replace(tzinfo=pytz.utc) dt_from = dt_to - timedelta(hours=3) city_id = city_id temp_min = fc['main']['temp_min']-KELVIN temp_max = fc['main']['temp_max']-KELVIN code = fc['weather'][0]['id'] precipitation = float(fc['rain']['3h']) if 'rain' in fc and '3h' in fc['rain'] else 0.0 icon = fc['weather'][0]['icon'] description = fc['weather'][0]['description'] clouds = float(fc['clouds']['all']) fc_const = 'aiUnlabeledFc%s%s' % (loc_label, dt_from.strftime('%Y%m%d%H%M%S')) logging.debug ("%s on %s-%s city_id=%s" % (fc_const, dt_from, dt_to, city_id)) # aiDescription(aiUnlabeledFcFreudental20161205180000, "clear sky"). # aiDtEnd(aiUnlabeledFcFreudental20161205180000, "2016-12-05T21:00:00+00:00"). # aiTempMin(aiUnlabeledFcFreudental20161205180000, -6.666). # aiIcon(aiUnlabeledFcFreudental20161205180000, "01n"). # aiLocation(aiUnlabeledFcFreudental20161205180000, wdeFreudental). # aiDtStart(aiUnlabeledFcFreudental20161205180000, "2016-12-05T18:00:00+00:00"). # aiClouds(aiUnlabeledFcFreudental20161205180000, 0.0). # aiPrecipitation(aiUnlabeledFcFreudental20161205180000, 0.0). # aiTempMax(aiUnlabeledFcFreudental20161205180000, -6.45). env = do_retract(env, build_predicate('aiDescription', [fc_const, '_'])) env = do_retract(env, build_predicate('aiDtEnd', [fc_const, '_'])) env = do_retract(env, build_predicate('aiTempMin', [fc_const, '_'])) env = do_retract(env, build_predicate('aiIcon', [fc_const, '_'])) env = do_retract(env, build_predicate('aiLocation', [fc_const, '_'])) env = do_retract(env, build_predicate('aiDtStart', [fc_const, '_'])) env = do_retract(env, build_predicate('aiClouds', [fc_const, '_'])) env = do_retract(env, build_predicate('aiPrecipitation', [fc_const, '_'])) env = do_retract(env, build_predicate('aiTempMax', [fc_const, '_'])) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiLocation', [fc_const, location]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiTempMin', [fc_const, temp_min]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiTempMax', [fc_const, temp_max]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiPrecipitation', [fc_const, precipitation]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiClouds', [fc_const, clouds]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiIcon', [fc_const, StringLiteral(icon)]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDescription', [fc_const, StringLiteral(description)]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDtStart', [fc_const, StringLiteral(dt_from.isoformat())]))) env = do_assertz(env, Clause(location=sl, head=build_predicate('aiDtEnd', [fc_const, StringLiteral(dt_to.isoformat())]))) kernal.rt.apply_overlay (WEATHER_DATA_MODULE, env)
PROC_TITLE = 'abook-analyze' MODELDIR = '../data/models/kaldi-chain-generic-%s-latest' % LANG MODEL = 'tdnn_sp' # # init terminal # misc.init_app(PROC_TITLE) # # config # config = misc.load_config('.speechrc') # # command line # parser = OptionParser("usage: %prog [options] directory") parser.add_option("-v", "--verbose", action="store_true", dest="verbose", help="enable debug output") (options, args) = parser.parse_args()
def main(language_model, debug=0, verbose=False, *text_corpus): """Train n-gram language model on tokenized text corpora The resulting language model will be written to the directory data/dst/lm/<language_model>/. The search path for the tokenized text corpora is data/dst/text-corpora. Example: ./speech_build_lm.py my-language-model parole_de europarl_de A language model will be trained on the text corpora found in data/dst/text-corpora/parole_de.txt and data/dst/text-corpora/europarl_de.txt. The resulting language model will be written to the directory data/dst/lm/my-language-model/. """ init_app('speech_build_lm') if len(text_corpus) < 1: logging.error("Argument text_corpus missing, at least one is " "required.") sys.exit(1) if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) config = load_config('.speechrc') srilm_root = config.get("speech", "srilm_root") ngram_path = Path('%s/bin/i686-m64/ngram' % srilm_root) ngram_count_path = Path('%s/bin/i686-m64/ngram-count' % srilm_root) if not ngram_path.exists(): logging.error("Could not find required executable %s" % ngram_path) sys.exit(1) if not ngram_count_path.exists(): logging.error("Could not find required executable %s" % ngram_count_path) sys.exit(1) outdir = LANGUAGE_MODELS_DIR / language_model outdir.mkdir(parents=True, exist_ok=True) train_fn = outdir / "train_all.txt" num_sentences = 0 with codecs.open(str(train_fn), 'w', 'utf8') as dstf: for text_corpus_name in text_corpus: src = TEXT_CORPORA_DIR / (text_corpus_name + ".txt") logging.info('reading from sources %s' % src) with codecs.open(str(src), 'r', 'utf8') as srcf: while True: line = srcf.readline() if not line: break dstf.write(line) num_sentences += 1 if num_sentences % SENTENCES_STATS == 0: logging.info('%8d sentences.' % num_sentences) if debug > 0 and num_sentences >= debug: logging.warning( 'stopping because sentence debug limit is reached.' ) break logging.info('done. %s written, %d sentences.' % (train_fn, num_sentences)) lm_fn = outdir / 'lm_full.arpa' train_ngram_model(ngram_count_path, train_fn, lm_fn) lm_pruned_fn = outdir / 'lm.arpa' prune_ngram_model(ngram_path, lm_fn, lm_pruned_fn)