def main(config_path='io_args.yml'): try: conf = load(config_path) do_export(conf) except Exception as ex: sys.stderr.write(repr(ex)) return 1
def main(config_path='io_args.yml'): try: conf = load(config_path) write_unique_file_pair(conf) except Exception as ex: sys.stderr.write(repr(ex)) return 1
def main(config_path='io_args.yml'): try: conf = load(config_path) write_parallel_index_files(conf) except Exception as ex: sys.stderr.write(repr(ex)) return 1
def main(config_path='io_args.yml'): try: conf = load(config_path) vocab_path_base = os.path.join(conf['vocabulary_directory'], conf['vocabulary_name']) for path in (conf['src_train'], conf['trg_train']): write_vocabulary(path, vocab_path_base) except Exception as ex: sys.stderr.write(repr(ex)) return 1
def main(io_argument_path='io_args.yml'): args = load(io_argument_path) with open(args['model_name'] + '.json', encoding='utf-8') as json: model_json = json.read() model = model_from_json(model_json) plot_model(model, to_file='{}.png'.format(args['model_name']), show_shapes=True, rankdir='BT')
def load(session, epochNumber=0, config=None): if config is None: config = cfg.load(session) # find results file for epoch number epochDir = get_epoch_dir(config, epochNumber) # epoch = get_epochs(config)[epochNumber] # epochDir = '/'.join((config.get('session','outputprefix'), epoch)) h5files = glob.glob(epochDir + '/*.h5') if len(h5files) != 1: utils.error('More than one .h5 file found in output " \ "directory: %s' % str(h5files)) return Session(h5files[0], config.getint('audio', 'samprate'), cache_dir=config.get('filesystem', 'tmp', '/tmp'))
from yadfs.client.client import Client import sys import cfg if __name__ == '__main__': cfg_path = sys.argv[1] opts = cfg.load(cfg_path) cl = Client(opts['ns_addr']) cl.create_file("/home/osboxes/yamr/cfg.py", "/test") data = cl.get_chunk("/test/cfg.py_0") cl.download_to("/test/cfg.py", "/home/osboxes/yamr/cfg_text.py") print(data) cl.save("hohohoho", "/test/my_file") cl.download_to("/test/my_file", "/home/osboxes/yamr/hoho.txt")
def train(properties_path='props.yml', io_argument_path='io_args.yml'): props = load(properties_path) args = load(io_argument_path) # Read vocabularies src_f_name = args['src_train'] trg_f_name = args['trg_train'] vs = data_dense.read_vocabularies(args['model_name'] + "-vocab.pickle", src_f_name, trg_f_name, False, props['ngram_length']) vs.trainable = False # Inputs: list of one Input per N-gram size src_inp = Input( shape=(props['max_sent_length'], ), name="source_ngrams_{N}".format(N=props['ngram_length'][0]), dtype="int32") trg_inp = Input( shape=(props['max_sent_length'], ), name="target_ngrams_{N}".format(N=props['ngram_length'][0]), dtype="int32") # Embeddings: list of one Embedding per input src_emb = Embedding(len(vs.source_ngrams[props['ngram_length'][0]]), props['feature_count'], input_length=props['max_sent_length'], name="source_embedding_{N}".format( N=props['ngram_length'][0]))(src_inp) trg_emb = Embedding(len(vs.target_ngrams[props['ngram_length'][0]]), props['feature_count'], input_length=props['max_sent_length'], name="target_embedding_{N}".format( N=props['ngram_length'][0]))(trg_inp) # Conv src_conv_out = Conv1D(props['feature_count'], (5, ), padding='same', activation='relu')(src_emb) trg_conv_out = Conv1D(props['feature_count'], (5, ), padding='same', activation='relu')(trg_emb) src_maxpool_out = MaxPooling1D( pool_size=props['max_sent_length'])(src_conv_out) trg_maxpool_out = MaxPooling1D( pool_size=props['max_sent_length'])(trg_conv_out) src_flat_out = Flatten()(src_maxpool_out) trg_flat_out = Flatten()(trg_maxpool_out) # yet one dense src_dense_out = Dense(props['gru_width'], name="source_dense")(src_flat_out) trg_dense_out = Dense(props['gru_width'], name="target_dense")(trg_flat_out) # ...and cosine between the source and target side merged_out = dot([src_dense_out, trg_dense_out], axes=1, normalize=True) # classification s_out = Dense(1, activation='sigmoid', name='classification_layer')(merged_out) model = Model(inputs=[src_inp, trg_inp], outputs=s_out) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) print(model.summary()) train_inf_iter = data_dense.InfiniteDataIterator(src_f_name, trg_f_name) train_batch_iter = data_dense.fill_batch(props['minibatch_size'], props['max_sent_length'], vs, train_inf_iter, props['ngram_length']) # dev iter dev_inf_iter = data_dense.InfiniteDataIterator(args['src_devel'], args['trg_devel']) dev_batch_iter = data_dense.fill_batch(props['minibatch_size'], props['max_sent_length'], vs, dev_inf_iter, props['ngram_length']) # save model json model_json = model.to_json() with open('{}.json'.format(args['model_name']), "w") as json_file: json_file.write(model_json) # callback to save weights after each epoch save_cb = ModelCheckpoint(filepath=args['model_name'] + '.{epoch:02d}.h5', monitor='val_loss', verbose=1, save_best_only=False, mode='auto') early_stop = EarlyStopping(monitor='val_loss', patience=5) csv_logger = CSVLogger('./logs/train.log', append=True, separator=';') # tensor_board = TensorBoard(log_dir='./graph', write_graph=True, write_images=True) # callbacks = [save_cb, early_stop, csv_logger, tensor_board] callbacks = [ save_cb, early_stop, csv_logger, TrainValTensorBoard(write_graph=False) ] # steps per epoch or validation steps equal samples in train or devel dataset / batch size, e.g. 2700 / 200 = 14 steps_per_epoch = ceil( len(train_inf_iter.data) / props['minibatch_size']) * 10 val_steps = ceil(len(dev_inf_iter.data) / props['minibatch_size']) * 10 model.fit_generator(train_batch_iter, steps_per_epoch, props['epochs'], callbacks=callbacks, validation_data=dev_batch_iter, validation_steps=val_steps)
def main(properties_path='props.yml', io_argument_path='io_args.yml'): args = load(io_argument_path) props = load(properties_path) lang_pair = '{}-{}'.format(args['source_language'], args['target_language']) aligned_filename_suffix = '.{}.aligned'.format(lang_pair) lang_pair_work_directory = os.path.join(args['work_directory'], lang_pair) dictionary_path_base = os.path.join(args['dictionary_directory'], args['dictionary_name']) vocabulary_path_base = os.path.join(args['vocabulary_directory'], args['vocabulary_name']) src_lang_vocabulary_path = '{}.{}'.format(vocabulary_path_base, args['source_language']) trg_lang_vocabulary_path = '{}.{}'.format(vocabulary_path_base, args['target_language']) src2trg_dictionary = build_dictionary(dictionary_path_base + ".f2e", src_lang_vocabulary_path) trg2src_dictionary = build_dictionary(dictionary_path_base + ".e2f", trg_lang_vocabulary_path) word2idx_src = \ {word.strip().lower(): i for i, word in enumerate(open(src_lang_vocabulary_path, encoding="utf-8"))} word2idx_trg = \ {word.strip().lower(): i for i, word in enumerate(open(trg_lang_vocabulary_path, encoding="utf-8"))} src2trg_matrix = build_translation_matrix(src2trg_dictionary, word2idx_src, word2idx_trg).tocsr() trg2src_matrix = build_translation_matrix(trg2src_dictionary, word2idx_trg, word2idx_src).tocsr() if not os.path.exists(lang_pair_work_directory): os.makedirs(lang_pair_work_directory) for entry in os.scandir(os.path.join(args['source_data_directory'], 'snt')): if not (entry.is_file() and entry.name.endswith('_{}.snt'.format( args['source_language']))): continue pair_title = entry.name.rsplit('.', 1)[0].rsplit('_', 1)[0] trg_lang_filepath = os.path.join( os.path.dirname(entry.path), '{}_{}.snt'.format(pair_title, args['target_language'])) if not os.path.exists(trg_lang_filepath): continue with open(os.path.join(lang_pair_work_directory, pair_title + aligned_filename_suffix), 'w', encoding='utf-8', newline='\n') as out_combined, \ open(os.path.join(lang_pair_work_directory, '{}.{}.keras'.format(pair_title, lang_pair)), 'w', encoding='utf-8', newline='\n') as out_keras, \ open(os.path.join(lang_pair_work_directory, '{}.{}.baseline'.format(pair_title, lang_pair)), 'w', encoding='utf-8', newline='\n') as out_baseline: src_sentences, src_vectors = load_data(entry.path, props['feature_count'], props['gru_width']) trg_sentences, trg_vectors = load_data(trg_lang_filepath, props['feature_count'], props['gru_width']) src_sparse = build_sparse_sklearn(src_sentences, word2idx_src) src_normalizer = np.array( [len(set(s.split())) for s in src_sentences], dtype=np.float32) sparse_dot_out = np.zeros((len(src_sentences), len(trg_sentences)), dtype=np.float32) sparse_dot_out2 = np.zeros( (len(src_sentences), len(trg_sentences)), dtype=np.float32) trg_sparse = build_sparse_sklearn(trg_sentences, word2idx_trg) trg_normalizer = np.array( [len(set(s.split())) for s in trg_sentences], dtype=np.float32) trg_normalizer = trg_normalizer.reshape( (1, len(trg_normalizer)))[:, len(trg_sentences) - 1] trg_translated_sparse = (trg_sparse * trg2src_matrix).tocsc() trg_translated_sparse.data = np.ones(len( trg_translated_sparse.data), dtype=np.float32) trg_sparse = trg_sparse.tocsc() sim_matrix = np.dot(src_vectors[:len(src_sentences), :], trg_vectors[:len(trg_sentences), :].T) csr_csc_dot_f(0, len(src_sentences), src_sparse, trg_translated_sparse, sparse_dot_out) np.divide(sparse_dot_out, src_normalizer.reshape( (len(src_normalizer), 1))[:len(src_sentences), :], sparse_dot_out) # normalize tmp = src_sparse[:len(src_sentences), :] * src2trg_matrix tmp.data = np.ones(len(tmp.data), dtype=np.float32) # force to binary csr_csc_dot_f(0, tmp.shape[0], tmp, trg_sparse, sparse_dot_out2) np.divide(sparse_dot_out2, trg_normalizer, sparse_dot_out2) # normalize # sum sparse_dot_out and sparse_dot_out2, write results to sparse_dot_out np.add(sparse_dot_out, sparse_dot_out2, sparse_dot_out) # sum all three, write results to sparse_dot_out2 np.add(sim_matrix, sparse_dot_out, sparse_dot_out2) # now sim_matrix has dense similarities, sparse_dot_out has baseline similarities, # and sparse_dot_out2 has combined similarities argmaxs_keras = np.argmax(sim_matrix, axis=1) argmaxs_baseline = np.argmax(sparse_dot_out, axis=1) argmaxs_combined = np.argmax(sparse_dot_out2, axis=1) # Print results for j in range(argmaxs_keras.shape[0] ): # all three should have the same shape # keras print(sim_matrix[j, argmaxs_keras[j]], src_sentences[j], trg_sentences[argmaxs_keras[j]], sep="\t", file=out_keras, flush=True) # baseline print(sparse_dot_out[j, argmaxs_baseline[j]] / 2.0, src_sentences[j], trg_sentences[argmaxs_baseline[j]], sep="\t", file=out_baseline, flush=True) # combined print(sparse_dot_out2[j, argmaxs_combined[j]] / 3.0, src_sentences[j], trg_sentences[argmaxs_combined[j]], sep="\t", file=out_combined, flush=True)
from socket import * from uuid import UUID from mqtt import MQTT from decrypt import decrypt import cfg cfg = cfg.load() mqtt = MQTT() s = socket(AF_INET, SOCK_DGRAM) s.setsockopt(SOL_SOCKET, SO_BROADCAST, 1) s.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1) s.bind(('', 17117)) while True: data = s.recv(1024) data = decrypt(data) # if no valid data was received if data == None: continue # print(data.hex(' ')) # check first two bytes of mac, they must be 0x17 and 0x00 mac = data[0:6] if mac[0] != 0x17 and mac[1] != 0x00: continue # battery state (percent) if data[10] > 0: mqtt.publish('sensor/0x%s/battery/state' % (mac.hex()), data[10])
def get_n_epochs(session): config = cfg.load(session) return len(get_epochs(config))
def vectorize(properties_path='props.yml', io_argument_path='io_args.yml'): # read and create files props = load(properties_path) args = load(io_argument_path) output_filename_extension = '.npy' print('Vectorizing all sentences', file=sys.stderr) # read vocabularies vs = data_dense.read_vocabularies( '{}-vocab.pickle'.format(args['model_name']), "xxx", "xxx", False, props['ngram_length']) vs.trainable = False # load model trained_model = load_model('{}.{}'.format(args['model_name'], args['epoch_number'])) output_size = trained_model.get_layer('source_dense').output_shape[1] max_sent_len = trained_model.get_layer( 'source_ngrams_{n}'.format(n=props['ngram_length'][0])).output_shape[1] print(output_size, max_sent_len, file=sys.stderr) # build matrices for entry in os.scandir(args['preprocessed_source_data_directory']): if not (entry.is_file() and entry.name.endswith('_{}.snt'.format( args['source_language']))): continue src_in_path = entry.path trg_in_path = entry.path.rsplit('_', 1)[0] + '_{}.snt'.format( args['target_language']) # trg_in_path = '{}.{}'.format(os.path.splitext(entry.path)[0], args['target_language']) if not os.path.exists(trg_in_path): continue src_out_path = entry.path + output_filename_extension trg_out_path = trg_in_path + output_filename_extension max_sent_count = max(get_sentence_count(src_in_path), get_sentence_count(trg_in_path)) with open(src_in_path, encoding='utf-8') as src_inp, \ open(trg_in_path, encoding='utf-8') as trg_inp, \ open(src_out_path, 'wb') as src_outp, \ open(trg_out_path, 'wb') as trg_outp: # get vectors counter = 0 for i, (mx, targets, src_data, trg_data) in enumerate( fill_batch(max_sent_count, max_sent_len, vs, iter_wrapper(src_inp, trg_inp), props['ngram_length'])): src, trg = trained_model.predict( mx) # shape is (max_sent_count, props['gru_width']) # loop over items in batch for j, (src_v, trg_v) in enumerate(zip(src, trg)): if j >= len(src_data): # empty padding of the batch break write_vector_to_file(src_outp, normalize_v(src_v), src_data[j]) write_vector_to_file(trg_outp, normalize_v(trg_v), trg_data[j]) counter += 1 if counter > 0 and counter % 100 == 0: print('{}: vectorized {} sentence pairs'.format( os.path.splitext(entry.name)[0], counter), end='\r', file=sys.stderr, flush=True) print('{}: vectorized {} sentence pairs'.format( os.path.splitext(entry.name)[0], counter), file=sys.stderr, flush=True)