Example #1
0
def main(config_path='io_args.yml'):
    try:
        conf = load(config_path)
        do_export(conf)
    except Exception as ex:
        sys.stderr.write(repr(ex))
        return 1
Example #2
0
def main(config_path='io_args.yml'):
    try:
        conf = load(config_path)
        write_unique_file_pair(conf)
    except Exception as ex:
        sys.stderr.write(repr(ex))
        return 1
Example #3
0
def main(config_path='io_args.yml'):
    try:
        conf = load(config_path)
        write_parallel_index_files(conf)
    except Exception as ex:
        sys.stderr.write(repr(ex))
        return 1
Example #4
0
def main(config_path='io_args.yml'):
    try:
        conf = load(config_path)
        vocab_path_base = os.path.join(conf['vocabulary_directory'], conf['vocabulary_name'])

        for path in (conf['src_train'], conf['trg_train']):
            write_vocabulary(path, vocab_path_base)
    except Exception as ex:
        sys.stderr.write(repr(ex))
        return 1
Example #5
0
def main(io_argument_path='io_args.yml'):
    args = load(io_argument_path)

    with open(args['model_name'] + '.json', encoding='utf-8') as json:
        model_json = json.read()

    model = model_from_json(model_json)
    plot_model(model,
               to_file='{}.png'.format(args['model_name']),
               show_shapes=True,
               rankdir='BT')
Example #6
0
def load(session, epochNumber=0, config=None):
    if config is None:
        config = cfg.load(session)

    # find results file for epoch number
    epochDir = get_epoch_dir(config, epochNumber)
    # epoch = get_epochs(config)[epochNumber]
    # epochDir = '/'.join((config.get('session','outputprefix'), epoch))
    h5files = glob.glob(epochDir + '/*.h5')
    if len(h5files) != 1:
        utils.error('More than one .h5 file found in output " \
                "directory: %s' % str(h5files))
    return Session(h5files[0],
                   config.getint('audio', 'samprate'),
                   cache_dir=config.get('filesystem', 'tmp', '/tmp'))
Example #7
0
from yadfs.client.client import Client
import sys
import cfg

if __name__ == '__main__':
    cfg_path = sys.argv[1]
    opts = cfg.load(cfg_path)
    cl = Client(opts['ns_addr'])
    cl.create_file("/home/osboxes/yamr/cfg.py", "/test")
    data = cl.get_chunk("/test/cfg.py_0")

    cl.download_to("/test/cfg.py", "/home/osboxes/yamr/cfg_text.py")
    print(data)

    cl.save("hohohoho", "/test/my_file")
    cl.download_to("/test/my_file", "/home/osboxes/yamr/hoho.txt")
Example #8
0
def train(properties_path='props.yml', io_argument_path='io_args.yml'):
    props = load(properties_path)
    args = load(io_argument_path)

    # Read vocabularies
    src_f_name = args['src_train']
    trg_f_name = args['trg_train']
    vs = data_dense.read_vocabularies(args['model_name'] + "-vocab.pickle",
                                      src_f_name, trg_f_name, False,
                                      props['ngram_length'])
    vs.trainable = False

    # Inputs: list of one Input per N-gram size
    src_inp = Input(
        shape=(props['max_sent_length'], ),
        name="source_ngrams_{N}".format(N=props['ngram_length'][0]),
        dtype="int32")

    trg_inp = Input(
        shape=(props['max_sent_length'], ),
        name="target_ngrams_{N}".format(N=props['ngram_length'][0]),
        dtype="int32")

    # Embeddings: list of one Embedding per input
    src_emb = Embedding(len(vs.source_ngrams[props['ngram_length'][0]]),
                        props['feature_count'],
                        input_length=props['max_sent_length'],
                        name="source_embedding_{N}".format(
                            N=props['ngram_length'][0]))(src_inp)

    trg_emb = Embedding(len(vs.target_ngrams[props['ngram_length'][0]]),
                        props['feature_count'],
                        input_length=props['max_sent_length'],
                        name="target_embedding_{N}".format(
                            N=props['ngram_length'][0]))(trg_inp)

    # Conv
    src_conv_out = Conv1D(props['feature_count'], (5, ),
                          padding='same',
                          activation='relu')(src_emb)
    trg_conv_out = Conv1D(props['feature_count'], (5, ),
                          padding='same',
                          activation='relu')(trg_emb)

    src_maxpool_out = MaxPooling1D(
        pool_size=props['max_sent_length'])(src_conv_out)
    trg_maxpool_out = MaxPooling1D(
        pool_size=props['max_sent_length'])(trg_conv_out)

    src_flat_out = Flatten()(src_maxpool_out)
    trg_flat_out = Flatten()(trg_maxpool_out)

    # yet one dense
    src_dense_out = Dense(props['gru_width'],
                          name="source_dense")(src_flat_out)
    trg_dense_out = Dense(props['gru_width'],
                          name="target_dense")(trg_flat_out)

    # ...and cosine between the source and target side
    merged_out = dot([src_dense_out, trg_dense_out], axes=1, normalize=True)

    # classification
    s_out = Dense(1, activation='sigmoid',
                  name='classification_layer')(merged_out)

    model = Model(inputs=[src_inp, trg_inp], outputs=s_out)

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    print(model.summary())

    train_inf_iter = data_dense.InfiniteDataIterator(src_f_name, trg_f_name)
    train_batch_iter = data_dense.fill_batch(props['minibatch_size'],
                                             props['max_sent_length'], vs,
                                             train_inf_iter,
                                             props['ngram_length'])

    # dev iter
    dev_inf_iter = data_dense.InfiniteDataIterator(args['src_devel'],
                                                   args['trg_devel'])
    dev_batch_iter = data_dense.fill_batch(props['minibatch_size'],
                                           props['max_sent_length'], vs,
                                           dev_inf_iter, props['ngram_length'])

    # save model json
    model_json = model.to_json()

    with open('{}.json'.format(args['model_name']), "w") as json_file:
        json_file.write(model_json)

    # callback to save weights after each epoch
    save_cb = ModelCheckpoint(filepath=args['model_name'] + '.{epoch:02d}.h5',
                              monitor='val_loss',
                              verbose=1,
                              save_best_only=False,
                              mode='auto')

    early_stop = EarlyStopping(monitor='val_loss', patience=5)

    csv_logger = CSVLogger('./logs/train.log', append=True, separator=';')

    # tensor_board = TensorBoard(log_dir='./graph', write_graph=True, write_images=True)
    # callbacks = [save_cb, early_stop, csv_logger, tensor_board]

    callbacks = [
        save_cb, early_stop, csv_logger,
        TrainValTensorBoard(write_graph=False)
    ]

    # steps per epoch or validation steps equal samples in train or devel dataset / batch size, e.g. 2700 / 200 = 14
    steps_per_epoch = ceil(
        len(train_inf_iter.data) / props['minibatch_size']) * 10
    val_steps = ceil(len(dev_inf_iter.data) / props['minibatch_size']) * 10

    model.fit_generator(train_batch_iter,
                        steps_per_epoch,
                        props['epochs'],
                        callbacks=callbacks,
                        validation_data=dev_batch_iter,
                        validation_steps=val_steps)
Example #9
0
def main(properties_path='props.yml', io_argument_path='io_args.yml'):
    args = load(io_argument_path)
    props = load(properties_path)

    lang_pair = '{}-{}'.format(args['source_language'],
                               args['target_language'])
    aligned_filename_suffix = '.{}.aligned'.format(lang_pair)
    lang_pair_work_directory = os.path.join(args['work_directory'], lang_pair)

    dictionary_path_base = os.path.join(args['dictionary_directory'],
                                        args['dictionary_name'])
    vocabulary_path_base = os.path.join(args['vocabulary_directory'],
                                        args['vocabulary_name'])

    src_lang_vocabulary_path = '{}.{}'.format(vocabulary_path_base,
                                              args['source_language'])
    trg_lang_vocabulary_path = '{}.{}'.format(vocabulary_path_base,
                                              args['target_language'])

    src2trg_dictionary = build_dictionary(dictionary_path_base + ".f2e",
                                          src_lang_vocabulary_path)
    trg2src_dictionary = build_dictionary(dictionary_path_base + ".e2f",
                                          trg_lang_vocabulary_path)

    word2idx_src = \
        {word.strip().lower(): i for i, word in enumerate(open(src_lang_vocabulary_path, encoding="utf-8"))}
    word2idx_trg = \
        {word.strip().lower(): i for i, word in enumerate(open(trg_lang_vocabulary_path, encoding="utf-8"))}

    src2trg_matrix = build_translation_matrix(src2trg_dictionary, word2idx_src,
                                              word2idx_trg).tocsr()
    trg2src_matrix = build_translation_matrix(trg2src_dictionary, word2idx_trg,
                                              word2idx_src).tocsr()

    if not os.path.exists(lang_pair_work_directory):
        os.makedirs(lang_pair_work_directory)

    for entry in os.scandir(os.path.join(args['source_data_directory'],
                                         'snt')):
        if not (entry.is_file() and entry.name.endswith('_{}.snt'.format(
                args['source_language']))):
            continue

        pair_title = entry.name.rsplit('.', 1)[0].rsplit('_', 1)[0]
        trg_lang_filepath = os.path.join(
            os.path.dirname(entry.path),
            '{}_{}.snt'.format(pair_title, args['target_language']))

        if not os.path.exists(trg_lang_filepath):
            continue

        with open(os.path.join(lang_pair_work_directory, pair_title + aligned_filename_suffix),
                  'w', encoding='utf-8', newline='\n') as out_combined, \
                open(os.path.join(lang_pair_work_directory, '{}.{}.keras'.format(pair_title, lang_pair)),
                     'w', encoding='utf-8', newline='\n') as out_keras, \
                open(os.path.join(lang_pair_work_directory, '{}.{}.baseline'.format(pair_title, lang_pair)),
                     'w', encoding='utf-8', newline='\n') as out_baseline:
            src_sentences, src_vectors = load_data(entry.path,
                                                   props['feature_count'],
                                                   props['gru_width'])
            trg_sentences, trg_vectors = load_data(trg_lang_filepath,
                                                   props['feature_count'],
                                                   props['gru_width'])

            src_sparse = build_sparse_sklearn(src_sentences, word2idx_src)
            src_normalizer = np.array(
                [len(set(s.split())) for s in src_sentences], dtype=np.float32)

            sparse_dot_out = np.zeros((len(src_sentences), len(trg_sentences)),
                                      dtype=np.float32)
            sparse_dot_out2 = np.zeros(
                (len(src_sentences), len(trg_sentences)), dtype=np.float32)

            trg_sparse = build_sparse_sklearn(trg_sentences, word2idx_trg)
            trg_normalizer = np.array(
                [len(set(s.split())) for s in trg_sentences], dtype=np.float32)
            trg_normalizer = trg_normalizer.reshape(
                (1, len(trg_normalizer)))[:, len(trg_sentences) - 1]

            trg_translated_sparse = (trg_sparse * trg2src_matrix).tocsc()
            trg_translated_sparse.data = np.ones(len(
                trg_translated_sparse.data),
                                                 dtype=np.float32)

            trg_sparse = trg_sparse.tocsc()

            sim_matrix = np.dot(src_vectors[:len(src_sentences), :],
                                trg_vectors[:len(trg_sentences), :].T)

            csr_csc_dot_f(0, len(src_sentences), src_sparse,
                          trg_translated_sparse, sparse_dot_out)

            np.divide(sparse_dot_out,
                      src_normalizer.reshape(
                          (len(src_normalizer), 1))[:len(src_sentences), :],
                      sparse_dot_out)  # normalize

            tmp = src_sparse[:len(src_sentences), :] * src2trg_matrix
            tmp.data = np.ones(len(tmp.data),
                               dtype=np.float32)  # force to binary

            csr_csc_dot_f(0, tmp.shape[0], tmp, trg_sparse, sparse_dot_out2)

            np.divide(sparse_dot_out2, trg_normalizer,
                      sparse_dot_out2)  # normalize

            # sum sparse_dot_out and sparse_dot_out2, write results to sparse_dot_out
            np.add(sparse_dot_out, sparse_dot_out2, sparse_dot_out)
            # sum all three, write results to sparse_dot_out2
            np.add(sim_matrix, sparse_dot_out, sparse_dot_out2)

            # now sim_matrix has dense similarities, sparse_dot_out has baseline similarities,
            #   and sparse_dot_out2 has combined similarities

            argmaxs_keras = np.argmax(sim_matrix, axis=1)
            argmaxs_baseline = np.argmax(sparse_dot_out, axis=1)
            argmaxs_combined = np.argmax(sparse_dot_out2, axis=1)

            # Print results
            for j in range(argmaxs_keras.shape[0]
                           ):  # all three should have the same shape
                # keras
                print(sim_matrix[j, argmaxs_keras[j]],
                      src_sentences[j],
                      trg_sentences[argmaxs_keras[j]],
                      sep="\t",
                      file=out_keras,
                      flush=True)
                # baseline
                print(sparse_dot_out[j, argmaxs_baseline[j]] / 2.0,
                      src_sentences[j],
                      trg_sentences[argmaxs_baseline[j]],
                      sep="\t",
                      file=out_baseline,
                      flush=True)
                # combined
                print(sparse_dot_out2[j, argmaxs_combined[j]] / 3.0,
                      src_sentences[j],
                      trg_sentences[argmaxs_combined[j]],
                      sep="\t",
                      file=out_combined,
                      flush=True)
Example #10
0
from socket import *
from uuid import UUID
from mqtt import MQTT
from decrypt import decrypt
import cfg

cfg = cfg.load()
mqtt = MQTT()

s = socket(AF_INET, SOCK_DGRAM)
s.setsockopt(SOL_SOCKET, SO_BROADCAST, 1)
s.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
s.bind(('', 17117))

while True:
    data = s.recv(1024)
    data = decrypt(data)
    # if no valid data was received
    if data == None:
        continue

    # print(data.hex(' '))

    # check first two bytes of mac, they must be 0x17 and 0x00
    mac = data[0:6]
    if mac[0] != 0x17 and mac[1] != 0x00:
        continue

    # battery state (percent)
    if data[10] > 0:
        mqtt.publish('sensor/0x%s/battery/state' % (mac.hex()), data[10])
Example #11
0
def get_n_epochs(session):
    config = cfg.load(session)
    return len(get_epochs(config))
Example #12
0
def vectorize(properties_path='props.yml', io_argument_path='io_args.yml'):
    # read and create files
    props = load(properties_path)
    args = load(io_argument_path)

    output_filename_extension = '.npy'

    print('Vectorizing all sentences', file=sys.stderr)

    # read vocabularies
    vs = data_dense.read_vocabularies(
        '{}-vocab.pickle'.format(args['model_name']), "xxx", "xxx", False,
        props['ngram_length'])
    vs.trainable = False

    # load model
    trained_model = load_model('{}.{}'.format(args['model_name'],
                                              args['epoch_number']))
    output_size = trained_model.get_layer('source_dense').output_shape[1]
    max_sent_len = trained_model.get_layer(
        'source_ngrams_{n}'.format(n=props['ngram_length'][0])).output_shape[1]
    print(output_size, max_sent_len, file=sys.stderr)

    # build matrices
    for entry in os.scandir(args['preprocessed_source_data_directory']):
        if not (entry.is_file() and entry.name.endswith('_{}.snt'.format(
                args['source_language']))):
            continue

        src_in_path = entry.path
        trg_in_path = entry.path.rsplit('_', 1)[0] + '_{}.snt'.format(
            args['target_language'])
        # trg_in_path = '{}.{}'.format(os.path.splitext(entry.path)[0], args['target_language'])

        if not os.path.exists(trg_in_path):
            continue

        src_out_path = entry.path + output_filename_extension
        trg_out_path = trg_in_path + output_filename_extension

        max_sent_count = max(get_sentence_count(src_in_path),
                             get_sentence_count(trg_in_path))

        with open(src_in_path, encoding='utf-8') as src_inp, \
                open(trg_in_path, encoding='utf-8') as trg_inp, \
                open(src_out_path, 'wb') as src_outp, \
                open(trg_out_path, 'wb') as trg_outp:
            # get vectors
            counter = 0

            for i, (mx, targets, src_data, trg_data) in enumerate(
                    fill_batch(max_sent_count, max_sent_len, vs,
                               iter_wrapper(src_inp, trg_inp),
                               props['ngram_length'])):
                src, trg = trained_model.predict(
                    mx)  # shape is (max_sent_count, props['gru_width'])
                # loop over items in batch
                for j, (src_v, trg_v) in enumerate(zip(src, trg)):
                    if j >= len(src_data):  # empty padding of the batch
                        break

                    write_vector_to_file(src_outp, normalize_v(src_v),
                                         src_data[j])
                    write_vector_to_file(trg_outp, normalize_v(trg_v),
                                         trg_data[j])

                    counter += 1

                    if counter > 0 and counter % 100 == 0:
                        print('{}: vectorized {} sentence pairs'.format(
                            os.path.splitext(entry.name)[0], counter),
                              end='\r',
                              file=sys.stderr,
                              flush=True)

            print('{}: vectorized {} sentence pairs'.format(
                os.path.splitext(entry.name)[0], counter),
                  file=sys.stderr,
                  flush=True)