Beispiel #1
0
def main(model_name, embedding_path):
    num_targets = 20

    training_data = get_training_data()
    vocabulary = Vocabulary()
    texts = [sample.text for sample in training_data]
    text_token_generator = WordTokenizer(texts=texts)
    tokens = list(text_token_generator.get_tokens())

    max_doc_length = text_token_generator.get_max_length()

    vocabulary.fit(tokens=tokens)
    max_num_tokens = len(vocabulary)
    token_index_map = vocabulary.dictionary

    embedding_index = get_embedding_index(embedding_path)
    embedding_dimensions = len(list(embedding_index.values())[0])

    embedding_matrix = get_embedding_matrix(
        embedding_dimensions=embedding_dimensions,
        embedding_index=embedding_index,
        token_index_mapping=token_index_map)

    model = get_model(max_document_length=max_doc_length,
                      max_num_tokens=max_num_tokens,
                      embedding_weights=embedding_matrix,
                      embedding_dims=embedding_dimensions,
                      num_targets=num_targets)
    model.compile(optimizer=Adam(), loss="binary_crossentropy")
    plot_model(model, to_file=model_name + '.png')
Beispiel #2
0
def main(training_data_file_path, model_name, model_file_path):
    MAX_DOCUMENT_LENGTH = 10
    training_data = get_training_data(training_data_file_path)
    training_data = preprocess_text(training_data)
    sequences = get_sequences(training_data)
    print(len(sequences), sequences[0])

    tokens = sorted(list(set(training_data)))
    vocabulary = Vocabulary()
    vocabulary.fit(tokens)

    transformer = CharTextTransformer(vocabulary)
    encoded_sequences = list(map(transformer.transform, sequences))
    encoded_sequences = np.array(encoded_sequences)
    X, y = encoded_sequences[:, :-1], encoded_sequences[:, -1:]
    print(X.shape, y.shape)
    sequences = [to_categorical(x, num_classes=len(vocabulary)) for x in X]
    X = np.array(sequences)
    y = to_categorical(y, num_classes=len(vocabulary))

    experiment = CharLevelLanguageModelExperiment()
    model = experiment.get_model(max_document_length=MAX_DOCUMENT_LENGTH, vocabulary_size=len(vocabulary))
    model.compile(optimizer=Adam(), loss="categorical_crossentropy",
                  metrics=["accuracy"])
    plot_model(model, show_shapes=True, to_file=model_name + '.png')

    callbacks = [
        GradientDebugger(),
        TensorBoard(log_dir='/tmp/char_level_language_model'),
        ReduceLROnPlateau(factor=0.1, verbose=1),

    ]
    model.fit(x=X, y=y, epochs=100, verbose=1, callbacks=callbacks, shuffle=True)
    predicted_text = predict_text(model, transformer=transformer, max_sequence_length=10, num_chars=20, seed_text="sing a so")
    print(predicted_text)
Beispiel #3
0
def get_vocabulary_tokenizer(samples):
    texts = [sample.text for sample in samples]
    vocabulary = Vocabulary()
    tokenizer = WordTokenizer(texts=texts, tokenizer=TweetTokenizer())
    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in samples]
    vocabulary.fit((token for tokens in tokenized_samples for token in tokens))
    print(tokenized_samples[0:1])
    return vocabulary, tokenizer
Beispiel #4
0
def main(model_name, model_file_path, embedding_path, training_data_path, batch_size=32, epochs=20):
    samples = get_training_data(training_data_path)
    labels = [sample.label for sample in samples]
    num_targets = len(NAME_INDEX_CLAS_MAP)

    texts = [sample.text for sample in samples]
    vocabulary = Vocabulary()
    tokenizer = WordTokenizer(texts=texts)

    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in samples]
    vocabulary.fit((token for tokens in tokenized_samples for token in tokens))
    print(tokenized_samples[0:1])

    token_index_map = vocabulary.dictionary

    num_tokens = len(vocabulary)
    max_document_length = tokenizer.get_max_length(tokenized_samples)
    transformer = TextTransformer(class_map=NAME_INDEX_CLAS_MAP, max_sequence_length=max_document_length,
                                  vocabulary=vocabulary, tokenizer=tokenizer)

    sample_batch_provider = SampleBatchProvider(batch_size=len(samples), num_labels=num_targets,
                                                max_document_length=max_document_length, max_token_length=num_tokens)

    X = [transformer.transform(sample.text) for sample in samples]
    X, y = sample_batch_provider.get_batch(X, labels)

    embedding_index = get_embedding_index(embedding_path)
    embedding_dimensions = len(list(embedding_index.values())[0])

    embedding_matrix = get_embedding_matrix(
        embedding_dimensions=embedding_dimensions,
        embedding_index=embedding_index,
        token_index_mapping=token_index_map
    )

    model = get_model(max_document_length=max_document_length, max_num_tokens=num_tokens,
                      embedding_weights=embedding_matrix, embedding_dims=embedding_dimensions, num_targets=num_targets)
    model.compile(optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"])
    plot_model(model, to_file=model_name + '.png')

    callbacks = [
        GradientDebugger(),
        TensorBoard(log_dir='/tmp/reuters_embedding'),
        ModelCheckpoint(model_file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max'),
        EarlyStopping(monitor='val_acc', patience=5, mode='max'),
        ReduceLROnPlateau(factor=0.1, verbose=1),

    ]
    model.fit(x=X, y=y, batch_size=None, epochs=epochs, verbose=1, callbacks=callbacks, validation_split=0.2,
              shuffle=True, steps_per_epoch=100, validation_steps=100)
Beispiel #5
0
def main():
    MAX_SEQUENCE_LENGTH = 1014  # from paper
    NUM_CLASSES = 46  # 46 reuters topics
    input_vocabulary = Vocabulary()
    input_vocabulary_size = 1000

    (x_train, y_train), (x_test, y_test) = reuters.load_data(path="reuters.npz",
                                                             num_words=None,
                                                             skip_top=0,
                                                             maxlen=None,
                                                             test_split=0.2,
                                                             seed=113,
                                                             start_char=1,
                                                             oov_char=2,
                                                             index_from=3)
    model = get_model(
        filter_sizes=(256,),
        kernel_sizes=(7,),
        pool_sizes=(3,),
        hidden_sizes=(1056,),
        input_vocabulary_size=input_vocabulary_size,
        num_classes=NUM_CLASSES,
        max_sequence_length=MAX_SEQUENCE_LENGTH,
        metrics=['accuracy']
    )
    print(model.summary())
    class_map = get_class_map(np.concatenate([y_train, y_test]))
    transformer = Transformer(max_sequence_length=MAX_SEQUENCE_LENGTH, vocabulary_size=input_vocabulary_size,
                              class_map=class_map)
    training_generator = DataGenerator(X=x_train, y=y_train, transformer=transformer)
    X, y = training_generator.get_training_data()
    model.fit(x=X, y=y, epochs=1, verbose=1, callbacks=[TensorBoard(log_dir='/tmp/deep_conv')])
Beispiel #6
0
def main(audio_dir, labels_file_path):
    data_generator = DataGenerator(audio_dir, labels_file_path)
    texts = list([datum['y'] for datum in data_generator.generate()])
    token_generator = CharacterTokenizer(texts)
    tokens = token_generator.get_tokens()
    vocabulary = Vocabulary()
    vocabulary.fit(tokens)
    transformer = Transformer(max_frequency=129,
                              max_time=400,
                              output_vocabulary=vocabulary)
    num_training_datums = 10

    X = np.zeros(shape=(num_training_datums, transformer.max_time,
                        transformer.max_frequency))
    y = np.zeros(shape=(num_training_datums, transformer.max_time))
    labels = []
    input_lengths = []
    label_lengths = []

    for i, datum in enumerate(
            list(data_generator.generate())[0:num_training_datums]):
        xi = transformer.transform_x(datum['x'])
        yi = transformer.transform_y(datum['y'])
        input_lengths.append(xi.shape[0])
        label_lengths.append(yi.shape[0])
        labels.append(yi)
        X[i, :transformer.max_time, :transformer.
          max_frequency] = xi[:transformer.max_time, :transformer.
                              max_frequency]
        #y[i, :transformer.max_time, :len(vocabulary)] = yi[:transformer.max_time, :len(vocabulary)]
        #y[i, :transformer.max_time] = yi[:transformer.max_time]

    model = get_model(max_audio_length=transformer.max_time,
                      max_frequency=transformer.max_frequency,
                      output_vocab_size=len(vocabulary))
    print(model.summary())
    np.array(labels)
    inputs = {
        'input': X,
        'the_labels': np.array(labels),
        'input_length': np.array(input_lengths),
        'label_length': np.array(label_lengths),
    }

    outputs = {'ctc': np.zeros(shape=[num_training_datums])}

    model.fit(x=inputs, y=outputs)
Beispiel #7
0
def main(training_data_path, path_prefix, split=0.9):
    document = load_text(training_data_path)
    pairs = get_pairs(document)
    shuffle(pairs)
    # pairs = pairs[:100]
    print("before cleaning", pairs[0:10])
    pairs = preprocess_pairs(pairs)
    print("after cleaning", pairs[0:10])
    pairs = np.array(pairs)
    max_source_length = get_max_length(pairs[:, 0])
    max_target_length = get_max_length(pairs[:, 1])

    source_pairs = [pair[0] for pair in pairs]
    target_pairs = [pair[1] for pair in pairs]
    source_tokens = get_tokens(source_pairs)
    target_tokens = get_tokens(target_pairs)
    source_vocabulary = Vocabulary()
    source_vocabulary.fit(source_tokens)
    target_vocabulary = Vocabulary()
    target_vocabulary.fit(target_tokens)

    joblib.dump(source_vocabulary,
                os.path.join(path_prefix, "source_vocabulary.pkl"))
    joblib.dump(target_vocabulary,
                os.path.join(path_prefix, "target_vocabulary.pkl"))

    split = int(len(pairs) * split)
    training_pairs = pairs[:split]
    validation_pairs = pairs[:split]

    training_data_sink = JSONFileSink(
        os.path.join(path_prefix, "training.json"))
    validation_data_sink = JSONFileSink(
        os.path.join(path_prefix, "validation.json"))

    for pair in training_pairs:
        training_data_sink.receive(pair)

    for pair in validation_pairs:
        validation_data_sink.receive(pair)

    print("max source length ", max_source_length)
    print("max target length ", max_target_length)
def predict_signal(model, transformer, text):
    tokenizer = TweetTokenizer()
    text = preprocess_text(text)
    tokens = tokenizer.tokenize(text)

    token_chunks = divide_chunks(tokens, 2)
    for chunks in token_chunks:
        text = " ".join(chunks)
        xi = transformer.transform_xi(text)
        X = np.array([xi])
        prediction = model.predict(X, verbose=0)[0]
        integers = [np.argmax(vector) for vector in prediction]
        target = list()
        Vocabulary()

        for ix in integers:
            word = transformer.target_vocabulary.decode([ix])[0]
            if word is None or ix == 0:
                break

            target.append(word)

        out = " ".join(target)
        print("source text %s predicted %s" % (text, out))
Beispiel #9
0
def main(model_name, model_file_path, num_epochs=100, batch_size=64, num_samples=1000000):
    data_path = '/Users/jnewman/Projects/learning/ai_blog/ml/data/translation/spa-eng/spa.txt'
    experiment = NeuralTranslationCharacterExperiment()
    lines = load_data(data_path)
    source_texts, target_texts, source_characters, target_characters = prepare_data(lines, num_samples)

    source_characters = sorted(list(source_characters))
    target_characters = sorted(list(target_characters))

    source_vocabulary = Vocabulary()
    source_vocabulary.fit(tokens=source_characters)
    target_vocabulary = Vocabulary()
    target_vocabulary.fit(tokens=target_characters)

    source_vocabulary_size = len(source_vocabulary)
    target_vocabulary_size = len(target_vocabulary)
    max_source_seq_length = max([len(txt) for txt in source_texts])
    max_target_seq_length = max([len(txt) for txt in target_texts])

    print('Number of samples:', len(source_texts))
    print('Number of unique input tokens:', source_vocabulary_size)
    print('Number of unique output tokens:', target_vocabulary_size)
    print('Max sequence length for inputs:', max_source_seq_length)
    print('Max sequence length for outputs:', max_target_seq_length)

    encoder_input_data = np.zeros(
        (len(source_texts), max_source_seq_length, source_vocabulary_size),
        dtype='float32')
    decoder_input_data = np.zeros(
        (len(source_texts), max_target_seq_length, target_vocabulary_size),
        dtype='float32')
    decoder_target_data = np.zeros(
        (len(source_texts), max_target_seq_length, target_vocabulary_size),
        dtype='float32')

    for i, (source_text, target_text) in enumerate(zip(source_texts, target_texts)):
        for t, char in enumerate(source_text):
            source_vocab_ix = source_vocabulary.dictionary.get(char)
            encoder_input_data[i, t, source_vocab_ix] = 1.
        for t, char in enumerate(target_text):
            # decoder_target_data is ahead of decoder_input_data by one timestep
            target_vocab_ix = target_vocabulary.dictionary.get(char)
            decoder_input_data[i, t, target_vocab_ix] = 1.
            if t > 0:
                # decoder_target_data will be ahead by one timestep
                # and will not include the start character.
                decoder_target_data[i, t - 1, target_vocabulary.dictionary.get(char)] = 1.

    models = experiment.get_lstm_model(source_vocabulary_length=source_vocabulary_size,
                                       target_vocabulary_length=target_vocabulary_size)
    training_model = models["training_model"]
    decoder_model = models["decoder"]
    encoder_model = models["encoder"]
    # Run training
    training_model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

    plot_model(training_model, to_file='char_translator.png', show_shapes=True)

    callbacks = [
        GradientDebugger(),
        TensorBoard(log_dir='/tmp/char_translator'),
        ReduceLROnPlateau(factor=0.01, verbose=1),
        ModelCheckpoint(model_name + '.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    ]

    training_model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
                       batch_size=batch_size,
                       epochs=num_epochs,
                       callbacks=callbacks,
                       validation_split=0.2,
                       shuffle=True)
    # Save model
    training_model.save('s2s.h5')

    for seq_index in range(100):
        # Take one sequence (part of the training set)
        # for trying out decoding.
        input_seq = encoder_input_data[seq_index: seq_index + 1]
        decoded_sentence = decode_sequence(
            input_seq=input_seq,
            decoder_model=decoder_model,
            max_decoder_seq_length=max_target_seq_length,
            target_vocabulary=target_vocabulary,
            num_decoder_tokens=target_vocabulary_size,
            encoder_model=encoder_model
        )
        print('-')
        print('Input sentence:', source_texts[seq_index])
        print('Decoded sentence:', decoded_sentence)
Beispiel #10
0
    args = parser.parse_args()

    uid = uuid4().hex
    os.makedirs(uid)
    print('training run: {}'.format(uid))

    samples = load_samples(args.samples_path)
    samples = samples[0:10]
    train_samples, val_samples = train_val_split(samples)
    train_provider = TripletProvider(train_samples, shuffle=True)
    val_provider = TripletProvider(val_samples, shuffle=True)

    tokenizer = TweetTokenizer()
    tokenized_samples = [tokenizer.tokenize(sample.text) for sample in train_samples]

    vocabulary = Vocabulary()
    vocabulary.fit((c for tokens in tokenized_samples for token in tokens for c in token))
    vocab_path = os.path.join(uid, 'vocab_{}.pkl'.format(uid))
    joblib.dump(vocabulary, vocab_path)

    transformer = HierarchicalTripletTransformer(vocabulary)

    max_document_length, max_token_length = get_max_length(tokenized_samples)

    train_generator = TripletBatchGenerator(train_provider, transformer, max_document_length, max_token_length,
                                            len(vocabulary), args.batch_size)

    val_generator = TripletBatchGenerator(val_provider, transformer, max_document_length, max_token_length,
                                          len(vocabulary), args.batch_size)

    encoder = get_model(