Example #1
0
def encode_labels(data, labenc=None, max_len=50, pad=True):
    if labenc is None:
        labenc = labelencoder
    if labenc is None: # or onehotencoder == None:
        print("Error: labelencoder must be trained before it can be used!")
        return None
    #return onehotencoder.transform(labelencoder.transform(data))
    data2 = []

    num_labels = len(labenc.classes_)
    zero_vec = data_util.zero_vec(num_labels)
    if debug: print("data: ", str(len(data)))
    for item in data:
        #print "item len: " + str(len(item))
        new_item = []
        if len(item) > 0:
            item2 = labenc.transform(item)
            for lab in item2:
                onehot = []
                for x in range(num_labels):
                    onehot.append(0)
                onehot[lab] = 1
                new_item.append(onehot)
        # Pad vectors
        if pad:
            if len(new_item) > max_len:
                new_item = new_item[0:max_len]
            while len(new_item) < max_len:
                new_item.append(zero_vec)
        data2.append(new_item)
    return data2
Example #2
0
def decode_sequence(encoder_model, decoder_model, input_seq, output_seq_len, output_dim, vec_labels=False):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, batch_size=1)

    # Generate empty target sequence of length 1.
    #output_dim = 5
    #print "output_dim: " + str(output_dim)
    target_seq = numpy.zeros((1, 1, int(output_dim)))
    # Populate the first character of target sequence with the start character.
    zero_lab = data_util.zero_vec(output_dim)
    if vec_labels:
        target_seq[0, 0] = zero_lab
    else:
        zero_lab = encode_labels([['O']])[0][0]
        index = zero_lab.index(1)
        target_seq[0, 0, index] = 1

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = []
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        # Sample a token
        #sampled_token_index = np.argmax(output_tokens[0, -1, :])
        #sampled_lab = reverse_target_char_index[sampled_token_index]
        #print "output_tokens shape: " + str(output_tokens.shape)
        token = output_tokens[0, -1]
        #print "token: " + str(token)
        encoded_label = numpy.zeros((output_dim,), dtype=numpy.int).tolist()
        if vec_labels:
            decoded_sentence.append(encoded_label)
        else:
            ind = numpy.argmax(token)
            encoded_label[ind] = 1
            #print "encoded_label: " + str(encoded_label)
            sampled_lab = decode_labels([encoded_label])[0]
            print("sampled_lab: ", str(sampled_lab))
            decoded_sentence.append(sampled_lab)

        # Exit condition: either hit max length or find stop character.
        if (len(decoded_sentence) > output_seq_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = numpy.zeros((1, 1, output_dim))
        for x in range(output_dim):
            target_seq[0, 0, x] = token[x]

        # Update states
        states_value = [h, c]

    return decoded_sentence
Example #3
0
def get_feats(seqs, pad=True, train=False):
    print("get_feats")
    vec_model, dim = word2vec.load(vecfile)
    zero_vec = data_util.zero_vec(dim)
    feats = []
    labels = []
    global label_set
    label_set = set([])
    for s in seqs:
        s_feats = []
        s_labels = []
        for pair in s:
            word = pair[0]
            vector = word2vec.get(word, vec_model)
            s_feats.append(vector)
            s_labels.append(pair[1])
            label_set.add(pair[1])
        feats.append(s_feats)
        labels.append(s_labels)
    if train:
        num_labels = len(list(label_set))
        if debug: print("num original labels:", str(num_labels))
        label_list = list(label_set)
        print('label_list:', str(label_list))
        label_list.append('IE') # Add the IE label in case it wasn't in training data
        print('label_list:', str(labels))
        create_labelencoder(label_list, num_labels)
        global max_seq_len
        #max_seq_len = max([len(txt) for txt in feats])
    print("max_seq_len: ", str(max_seq_len))

    # Pad sequences
    #feats = pad_sequences(numpy.array(feats), maxlen=max_seq_len, dtype='float32', padding="pre")
    #labels = pad_sequences(numpy.array(labels), maxlen=max_seq_len, dtype='str', padding="pre", value='O')

    if pad:
        padded_feats = []
        padded_labels = []
        for feat in feats:
            #print "seq len: " + str(len(feat))
            while len(feat) > max_seq_len:
                feat_part = feat[0:max_seq_len]
                padded_feats.append(pad_feat(feat_part, max_seq_len, zero_vec))
                feat = feat[max_seq_len:]
            new_feat = pad_feat(feat, max_seq_len, zero_vec)
            padded_feats.append(new_feat)
        for labs in labels:
            while len(labs) > max_seq_len:
                labs_part = labs[0:max_seq_len]
                padded_labels.append(pad_feat(labs_part, max_seq_len, 'O'))
                labs = labs[max_seq_len:]
                #print("labs:", str(labs))
            padded_labels.append(pad_feat(labs, max_seq_len, 'O'))
        feats = padded_feats
        labels = padded_labels

    # Encode labels
    print("labels[0]: ", str(labels[0]))
    encoded_labels = encode_labels(labels, max_len=max_seq_len, pad=pad)
    print("encoded_labels[0]: ", str(encoded_labels[0]))
    #for row in labels:
    #    encoded_row = encode_labels(row)
    #    encoded_labels.append(encoded_row)
    print("feats: ", str(len(feats)), " labels: ", str(len(encoded_labels)))
    return feats, encoded_labels
Example #4
0
def train_seq2seq(trainx, trainy, num_nodes=100, vec_labels=False, loss_function="cosine_proximity", num_epochs=10):
    trainx = numpy.array(trainx)
    print("trainx shape: ", str(trainx.shape))
    trainy = numpy.array(trainy)
    print("trainy shape: ", str(trainy.shape))
    input_dim = trainx.shape[-1]
    output_dim = trainy.shape[-1]
    input_seq_len = trainx.shape[1]
    output_seq_len = trainy.shape[1]

    # Create decoder target data
    trainy_target = []
    zero_lab = data_util.zero_vec(output_dim)
    if not vec_labels:
        zero_lab = encode_labels([['O']])[0][0]
    print("zero_lab shape: ", str(numpy.asarray(zero_lab)))
    for i in range(trainy.shape[0]):
        row = trainy[i].tolist()
        new_row = row[1:]
        new_row.append(zero_lab)
        trainy_target.append(new_row)
    trainy_target = numpy.asarray(trainy_target)

    print("trainy_target shape: ", str(trainy_target.shape))

    # Set up the encoder
    latent_dim = num_nodes
    dropout = 0.1
    encoder_inputs = Input(shape=(None, input_dim)) #seq_len
    encoder = LSTM(latent_dim, return_state=True)

    # Encoder-Decoder model
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, output_dim))
    decoder_rnn = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, d_state_h, d_state_c = decoder_rnn(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(output_dim, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    model.compile(optimizer='rmsprop', loss=loss_function)
    model.fit([trainx, trainy], trainy_target, epochs=num_epochs)

    # Normal RNN
    #rnn_out = GRU(latent_dim, return_sequences=False)(encoder_inputs)
    #dropout_out = Dropout(dropout)(rnn_out)
    #prediction = Dense(output_dim, activation='softmax')(dropout_out)
    #model = Model(inputs=encoder_inputs, outputs=prediction)
    #model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    #model.fit(trainx, trainy, nb_epoch=20)

    model.summary()
    model.save('seq2seq.model')

    # Create models for inference
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_rnn(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

    return model, encoder_model, decoder_model, output_dim
def get_vec(word, model):
    dim = model.vector_size
    if word in model:  #.wv.vocab:
        return model[word].tolist()
    else:
        return data_util3.zero_vec(dim)
Example #6
0
def get(word, model):
    dim = model.vector_size
    if word in model:  #.wv.vocab:
        return list(model[word])
    else:
        return data_util.zero_vec(dim)