def create_embedding_model():
    text_inputs = Input(shape = (MAX_SEQ_LENGTH, ), name = "text_input")
    embedding_layer = Embedding(MAX_NB_WORDS + 1,
                                EMBEDDING_DIM,
                                embeddings_initializer = Constant(embedding_matrix),
                                input_length = MAX_SEQ_LENGTH,
                                trainable = False)
    x = embedding_layer(text_inputs)

    # convolution 1st layer
    x = Conv1D(128, 5, activation='relu', input_shape = (200, 1))(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(5)(x)

    # convolution 2nd layer
    x = Conv1D(128, 5, activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(35)(x)
    x = Flatten()(x)

    embedding_input = Input(shape = (EMBEDDING_SIZE, ), name = "embedding_input")
    all_features = concatenate([x, embedding_input])

    x = Dense(units=1000, activation='relu', input_shape=(int_shape(all_features),))(all_features)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)
    x = Dense(units=1000, activation='relu', input_shape=(int_shape(all_features),))(x)   
    outputs = Dense(units=len(labelEncoder.classes_), activation = 'softmax')(x)

    model = Model([text_inputs, embedding_input], outputs)
    model.compile(loss = 'categorical_crossentropy',
                 optimizer = keras.optimizers.Adam(lr=0.001), 
                 metrics = ['accuracy'])
    return model
def create_model():
    embedding_layer = Embedding(
        MAX_NB_WORDS + 1,
        EMBEDDING_DIM,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=MAX_SEQ_LENGTH,
        trainable=False)
    model = Sequential()
    model.add(embedding_layer)
    # convolution 1st layer
    model.add(Conv1D(128, 5, activation='relu', input_shape=(200, 1)))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(5))

    # convolution 2nd layer
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(35))

    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(len(labelEncoder.classes_), activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=keras.optimizers.Adam(lr=0.001),
                  metrics=['accuracy'])
    return model
Ejemplo n.º 3
0
 def build(self, input_shape):
     self.shape = input_shape
     init_mat = np.eye(self.njoints) + np.random.normal(0.0, 1e-2, (self.njoints, self.njoints))
     self.comb_matrix = self.add_weight(name='comb_matrix',
                                        shape=(self.njoints, self.njoints),
                                        initializer=Constant(init_mat),
                                        trainable=True)
     super(CombMatrix, self).build(input_shape)
def create_model():
    sequence_input = Input(shape=(MAX_SEQ_LENGTH, ), dtype='int32')
    embedded_layer = Embedding(
        MAX_NB_WORDS + 1,
        EMBEDDING_DIM,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=MAX_SEQ_LENGTH,
        trainable=False)
    embedded_sequences = embedded_layer(sequence_input)
    # convolution 1st layer
    x = Conv1D(128, 5, activation='relu',
               input_shape=(200, 1))(embedded_sequences)
    x = BatchNormalization()(x)
    x = MaxPooling1D(5)(x)

    # convolution 2nd layer
    x = Conv1D(128, 5, activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(35)(x)

    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    category_output = Dense(len(label_mapping),
                            activation='softmax',
                            name="category")(x)
    journal_output = Dense(len(labelEncoder.classes_),
                           activation='softmax',
                           name="journal")(x)
    if_output = Dense(len(quartiles) + 1, activation='softmax', name="if")(x)
    model = Model(inputs=sequence_input,
                  outputs=[category_output, journal_output, if_output])
    model.compile(loss={
        'category': 'categorical_crossentropy',
        'journal': 'categorical_crossentropy',
        'if': 'categorical_crossentropy'
    },
                  optimizer=keras.optimizers.Adam(lr=0.001),
                  metrics={
                      'category': 'accuracy',
                      'journal': 'accuracy',
                      'if': 'accuracy'
                  })

    return model
Ejemplo n.º 5
0
def train(choice, dirname, window):
    #  --------------------------------------------------------------------------------------------------------------------
    dimensionality = 50  # No need to adjust, unless you want to experiment with custom embeddings
    print("Dimensionality:", dimensionality)
    regex = re.compile(r"[+-.]?\d+[-.,\d+:]*(th|st|nd|rd)?")

    if choice == 'imm':
        base = '_imm'
    elif choice == 'prewin':
        base = '_prewin'
    style = 'test'
    mlmr_dir = dirname
    seq_length = window  # Adjust to 5 for PreWin and 5, 10, 50 for baseline results

    neg = load_from_pickle("{}/wiki_LOCATION_{}{}.pkl".format(
        mlmr_dir, style, base))
    pos = load_from_pickle("{}/wiki_INSTITUTE_{}{}.pkl".format(
        mlmr_dir, style, base))
    if path.exists("{}/wiki_EVENT_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_EVENT_{}{}.pkl".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_TEAM_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_TEAM_{}{}.pkl".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_ARTIFACT_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_ARTIFACT_{}{}.pkl".format(
                mlmr_dir, style, base)))

    print("Sequence Length: 2 times ", seq_length)

    A = []
    dep_labels = {u"<u>"}
    for coll in [neg, pos]:
        for l in coll:
            A.append(l)
            dep_labels.update(set(l[1][-seq_length:] + l[3][:seq_length]))

    random.shuffle(A)

    X_L, D_L, X_R, D_R, Y = [], [], [], [], []
    for a in A:
        X_L.append(a[0][-seq_length:])
        D_L.append(a[1][-seq_length:])
        X_R.append(a[2][:seq_length])
        D_R.append(a[3][:seq_length])
        Y.append(a[4])

    print('No of training examples: ', len(X_L))
    dump_to_pickle("dep_labels.pkl", dep_labels)
    dep_labels = load_from_pickle("dep_labels.pkl")
    #  --------------------------------------------------------------------------------------------------------------------
    vocabulary = {u"<u>", u"0.0"}
    vocab_limit = 100000
    print('Vocabulary Size: ', vocab_limit)
    print("Building sequences...")

    count = 0
    vectors_glove = {u'<u>': np.ones(dimensionality)}
    # Please supply your own embeddings, see README.md for details
    for line in codecs.open("glove.6B.50d.txt", encoding="utf-8"):
        tokens = line.split()
        vocabulary.add(tokens[0])
        vectors_glove[tokens[0]] = [float(x) for x in tokens[1:]]
        count += 1
        if count >= vocab_limit:
            break

    vectors_glove[u"0.0"] = np.zeros(dimensionality)
    word_to_index = dict([(w, i) for i, w in enumerate(vocabulary)])
    dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)])

    for x_l, x_r, d_l, d_r in zip(X_L, X_R, D_L, D_R):
        for i, w in enumerate(x_l):
            if w != u"0.0":
                w = regex.sub(u"1", w)
            if w in word_to_index:
                x_l[i] = word_to_index[w]
            else:
                x_l[i] = word_to_index[u"<u>"]
        for i, w in enumerate(x_r):
            if w != u"0.0":
                w = regex.sub(u"1", w)
            if w in word_to_index:
                x_r[i] = word_to_index[w]
            else:
                x_r[i] = word_to_index[u"<u>"]
        for i, w in enumerate(d_l):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_l[i] = arr
        for i, w in enumerate(d_r):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_r[i] = arr

    X_L = np.asarray(X_L)
    X_R = np.asarray(X_R)
    D_L = np.asarray(D_L)
    D_R = np.asarray(D_R)
    Y = np.asarray(Y)

    # convert labels to one-hot format
    num_classes = Y.max() + 1
    one_hot = np.zeros((Y.size, num_classes))
    one_hot[np.arange(Y.size), Y] = 1
    Y = one_hot

    weights = np.zeros((len(vocabulary), dimensionality))
    for w in vocabulary:
        if w in vectors_glove:
            weights[word_to_index[w]] = vectors_glove[w]
    weights = np.array([weights])

    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    print(u'Building model...')
    first_input = Input(shape=(seq_length, ))
    a = Embedding(len(vocabulary),
                  dimensionality,
                  input_length=(seq_length, ),
                  embeddings_initializer=Constant(weights))(first_input)
    b = LSTM(units=15)(a)
    first_output = Dropout(0.2)(b)
    model_left = Model(inputs=first_input, outputs=first_output)

    second_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(second_input)
    b = Dropout(0.2)(a)
    second_output = Flatten()(b)
    dep_left = Model(inputs=second_input, outputs=second_output)

    third_input = Input(shape=(seq_length, ))
    a = Embedding(len(vocabulary),
                  dimensionality,
                  input_length=(seq_length, ),
                  embeddings_initializer=Constant(weights))(third_input)
    b = LSTM(units=15, go_backwards=True)(a)
    third_output = Dropout(0.2)(b)
    model_right = Model(inputs=third_input, outputs=third_output)

    fourth_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(fourth_input)
    b = Dropout(0.2)(a)
    fourth_output = Flatten()(b)
    dep_right = Model(inputs=fourth_input, outputs=fourth_output)

    a = concatenate([first_output, second_output, third_output, fourth_output])
    b = Dense(10)(a)
    c = Dense(num_classes, activation='softmax')(b)
    merged_model = Model(
        inputs=[first_input, second_input, third_input, fourth_input],
        outputs=c)
    merged_model.compile(loss='categorical_crossentropy',
                         optimizer='adagrad',
                         metrics=['accuracy'])
    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    checkpoint = ModelCheckpoint(filepath="lstm.hdf5", verbose=0)
    merged_model.fit([X_L, D_L, X_R, D_R],
                     Y,
                     batch_size=16,
                     epochs=5,
                     callbacks=[checkpoint],
                     verbose=0)