def __init__(self, ratings_filename='../Data/ratings.npz', n_hidden_factors=10):
        self.data = DataLoader.ratings_data(ratings_filename)
        self.n_users, self.n_items = self.data.shape
        self.n_hidden_factors = n_hidden_factors
        self.corpus_ix = self.data.nonzero()

        self.alpha = np.random.uniform()
        self.beta_user = np.random.rand(self.n_users)
        self.beta_item = np.random.rand(self.n_items)
        self.gamma_user = np.random.rand(self.n_users, self.n_hidden_factors)
        self.gamma_item = np.random.rand(self.n_items, self.n_hidden_factors)
        self.predicted_rating = np.zeros((self.n_users, self.n_items))
Beispiel #2
0
    def __init__(self,
                 ratings_filename='../Data/ratings.npz',
                 n_hidden_factors=10):
        self.data = DataLoader.ratings_data(ratings_filename)
        self.n_users, self.n_items = self.data.shape
        self.n_hidden_factors = n_hidden_factors
        self.corpus_ix = self.data.nonzero()

        self.alpha = np.random.uniform()
        self.beta_user = np.random.rand(self.n_users)
        self.beta_item = np.random.rand(self.n_items)
        self.gamma_user = np.random.rand(self.n_users, self.n_hidden_factors)
        self.gamma_item = np.random.rand(self.n_items, self.n_hidden_factors)
        self.predicted_rating = np.zeros((self.n_users, self.n_items))
Beispiel #3
0
    def __init__(self, reviews_filename='../Data/reviews.npz', n_topics=10):
        data = DataLoader.review_data(reviews_filename)
        self.n_docs, self.n_vocab = data.shape
        self.n_topics = n_topics

        self.phi = np.random.rand(n_topics, self.n_vocab)
        self.phi /= self.phi.sum(axis=1)[:, None]

        self.theta = np.random.rand(self.n_docs, n_topics)
        self.theta /= self.theta.sum(axis=1)[:, None]

        self.topic_frequencies = np.zeros((self.n_docs, self.n_topics))
        self.word_topic_frequencies = np.zeros((self.n_topics, self.n_vocab))
        self.backgroundwords = np.zeros(self.n_vocab)

        self.z = list()
        self.reviews = list()
        for doc_ix in xrange(self.n_docs):
            data_review = flatten_bow(data[doc_ix, :].toarray()[0])
            n_words = len(data_review)
            self.z.append(np.zeros(n_words, dtype=int))
            self.reviews.append(data_review)
            np.add.at(self.backgroundwords, data_review, 1.0)
        self.backgroundwords /= np.sum(self.backgroundwords)
    def __init__(self, reviews_filename='../Data/reviews.npz', n_topics=10):
        data = DataLoader.review_data(reviews_filename)
        self.n_docs, self.n_vocab = data.shape
        self.n_topics = n_topics

        self.phi = np.random.rand(n_topics, self.n_vocab)
        self.phi /= self.phi.sum(axis=1)[:, None]

        self.theta = np.random.rand(self.n_docs, n_topics)
        self.theta /= self.theta.sum(axis=1)[:, None]

        self.topic_frequencies = np.zeros((self.n_docs, self.n_topics))
        self.word_topic_frequencies = np.zeros((self.n_topics, self.n_vocab))
        self.backgroundwords = np.zeros(self.n_vocab)

        self.z = list()
        self.reviews = list()
        for doc_ix in xrange(self.n_docs):
            data_review = flatten_bow(data[doc_ix, :].toarray()[0])
            n_words = len(data_review)
            self.z.append(np.zeros(n_words, dtype=int))
            self.reviews.append(data_review)
            np.add.at(self.backgroundwords, data_review, 1.0)
        self.backgroundwords /= np.sum(self.backgroundwords)
Beispiel #5
0
    x_char = []
    for sent in x_tmp2:
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char


"""
Preparing file
"""

train = DL('ner_3_train.ner')
test = DL('ner_3_test.ner')
# val = DL('id-ud-dev.pos')
# train.add('id-ud-dev.pos')
"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
    values = line.split()
Beispiel #6
0
def activationPrompt(name):
    print "List of Activation Functions\n" \
          "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \
          "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear"
    choice = input('Enter type of activation function for ' + name + ': ')
    activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign',
                   'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
    return activations[choice - 1]


"""
Preparing file
"""
train_name = sys.argv[1]
train = DL(train_name)
# percentage = input('Enter percentage of data to take: ')
percentage = 0.9
seed = sys.argv[2]
# seed = input('Enter seed for slicing data: ')
train.slice(percentage, seed)
test_name = sys.argv[1]
test = DL(test_name)
test.antislice(percentage, seed)

"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
    x_char = []
    for sent in x_tmp2:
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char


"""
Preparing file
"""

train = DL('ner_3_train.ner')
percentage = float(sys.argv[1])  # input('Enter percentage of data to take: ')
seed = int(sys.argv[2])  # input('Enter seed for slicing data: ')
train.slice(percentage, seed)
test = DL('ner_3_test.ner')
# val = DL('ner_3_train.ner')
# train.add('id-ud-dev.pos')
"""
Load pre-trained word embedding
"""

embeddings_index = {}
WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
Beispiel #8
0
    def createModel(self, traindata, valdata, testdata, wordemb, charemb):
        self.train = DL(traindata)
        self.val = DL(valdata)
        self.test = DL(testdata)

        # Load pre-trained embedding
        embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb)
        char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader(
            charemb)

        # Create Word & Label Index
        self.char = DI(self.train.words + ce_words)
        self.word = DI([self.train.words, [we_words]])
        self.label = DI([self.train.labels])
        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        # Create word embedding matrix
        self.EMBEDDING_DIM = len(self.coefs)
        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        for wrd, i in self.word.index.items():
            embedding_vector = embeddings_index.get(wrd)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Create char embedding matrix
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.EMBEDDING_DIM)))
        for chars, i in self.char.index.items():
            embedding_vector = char_embeddings_index.get(chars)
            if embedding_vector is not None:
                char_embedding_matrix[i] = embedding_vector

        trimlen = self.padsize
        self.train.trim(trimlen)
        self.test.trim(trimlen)
        self.val.trim(trimlen)

        self.x_train = DM(self.train.words, self.word.index)
        self.x_test = DM(self.test.words, self.word.index)
        self.x_val = DM(self.val.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov
        print "Number of OOV (val):", len(self.x_val.oov_index)
        print "OOV word occurences (val):", self.x_val.oov
        padsize = self.padsize
        self.x_train.pad(padsize)
        self.x_test.pad(padsize)
        self.x_val.pad(padsize)
        print('Padded until %s tokens.' % padsize)

        self.y_train = DM(self.train.labels, self.label.index)
        self.y_test = DM(self.test.labels, self.label.index)
        self.y_val = DM(self.val.labels, self.label.index)

        self.y_train.pad(padsize)
        self.y_test.pad(padsize)
        self.y_val.pad(padsize)
        self.y_encoded = to_categorical(self.y_train.padded)
        self.y_val_enc = to_categorical(self.y_val.padded)

        # Converting char text data to int using index
        self.x_test_char = self.convertCharText2Int(self.test)
        self.x_train_char = self.convertCharText2Int(self.train)
        self.x_val_char = self.convertCharText2Int(self.val)

        # Create keras word model
        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = self.dropout_embedding
        dropout = Dropout(rate=drop)(embedded_sequences)

        # Create keras char model
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')
        embedded_sequences_c = embedding_layer_c(sequence_input_c)
        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = self.dropout_gru
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)

        # Combine word + char model
        merge_m = 'concat'
        merge = Concatenate()([dropout, rtwo])
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(merge)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)
        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = self.optimizer
        loss = self.loss
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])
Beispiel #9
0
class SeqTagger:
    mask = True  # mask pad (zeros) or not
    EMBEDDING_DIM = 64
    CHAR_EMBEDDING_DIM = 64
    padsize = 188
    char_padsize = 41
    dropout_embedding = 0.4
    dropout_gru = 0.5
    optimizer = 'adagrad'
    loss = 'poisson'
    patience = 3

    def __init__(self):
        self.coefs = []
        self.textinput = ''
        self.train = ''
        self.val = ''
        self.test = ''
        self.char = ''
        self.word = ''
        self.label = ''
        self.x_train = ''
        self.x_val = ''
        self.x_test = ''
        self.x_test_char = ''
        self.y_train = ''
        self.y_test = ''
        self.y_val = ''
        self.y_encoded = ''
        self.y_val_enc = ''
        self.x_test_char = ''
        self.x_train_char = ''
        self.x_val_char = ''
        self.model = {}
        self.results = []
        self.data = {}
        self.json_data = {}

    def createModel(self, traindata, valdata, testdata, wordemb, charemb):
        self.train = DL(traindata)
        self.val = DL(valdata)
        self.test = DL(testdata)

        # Load pre-trained embedding
        embeddings_index, we_words = self.pretrainedEmbeddingLoader(wordemb)
        char_embeddings_index, ce_words = self.pretrainedEmbeddingLoader(
            charemb)

        # Create Word & Label Index
        self.char = DI(self.train.words + ce_words)
        self.word = DI([self.train.words, [we_words]])
        self.label = DI([self.train.labels])
        print 'Found', self.word.cnt - 1, 'unique words.'
        print 'Found', self.char.cnt - 1, 'unique chars.'
        print 'Found', self.label.cnt - 1, 'unique labels.'

        # Create word embedding matrix
        self.EMBEDDING_DIM = len(self.coefs)
        embedding_matrix = np.zeros(
            (len(self.word.index) + 1, int(self.EMBEDDING_DIM)))
        for wrd, i in self.word.index.items():
            embedding_vector = embeddings_index.get(wrd)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        # Create char embedding matrix
        char_embedding_matrix = np.zeros(
            (len(self.char.index) + 1, int(self.EMBEDDING_DIM)))
        for chars, i in self.char.index.items():
            embedding_vector = char_embeddings_index.get(chars)
            if embedding_vector is not None:
                char_embedding_matrix[i] = embedding_vector

        trimlen = self.padsize
        self.train.trim(trimlen)
        self.test.trim(trimlen)
        self.val.trim(trimlen)

        self.x_train = DM(self.train.words, self.word.index)
        self.x_test = DM(self.test.words, self.word.index)
        self.x_val = DM(self.val.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov
        print "Number of OOV (val):", len(self.x_val.oov_index)
        print "OOV word occurences (val):", self.x_val.oov
        padsize = self.padsize
        self.x_train.pad(padsize)
        self.x_test.pad(padsize)
        self.x_val.pad(padsize)
        print('Padded until %s tokens.' % padsize)

        self.y_train = DM(self.train.labels, self.label.index)
        self.y_test = DM(self.test.labels, self.label.index)
        self.y_val = DM(self.val.labels, self.label.index)

        self.y_train.pad(padsize)
        self.y_test.pad(padsize)
        self.y_val.pad(padsize)
        self.y_encoded = to_categorical(self.y_train.padded)
        self.y_val_enc = to_categorical(self.y_val.padded)

        # Converting char text data to int using index
        self.x_test_char = self.convertCharText2Int(self.test)
        self.x_train_char = self.convertCharText2Int(self.train)
        self.x_val_char = self.convertCharText2Int(self.val)

        # Create keras word model
        MAX_SEQUENCE_LENGTH = self.padsize
        embedding_layer = Embedding(len(self.word.index) + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_SEQUENCE_LENGTH,
                                    mask_zero=self.mask)

        sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')

        embedded_sequences = embedding_layer(sequence_input)
        drop = self.dropout_embedding
        dropout = Dropout(rate=drop)(embedded_sequences)

        # Create keras char model
        def reshape_one(c):
            return K.reshape(c, (tf.shape(c)[0] * self.padsize,
                                 self.char_padsize, self.CHAR_EMBEDDING_DIM))

        def reshape_two(c):
            if merge_m_c == 'concat':
                return K.reshape(c,
                                 (tf.shape(c)[0] / self.padsize, self.padsize,
                                  self.CHAR_EMBEDDING_DIM * 2))
            else:
                return K.reshape(c, (tf.shape(c)[0] / self.padsize,
                                     self.padsize, self.CHAR_EMBEDDING_DIM))

        MAX_WORD_LENGTH = self.char_padsize

        embedding_layer_c = Embedding(len(self.char.index) + 1,
                                      self.EMBEDDING_DIM,
                                      weights=[char_embedding_matrix],
                                      input_length=MAX_WORD_LENGTH,
                                      mask_zero=self.mask)

        sequence_input_c = Input(shape=(
            self.padsize,
            MAX_WORD_LENGTH,
        ),
                                 dtype='int32')
        embedded_sequences_c = embedding_layer_c(sequence_input_c)
        dropout_c = Dropout(rate=drop)(embedded_sequences_c)

        rone = Lambda(reshape_one)(dropout_c)
        merge_m = 'concat'
        merge_m_c = merge_m
        dropout_gru = self.dropout_gru
        rec_dropout = dropout_gru
        gru_karakter = Bidirectional(GRU(self.CHAR_EMBEDDING_DIM,
                                         return_sequences=False,
                                         dropout=dropout_gru,
                                         recurrent_dropout=rec_dropout),
                                     merge_mode=merge_m,
                                     weights=None)(rone)

        rtwo = Lambda(reshape_two)(gru_karakter)

        # Combine word + char model
        merge_m = 'concat'
        merge = Concatenate()([dropout, rtwo])
        gru_kata = Bidirectional(GRU(self.EMBEDDING_DIM * 3,
                                     return_sequences=True,
                                     dropout=dropout_gru,
                                     recurrent_dropout=rec_dropout),
                                 merge_mode=merge_m,
                                 weights=None)(merge)

        crf = CRF(len(self.label.index) + 1, learn_mode='marginal')(gru_kata)
        self.model = Model(inputs=[sequence_input, sequence_input_c],
                           outputs=[crf])

        optimizer = self.optimizer
        loss = self.loss
        self.model.summary()
        self.model.compile(loss=loss, optimizer=optimizer, metrics=['acc'])

    def trainFit(self):
        val_data = ([np.array(self.x_val.padded),
                     np.array(self.x_val_char)], [np.array(self.y_val_enc)])
        callback = EarlyStopping(monitor='val_loss',
                                 patience=self.patience,
                                 verbose=0,
                                 mode='auto')
        self.model.fit(
            [np.array(self.x_train.padded),
             np.array(self.x_train_char)], [np.array(self.y_encoded)],
            validation_data=val_data,
            validation_split=0.1,
            epochs=100,
            batch_size=32,
            callbacks=[callback])

    def evaluate(self):
        self.results = []
        print "Computing..."
        raw_results = self.model.predict(
            [np.array(self.x_test.padded),
             np.array(self.x_test_char)])
        for raw_result in raw_results:
            result = []
            for token in raw_result:
                value = np.argmax(token)
                result.append(value)
            self.results.append(result)

        label_index = range(1, len(self.label.index) + 1)
        label_names = []
        for key, value in sorted(self.label.index.iteritems(),
                                 key=lambda (k, v): (v, k)):
            label_names.append(key)

        # flatten list for sklearn evaluation
        y_true = [item for sublist in self.y_test.padded for item in sublist]
        y_pred = [item for sublist in self.results for item in sublist]
        print "Sklearn evaluation:"
        print classification_report(y_true,
                                    y_pred,
                                    labels=label_index,
                                    target_names=label_names)

        f1_mac = f1_score(y_true,
                          y_pred,
                          labels=label_index[1:],
                          average='macro')
        f1_mic = f1_score(y_true,
                          y_pred,
                          labels=label_index[1:],
                          average='micro')
        f1 = max([f1_mac, f1_mic])
        print 'F-1 Score:'
        print f1

    def predict(self, text):
        self.textinput = text
        self.test = DP(text)
        self.x_test = DM(self.test.words, self.word.index)
        print "Number of OOV:", len(self.x_test.oov_index)
        print "OOV word occurences:", self.x_test.oov

        self.x_test.pad(self.padsize)
        print('Padded until %s tokens.' % self.padsize)

        self.x_test_char = self.convertCharText2Int(self.test)

        self.results = []
        print "Computing..."
        print self.x_test.padded
        print self.x_test_char
        raw_results = self.model.predict(
            [np.array(self.x_test.padded),
             np.array(self.x_test_char)])
        for raw_result in raw_results:
            result = []
            for token in raw_result:
                value = np.argmax(token)
                result.append(value)
            self.results.append(result)

        temp = self.results[0]
        li = self.label.index
        keys = li.keys()
        values = li.values()
        self.results = []
        start = False
        for token in temp:
            if token != 0:
                start = True
            if start:
                if token == 0:
                    self.results.append('?')
                else:
                    self.results.append(keys[values.index(token)])

        print self.test.words[0]
        print self.results

        self.data = {'words': self.test.words[0], 'labels': self.results}
        self.json_data = json.dumps(self.data)
        return self.json_data

    def log(self):
        self.textoutput = ''
        for token in self.results:
            self.textoutput = self.textoutput + token + ' '
        rnow = datetime.now()
        logcsv = open('log.csv', 'a')
        writer = csv.writer(logcsv, delimiter=',')
        writer.writerow([
            'no',
            str(rnow.date()),
            str(rnow.time())[:-10], self.w_name, self.word.cnt - 1,
            self.char.cnt - 1, self.textinput,
            len(self.x_test.oov_index), self.textoutput
        ])
        logcsv.close()

    def convertCharText2Int(self, dataload):
        x_tmp1 = []
        for sent in dataload.words:
            x_map = DM(sent, self.char.index, False)
            if x_map.padsize > self.char_padsize:
                self.char_padsize = x_map.padsize
            x_tmp1.append(x_map)

        x_tmp2 = []
        for sent in x_tmp1:
            sent.pad(self.char_padsize)
            x_tmp2.append(sent.padded)
        print('Padded until %s chars.' % self.char_padsize)
        zeroes = []
        for i in range(self.char_padsize):
            zeroes.append(0)
        x_char = []
        for sent in x_tmp2:
            padded_sent = sent
            pad = self.padsize - len(sent)
            for i in range(pad):
                padded_sent = np.vstack((zeroes, padded_sent))
            x_char.append(padded_sent)
        print('Padded until %s tokens.' % self.padsize)
        return x_char

    def pretrainedEmbeddingLoader(self, filename):
        embeddings_index = {}
        f = open(filename, 'r')
        for line in f:
            values = line.split()
            token = values[0]
            self.coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[token] = self.coefs
        f.close()
        embs = []
        for token in embeddings_index:
            embs.append(token)
        return embeddings_index, embs

    def saveModel(self, w_name):
        for i in range(len(self.model.layers)):
            with open(w_name + '_' + str(i) + '.wgt', 'wb') as fp:
                pickle.dump(self.model.layers[i].get_weights(), fp)

    def loadModel(self, w_name):
        m_layers_len = len(self.model.layers)
        for i in range(m_layers_len):
            with open(w_name + '_' + str(i) + ".wgt", "rb") as fp:
                w = pickle.load(fp)
                self.model.layers[i].set_weights(w)
Beispiel #10
0
    print "List of Activation Functions\n" \
          "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \
          "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear"
    choice = input('Enter type of activation function for ' + name + ': ')
    activations = [
        'softmax', 'elu', 'selu', 'softplus', 'softsign', 'relu', 'tanh',
        'sigmoid', 'hard_sigmoid', 'linear'
    ]
    return activations[choice - 1]


"""
Preparing file
"""

train = DL('ner_2_train.ner')
percentage = input('Enter percentage of data to take: ')
seed = input('Enter seed for slicing data: ')
train.slice(percentage, seed)
test = DL('ner_2_test.ner')
"""
Load pre-trained word embedding
"""

embeddings_index = {}
WE_DIR = raw_input('Enter word embedding file name: ')
# WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
Beispiel #11
0

def activationPrompt(name):
    print "List of Activation Functions\n" \
          "1. softmax\t\t2. elu\t\t3. selu\t\t4. softplus\t\t5. softsign\n" \
          "6. relu\t\t7. tanh\t\t8. sigmoid\t\t9. hard_sigmoid\t\t10. linear"
    choice = input('Enter type of activation function for ' + name + ': ')
    activations = ['softmax', 'elu', 'selu', 'softplus', 'softsign',
                   'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
    return activations[choice-1]

"""
Preparing file
"""

train = DL('id-ud-train.pos')
test = DL('id-ud-test.pos')
train.add('id-ud-dev.pos')

"""
Load pre-trained word embedding
"""

embeddings_index = {}
WE_DIR = raw_input('Enter word embedding file name: ')
# WE_DIR = 'polyglot.vec'

print 'Loading', WE_DIR, '...'
f = open(WE_DIR, 'r')
for line in f:
    values = line.split()
Beispiel #12
0
        padded_sent = sent
        pad = padsize - len(sent)
        for i in range(pad):
            padded_sent = np.vstack((zeroes, padded_sent))
        x_char.append(padded_sent)
    print('Padded until %s tokens.' % padsize)
    return x_char


"""
Preparing file
"""

percentage = 0.9
seed = sys.argv[1]
train = DL('ner_3_train.ner')
train.slice(percentage, seed)
test = DL('ner_3_train.ner')
test.antislice(percentage, seed)
val = DL('ner_3_train.ner')
val.antislice(percentage, seed)
# train.add('id-ud-dev.pos')

"""
Load pre-trained word embedding
"""

embeddings_index = {}
# WE_DIR = raw_input('Enter word embedding file name: ')
WE_DIR = 'polyglot.vec'