コード例 #1
0
ファイル: run.py プロジェクト: alexjane19/QASystemLSTM
def Update_Word_Embedding(languege):
    if(languege==0):
        question_pre = load_data(pre_path_files+'Persian/question_pre')
        answering_pre = load_data(pre_path_files+'Persian/answering_pre')
        tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM'])
        save_data(pre_path_files+'Persian/tokenizer',tokenizer)
        save_data(pre_path_files+'Persian/embedding_matrix',embedding_matrix)
    elif(languege==1):
        question_pre = load_data(pre_path_files+'English/question_pre')
        answering_pre = load_data('English/answering_pre')
        tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM'])
        save_data(pre_path_files+'English/tokenizer', tokenizer)
        save_data(pre_path_files+'English/embedding_matrix', embedding_matrix)
コード例 #2
0
ファイル: run.py プロジェクト: alexjane19/QASystemLSTM
def Word_Embedding(languege='Persian'):
    file=''
    if not os.path.exists(pre_path_files+languege+'/Word2Vec'):
        os.makedirs(pre_path_files+languege+'/Word2Vec')
        file='model-'+str(datetime.datetime.now().strftime("%Y-%m-%d-%I-%M"))+'.model'
    else :
        file=glob.glob(pre_path_files+languege+'/Word2Vec/')[0]
    question_pre = load_data(pre_path_files+languege+'/question_pre')
    question_pre=[k for k, v in itertools.groupby(question_pre)]
    answering_pre = load_data(pre_path_files+languege+'/answering_pre')
    answering_pre=[k for k, v in itertools.groupby(answering_pre)]
    tokenizer, embedding_matrix = word_embed_meta_data(question_pre + answering_pre,siamese_config['EMBEDDING_DIM'],pre_path_files+languege+'/Word2Vec/'+file)
    save_data(pre_path_files+languege+'/Word2Vec/tokenizer',tokenizer)
    save_data(pre_path_files+languege+'/Word2Vec/embedding_matrix',embedding_matrix)
def fn():
    df = pd.read_csv('train_data.csv')
    sentences1 = list(df['sentences1'])
    sentences2 = list(df['sentences2'])
    is_similar = list(df['is_similar'])
    sentences1 = pre.preprocessing(sentences1)
    sentences2 = pre.preprocessing(sentences2)
    tokenizer, embedding_matrix = word_embed_meta_data(
        sentences1 + sentences2, siamese_config['EMBEDDING_DIM'])
    embedding_meta_data = {
        'tokenizer': tokenizer,
        'embedding_matrix': embedding_matrix
    }
    return tokenizer, embedding_matrix, embedding_meta_data
コード例 #4
0
def get_doc2vec_vectors_train_valid_split(trainingData):

    # split the dataset into training and validation datasets
    train_x, valid_x, train_y, valid_y = get_train_test_split_of_dataframe(
        trainingData, False)

    # label encode the target variable
    encoder = preprocessing.LabelEncoder()
    train_y = encoder.fit_transform(train_y)
    valid_y = encoder.fit_transform(valid_y)

    sentences1 = train_x['Q1']
    sentences2 = train_x['Q2']
    is_similar = list(train_y)

    sentences1_validate = valid_x['Q1']
    sentences2_validate = valid_x['Q2']
    is_similar_validate = list(valid_y)

    tokenizer, embedding_matrix = word_embed_meta_data(
        sentences1 + sentences2, siamese_config['EMBEDDING_DIM'])

    embedding_meta_data = {
        'tokenizer': tokenizer,
        'embedding_matrix': embedding_matrix
    }

    ## creating sentence pairs
    sentences_pairs = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
    del sentences1
    del sentences2

    sentences_pairs_validate = [
        (x1, x2) for x1, x2 in zip(sentences1_validate, sentences2_validate)
    ]
    del sentences1_validate
    del sentences2_validate

    test_data_x1, test_data_x2, leaks_test = create_test_data(
        tokenizer, sentences_pairs_validate,
        siamese_config['MAX_SEQUENCE_LENGTH'])
    test_data_x = [test_data_x1, test_data_x2, leaks_test]

    return sentences_pairs, is_similar, test_data_x, is_similar_validate, embedding_meta_data
コード例 #5
0
ファイル: controller.py プロジェクト: aniketnk/InternKit

df = pd.read_csv('intelligent_component/sample_data.csv')
sentences1 = list(df['sentences1'])
sentences2 = list(df['sentences2'])
is_similar = list(df['is_similar'])
del df


####################################
######## Word Embedding ############
####################################


# creating word embedding meta data for word embedding 
tokenizer, embedding_matrix = word_embed_meta_data(sentences1 + sentences2,  siamese_config['EMBEDDING_DIM'])

embedding_meta_data = {
	'tokenizer': tokenizer,
	'embedding_matrix': embedding_matrix
}

## creating sentence pairs
sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
del sentences1
del sentences2


##########################
######## Training ########
##########################
コード例 #6
0
    def Execute_Model(self):
        EMBEDDING_DIM = 50
        MAX_SEQUENCE_LENGTH = 10
        RATE_DROP_LSTM = 0.17
        RATE_DROP_DENSE = 0.25
        NUMBER_LSTM = 50
        NUMBER_DENSE_UNITS = 50
        ACTIVATION_FUNCTION = 'relu'
        VALIDATION_SPLIT = 0.1

        sentences1 = list(self.df['question1'].astype(str))
        sentences2 = list(self.df['question2'].astype(str))
        is_similar = list(self.df['is_duplicate'])
        tokenizer, embedding_matrix = word_embed_meta_data(
            sentences1 + sentences2, EMBEDDING_DIM)

        embedding_meta_data = {
            'tokenizer': tokenizer,
            'embedding_matrix': embedding_matrix
        }
        sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
        nb_words = len(tokenizer.word_index) + 1
        embedding_layer = layers.Embedding(nb_words,
                                           siamese_config['EMBEDDING_DIM'],
                                           weights=[embedding_matrix],
                                           input_length=MAX_SEQUENCE_LENGTH,
                                           trainable=False)

        lstm_layer = layers.Bidirectional(
            layers.LSTM(NUMBER_LSTM,
                        dropout=RATE_DROP_LSTM,
                        recurrent_dropout=RATE_DROP_LSTM))

        sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
        embedded_sequences_1 = embedding_layer(sequence_1_input)
        left_output = lstm_layer(embedded_sequences_1)

        sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
        embedded_sequences_2 = embedding_layer(sequence_2_input)
        right_output = lstm_layer(embedded_sequences_2)

        merged = layers.concatenate([left_output, right_output], axis=-1)
        merged = BatchNormalization()(merged)
        merged = layers.Dropout(0.1)(merged)
        merged = layers.Dense(128, activation='relu')(merged)
        merged = BatchNormalization()(merged)
        merged = layers.Dropout(0.1)(merged)
        predictions = layers.Dense(1, activation='sigmoid')(merged)

        model = Model([sequence_1_input, sequence_2_input], predictions)

        model.compile(loss='binary_crossentropy',
                      optimizer='nadam',
                      metrics=['acc'])
        model.summary()
        train_data_x1, train_data_x2, train_labels, leaks_train, \
        val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair,
                                                                               is_similar, MAX_SEQUENCE_LENGTH,
                                                                               VALIDATION_SPLIT)
        callbacks = [
            keras.callbacks.TensorBoard(
                log_dir=
                'E:\workdirectory\Code Name Val Halen\DS Sup\DL\Chapter 15\logs',
                histogram_freq=1)
        ]

        self.history = model.fit([train_data_x1, train_data_x2],
                                 train_labels,
                                 validation_data=([val_data_x1,
                                                   val_data_x2], val_labels),
                                 epochs=200,
                                 batch_size=64,
                                 shuffle=True,
                                 callbacks=callbacks)
コード例 #7
0
# 3 negative examples
#model = load_model('./checkpoints/1560122132/lstm_50_50_0.10_0.25.h5')
# model without arguments
#model = load_model('./checkpoints/1559659364/lstm_50_50_0.17_0.25.h5')


df = pd.read_csv('../data/final_hate_dataset_test.csv', sep='\t')
hs_sentences = list(df['HS'])
cn_sentences = list(df['CN'])
cntype = list(df['CNtype'])
# remove the cn type
# cntype = ['none' for x in cntype]

top_n = 3

tokenizer, embedding_matrix = word_embed_meta_data(hs_sentences + cn_sentences,  siamese_config['EMBEDDING_DIM'])

guess = 0
for i in range(len(hs_sentences)):
    resp = cn_sentences[i]
    hs = hs_sentences[i]
    all_pairs = [(hs, cn) for cn in cn_sentences]
    test_data_x1, test_data_x2, leaks_test, cntypes = create_test_data(tokenizer, all_pairs, cntype, siamese_config['MAX_SEQUENCE_LENGTH'])

    preds = list(model.predict([test_data_x1, test_data_x2, leaks_test, cntypes], verbose=1).ravel())
    results = [(x, y, z) for (x, y), z in zip(all_pairs, preds)]
    results.sort(key=itemgetter(2), reverse=True)
    top = results[:top_n]
    responses = [x[1] for x in top]
    if resp in responses:
        print('Guess')
コード例 #8
0
CONFIG.number_lstm_units = siamese_config['NUMBER_LSTM']
CONFIG.rate_drop_lstm = siamese_config['RATE_DROP_LSTM']
CONFIG.number_dense_units = siamese_config['NUMBER_DENSE_UNITS']
CONFIG.activation_function = siamese_config['ACTIVATION_FUNCTION']
CONFIG.rate_drop_dense = siamese_config['RATE_DROP_DENSE']
CONFIG.validation_split_ratio = siamese_config['VALIDATION_SPLIT']
CONFIG.max_nb_words = siamese_config['MAX_NB_WORDS']
CONFIG.filter_sizes = siamese_config['FILTER_SIZES']
CONFIG.num_filters = siamese_config['NUM_FILTERS']
####################################
######## Word Embedding ############
####################################

# creating word embedding meta data for word embedding
tokenizer, embedding_matrix, embedding_index = word_embed_meta_data(
    question["words"], CONFIG.embedding_dim, CONFIG.max_nb_words,
    'data/word_embed.txt')

embedding_meta_data = {
    'tokenizer': tokenizer,
    'embedding_matrix': embedding_matrix,
    'embedding_index': embedding_index
}

## creating sentence pairs
sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)]
char_pair = [(x1, x2) for x1, x2 in zip(char1, char2)]
del sentences1
del sentences2

## get tfidf dictionary