def test_cnn_train(self): # Get them labels! print(PROJECT_DIR) print(DATA_DIR) with io.open(DATA_DIR + '.labels', 'r') as f: labels = [line.rstrip('\n') for line in f] labels = list(set(labels)) # Run the model model = Magpie() a = model.train_word2vec(DATA_DIR, vec_dim=300) print("done2") print("done3") model.init_word_vectors(DATA_DIR, vec_dim=300) model.train(DATA_DIR, labels, nn_model='cnn', test_ratio=0.2, epochs=30) path1 = PROJECT_DIR + '/here1.h5' path2 = PROJECT_DIR + '/embedinghere' path3 = PROJECT_DIR + '/scaler' model.save_word2vec_model(path2) model.save_scaler(path3, overwrite=True) model.save_model(path1) print("thuc hien test") # Do a simple prediction print( model.predict_from_text( 'cho em hỏi về lịch khám của bác_sỹ đào việt_hằng và số điện_thoại' ))
def train_dl(save, vec_dim, epochs): """ train process """ magpie = Magpie() # magpie.train_word2vec('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', vec_dim=100) # magpie.fit_scaler('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories') magpie.init_word_vectors( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', vec_dim=vec_dim) with open('data/categories.labels') as f: labels = f.readlines() labels = [x.strip() for x in labels] magpie.train( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', labels, test_ratio=0.0, epochs=epochs) if save: """ Save model """ magpie.save_word2vec_model( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/embeddings/here' ) magpie.save_scaler( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/scaler/here', overwrite=True) magpie.save_model( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/model/here.h5' ) return magpie
def test_cnn_train(self): # Get them labels! with io.open(DATA_DIR + '.labels', 'r') as f: labels = {line.rstrip('\n') for line in f} # Run the model model = Magpie() model.init_word_vectors(DATA_DIR, vec_dim=100) history = model.train(DATA_DIR, labels, nn_model='cnn', test_ratio=0.3, epochs=3) assert history is not None # Do a simple prediction predictions = model.predict_from_text("Black holes are cool!") assert len(predictions) == len(labels) # Assert the hell out of it! for lab, val in predictions: assert lab in labels assert 0 <= val <= 1
'1142', '1143', '1144', '1151', '1152', '1153', '1154', '1211', '1212', '1213', '1214', '1215', '1216', '1217', '1218', '1219', '1221', '1222', '1223', '1231', '1232', '1233', '1234', '1235', '1241', '1242', '1243', '1251', '1311', '1312', '1313', '1314', '1321', '1322', '1323', '1331', '1332', '1333', '1334', '1341', '1342', '1343', '1344', '1345', '1351', '1411', '1421', '1431', '1441', '15', '2111', '2112', '2113', '2114', '2115', '2116', '2117', '2121', '2122', '2123', '2124', '2131', '2132', '2133', '2134', '2141', '2142', '2143', '2144', '2145', '2146', '2147', '2148', '2149', '21410', '2151', '2152', '2153', '2154', '2155', '2156', '2161', '2162', '2163', '2164', '2165', '2166', '2167', '2168', '2171', '2172', '2173', '2174', '2175', '2176', '2177', '2178', '2179', '21710', '21711', '2181', '2182', '2183', '2184', '2185', '2186', '2187', '2188', '2191', '2192', '2193', '2194', '2195', '2196', '221', '222', '223', '224', '2311', '2312', '2313', '2314', '2315', '2316', '2321', '2322', '2323', '2324', '24', '31', '32', '33', '34', '41', '42', '43', '51', '52', '53', '54', '55', '56', '57', '58', '61', '7111', '7112', '7113', '7114', '7115', '7116', '7117', '7118', '7119', '71110', '71111', '7121', '7122', '7123', '7124', '7125', '7126', '7127', '7128', '7129', '7131', '7132', '7133', '7134', '7135', '7136', '7137', '7138', '7139', '71310', '71311', '71312', '7141', '7142', '7151', '721', '722', '723', '724', '7311', '7312', '7313', '7314', '7315', '7316', '7321', '7322', '7323', '7324', '7325', '7326', '7331', '7332', '7333', '7334', '7335', '7336', '734', '74' ] magpie.train('data/hep-categories', labels, test_ratio=0.2, epochs=20) #训练,20%数据作为测试数据,20轮 根据给定数据训练模型 ''' #保存训练后的模型文件 magpie.save_word2vec_model('save/embeddings/best', overwrite=True) magpie.save_scaler('save/scaler/best', overwrite=True) magpie.save_model('save/model/best.h5') '''
for WORD2VEC_CONTEXT in [4, 5, 6, 7, 8]: if os.path.exists('log/' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT) + '.txt'): continue magpie.train_word2vec(train_dir, vec_dim=EMBEDDING_SIZE, MWC=MIN_WORD_COUNT, w2vc=WORD2VEC_CONTEXT) magpie.fit_scaler('C:\\magpie-master\\data\\hep-categories') magpie.train( 'C:\\magpie-master\\data\\hep-categories', labels, callbacks=[lossHistory], test_ratio=0.1, epochs=20, logdir='C:\\magpie-master\\log\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT) + '.txt') # 训练,10%数据作为测试数据,20轮 lossHistory.loss_plot( 'epoch', 'C:\\magpie-master\\pic\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT) + '.jpg') ''' magpie.save_word2vec_model( 'C:\\magpie-master\\save\\embeddings\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str( MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT)) magpie.save_scaler( 'C:\\magpie-master\\save\\scaler\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str( MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT))
file.write(label) print("Data generation finished.") address = "/home/ubuntu/toxic/magpie_data" #data_prep("/Users/wangergou/Downloads/kaggle/Toxic_Comment_Classification/Magpie/data/") data_prep(address) magpie = Magpie() print("Loading word vector... \n") magpie.train_word2vec(address, vec_dim=100) print("Initializing data... \n") magpie.init_word_vectors(address, vec_dim=100) labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] print("Training starts... \n") magpie.train(address, labels, test_ratio=0.2, epochs=30) magpie.save_model('/home/ubuntu/toxic/magpie_model.h5')
def train_magpie(labels): magpie = Magpie() magpie.init_word_vectors(WRITE_SK_CAT_PATH, vec_dim=VEC_DIM) magpie.train(WRITE_SK_CAT_PATH, labels, test_ratio=0.2, epochs=EPOCHS) return magpie
labels = getlabel('/home/ydm/ren/remote/multiLabel/data/labels.txt') # magpie = Magpie( # keras_model='/home/ydm/ren/remote/multiLabel/data/here.h5', # word2vec_model='/home/ydm/ren/remote/multiLabel/data/word2vec_mode', # scaler='/home/ydm/ren/remote/multiLabel/data/scaler', # labels=labels # ) magpie = Magpie() magpie.init_word_vectors( '/home/ydm/ren/remote/multiLabel/data/hep-categories', vec_dim=100) print(len(labels)) magpie.train('/home/ydm/ren/remote/multiLabel/data/hep-categories', labels, epochs=30, batch_size=128) magpie.save_word2vec_model( '/home/ydm/ren/remote/multiLabel/data/word2vec_mode_place') magpie.save_scaler('/home/ydm/ren/remote/multiLabel/data/scaler_place', overwrite=True) magpie.save_model('/home/ydm/ren/remote/multiLabel/data/model_place.h5') alltest = getlabel( '/home/ydm/ren/remote/multiLabel/data/allsents_test.txt') # alltest = [alltest] writes = open('/home/ydm/ren/remote/multiLabel/data/result_place.txt', 'w', encoding='utf-8') for sent in alltest:
#!/usr/bin/python # -*- coding: UTF-8 -*- """ @Author: njuselhx @Time: 2021/1/21 下午7:01 @File: train.py @Software: PyCharm """ from magpie import Magpie magpie = Magpie() ''' magpie.init_word_vectors('data/hep-categories-zh', vec_dim=100) labels = ['军事', '旅游', '政治'] magpie.train('data/hep-categories-zh', labels, test_ratio=0.2, epochs=100) magpie.save_model('save/keras_model_zh.h5') magpie.save_word2vec_model('save/word2vec_model_zh', overwrite=True) magpie.save_scaler('save/scaler_zh', overwrite=True) print(magpie.predict_from_text('特朗普在联合国大会发表演讲谈到这届美国政府成绩时,称他已经取得了美国历史上几乎最大的成就。随后大会现场传出了嘲笑声,特朗普立即回应道:“这是真的。”')) ''' magpie.init_word_vectors('data/emotion-categories', vec_dim=100) labels = ['满意', '喜悦', '乐观', '愤怒', '悲哀', '恐惧', '厌恶', '焦虑', '怀疑'] magpie.train('data/emotion-categories', labels, test_ratio=0.2, epochs=2333) magpie.save_model('save/emotion_keras_model.h5') magpie.save_word2vec_model('save/emotion_word2vec_model', overwrite=True) magpie.save_scaler('save/emotion_scaler', overwrite=True)
from magpie import Magpie magpie = Magpie() magpie.init_word_vectors('data/hep-categories', vec_dim=100) labels = [ "Astrophysics", "Experiment-HEP", "Gravitation and Cosmology", "Phenomenology-HEP", "Theory-HEP", ] magpie.train('data/hep-categories', labels, test_ratio=0.2, epochs=30) print(magpie.predict_from_text('Stephen Hawking studies black holes'))
import os import sys sys.path.append(os.path.realpath(os.getcwd())) sys.path.append("..") from magpie import Magpie magpie = Magpie() magpie.train_word2vec('../data/hep-categories', vec_dim=3) #训练一个word2vec magpie.fit_scaler('../data/hep-categories') #生成scaler magpie.init_word_vectors('../data/hep-categories', vec_dim=3) #初始化词向量 labels = ['军事', '旅游', '政治'] #定义所有类别 magpie.train('../data/hep-categories', labels, test_ratio=0.2, epochs=20) #训练,20%数据作为测试数据,5轮 #保存训练后的模型文件 magpie.save_word2vec_model('../workspace/embeddings', overwrite=True) magpie.save_scaler('../workspace/scaler', overwrite=True) magpie.save_model('../workspace/model.h5')
def Deep_learning(df, x_test, target): folder = '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) # elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print(e) folder = '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories/' for the_file in os.listdir(folder): file_path = os.path.join(folder, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) # elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception as e: print(e) lab_list = [] for i, row in df.iterrows(): if i > len(df): break else: file_name = '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories/' + str( i) + '.txt' lab_name = '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories/' + str( i) + '.lab' title_data = df.at[i, target].encode('ascii', 'ignore').decode('ascii') with open(file_name, 'w') as the_file: the_file.write(title_data) row_data = eval(df.at[i, 'group_id']) for j in row_data: lab_list.append(j) with open(lab_name, 'a') as the_file: the_file.write(str(j) + '\n') lab_set = list(set(lab_list)) file = '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/' + 'categories' + '.labels' for i in lab_set: with open(file, 'a') as the_file: the_file.write(str(i) + '\n') magpie = Magpie() # magpie.train_word2vec('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories', vec_dim=100) # magpie.fit_scaler('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories') magpie.init_word_vectors( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories', vec_dim=100) with open('test_data/categories.labels') as f: labels = f.readlines() labels = [x.strip() for x in labels] magpie.train( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/test_data/categories', labels, test_ratio=0.0, epochs=20) results_dl = {} df_test = pd.DataFrame(np.atleast_2d(x_test), columns=['title']) for i, row in df_test.iterrows(): title_data = df_test.at[i, target].encode('ascii', 'ignore').decode('ascii') title_data = preprocess(title_data) # print("This is title: ", title_data) df_test.at[i, target] = title_data pre_label = [ s[0] for s in magpie.predict_from_text(title_data) if s[1] >= 0.25 ] # print("This is test: ", title_data) # print("This is predict label: ", pre_label) results_dl[title_data] = pre_label return results_dl
train process """ magpie = Magpie() # magpie.train_word2vec('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', vec_dim=100) # magpie.fit_scaler('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories') magpie.init_word_vectors( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', vec_dim=100) with open('data/categories.labels') as f: labels = f.readlines() labels = [x.strip() for x in labels] magpie.train( '/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories', labels, test_ratio=0.0, epochs=30) # """ # Save model # """ # # magpie.save_word2vec_model('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/embeddings/here') # magpie.save_scaler('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/scaler/here', overwrite=True) # magpie.save_model('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/save/model/here.h5') """ Reinitialize """ # with open('/Users/sunxuan/Documents/PycharmProjects/ImpactPool/data/categories.labels') as f:
Success = 'Success:' error = 'error:' magpie = Magpie() lossHistory = LossHistory() for EMBEDDING_SIZE in [250, 500]: for MIN_WORD_COUNT in [5, 10]: for WORD2VEC_CONTEXT in [5, 10]: magpie.train_word2vec(train_dir, vec_dim=EMBEDDING_SIZE, MWC=MIN_WORD_COUNT, w2vc=WORD2VEC_CONTEXT) magpie.fit_scaler('C:\\magpie-master\\data\\hep-categories') magpie.train('C:\\magpie-master\\data\\hep-categories', labels, callbacks=[lossHistory], test_ratio=0.1, epochs=20) # 训练,20%数据作为测试数据,20轮 lossHistory.loss_plot( 'epoch', 'C:\\magpie-master\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT) + '.jpg') magpie.save_word2vec_model( 'C:\\magpie-master\\save\\embeddings\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT)) magpie.save_scaler('C:\\magpie-master\\save\\scaler\\' + train_dir[-3:] + '_' + str(EMBEDDING_SIZE) + '_' + str(MIN_WORD_COUNT) + '_' + str(WORD2VEC_CONTEXT)) magpie.save_model('C:\\magpie-master\\save\\model\\' +
labels4 = sys.argv[9] labels = [ labels1, labels2, labels3, labels4 ] #print (labels) dirName = 'D:\\xampp\\htdocs\\mtlbl\\webpage\\admin\\models\\' + model_name os.mkdir(dirName) model_path = dirName + '\\' + model_name scaler_path = dirName + '\\scaler_' + model_name keras_path = dirName + '\\keras_'+ model_name + '.h5' #print (model_path) #print (keras_path) from magpie import Magpie magpie = Magpie() magpie.init_word_vectors(data, vec_dim=vec_num) magpie.train(data, labels, test_ratio= test_rat, epochs = ep) #more epoch = more understanding of vector and lower lose rate #magpie.predict_from_text('ECB to reveal bad loan hurdles for euro zone bank test') #test magpie.save_word2vec_model(model_path) magpie.save_scaler(scaler_path, overwrite=True) magpie.save_model(keras_path)
min_lr=0) ''' #调参 for optimizer in ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']: for BATCH_SIZE in [16, 32, 64, 128, 256]: print(optimizer+str(BATCH_SIZE)) magpie.train('data/hep-categories', labels, batch_size=BATCH_SIZE, callbacks=[checkpoint, reduceLROnPlateau], test_ratio=0.1, epochs=60, verbose=1, optimizer=optimizer, logdir='C:\\magpie-master\\trainlog\\' + optimizer + '_' + str(BATCH_SIZE) + '.txt' ) ''' #形成最终模型 magpie.train( 'data/hep-categories', labels, batch_size=16, callbacks=[checkpoint, reduceLROnPlateau], test_ratio=0.0, epochs=60, verbose=1, optimizer='Adam', ) magpie.save_word2vec_model('save/embeddings/best', overwrite=True) magpie.save_scaler('save/scaler/best', overwrite=True) magpie.save_model('save/model/best.h5')