def trainWord2VecModelTest(): fileProcess.reLoadEncoding() # load all file folder path trainDir = fileProcess.auto_config_root() + u'med_question_nolabel/' # med_qus_categories = cacheIndex.med_question_index.values() # dirPath = [] # dirPath.extend(trainDir + category + u'/' for category in med_qus_categories) # loadedFilesPath = fileProcess.listAllFilePathInDirectory(dirPath) # print('files num: {0}'.format(len(loadedFilesPath))) # load all sentences to be trained totalSentences = fileProcess.loadMedQuesSentences(trainDir) print('sentences num: {0}'.format(len(totalSentences))) start_w2v = time.clock() w2vModelPath = fileProcess.auto_config_root( ) + u'model_cache/gensim/med_qus-nolabel.vector' model = word2Vec.trainWord2VecModel(totalSentences, w2vModelPath) end_w2v = time.clock() print( 'train gensim word2vec model finish, use time: {0}'.format(end_w2v - start_w2v)) print('test word vector: {0}'.format(model['腰疼/v'.decode('utf-8')])) print('vocab size: {0}'.format(len(model.vocab)))
def testGetNpyData(lb_data=0, encode_type=1): ''' @param @encode_type: when encode_type = 1, use attention sequence expand encoder otherwise use basic encoder ''' npzPath = fileProcess.auto_config_root( ) + u'exp_mid_data/npy_data/train_Att_NP{0}.npz'.format(lb_data) if encode_type != 1: npzPath = fileProcess.auto_config_root( ) + u'exp_mid_data/npy_data/train_B_NP{0}.npz'.format(lb_data) xy_data, input_shape = medQuesRec.loadExprimentNpzData(npzPath) print('get the xy_data at {0}! input shape: {1}'.format( npzPath, input_shape)) return xy_data, input_shape
def loadModelfromFileTest(): w2vModelPath = fileProcess.auto_config_root( ) + 'model_cache/gensim/med_qus-nolabel.vector' model = word2Vec.loadModelfromFile(w2vModelPath) queryWord = '腰疼/v' vector = word2Vec.getWordVec(model, queryWord) print('vector: {0}, \nvector_size: {1}'.format(vector, len(vector)))
def testLoadBasicData(lb_data=0): trainFilePath = fileProcess.auto_config_root( ) + u'exp_mid_data/train_test-' + data_scala + u'/train{0}.txt'.format( lb_data) testFilePath = fileProcess.auto_config_root( ) + u'exp_mid_data/train_test-' + data_scala + u'/test{0}.txt'.format( lb_data) gensimW2VModelPath = fileProcess.auto_config_root( ) + u'model_cache/gensim/med_qus-nolabel.vector' npzPath = fileProcess.auto_config_root( ) + u'exp_mid_data/npy_data/' + data_scala + u'/train_B_NP{0}.npz'.format( lb_data) trainTestFileTuples = (trainFilePath, testFilePath) xy_data, input_shape = medQuesRec.loadGensimMatData(trainTestFileTuples, gensimW2VModelPath, nb_classes=11) medQuesRec.storeExprimentNpzData(npzPath, xy_data) print('store the basic xy_data at {0}!'.format(npzPath)) return xy_data, input_shape
def testLoadAttentionData(lb_data=0): trainFilePath = fileProcess.auto_config_root( ) + u'exp_mid_data/train_test-' + data_scala + u'/train{0}.txt'.format( lb_data) testFilePath = fileProcess.auto_config_root( ) + u'exp_mid_data/train_test-' + data_scala + u'/test{0}.txt'.format( lb_data) gensimW2VModelPath = fileProcess.auto_config_root( ) + u'model_cache/gensim/med_qus-nolabel.vector' npzPath = fileProcess.auto_config_root( ) + u'exp_mid_data/npy_data/' + data_scala + u'/train_Att_NP{0}.npz'.format( lb_data) trainTestFileTuples = (trainFilePath, testFilePath) # --- only use training files to get the MG indicators --- # xy_data, input_shape = medQuesRec.loadAttentionGensimMatData( trainTestFileTuples, gensimW2VModelPath, 11, trainFilePath) medQuesRec.storeExprimentNpzData(npzPath, xy_data) print('store the attention xy_data at {0}!'.format(npzPath)) return xy_data, input_shape
def testTrainNetPred(xy_data, input_shape, name_net='CNNs_Net', lb_data=None): frame_path = fileProcess.auto_config_root( ) + u'model_cache/keras/{0}1000-5000_{1}.json'.format(name_net, lb_data) x_train = xy_data[0] y_train = xy_data[1] model, history_metrices = medQuesRec.trainNetworkPredictor( x_train, y_train, input_shape, nb_classes=11, network=name_net, frame_path=frame_path) print(history_metrices) return frame_path, history_metrices
# -*- coding: UTF-8 -*- ''' Created on 2016年11月20日 @author: super ''' import random import time import warnings from interface import fileProcess, cacheIndex from interface.embedding import word2Vec # _totalDirPath_1 = fileProcess.auto_config_root() + u'med_question_1000each/' _totalDirPath_2_5 = fileProcess.auto_config_root() + u'med_question_2500each/' _totalDirPath_3_5 = fileProcess.auto_config_root() + u'med_question_3500each/' _totalDirPath_5 = fileProcess.auto_config_root() + u'med_question_5000each/' def prodRandomLabeledData(totalDirPath, writeFilePath_5=None): fileProcess.reLoadEncoding() # load all sentences to be trained totalSentences = fileProcess.loadMedQuesSentences(totalDirPath) med_qus_categories = cacheIndex.med_question_index.keys() # dirPath = [] # dirPath.extend(totalDirPath + category + '/' for category in med_qus_categories) start_label = time.clock()