def trainWord2VecModelTest():

    fileProcess.reLoadEncoding()

    # load all file folder path
    trainDir = fileProcess.auto_config_root() + u'med_question_nolabel/'
    #     med_qus_categories = cacheIndex.med_question_index.values()
    #     dirPath = []
    #     dirPath.extend(trainDir + category + u'/' for category in med_qus_categories)

    #     loadedFilesPath = fileProcess.listAllFilePathInDirectory(dirPath)
    #     print('files num: {0}'.format(len(loadedFilesPath)))

    # load all sentences to be trained
    totalSentences = fileProcess.loadMedQuesSentences(trainDir)
    print('sentences num: {0}'.format(len(totalSentences)))

    start_w2v = time.clock()
    w2vModelPath = fileProcess.auto_config_root(
    ) + u'model_cache/gensim/med_qus-nolabel.vector'
    model = word2Vec.trainWord2VecModel(totalSentences, w2vModelPath)
    end_w2v = time.clock()
    print(
        'train gensim word2vec model finish, use time: {0}'.format(end_w2v -
                                                                   start_w2v))

    print('test word vector: {0}'.format(model['腰疼/v'.decode('utf-8')]))

    print('vocab size: {0}'.format(len(model.vocab)))
def testGetNpyData(lb_data=0, encode_type=1):
    '''
    @param @encode_type: when encode_type = 1, use attention sequence expand encoder
        otherwise use basic encoder
    '''
    npzPath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/npy_data/train_Att_NP{0}.npz'.format(lb_data)
    if encode_type != 1:
        npzPath = fileProcess.auto_config_root(
        ) + u'exp_mid_data/npy_data/train_B_NP{0}.npz'.format(lb_data)
    xy_data, input_shape = medQuesRec.loadExprimentNpzData(npzPath)
    print('get the xy_data at {0}! input shape: {1}'.format(
        npzPath, input_shape))

    return xy_data, input_shape
def loadModelfromFileTest():

    w2vModelPath = fileProcess.auto_config_root(
    ) + 'model_cache/gensim/med_qus-nolabel.vector'
    model = word2Vec.loadModelfromFile(w2vModelPath)

    queryWord = '腰疼/v'
    vector = word2Vec.getWordVec(model, queryWord)

    print('vector: {0}, \nvector_size: {1}'.format(vector, len(vector)))
def testLoadBasicData(lb_data=0):
    trainFilePath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/train_test-' + data_scala + u'/train{0}.txt'.format(
        lb_data)
    testFilePath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/train_test-' + data_scala + u'/test{0}.txt'.format(
        lb_data)
    gensimW2VModelPath = fileProcess.auto_config_root(
    ) + u'model_cache/gensim/med_qus-nolabel.vector'

    npzPath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/npy_data/' + data_scala + u'/train_B_NP{0}.npz'.format(
        lb_data)

    trainTestFileTuples = (trainFilePath, testFilePath)
    xy_data, input_shape = medQuesRec.loadGensimMatData(trainTestFileTuples,
                                                        gensimW2VModelPath,
                                                        nb_classes=11)
    medQuesRec.storeExprimentNpzData(npzPath, xy_data)
    print('store the basic xy_data at {0}!'.format(npzPath))

    return xy_data, input_shape
def testLoadAttentionData(lb_data=0):
    trainFilePath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/train_test-' + data_scala + u'/train{0}.txt'.format(
        lb_data)
    testFilePath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/train_test-' + data_scala + u'/test{0}.txt'.format(
        lb_data)
    gensimW2VModelPath = fileProcess.auto_config_root(
    ) + u'model_cache/gensim/med_qus-nolabel.vector'

    npzPath = fileProcess.auto_config_root(
    ) + u'exp_mid_data/npy_data/' + data_scala + u'/train_Att_NP{0}.npz'.format(
        lb_data)

    trainTestFileTuples = (trainFilePath, testFilePath)
    # --- only use training files to get the MG indicators --- #
    xy_data, input_shape = medQuesRec.loadAttentionGensimMatData(
        trainTestFileTuples, gensimW2VModelPath, 11, trainFilePath)

    medQuesRec.storeExprimentNpzData(npzPath, xy_data)
    print('store the attention xy_data at {0}!'.format(npzPath))

    return xy_data, input_shape
def testTrainNetPred(xy_data, input_shape, name_net='CNNs_Net', lb_data=None):
    frame_path = fileProcess.auto_config_root(
    ) + u'model_cache/keras/{0}1000-5000_{1}.json'.format(name_net, lb_data)
    x_train = xy_data[0]
    y_train = xy_data[1]
    model, history_metrices = medQuesRec.trainNetworkPredictor(
        x_train,
        y_train,
        input_shape,
        nb_classes=11,
        network=name_net,
        frame_path=frame_path)
    print(history_metrices)

    return frame_path, history_metrices
# -*- coding: UTF-8 -*-
'''
Created on 2016年11月20日

@author: super
'''

import random
import time
import warnings

from interface import fileProcess, cacheIndex
from interface.embedding import word2Vec

# _totalDirPath_1 = fileProcess.auto_config_root() + u'med_question_1000each/'
_totalDirPath_2_5 = fileProcess.auto_config_root() + u'med_question_2500each/'
_totalDirPath_3_5 = fileProcess.auto_config_root() + u'med_question_3500each/'
_totalDirPath_5 = fileProcess.auto_config_root() + u'med_question_5000each/'


def prodRandomLabeledData(totalDirPath, writeFilePath_5=None):

    fileProcess.reLoadEncoding()

    # load all sentences to be trained
    totalSentences = fileProcess.loadMedQuesSentences(totalDirPath)

    med_qus_categories = cacheIndex.med_question_index.keys()
    #     dirPath = []
    #     dirPath.extend(totalDirPath + category + '/' for category in med_qus_categories)
    start_label = time.clock()