def getWordVec(model, queryWord):
    # reload for safe
    fileProcess.reLoadEncoding()
    
    vector = model[queryWord.decode('utf-8')]
#     vector = model[queryWord]
    return vector
def trainWord2VecModelTest():

    fileProcess.reLoadEncoding()

    # load all file folder path
    trainDir = fileProcess.auto_config_root() + u'med_question_nolabel/'
    #     med_qus_categories = cacheIndex.med_question_index.values()
    #     dirPath = []
    #     dirPath.extend(trainDir + category + u'/' for category in med_qus_categories)

    #     loadedFilesPath = fileProcess.listAllFilePathInDirectory(dirPath)
    #     print('files num: {0}'.format(len(loadedFilesPath)))

    # load all sentences to be trained
    totalSentences = fileProcess.loadMedQuesSentences(trainDir)
    print('sentences num: {0}'.format(len(totalSentences)))

    start_w2v = time.clock()
    w2vModelPath = fileProcess.auto_config_root(
    ) + u'model_cache/gensim/med_qus-nolabel.vector'
    model = word2Vec.trainWord2VecModel(totalSentences, w2vModelPath)
    end_w2v = time.clock()
    print(
        'train gensim word2vec model finish, use time: {0}'.format(end_w2v -
                                                                   start_w2v))

    print('test word vector: {0}'.format(model['腰疼/v'.decode('utf-8')]))

    print('vocab size: {0}'.format(len(model.vocab)))
def queryMostSimWords(model, wordStr, topN=20):
    '''
    MSimilar words basic query function
    return 2-dim List [0] is word [1] is double-prob
    '''
    fileProcess.reLoadEncoding()
        
    similarPairList = model.most_similar(wordStr.decode('utf-8'), topn=topN)
    return similarPairList
def splitTrainTestData(totalDataPath, trainTestDirPath, split_rate=10):
    '''
    @param totalDataPath: string of data path which has all sentence line with labels
    @param split_rate: positive integer which means number of split pieces, default is 10
    '''

    fileProcess.reLoadEncoding()

    fw = open(totalDataPath, 'r')
    totalLines = fw.readlines()
    fw.close()

    start_split = time.clock()
    nb_lines = len(totalLines)
    if nb_lines % split_rate != 0:
        warnings.warn('split_rate must can divide number of lines!')
        return None

    span = nb_lines / split_rate
    #     splitPartLines = []
    splitTrainTestTuples = []
    scan_p = 0
    for i in range(split_rate):
        #         splitPartLines.append(totalLines[scan_p : scan_p + span])
        print('split from ' + str(0) + ' to ' + str(scan_p) + ', ' +
              str(scan_p + span) + ' to ' + str(nb_lines))
        splitTrainTestTuples.append(
            (totalLines[0:scan_p] + totalLines[scan_p + span:nb_lines],
             totalLines[scan_p:scan_p + span]))
        scan_p += span
    end_split = time.clock()
    print('finish split train and test data in {0}s'.format(end_split -
                                                            start_split))

    start_write = time.clock()
    for i in range(len(splitTrainTestTuples)):
        writeTrainTestPathTuple = (trainTestDirPath +
                                   u'train{0}.txt'.format(i),
                                   trainTestDirPath + u'test{0}.txt'.format(i))
        writeTrainStr = ''.join(splitTrainTestTuples[i][0])
        writeTestStr = ''.join(splitTrainTestTuples[i][1])

        fw_train = open(writeTrainTestPathTuple[0], 'w')
        fw_train.write(writeTrainStr)
        fw_train.close()
        fw_test = open(writeTrainTestPathTuple[1], 'w')
        fw_test.write(writeTestStr)
        fw_test.close()
    end_write = time.clock()
    print('finish produce split train test data file in {0}s'.format(
        end_write - start_write))

    return splitTrainTestTuples
def trainWord2VecModel(sentences, modelPath,
                           Size=100,
                           Window=5,
                           MinCount=1,
                           Workers=multiprocessing.cpu_count()):
    # reload for safe
    fileProcess.reLoadEncoding()
    
    # train word2vec model
    model = Word2Vec(sentences, 
                     size=Size, window=Window, min_count=MinCount, workers=Workers)
    #===========================================================================
    # save work2vec model on disk
    # then, load sim_data
    #===========================================================================
    model.save(modelPath)
    model.init_sims(replace=False)
    print('producing word2vec model ... ok! model store in {0}'.format(modelPath))

    return model
def prodRandomLabeledData(totalDirPath, writeFilePath_5=None):

    fileProcess.reLoadEncoding()

    # load all sentences to be trained
    totalSentences = fileProcess.loadMedQuesSentences(totalDirPath)

    med_qus_categories = cacheIndex.med_question_index.keys()
    #     dirPath = []
    #     dirPath.extend(totalDirPath + category + '/' for category in med_qus_categories)
    start_label = time.clock()
    classes = []
    for category in med_qus_categories:
        cateDirPath = totalDirPath + category + '/'
        cateFilesPath = fileProcess.listAllFilePathInDirectory(cateDirPath)
        for i in range(len(cateFilesPath)):
            classes.append(cacheIndex.med_question_index[category])

    totalSentences_labeled = []
    for i in range(len(totalSentences)):
        words_str = '[' + ','.join(totalSentences[i]) + ']'
        sentence_labeled = words_str + str(classes[i])
        totalSentences_labeled.append(sentence_labeled)
    end_label = time.clock()
    print('finish give labels in {0}s'.format(end_label - start_label))

    start_random = time.clock()
    random.shuffle(totalSentences_labeled)
    end_random = time.clock()
    print('finish random data in {0}s'.format(end_random - start_random))

    if writeFilePath_5 != None:
        fw = open(writeFilePath_5, 'w')
        fw.write('\n'.join(totalSentences_labeled))
        fw.close()

    return totalSentences_labeled