def getWordVec(model, queryWord): # reload for safe fileProcess.reLoadEncoding() vector = model[queryWord.decode('utf-8')] # vector = model[queryWord] return vector
def trainWord2VecModelTest(): fileProcess.reLoadEncoding() # load all file folder path trainDir = fileProcess.auto_config_root() + u'med_question_nolabel/' # med_qus_categories = cacheIndex.med_question_index.values() # dirPath = [] # dirPath.extend(trainDir + category + u'/' for category in med_qus_categories) # loadedFilesPath = fileProcess.listAllFilePathInDirectory(dirPath) # print('files num: {0}'.format(len(loadedFilesPath))) # load all sentences to be trained totalSentences = fileProcess.loadMedQuesSentences(trainDir) print('sentences num: {0}'.format(len(totalSentences))) start_w2v = time.clock() w2vModelPath = fileProcess.auto_config_root( ) + u'model_cache/gensim/med_qus-nolabel.vector' model = word2Vec.trainWord2VecModel(totalSentences, w2vModelPath) end_w2v = time.clock() print( 'train gensim word2vec model finish, use time: {0}'.format(end_w2v - start_w2v)) print('test word vector: {0}'.format(model['腰疼/v'.decode('utf-8')])) print('vocab size: {0}'.format(len(model.vocab)))
def queryMostSimWords(model, wordStr, topN=20): ''' MSimilar words basic query function return 2-dim List [0] is word [1] is double-prob ''' fileProcess.reLoadEncoding() similarPairList = model.most_similar(wordStr.decode('utf-8'), topn=topN) return similarPairList
def splitTrainTestData(totalDataPath, trainTestDirPath, split_rate=10): ''' @param totalDataPath: string of data path which has all sentence line with labels @param split_rate: positive integer which means number of split pieces, default is 10 ''' fileProcess.reLoadEncoding() fw = open(totalDataPath, 'r') totalLines = fw.readlines() fw.close() start_split = time.clock() nb_lines = len(totalLines) if nb_lines % split_rate != 0: warnings.warn('split_rate must can divide number of lines!') return None span = nb_lines / split_rate # splitPartLines = [] splitTrainTestTuples = [] scan_p = 0 for i in range(split_rate): # splitPartLines.append(totalLines[scan_p : scan_p + span]) print('split from ' + str(0) + ' to ' + str(scan_p) + ', ' + str(scan_p + span) + ' to ' + str(nb_lines)) splitTrainTestTuples.append( (totalLines[0:scan_p] + totalLines[scan_p + span:nb_lines], totalLines[scan_p:scan_p + span])) scan_p += span end_split = time.clock() print('finish split train and test data in {0}s'.format(end_split - start_split)) start_write = time.clock() for i in range(len(splitTrainTestTuples)): writeTrainTestPathTuple = (trainTestDirPath + u'train{0}.txt'.format(i), trainTestDirPath + u'test{0}.txt'.format(i)) writeTrainStr = ''.join(splitTrainTestTuples[i][0]) writeTestStr = ''.join(splitTrainTestTuples[i][1]) fw_train = open(writeTrainTestPathTuple[0], 'w') fw_train.write(writeTrainStr) fw_train.close() fw_test = open(writeTrainTestPathTuple[1], 'w') fw_test.write(writeTestStr) fw_test.close() end_write = time.clock() print('finish produce split train test data file in {0}s'.format( end_write - start_write)) return splitTrainTestTuples
def trainWord2VecModel(sentences, modelPath, Size=100, Window=5, MinCount=1, Workers=multiprocessing.cpu_count()): # reload for safe fileProcess.reLoadEncoding() # train word2vec model model = Word2Vec(sentences, size=Size, window=Window, min_count=MinCount, workers=Workers) #=========================================================================== # save work2vec model on disk # then, load sim_data #=========================================================================== model.save(modelPath) model.init_sims(replace=False) print('producing word2vec model ... ok! model store in {0}'.format(modelPath)) return model
def prodRandomLabeledData(totalDirPath, writeFilePath_5=None): fileProcess.reLoadEncoding() # load all sentences to be trained totalSentences = fileProcess.loadMedQuesSentences(totalDirPath) med_qus_categories = cacheIndex.med_question_index.keys() # dirPath = [] # dirPath.extend(totalDirPath + category + '/' for category in med_qus_categories) start_label = time.clock() classes = [] for category in med_qus_categories: cateDirPath = totalDirPath + category + '/' cateFilesPath = fileProcess.listAllFilePathInDirectory(cateDirPath) for i in range(len(cateFilesPath)): classes.append(cacheIndex.med_question_index[category]) totalSentences_labeled = [] for i in range(len(totalSentences)): words_str = '[' + ','.join(totalSentences[i]) + ']' sentence_labeled = words_str + str(classes[i]) totalSentences_labeled.append(sentence_labeled) end_label = time.clock() print('finish give labels in {0}s'.format(end_label - start_label)) start_random = time.clock() random.shuffle(totalSentences_labeled) end_random = time.clock() print('finish random data in {0}s'.format(end_random - start_random)) if writeFilePath_5 != None: fw = open(writeFilePath_5, 'w') fw.write('\n'.join(totalSentences_labeled)) fw.close() return totalSentences_labeled