def getFeature(sentence, f_index): kw = WordSeg.getKeyWords(sentence, kw_each_predict) f_vector = [0] * kw_each_label * num_of_labels # all 0 list # get the f_vector for each text for i in kw: if f_index.has_key(i): f_vector[f_index[i]] = 1 return f_vector
def getFeatureSpace(train_data_dir, labels, sample_num=100): kws = list() # list =all key words train_set = list() for label in labels: alltext, texts = getAllText(train_data_dir, label, sample_num) kw = WordSeg.getKeyWords(alltext, kw_each_label) kws.extend(kw) train_set.append(texts) #print " ".join(kws) #print len(train_set) f_index = dict(zip(kws, range(len(kws)))) y, X = getTrainMatrix(train_set, f_index) return f_index, y, X
def getFeatureSpace( train_data_dir, labels, sample_num=100 ): kws = list() # list =all key words train_set = list() for label in labels: alltext, texts = getAllText( train_data_dir, label, sample_num ) kw = WordSeg.getKeyWords(alltext, kw_each_label ) kws.extend(kw) train_set.append( texts ); #print " ".join(kws) #print len(train_set) f_index = dict( zip( kws, range( len(kws) ) ) ) y, X = getTrainMatrix( train_set, f_index ) return f_index, y, X
#写入bunch对象 def writebunchobj(path,bunchobj): file_obj=open(path,"rb") pickle.dump(bunchobj, file_obj, protocol=0) file_obj.close() #导入分词后的词向量Bunch对象 path="../file/train_word_bag/train_set.dat" #词向量保存路径 bunch = readbunchobj(path) #构建tf-idf词向量空间对象 tfidfspace=Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={}) #停用词表 stopword_path="../file/train_word_bag/stoplist.txt" stpwrdlst=wordSeg.readfile(stopword_path).splitlines() #使用tfidfVectorizer初始化向量空间模型 vectorizer=TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf=True,max_df=0.5) transformer=TfidfTransformer() #该类会统计每个词语的tf-idf权值 #文本转为词频矩阵,单独保存字典文件 tfidfspace.tdm=vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary=vectorizer.vocabulary_ #创建词袋的持久化 space_path="../file/train_word_bag/tfidfspace.dat" #词向量词袋保持路径 writebunchobj(space_path,tfidfspace)
def getFeature(sentence): return WordSeg.getKeyWords(sentence, kw_each_predict)
def getFeature( sentence ): return WordSeg.getKeyWords( sentence, kw_each_predict )