def getFeature(sentence, f_index):
    kw = WordSeg.getKeyWords(sentence, kw_each_predict)
    f_vector = [0] * kw_each_label * num_of_labels  # all 0 list
    # get the f_vector for each text
    for i in kw:
        if f_index.has_key(i):
            f_vector[f_index[i]] = 1
    return f_vector
def getFeature(sentence, f_index):
    kw = WordSeg.getKeyWords(sentence, kw_each_predict)
    f_vector = [0] * kw_each_label * num_of_labels  # all 0 list
    # get the f_vector for each text
    for i in kw:
        if f_index.has_key(i):
            f_vector[f_index[i]] = 1
    return f_vector
Beispiel #3
0
def getFeatureSpace(train_data_dir, labels, sample_num=100):
    kws = list()  # list =all key words
    train_set = list()
    for label in labels:
        alltext, texts = getAllText(train_data_dir, label, sample_num)
        kw = WordSeg.getKeyWords(alltext, kw_each_label)
        kws.extend(kw)
        train_set.append(texts)
    #print " ".join(kws)
    #print len(train_set)
    f_index = dict(zip(kws, range(len(kws))))
    y, X = getTrainMatrix(train_set, f_index)
    return f_index, y, X
Beispiel #4
0
def getFeatureSpace( train_data_dir, labels, sample_num=100 ):
    kws = list()                # list =all key words
    train_set = list()
    for label in labels:
        alltext, texts = getAllText( train_data_dir, label, sample_num )
        kw = WordSeg.getKeyWords(alltext, kw_each_label )
        kws.extend(kw)
        train_set.append( texts );
    #print " ".join(kws)
    #print len(train_set)
    f_index = dict( zip( kws, range( len(kws) ) ) )
    y, X = getTrainMatrix( train_set, f_index )
    return f_index, y, X
Beispiel #5
0
#写入bunch对象
def writebunchobj(path,bunchobj):
    file_obj=open(path,"rb")
    pickle.dump(bunchobj, file_obj, protocol=0)
    file_obj.close()
    

 #导入分词后的词向量Bunch对象
path="../file/train_word_bag/train_set.dat"    #词向量保存路径
bunch = readbunchobj(path)
 #构建tf-idf词向量空间对象
tfidfspace=Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={})
 
 #停用词表
stopword_path="../file/train_word_bag/stoplist.txt"
stpwrdlst=wordSeg.readfile(stopword_path).splitlines()
 
 
 
 #使用tfidfVectorizer初始化向量空间模型
 
vectorizer=TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf=True,max_df=0.5)
transformer=TfidfTransformer()  #该类会统计每个词语的tf-idf权值
 #文本转为词频矩阵,单独保存字典文件
tfidfspace.tdm=vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary=vectorizer.vocabulary_
 
 #创建词袋的持久化 
space_path="../file/train_word_bag/tfidfspace.dat"  #词向量词袋保持路径
writebunchobj(space_path,tfidfspace)
 
Beispiel #6
0
def getFeature(sentence):
    return WordSeg.getKeyWords(sentence, kw_each_predict)
Beispiel #7
0
def getFeature( sentence ):
    return WordSeg.getKeyWords( sentence, kw_each_predict )