Ejemplo n.º 1
0
def get_trainset():
	train = pd.read_csv('labeledTrainData.tsv', delimiter="\t", quoting=3)
	y_train = train['sentiment'][:10000]
	train_data = []
	#for i in xrange(0,len(train['review'])):
	for i in xrange(0,10000):
		train_data.append(" ".join(review_to_wordlist(train['review'][i])))

	f = open('train_data.pickle', 'wb')
	f.write(pickle.dumps(train_data))
	f.close()

	print "TrainSet Starting!\n"
	from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
	tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
	X = tfv.fit_transform(train_data).todense()
	voc = tfv.vocabulary_
	#f = open('X.pickle', 'wb')
	#f.write(pickle.dumps(X))
	#f.close()
	#print 'TrainSet End!\n'

	del train_data

	return X, y_train, voc
Ejemplo n.º 2
0
def feature_tfidf(train_words, test_words):
    """
    建立tfidf 特征向量
    :param words:
    :return:
    """
    # 初始化TFIV对象,去停用词,加2元语言模型
    tfv = TFIV(min_df=3,
               max_features=None,
               strip_accents='unicode',
               analyzer='word',
               token_pattern=r'\w{1,}',
               ngram_range=(1, 2),
               use_idf=1,
               smooth_idf=1,
               sublinear_tf=1,
               stop_words='english')
    # 合并训练和测试集以便进行TFIDF向量化操作
    X_all = train_words + test_words
    len_train = len(train_words)
    # 这一步有点慢,train model
    tfv.fit(X_all)
    X_all = tfv.transform(X_all)
    # 恢复成训练集和测试集部分
    X = X_all[:len_train]
    X_test = X_all[len_train:]
    return X, X_test
Ejemplo n.º 3
0
def kaggle_21feb_svm_max_df_filtering():
    tfv = TFIV(min_df=3,
               max_features=None,
               strip_accents='unicode',
               analyzer='word',
               token_pattern=r'\w{1,}',
               ngram_range=(1, 2),
               use_idf=1,
               smooth_idf=1,
               sublinear_tf=False,
               max_df=0.9)

    X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'],
                                                        raw['y'],
                                                        test_size=0.2,
                                                        random_state=42)

    lsvc = LinearSVC(max_iter=100000)

    cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)])

    label = 'svm'

    scores = cross_val_score(cp1, X_train, y_train, cv=5, scoring='f1_micro')
    cp1.fit(X_train, y_train)
    mean_f1 = f1_score(y_test, cp1.predict(X_test), average='micro')
    print("Accuracy: %0.4f (+/- %0.4f) [%s] | Test: %0.4f" %
          (scores.mean(), scores.std(), label, mean_f1))

    write_submission('21feb-svm-max_df_filtering-sub.csv', raw['X_Ids'],
                     cp1.predict(raw['X_raw']))
Ejemplo n.º 4
0
def pred(test_idx, test_doc, model, voc):
	test_data = []
	for i in xrange(len(test_doc)):
		test_data.append(" ".join(review_to_wordlist(test_doc[i])))
	tfv = TFIV(min_df=3, max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
			   ngram_range=(1, 2), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words='english', vocabulary=voc)
	X_test = tfv.fit_transform(test_data).todense()
	X_test = np.array(X_test)
	pred = model.predict(X_test)
	test_idx = np.array(test_idx)
	like_idx = list(np.array(range(len(tag)))[pred == 1])
	hate_idx = list(np.array(range(len(tag)))[pred == 0])
	return like_idx, hate_idx
Ejemplo n.º 5
0
def train_2(labelList=[],maxLen=None,kFold=0):
    # 使用pandas读入数据集
    all_data = pd.read_csv('../data/data_five_labels.csv')
    all_data.columns = ["disease","position","label","title","abstract","text"]
# 将训练和测试数据都转成词list
    x_data = []
    y_data = []
    #{'症状': 24697, '病因': 8472, '检查': 4191, '鉴别': 1105, '治疗': 25620, '饮食护理': 10522, '预防': 7246, '并发症': 2200}
    for i in range(0, len(all_data['title'])):
        if all_data["label"][i] not in labelList:#剔除数据量不足的标签
            x_data.append(all_data["title"][i])
            y_data.append(all_data["label"][i])
    stopwords = []
    #创建停用词表
    with open("../data/stopwords.txt","r",encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV( min_df=0,max_df=1,max_features=maxLen, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',
                  stop_words=stopwords,ngram_range=(1,3))
    x_all = []
    y_all = []
    jieba.load_userdict("../data/diseaseDic.txt")
    jieba.load_userdict("../data/symptomDic.txt")
    print("正在分词、去停...")
    i= 0
    for text in x_data:
        i+=1
        print("第%d篇文章..."%i)
        try:
            x_all.append(" ".join(jieba.cut(seg_sentence(text))))
            y_all.append(y_data[x_data.index(text)])
        except:
            pass
    print("初始标签长度:%d" % len(y_data))
    print("分词去停后标签长度:%d" %len(y_all))
    print("正在抽取特征词...")
    tfv.fit(x_all)
    feature_list = tfv.get_feature_names()
    #将特征词表写入txt
    f = open("../data/feature_names_3.txt","w",encoding="utf-8")
    for x in feature_list:
        f.write(x+"\n")
    f.close()
    print("正在将文档映射为向量...")
    X_all = tfv.transform(x_all)
    clf = svm.SVC(kernel='linear',C=1)
    print("训练...")
    scores = cross_val_score(clf,X_all,y_all,cv=kFold)
    return scores
Ejemplo n.º 6
0
def train_1():
    # 使用pandas读入数据集
    all_data = pd.read_csv('../data/articles.csv')
    all_data.columns = ["disease","position","label","title","abstract","text"]
# 取出句子的标签
    y_data = all_data['label']
    print("初始标签长度:%d" %len(y_data))
# 将训练和测试数据都转成词list
    x_data = []
    for i in range(0, len(all_data['title'])):
        x_data.append(all_data["title"][i])
    stopwords = []
    #创建停用词表
    with open("../data/stopwords.txt","r",encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV( min_df=0,max_df=1,max_features=30000, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',
                  stop_words=stopwords,ngram_range=(1,3))
    x_all = []
    y_all = []
    jieba.load_userdict("../data/diseaseDic.txt")
    jieba.load_userdict("../data/symptomDic.txt")
    print("正在分词、去停...")
    i= 0
    for text in x_data:
        i+=1
        print("第%d篇文章..."%i)
        try:
            x_all.append(" ".join(jieba.cut(seg_sentence(text))))
            y_all.append(y_data[x_data.index(text)])
        except:
            pass
    print("分词去停后标签长度:%d" %len(y_all))
    print("正在抽取特征词...")
    tfv.fit(x_all)
    feature_list = tfv.get_feature_names()
    #将特征词表写入txt
    f = open("../data/feature_names_3.txt","w",encoding="utf-8")
    for x in feature_list:
        f.write(x+"\n")
    f.close()
    print("正在将文档映射为向量...")
    X_all = tfv.transform(x_all)
    clf = svm.SVC(kernel='linear',C=1)
    print("训练...")
    scores = cross_val_score(clf,X_all,y_all,cv=5)
    print(scores)
Ejemplo n.º 7
0
def data_tfidf2(x):
    tfv = TFIV(
        min_df=3,
        max_df=0.5,
        max_features=None,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        use_idf=1,
        smooth_idf=1,
        sublinear_tf=1,
    )
    tfv.fit(x)
    x = tfv.transform(x)
    return x
Ejemplo n.º 8
0
def main(star_rating=1):
    # start = datetime.strptime(start_time, '%m/%d/%Y')
    # end = datetime.strptime(end_time, '%m/%d/%Y')

    review = data.review_body[data.star_rating == 1]

    stop_words = load_stopword()
    texts = [[
        word for word in line.strip().lower().split() if word not in stop_words
    ] for line in review]  #去除符号,数字,停用词

    temp = []
    for i in texts:
        temp.append(' '.join(i))
    print('词语处理完成!')
    '''TF-idf处理数据'''
    n_features = 1000  #限定有多少个特征
    tf_vectorizer = TFIV(strip_accents='unicode',
                         max_features=n_features,
                         stop_words='english',
                         max_df=0.5,
                         min_df=10)  #建立TFidf向量化模型

    tf = tf_vectorizer.fit_transform(temp)  #使用模型向量化每一句话
    '''LDA'''
    from sklearn.decomposition import LatentDirichletAllocation  #导入LDA
    n_topics = 5  #一共有多少个主题
    lda = LatentDirichletAllocation(n_components=n_topics,
                                    max_iter=50,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)  #建立LDA模型
    lda.fit(tf)  #拟合数据

    def print_top_words(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ]))
        print()  #展示20个主题显示最好的结果

    n_top_words = 20

    tf_feature_names = tf_vectorizer.get_feature_names()  #获得那1000个特征的值
    print_top_words(lda, tf_feature_names, n_top_words)
Ejemplo n.º 9
0
def kaggle_21feb_ensemble_sub():
    tfv = TFIV(min_df=3,
               max_features=None,
               strip_accents='unicode',
               analyzer='word',
               token_pattern=r'\w{1,}',
               ngram_range=(1, 2),
               use_idf=1,
               smooth_idf=1,
               sublinear_tf=False,
               max_df=0.9)

    X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'],
                                                        raw['y'],
                                                        test_size=0.2,
                                                        random_state=42)

    lr = LogisticRegression(solver='lbfgs')
    lsvc = LinearSVC()
    nb = MultinomialNB()

    cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)])
    cp2 = Pipeline([('tfidf', tfv), ('lr', lr)])
    cp3 = Pipeline([('tfidf', tfv), ('nb', nb)])

    # Hard voting: majority rules, Soft voting: weighted average probabilities
    eclf = VotingClassifier(estimators=[('lsvc', cp1), ('lr', cp2),
                                        ('nb', cp3)],
                            voting='hard')

    label = 'ensemble'

    scores = cross_val_score(eclf, X_train, y_train, cv=5, scoring='f1_micro')
    eclf.fit(X_train, y_train)
    mean_f1 = f1_score(y_test, eclf.predict(X_test), average='micro')
    print("Accuracy: %0.4f (+/- %0.4f) [%s] | Test: %0.4f" %
          (scores.mean(), scores.std(), label, mean_f1))

    write_submission('21feb-ensemble-sub.csv', raw['X_Ids'],
                     eclf.predict(raw['X_raw']))
Ejemplo n.º 10
0
Archivo: main.py Proyecto: shuanzi/ML
    def tfidf(self, ngram=2):
        tfv = TFIV(min_df=3,
                   max_features=None,
                   strip_accents='unicode',
                   analyzer='word',
                   token_pattern=r'\w{1,}',
                   ngram_range=(1, ngram),
                   use_idf=1,
                   smooth_idf=1,
                   sublinear_tf=1,
                   stop_words='english')

        X_all = self.traindata + self.testdata
        lentrain = len(self.traindata)

        tfv.fit(X_all)
        X_all = tfv.transform(X_all)

        self.X = X_all[:lentrain]
        self.X_test = X_all[lentrain:]

        print("vectorization data size: ", self.X.shape)
        return self.X, self.y_train, self.X_test, self.y_test
Ejemplo n.º 11
0
    def tfidf(self, ngram=2):
        tfv = TFIV(min_df=3,
                   max_features=None,
                   strip_accents='unicode',
                   analyzer='word',
                   token_pattern=r'\w{1,}',
                   ngram_range=(1, ngram),
                   use_idf=1,
                   smooth_idf=1,
                   sublinear_tf=1,
                   stop_words='english')

        # Combine both to fit the TFIDF vectorization.
        X_all = self.traindata + self.testdata
        lentrain = len(self.traindata)

        tfv.fit(X_all)  # This is the slow part!
        X_all = tfv.transform(X_all)

        self.X = X_all[:lentrain]  # Separate back into training and test sets.
        self.X_test = X_all[lentrain:]

        print("vectorization data size: ", self.X.shape)
        return self.X, self.y_train, self.X_test
Ejemplo n.º 12
0
def train(maxFeature=None,minDf=0.0,maxDf=1.0,ngram=None,classifier=svm.SVC(kernel='linear')):
    # 使用pandas读入训练和测试csv文件
    train = pd.read_csv('../data/train_0.8.csv')
    test = pd.read_csv('../data/test_0.2.csv')
    train.columns = ["disease","position","label","title","abstract","text"]
    test.columns = ["disease","position","label","title","abstract","text"]
# 取出句子的标签
    y_train = train['label']
    y_test = test['label']
# 将训练和测试数据都转成词list
    train_data = []
    for i in range(0, len(train['title'])):
        train_data.append(train["title"][i]+str(train["abstract"][i])+str(train["text"][i]))
        # train_data.append(train["abstract"][i]+train['text'][i])
    test_data = []
    for i in range(0, len(test['title'])):
        test_data.append(test["title"][i]+str(test["abstract"][i])+str(test["text"][i]))
        # test_data.append(train["abstract"][i]+test['text'][i])


    stopwords = []
    with open("../data/stopwords.txt","r",encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # with open("../data/diseaseDic.txt","r",encoding="utf-8") as f:
    #     lines = f.readlines();
    #     for line in lines:
    #         stopwords.append(line.strip())
    # with open("../data/symptomDic.txt","r",encoding="utf-8") as f:
    #     lines = f.readlines();
    #     for line in lines:
    #         stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV( min_df=minDf,max_df=maxDf,max_features=maxFeature, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',ngram_range=ngram,
                  stop_words=stopwords)
    # 合并训练和测试集以便进行TFIDF向量化操作
    X_all = train_data + test_data
    tokenized_corpus = []
    jieba.load_userdict("../data/diseaseDic.txt")
    jieba.load_userdict("../data/symptomDic.txt")
    print("正在分词、去停...")
    for text in X_all:
        tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text))))
    len_train = len(train_data)

    # 对所有的数据进行向量化操作,耗时比较长
    print("正在抽取特征词...")
    tfv.fit(tokenized_corpus)
    feature_list = tfv.get_feature_names()
    f = open("../data/feature_names_3.txt","w",encoding="utf-8")
    for x in feature_list:
        f.write(x+"\n")
    f.close()
    print("正在将文档映射为向量...")
    X_all = tfv.transform(tokenized_corpus)
    with open("../data/vectors.txt","w",encoding="utf-8") as file:
        file.write(str(X_all))
    # 恢复成训练集和测试集部分
    X = X_all[:len_train]
    X_test = X_all[len_train:]


    # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy
    def calculate_result(actual, pred):
        m_precision = metrics.precision_score(actual, pred,average=None)
        m_recall = metrics.recall_score(actual, pred,average=None)
        accuracy = metrics.accuracy_score(pred, actual)
        print('SVM分类模型在测试集上的准确率:',accuracy)
        print('SVM分类模型在测试集上的P值:', m_precision)
        print('SVM分类模型在测试集上的R值:', m_recall)
        print('SVM分类模型在测试集上的F1值:', metrics.f1_score(actual, pred,average=None))
        return accuracy


    # 训练P_svm分类模型
    print("正在训练分类器...")
    svclf = classifier
    svclf.fit(X, y_train)
    # #模型保存
    # joblib.dump(svclf,"P_svm_train_model.m")
    print("正在预测...")
    pred = svclf.predict(X_test)
    acc = calculate_result(y_test, pred)
    return acc
Ejemplo n.º 13
0
print(len(labels))
print("File Reading Finished")


# In[105]:


print("Defining TFIDF Vectorizer")

tfIdfVec = TFIV(
                    min_df=3, # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
                    max_features=10000, # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
                    strip_accents='unicode', # Remove accents during the preprocessing step. 'ascii' is a fast method that only works on characters that have an direct ASCII mapping.
                                             # 'unicode' is a slightly slower method that works on any characters.
                    analyzer='word', # Whether the feature should be made of word or character n-grams.. Can be callable.
                    token_pattern=r'\w{1,}', # Regular expression denoting what constitutes a "token", only used if analyzer == 'word'.
                    ngram_range=(1,5), # The lower and upper boundary of the range of n-values for different n-grams to be extracted.
                    use_idf=1, # Enable inverse-document-frequency reweighting.
                    smooth_idf=1, # Smooth idf weights by adding one to document frequencies.
                    sublinear_tf=1, # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
                    stop_words = 'english' # 'english' is currently the only supported string value.
                )


# In[106]:


print("Fitting")

tfIdfVec.fit(clean_data) # Learn vocabulary and idf from training set.
Ejemplo n.º 14
0
    trainSetLabel.append(content[i])

for i in os.listdir('./test/'):
    words = []
    with open('./test/%s' % i, encoding='gb18030', errors='ignore') as file:
        mail = file.read()
    words = ' '.join(analyse.extract_tags(mail, topK=20))
    testList.append(words)

    i = sub('[..txt]', '', i)
    idList.append(i)

allData = trainList + testList
lenTrain = len(trainList)

ft = TFIV()
featureVector = ft.fit_transform(allData).toarray()

#Separate back into training and dev sets.
trainSet = featureVector[:lenTrain]
testSet = featureVector[lenTrain:]

#rain naive bayes model.
mnb = MultinomialNB()
MNBResult = mnb.fit(trainSet, trainSetLabel).predict(testSet)

outPut = pd.DataFrame(data={
    "classification": MNBResult,
    "id": idList,
    "content": testList
})
Ejemplo n.º 15
0
    seg_list = jieba.cut(stri,use_paddle=True)
    unlabeled_text.append(' '.join(list(seg_list)))

for stri in test_origin_text:
    seg_list = jieba.cut(stri,use_paddle=True)
    test_text.append(' '.join(list(seg_list)))

train_label = []
for i in range(0,1000):
    train_label.append(" ".join(review_to_wordlist(train_origin['label'][i])))



# min_df=3去除低词频的词,分析的视角是词,启用ngram_range,启用ITF,启用idf平滑smooth_idf=1
#用1 + log(tf)替换tfsublinear_tf=1
tfv = TFIV(min_df=3,  strip_accents='unicode', analyzer='word',ngram_range=(1, 2)
, use_idf=1,smooth_idf=1,sublinear_tf=1)
 
# 注意我只用训练集训练
tfv.fit(train_text)


X_all = train_text + unlabeled_text +test_text
len_train = len(train_text)
len_unlabeled = len(unlabeled_text)
X_all = tfv.transform(X_all)
 
 
# 恢复成训练集和测试集部分
# 左闭右开
train_X = X_all[:len_train]
unlabeled_X = X_all[len_train:len_train+len_unlabeled]
Ejemplo n.º 16
0
def train(maxFeature=None,minDf=0.0,maxDf=1.0,ngram=None,classifier=svm.SVC(kernel='linear')):
    # 使用pandas读入训练和测试csv文件
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')
    train.columns = ["disease","position","label","title","abstract","text"]
    test.columns = ["disease","position","label","title","abstract","text"]
    word_vec = {}
    with open("../data/word_vector.csv","r",encoding="utf-8") as f:
        for line in csv.reader(f):
            word_vec.update({line[0]:eval(line[1])})
# 取出句子的标签
    y_train = train['label']
    y_test = test['label']
# 将训练和测试数据都转成词list
    train_data = []
    for i in range(0, len(train['title'])):
        train_data.append(train["title"][i]+str(train["abstract"][i])+str(train["text"][i]))
        # train_data.append(train["abstract"][i]+train['text'][i])
    test_data = []
    for i in range(0, len(test['title'])):
        test_data.append(test["title"][i]+str(test["abstract"][i])+str(test["text"][i]))
        # test_data.append(train["abstract"][i]+test['text'][i])


    stopwords = []
    with open("../data/stopwords.txt","r",encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV( min_df=minDf,max_df=maxDf,max_features=maxFeature, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',ngram_range=ngram,
                  stop_words=stopwords)
    # 合并训练和测试集以便进行TFIDF向量化操作
    X_all = train_data + test_data
    tokenized_corpus = []
    # jieba.load_userdict("../data/diseaseDic.txt")
    # jieba.load_userdict("../data/symptomDic.txt")
    for text in X_all:
        tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text))))
    len_train = len(train_data)

    # 对所有的数据进行向量化操作,耗时比较长
    tfv.fit(tokenized_corpus)
    feature_list = tfv.get_feature_names()
    f = open("../data/feature_names_2.txt","w",encoding="utf-8")
    for x in feature_list:
        f.write(x+"\n")
    f.close()
    model = gensim.models.KeyedVectors.load_word2vec_format("../data/sgns.wiki.bigram-char", encoding="utf-8")
    X_all = tfv.transform(tokenized_corpus)
    all = []
    for sentence in X_all:
        sentenceWeight = np.array([0]*300)
        num = 0
        sentence = sentence.toarray().tolist()[0]
        for x in sentence:
            i = sentence.index(x)
            weight = x #使用x对词向量加权
            if x != 0:
                num += 1
                word = feature_list[i]
                if word in word_vec.keys():
                    wordMatrix = np.array(word_vec[word])
                else:
                    wordMatrix = np.array([0]*300)
                sentenceWeight = np.add(sentenceWeight,weight*wordMatrix)
        if num != 0:
            all.append((sentenceWeight/num).tolist())
        else:
            all.append([0]*300)
    with open("../data/vectors.txt","w",encoding="utf-8") as file:
        file.write(str(X_all))
    # 恢复成训练集和测试集部分
    X = np.array(all[:len_train])
    X_test = np.array(all[len_train:])


    # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy
    def calculate_result(actual, pred):
        m_precision = metrics.precision_score(actual, pred,average=None)
        m_recall = metrics.recall_score(actual, pred,average=None)
        accuracy = metrics.accuracy_score(pred, actual)
        with open("../data/record.txt","a",encoding="utf-8") as f:
            f.write('SVM分类模型在测试集上的准确率:'+str(accuracy)+'\n')
            f.write('SVM分类模型在测试集上的准确率:'+str(accuracy)+'\n')
            f.write('SVM分类模型在测试集上的P值:'+str(m_precision)+'\n')
            f.write('SVM分类模型在测试集上的R值:'+str(m_recall)+'\n')
            f.write('SVM分类模型在测试集上的F1值:'+str(metrics.f1_score(actual, pred,average=None))+'\n')
        return accuracy


    # 训练P_svm分类模型
    svclf = classifier
    svclf.fit(X, y_train)
    # #模型保存
    # joblib.dump(svclf,"P_svm_train_model.m")
    pred = svclf.predict(X_test)
    acc = calculate_result(y_test, pred)
    return acc
Ejemplo n.º 17
0
def main():

    #Load labeled training data
    train = pd.read_csv('../../data/labeledtrainData.tsv',
                        header=0,
                        delimiter="\t",
                        quoting=3)

    #The column on which we will predict
    predCol = train['sentiment']

    pickle.dump(predCol, open("../../picks/predCol", "wb"))

    #List for storing cleaned up training data
    trainData = []

    #Loop counter
    numRevs = len(train['review'])

    for i in range(0, numRevs):

        if ((i + 1) % 2000 == 0):

            print("Train Review %d of %d\n" % (i + 1, numRevs))

        #Clean each review> Please look at the definition of the sentimentToWordlist function in the preproc.py script
        trainData.append(" ".join(
            preProc.sentimentToWordlist(train['review'][i])))

    #Load test data
    test = pd.read_csv('../../data/testData.tsv',
                       header=0,
                       delimiter="\t",
                       quoting=3)

    #List for storing cleaned up test data
    testdata = []

    #Loop counter
    numRevs = len(test['review'])

    for i in range(0, numRevs):

        if ((i + 1) % 2000 == 0):

            print("Test Review %d of %d\n" % (i + 1, numRevs))

        #Clean each review> Please look at the definition of the sentimentToWordlist function in the preproc.py script
        testdata.append(" ".join(preProc.sentimentToWordlist(
            test['review'][i])))

    #Define/build TfidfVectorizer
    print("Defining TFIDF Vectorizer")

    tfIdfVec = TFIV(
        min_df=
        3,  # When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold
        max_features=
        3000,  # If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
        strip_accents=
        'unicode',  # Remove accents during the preprocessing step. 'ascii' is a fast method that only works on characters that have an direct ASCII mapping.
        # 'unicode' is a slightly slower method that works on any characters.
        analyzer=
        'word',  # Whether the feature should be made of word or character n-grams.. Can be callable.
        token_pattern=
        r'\w{1,}',  # Regular expression denoting what constitutes a "token", only used if analyzer == 'word'. 
        ngram_range=(
            1, 5
        ),  # The lower and upper boundary of the range of n-values for different n-grams to be extracted.
        use_idf=1,  # Enable inverse-document-frequency reweighting.
        smooth_idf=
        1,  # Smooth idf weights by adding one to document frequencies.
        sublinear_tf=
        1,  # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
        stop_words=
        'english'  # 'english' is currently the only supported string value.
    )

    pickle.dump(tfIdfVec, open("../../picks/vectorizer.pkl", "wb"))

    combineData = trainData + testdata  # Combine both to fit the TFIDF vectorization.

    trainLen = len(trainData)

    print("Fitting")

    tfIdfVec.fit(combineData)  # Learn vocabulary and idf from training set.

    print("Transforming")

    combineData = tfIdfVec.transform(
        combineData
    )  # Transform documents to document-term matrix. Uses the vocabulary and document frequencies (df) learned by fit (or fit_transform).
    pickle.dump(combineData, open("../../picks/transformedData.pkl", "wb"))
    print("Fitting and transforming done")

    trainAfterFit = combineData[:
                                trainLen]  # Separate back into training and test sets.
    pickle.dump(trainAfterFit, open("../../picks/fittedTrainData.pkl", "wb"))

    testAfterFit = combineData[trainLen:]
    pickle.dump(testAfterFit, open("../../picks/fittedTestData.pkl", "wb"))
Ejemplo n.º 18
0
# # #coding=utf-8
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
from codes.new_model import seg_sentence
import jieba
# import scipy.sparse.csr.csr_matrix
tfv = TFIV(strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',
           stop_words=None)
tfv.fit([
    " ".join(jieba.cut("SVM分类模型在测试集上的准确率")),
    " ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载")),
    " ".join(jieba.cut("【word2vec实例2】加载模型"))
])
print(tfv.get_feature_names())
# print(tfv.transform([" ".join(jieba.cut("SVM分类模型在测试集上的准确率"))," ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载"))," ".join(jieba.cut("【word2vec实例2】加载模型"))]))
all = tfv.transform([
    " ".join(jieba.cut("SVM分类模型在测试集上的准确率")),
    " ".join(jieba.cut("本文为博主原创文章,未经博主允许不得转载")),
    " ".join(jieba.cut("【word2vec实例2】加载模型")), "", "测试"
])
print(all)
for x in all:
    x = x.toarray().tolist()[0]
    for xx in x:
        if xx != 0:
            print(x.index(xx))
# print(all[0])
# # print(all.toarray().tolist())
# # for x in all:
Ejemplo n.º 19
0
test_data = []
for i in range(0, len(test['review'])):
    test_data.append(" ".join(review_to_wordlist(test['review'][i])))
print(train_data[0])  # with all this stuff...; 居然 2000 个字节
##################################################################
## 特征处理; 数据有了, 我们得想办法从数据里面拿到有区分度的特征
# 比如说 Kaggle 该问题的引导页提供的 word2vec 就是一种文本到数值域的特征抽取方式;
# TF-IDF(term frequency-interdocument frequency)向量. 每一个电影评论最后转化成一个 TF-IDF 向量.
# TF-IDF 是一种统计方法, 用以评估一字词(或者 n-gram)对于一个文件集或一个语料库中的其中一份文件的重要程度.
# 字词的重要性随着它在文件中出现的次数成正比增加, 但同时会随着它在语料库中出现的频率成反比下降.
# 同时我在单词的级别上又拓展到 2 元语言模型(对这个不了解的同学别着急, 后续的博客介绍马上就来)
# 恩, 你可以再加 3 元 4 元 语言模型; 主要是单机内存不够了, 先就 2 元 上, 凑活用吧
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
# 初始化 TFIV 对象, 去停用词, 加 2 元语言模型
tfv = TFIV(min_df=3,
           token_pattern=r'\w{1,}',
           ngram_range=(1, 2),
           sublinear_tf=1)  # 这里的 token_pattern 还不是很懂..
X_all, len_train = train_data + test_data, len(train_data)
print(len(X_all))  # 50000; 合并训练和测试集以便进行 TFIDF 向量化操作
X_all = tfv.fit_transform(
    X_all)  # 这一步有点慢, 去喝杯茶刷会儿微博知乎歇会儿..., ./l2.py 中直接从这不的下一步开始的...
X, x_test = X_all[:len_train], X_all[len_train:]  # 恢复成训练集和测试集部分
##################################################################
## 多项式朴素贝叶斯 vs 逻辑回归
# 特征现在我们拿到手了, 该建模了, 好吧, 博主折腾劲又上来了, 那个我们还是朴素贝叶斯和逻辑回归都建个分类器吧, 然后也可以比较比较
# 『 talk is cheap, I’ ll show you the code 』
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.model_selection import cross_val_score
import numpy as np
print("多项式贝叶斯分类器 20 折交叉验证得分: ",
      np.mean(cross_val_score(MNB(), X, y_train, cv=20,
Ejemplo n.º 20
0
def train_3(labelList=[],maxLen=None):
    # 使用pandas读入数据集
    all_data = pd.read_csv('../data/data_five_labels.csv')
    all_data.columns = ["disease","position","label","title","abstract","text"]
# 将训练和测试数据都转成词list
    x_data = []
    y_data = []
    #{'症状': 24697, '病因': 8472, '检查': 4191, '鉴别': 1105, '治疗': 25620, '饮食护理': 10522, '预防': 7246, '并发症': 2200}
    for i in range(0, len(all_data['title'])):
        if all_data["label"][i] in labelList:#剔除数据量不足的标签
            x_data.append(all_data["title"][i])
            y_data.append(all_data["label"][i])
    stopwords = []
    #创建停用词表
    with open("../data/stopwords.txt","r",encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV( min_df=0,max_df=1,max_features=maxLen, strip_accents='unicode', analyzer='word', token_pattern=r'\b[\u4e00-\u9fa5]\w+\b',
                  stop_words=stopwords,ngram_range=(1,3))
    x_all = []
    y_all = []
    jieba.load_userdict("../data/diseaseDic.txt")
    jieba.load_userdict("../data/symptomDic.txt")
    print("正在分词、去停...")
    i= 0
    for text in x_data:
        i+=1
        print("第%d篇文章..."%i)
        try:
            x_all.append(" ".join(jieba.cut(seg_sentence(text))))
            y_all.append(y_data[x_data.index(text)])
        except:
            pass
    print("初始标签长度:%d" % len(y_data))
    print("分词去停后标签长度:%d" %len(y_all))
    print("正在抽取特征词...")
    tfv.fit(x_all)
    feature_list = tfv.get_feature_names()
    #将特征词表写入txt
    f = open("../data/feature_names_3.txt","w",encoding="utf-8")
    for x in feature_list:
        f.write(x+"\n")
    f.close()
    print("正在将文档映射为向量...")
    X_all = tfv.transform(x_all)
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=0)
    print("训练...")
    clf = svm.SVC(kernel='linear', C=1).fit(X_train,y_train)
    def calculate_result(actual, pred):
        m_precision = metrics.precision_score(actual, pred,average=None)
        m_recall = metrics.recall_score(actual, pred,average=None)
        accuracy = metrics.accuracy_score(pred, actual)
        print('SVM分类模型在测试集上的准确率:',accuracy)
        print('SVM分类模型在测试集上的P值:', m_precision)
        print('SVM分类模型在测试集上的R值:', m_recall)
        print('SVM分类模型在测试集上的F1值:', metrics.f1_score(actual, pred,average=None))
        return accuracy

    pred = clf.predict(X_test)
    acc = calculate_result(y_test, pred)
    return acc
Ejemplo n.º 21
0
    #.join()表示用空格来连接括号内的字符串
    print i

test_data = []
for i in xrange(0, len(test['review'])):
    print i, len(test['review'])
    test_data.append("-".join(review_to_worldlist(test['review'][i])))
endTime = time.clock()
print train_data[0]
print "%d s" % (endTime - startTime)
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
tfv = TFIV(min_df=3,
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 2),
           use_idf=1,
           smooth_idf=1,
           sublinear_tf=1,
           stop_words='english')
X_all = train_data + test_data
len_train = len(train_data)

tfv.fit(X_all)
X_all = tfv.transform(X_all)
X = X_all[:len_train]  #恢复训练集和测试集
X_test = X_all[len_train:]

from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.cross_validation import cross_val_score
model_NB = MNB()
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
from processing import data_no_features

raw = data_no_features()

tfv = TFIV(min_df=3,
           max_features=None,
           strip_accents='unicode',
           analyzer='word',
           token_pattern=r'\w{1,}',
           ngram_range=(1, 2),
           sublinear_tf=False,
           max_df=0.9)

X_train, X_test, y_train, y_test = train_test_split(raw['x_raw'],
                                                    raw['y'],
                                                    test_size=0.2,
                                                    random_state=42)

lr = LogisticRegression(solver='lbfgs')
lsvc = LinearSVC()
nb = MultinomialNB()
dt = DecisionTreeClassifier()

cp1 = Pipeline([('tfidf', tfv), ('lsvc', lsvc)])
Ejemplo n.º 23
0
def train(maxFeature=None, minDf=0.0, maxDf=1.0, ngram=None, classifier=svm.SVC(kernel='linear'), index=None):
    # 使用pandas读入训练和测试csv文件
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')
    train.columns = ["disease", "position", "label", "title", "abstract", "text"]
    test.columns = ["disease", "position", "label", "title", "abstract", "text"]
    word_vec = {}
    with open("../data/word_vector.csv", "r", encoding="utf-8") as f:
        for line in csv.reader(f):
            word_vec.update({line[0]: eval(line[1])})
    # 取出句子的标签
    y_train = train['label']
    y_test = test['label']
    # 将训练和测试数据都转成词list
    train_data = []
    for i in range(0, len(train['title'])):
        train_data.append(train["title"][i] + str(train["abstract"][i]) + str(train["text"][i]))
        # train_data.append(train["abstract"][i]+train['text'][i])
    test_data = []
    for i in range(0, len(test['title'])):
        test_data.append(test["title"][i] + str(test["abstract"][i]) + str(test["text"][i]))
        # test_data.append(train["abstract"][i]+test['text'][i])

    stopwords = []
    with open("../data/stopwords.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    # 初始化TFIV对象,去停用词,加1-2元语言模型
    tfv = TFIV(min_df=minDf, max_df=maxDf, max_features=maxFeature, strip_accents='unicode', analyzer='word',
               token_pattern=r'\b[\u4e00-\u9fa5]\w+\b', ngram_range=ngram,
               stop_words=stopwords)
    # 合并训练和测试集以便进行TFIDF向量化操作
    X_all = train_data + test_data
    tokenized_corpus = []
    # jieba.load_userdict("../data/diseaseDic.txt")
    # jieba.load_userdict("../data/symptomDic.txt")
    print("正在分词、去停...")
    for text in X_all:
        tokenized_corpus.append(" ".join(jieba.cut(seg_sentence(text))))
    len_train = len(train_data)

    # 对所有的数据进行向量化操作,耗时比较长
    print("正在抽取特征词...")
    tfv.fit(tokenized_corpus)
    feature_list = tfv.get_feature_names()
    f = open("../data/feature_names_2.txt", "w", encoding="utf-8")
    for x in feature_list:
        f.write(x + "\n")
    f.close()
    print("正在将文档映射为向量...")
    model = gensim.models.KeyedVectors.load_word2vec_format("../data/sgns.wiki.bigram-char", encoding="utf-8")
    X_all = tfv.transform(tokenized_corpus)
    ii = 0  # 文章数
    all = []
    maxLen = 0
    # f_weight = open("../data/file_weight_" + index + ".txt", "w", encoding="utf-8")
    for sentence in X_all:
        j = 0  # 每篇文章的特征词个数
        ii += 1
        print("article num is %d" %ii)
        sentenceWeight = np.array([])
        sentence = sentence.toarray().tolist()[0]
        for x in sentence:
            i = sentence.index(x)
            weight = sentence[i]
            if weight != 0:
                word = feature_list[i]
                if word in word_vec.keys():
                    j += 1
                    wordMatrix = np.array(word_vec[word])
                # else:
                #     pass
                #     # wordMatrix = np.array([0]*300)
                sentenceWeight = np.array(sentenceWeight.tolist() + (weight * wordMatrix).tolist())
        all.append((sentenceWeight).tolist())#将每篇文章转换为词向量存入TXT文本,每篇文章的特征词数目不同,取最大长度,不足的补零
        if maxLen < j:
            maxLen = j
    print("max len:",maxLen)
    for i in range(0,len(all)):
        if len(all[i]) < maxLen *300:
            all[i] = np.pad(np.array(all[i]), (0, maxLen - len(all[i])), mode='constant', constant_values=(0, 0)).tolist()

    print("正在训练PCA降维模型...")
    pca = PCA(n_components=500)
    pca.fit(np.array(all))
    print("正在将文档矩阵降维...")
    all_data = pca.fit_transform(np.array(all)).tolist()
    with open("../data/vectors.txt", "w", encoding="utf-8") as file:
        file.write(str(X_all))
    # 恢复成训练集和测试集部分
    X = np.array(all_data[:len_train])
    X_test = np.array(all_data[len_train:])

    # 定义模型结果输出函数,依次输出p值,r值,F值和准确率accuracy
    def calculate_result(actual, pred):
        m_precision = metrics.precision_score(actual, pred, average=None)
        m_recall = metrics.recall_score(actual, pred, average=None)
        accuracy = metrics.accuracy_score(pred, actual)
        with open("../data/record_pca.txt", "a", encoding="utf-8") as f:
            f.write('SVM分类模型在测试集上的准确率:' + str(accuracy) + '\n')
            f.write('SVM分类模型在测试集上的准确率:' + str(accuracy) + '\n')
            f.write('SVM分类模型在测试集上的P值:' + str(m_precision) + '\n')
            f.write('SVM分类模型在测试集上的R值:' + str(m_recall) + '\n')
            f.write('SVM分类模型在测试集上的F1值:' + str(metrics.f1_score(actual, pred, average=None)) + '\n')
        return accuracy

    # 训练P_svm分类模型
    print("正在训练分类器...")
    svclf = classifier
    svclf.fit(X, y_train)
    # #模型保存
    joblib.dump(svclf, "P_svm_train_model.m")
    print("正在预测...")
    pred = svclf.predict(X_test)
    acc = calculate_result(y_test, pred)
    return acc
Ejemplo n.º 24
0
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['article' , 'author']
feature_set = df.loc[:,:].values


###Step 2: Data split 
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split (df['article'],df['author'] ,test_size=0.2, random_state=42)

### Step 3: tf-idf ( Term frequency - inversse document frequency) processing

from sklearn.feature_extraction.text import TfidfVectorizer as TFIV

get_stop_words= TFIV(  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

my_stop_words = set(get_stop_words.get_stop_words())
my_stop_words.add("link")

tfv = TFIV(  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = my_stop_words)

my_stop_words = tfv.get_stop_words()
tfid_train=tfv.fit_transform(features_train)
tfid_test =tfv.transform(features_test)