from gensim.models.doc2vec import Doc2Vec, TaggedDocument from utils import load_reviews import time start = time.time() filePath = '../../corpus/100k/allTrimed.csv' _, reviews = load_reviews(filePath) reviews = [review.split() for review in reviews] cost = time.time() - start print(f'Loading reviews cost: {cost:.4f} Sec') start = time.time() documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews)] # print(document) model = Doc2Vec(documents, vector_size=20, window=2, min_count=5, epochs=10) model.save_word2vec_format('./data/d2v.txt') model.save('./data/d2v.model') cost = time.time() - start print(f'Training model cost: {cost:.4f} Sec')
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from utils import load_reviews, data_suffle # stopwordPath = './data/stopword.txt' # userDictPath = './data/user_dict.txt' csvFilePath = '../../corpus/100k/allTrimed.csv' modelPath = './data/bayes.model' # 载入自定义字典 # jieba.load_userdict(userDictPath) time_start = time.time() labels, reviews = load_reviews(csvFilePath) labels, reviews = data_suffle(labels, reviews) # 1/4 分割数据集 n = len(labels) // 5 labels_train, reviews_train = labels[n:], reviews[n:] labels_test, reviews_test = labels[:n], reviews[:n] print(f'Load Corpus Cost {time.time() - time_start:.4f} Sec') print(reviews[:5], type(reviews), type(reviews[0])) time_start = time.time() vectorizer = CountVectorizer() vec_train = vectorizer.fit_transform( [np.str_(review) for review in reviews_train])
# initalize Mecab tagger tagger = Mecab() # initalize regular expression exp = re.compile(POS, re.IGNORECASE) # load sentiment dictionary bag = utils.load_dictionary() # load model if exist try: with open("../Resources/models/model", "rb") as model_file: model = pickle.load(model_file) except IOError as err: # load training reviews from file train_review = utils.load_reviews("../Resources/samples/train_data") # get feature from train data train_data, train_label = feature_data(tagger, exp, bag, train_review) # initalize classifer class model = SVM() # train model model.train(train_data, train_label) #save model with open("../Resources/models/model", "wb") as model_file: pickle.dump(model, model_file) else: print("use saved model..") # load test reviews from file test_review = utils.load_reviews("../Resources/samples/test_data") # get feature from test data
# initalize Mecab tagger tagger = Mecab() # initalize regular expression exp = re.compile(POS, re.IGNORECASE) # load sentiment dictionary bag = utils.load_dictionary() # load model if exist try: with open("./models/model", "rb") as model_file: model = pickle.load(model_file) except IOError as err: # load training reviews from file train_review = utils.load_reviews("./samples/train_data") # get feature from train data train_data, train_label = feature_data(tagger, exp, bag, train_review) # initalize classifer class model = SVM() # train model model.train(train_data, train_label) #save model with open("./models/model", "wb") as model_file: pickle.dump(model, model_file) else: print("use saved model..") # load test reviews from file #test_review = utils.load_reviews("./samples/test_data") # get feature from test data