def get_train_data(self): # Tạo train data df_train = pd.read_csv( codecs.open('filtered_data/vnexpress.csv', 'r', 'utf-8')) model = SVMModel() clf = model.clf.fit(df_train["content"], df_train.label) return clf
def get_train_data(self): train_data = [] fr_train = open('generated_files/cleanDataTrainVi.txt') for line in iter(fr_train.readline, ''): string_feature = unicode(line.rstrip(), "utf-8") string_target = unicode(fr_train.readline().rstrip(), "utf-8)") train_data.append({ "feature": string_feature, "target": string_target }) df_train = pd.DataFrame(train_data) #test data test_data = [] accuracy = [] fr_test = open('generated_files/cleanDataTestVi.txt') for line in iter(fr_test.readline, ''): string_feature = unicode(line.rstrip(), "utf-8") string_target = unicode(fr_test.readline().rstrip(), "utf-8)") accuracy.append(string_target) test_data.append({ "feature": string_feature, "target": string_target }) df_test = pd.DataFrame(test_data) model = SVMModel() clf = model.clf.fit(df_train["feature"], df_train.target) predicted = clf.predict(df_test["feature"]) # Print predicted result h = 0 for i in range(1, len(accuracy)): if predicted[i - 1] == accuracy[i - 1]: h += 1 print '%f' % (h / float(i)) print clf.predict_proba(df_test["feature"]) while True: raw = raw_input("nhap gi do:") decoded = raw.decode("utf-8") test = [] test.append({"feature": decoded, "target": u'HOTEL'}) test_df = pd.DataFrame(test) print(clf.predict(test_df["feature"]))
def predict(self, input): test_data = [] test_data.append({"feature": input, "target": "hoi_thoi_tiet"}) df_test = pd.DataFrame(test_data) # init model naive bayes model = SVMModel() filename = 'svm_model.sav' clf = pickle.load(open(filename, 'rb')) predicted = clf.predict(df_test["feature"]) Probability = clf.predict_proba(df_test["feature"])[0] # Print predicted result # print(predicted) # print(clf.predict_proba(df_test["feature"])) # print(clf.predict_proba(df_test["feature"])) return predicted, Probability
def get_train_data(self): # train data train_data = [] train_data.append({ "feature": u"Hôm nay trời đẹp không ?", "target": "hoi_thoi_tiet" }) train_data.append({ "feature": u"Hôm nay thời tiết thế nào ?", "target": "hoi_thoi_tiet" }) train_data.append({ "feature": u"Hôm nay mưa không ?", "target": "hoi_thoi_tiet" }) train_data.append({"feature": u"Chào em gái", "target": "chao_hoi"}) train_data.append({"feature": u"Chào bạn", "target": "chao_hoi"}) train_data.append({"feature": u"Hello bạn", "target": "chao_hoi"}) train_data.append({"feature": u"Hi kimi", "target": "chao_hoi"}) train_data.append({"feature": u"Hi em", "target": "chao_hoi"}) df_train = pd.DataFrame(train_data) # test data test_data = [] test_data.append({ "feature": u"Nóng quá, liệu mưa không em ơi?", "target": "hoi_thoi_tiet" }) df_test = pd.DataFrame(test_data) # init model naive bayes model = SVMModel() clf = model.clf.fit(df_train["feature"], df_train.target) predicted = clf.predict(df_test["feature"]) # Print predicted result print predicted print clf.predict_proba(df_test["feature"])
def train_data(self): train = open('data/giaitri.json', 'r', encoding='utf-8-sig') train_data = [] for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "giai_tri" }) train = open('data/giaoduc.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "giao_duc" }) train = open('data/kinhdoanh.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "kinh_doanh" }) train = open('data/phapluat-tintuc.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "phap_luat_tin_tuc" }) train = open('data/thegioi.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "the_gioi" }) train = open('data/thethao.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "the_thao" }) train = open('data/thoisu.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "thoi_su" }) train = open('data/tuvan.json', 'r', encoding='utf-8-sig') for line in train: data = json.loads(line) train_data.append({ "feature": (str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['title']).strip() + ' ' + str(data['sapo']).strip() + ' ' + str(data['text']).strip()), "target": "tu_van" }) df_train = pd.DataFrame(train_data) model = SVMModel() clf = model.clf.fit(df_train["feature"], df_train.target) filename = 'svm_model.sav' pickle.dump(clf, open(filename, 'wb'))
def get_train_data(self): common = cm.Common() # train data url = "people.csv" #train_data = TextClassificationPredict.connectMysql() train_data = TextClassificationPredict.readCSV(url) checkdata = TextClassificationPredict.readCSV("peoplemaster.csv") print(checkdata) df_train = pd.DataFrame(train_data) chectrain = pd.DataFrame(checkdata) df_train['category_id'] = df_train['master_room_type'].factorize()[0] train_outcome = pd.crosstab( index=train_data["master_room_type"], # Make a crosstab columns="count") # Name the count column df_train['room_name'] = df_train["room_name"].apply( TextClassificationPredict.clean_text) chectrain['room_name'] = chectrain["room_name"].apply( TextClassificationPredict.clean_text) dfview = df_train.drop(df_train[df_train['view'] == "Other"].index) dfBedType = df_train.drop( df_train[df_train['bedType'] == "Other"].index) dfBed = df_train.drop(df_train[df_train['bed'] == "Other"].index) target = train_data['master_room_type'] #target = checkdata['master_room_type'] targetview = dfview['view'] targetBedType = dfBedType['bedType'] targetBed = dfBed['bed'] traindata, testdata, labels_train, labels_test = train_test_split( df_train, target, test_size=0.2, random_state=10) traindataview, testdataview, labels_trainview, labels_testview = train_test_split( dfview, targetview, test_size=0.2, random_state=10) traindataBedType, testdataBedType, labels_trainBedType, labels_testBedType = train_test_split( dfBedType, targetBedType, test_size=0.2, random_state=10) traindataBed, testdataBed, labels_trainBed, labels_testBed = train_test_split( dfBed, targetBed, test_size=0.2, random_state=10) #model = NaiveBayesModel() model = SVMModel() modelview = SVMModel() modelBedType = SVMModel() modelBed = SVMModel() clf = model.clf.fit(traindata["room_name"], traindata.master_room_type) clfview = modelview.clf.fit(traindataview["room_name"], traindataview.view) clfBedType = modelBedType.clf.fit(traindataBedType["room_name"], traindataBedType.bedType) clfBed = modelBed.clf.fit(traindataBed["room_name"], traindataBed.bed) predicted = clf.predict(testdata['room_name'].apply( TextClassificationPredict.clean_text)) predictedview = clfview.predict(testdataview['room_name'].apply( TextClassificationPredict.clean_text)) predictedBedType = clfBedType.predict( testdataBedType['room_name'].apply( TextClassificationPredict.clean_text)) predictedBed = clfBed.predict(testdataBed['room_name'].apply( TextClassificationPredict.clean_text)) #print (predicted) print('accuracy %s' % accuracy_score(predicted, labels_test)) print('accuracyView %s' % accuracy_score(predictedview, labels_testview)) print('accuracyBedType %s' % accuracy_score(predictedBedType, labels_testBedType)) print('accuracyBed %s' % accuracy_score(predictedBed, labels_testBed)) a = clf.predict_proba(testdata["room_name"]) TextClassificationPredict.save_model( os.path.abspath(os.path.dirname(__file__)) + "/x_transformer.pkl", clf) TextClassificationPredict.save_model( os.path.abspath(os.path.dirname(__file__)) + "/x_transformerView.pkl", clfview) TextClassificationPredict.save_model( os.path.abspath(os.path.dirname(__file__)) + "/x_transformerBedType.pkl", clfBedType) TextClassificationPredict.save_model( os.path.abspath(os.path.dirname(__file__)) + "/x_transformerViewBed.pkl", clfBed) dt = pd.DataFrame(testdata) dt["predicted"] = predicted
def set_classification_model(self, type=SVM): if type==SVM: self.model = SVMModel()
class Pipeline: def __init__(self): self.set_trainset_directory() self.set_testset_directory() self.set_classification_model() self.set_features() self.extract_train_features() self.extract_train_labels() self.extract_test_features() self.extract_test_labels() def set_trainset_directory(self, directory="../data/train_pairs.csv"): self.trainset = DataSet(directory=directory) self.trainset.read_data() def set_testset_directory(self, directory="../data/test_pairs.csv"): self.testset = DataSet(directory=directory) self.testset.read_data() def set_classification_model(self, type=SVM): if type==SVM: self.model = SVMModel() def set_features(self, features=[SIMPLE_MATCHING, LAVENSHTEIN_DISTANCE, ROUGE_S, CONSECUTIVE_SUBSEQUENCE_MATCHING, TRI_GRAM_CHARACTER]): self.feature_extractors = [] if SIMPLE_MATCHING in features: self.feature_extractors.append(SimpleMatchingExtractor()) if LAVENSHTEIN_DISTANCE in features: self.feature_extractors.append(LavenshteinExtractor()) if ROUGE_S in features: self.feature_extractors.append(RougeSExtractor()) if CONSECUTIVE_SUBSEQUENCE_MATCHING in features: self.feature_extractors.append(ConsecutiveSubsequenceMatchingExtractor()) if TRI_GRAM_CHARACTER in features: self.feature_extractors.append(TriGramCharacterExtractor()) def extract_features(self, dataset: DataSet): X = [] for sentence_pair in dataset.sentence_pairs: x = [] token_pair = ProcessSentencePair(WordTokenizePreProcessor().transform(sentence_pair.text), WordTokenizePreProcessor().transform(sentence_pair.hypothesis)) pos_pair = ProcessSentencePair(PosTagPreProcessor().transform(sentence_pair.text, token_pair.text), PosTagPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis)) stem_pair = ProcessSentencePair(StemPreProcessor().transform(sentence_pair.text, token_pair.text), StemPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis)) lemma_pair = ProcessSentencePair(LemmaPreProcessor().transform(sentence_pair.text, pos_pair.text), LemmaPreProcessor().transform(sentence_pair.hypothesis, token_pair.hypothesis)) for feature_extractor in self.feature_extractors: x.append(feature_extractor.transform(sentence_pair, token_pair)) x.append(feature_extractor.transform(sentence_pair, stem_pair)) x.append(feature_extractor.transform(sentence_pair, lemma_pair)) X.append(x) return X def extract_labels(self, dataset: DataSet): Y = [] for sentence_pair in dataset.sentence_pairs: Y.append(sentence_pair.label) return Y def extract_train_features(self): self.train_features = self.extract_features(dataset=self.trainset) def extract_train_labels(self): self.train_labels = self.extract_labels(dataset=self.trainset) def extract_test_features(self): self.test_features = self.extract_features(dataset=self.testset) def extract_test_labels(self): self.test_labels = self.extract_labels(dataset=self.testset) def train_classification_model(self): self.model.fit(self.train_features, self.train_labels) def test_classification_model(self): self.test_predicts = self.model.transform(self.test_features) print(accuracy_score(self.test_labels, self.test_predicts)) def predict(self, dataset: DataSet): X = self.extract_features(dataset=dataset) return self.model.transform(X)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score data = pd.read_csv(codecs.open('../train_data/data_dantri.csv', 'r', 'utf-8')) data = data.append(pd.read_csv( codecs.open('../train_data/vnexpress.csv', 'r', 'utf-8')), ignore_index=True) data = data.loc[data.sample(frac=1).groupby('label').cumcount() <= 2000] X, y = data.content, data.label X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) start = time.time() model = SVMModel('rbf') clf = model.clf.fit(X_train, y_train) print(time.time() - start) pkl_filename = "svm_model.pkl" with open(pkl_filename, 'wb') as file: pickle.dump(clf, file) data = pd.read_csv(codecs.open('train_data/vietnamnet.csv', 'r', 'utf-8')) X_test, y_test = data.content, data.label y_pred = clf.predict(X_test) print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred))