def scope(): td = trainData(threshold=20) testd = testData() label, raw = td.getLabelsAndrawData() #process data ext = extractor() ext.loadCacheFile() ext.highFrequencyTokens(label, raw) ext.extractEmoji(raw) ext.extractHashTags(raw) data = ext.batchProduceFixFeatureVec(raw) tdata = ext.batchProduceFixFeatureVec(testd.getAllTweets()) td.unloadData() ext.saveCacheFiles() #ext.unloadExt() del ext del raw print(getsizeof(data)) print("clean trash...") #malloc_trim() return label, data, tdata
from fileReader import trainData, testData from featureExtractor import extractor from sklearn import svm import pickle # create trainingData and feature extractor train = trainData(threshold=10) e = extractor() # read training data label, raw = train.getLabelsAndrawData() data = e.batchToVector(raw, usr_flag=False) # clear training data for memory saving train.unloadData() # create svm model print("init model....") clf = svm.SVC(gamma='scale', verbose=False) print("training model.....") # train model clf.fit(data, label) print("finished training!!!!") # save model print("saving model...") with open("svmModel.pkl", 'wb') as f: pickle.dump(clf, f)
text = re.sub( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "", text) tokens = nltk.word_tokenize(text) stems = [] for item in tokens: if len(item) > 4: stems.append(PorterStemmer().stem(item)) space = " " text = space.join(stems) new_texts.append(text) print('break') return new_texts td = trainData() label, rawData = td.getLabelsAndrawData() X_train, X_test, y_train, y_test = train_test_split(rawData, label, test_size=0.25, random_state=32) t = tokenize(X_train) vec = TfidfVectorizer(min_df=20, max_df=1000, stop_words='english') X = vec.fit_transform(t).toarray() print(X) print("break") clf = RandomForestClassifier(n_estimators=100, n_jobs=4, verbose=2) #clf = LinearSVC(verbose=2)
import pandas as pd import numpy as np from fileReader import trainData from sklearn.model_selection import train_test_split from featureExtractor import extractor from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.naive_bayes import GaussianNB td = trainData(threshold=50) label,raw = td.getLabelsAndrawData() #process data ext = extractor() data = ext.batchProduceFixFeatureVec(raw) td.unloadData() X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0) #gnb = GaussianNB() #gnb.fit(X_train, y_train) #y_pred = gnb.predict(X_test) # train model regressor = RandomForestClassifier(n_estimators=50, criterion='entropy', min_samples_split=40, verbose=2) regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) # print output