def scope(): td = trainData(threshold=20) testd = testData() label, raw = td.getLabelsAndrawData() #process data ext = extractor() ext.loadCacheFile() ext.highFrequencyTokens(label, raw) ext.extractEmoji(raw) ext.extractHashTags(raw) data = ext.batchProduceFixFeatureVec(raw) tdata = ext.batchProduceFixFeatureVec(testd.getAllTweets()) td.unloadData() ext.saveCacheFiles() #ext.unloadExt() del ext del raw print(getsizeof(data)) print("clean trash...") #malloc_trim() return label, data, tdata
print('break') return new_texts td = trainData() label, rawData = td.getLabelsAndrawData() X_train, X_test, y_train, y_test = train_test_split(rawData, label, test_size=0.25, random_state=32) t = tokenize(X_train) vec = TfidfVectorizer(min_df=20, max_df=1000, stop_words='english') X = vec.fit_transform(t).toarray() print(X) print("break") clf = RandomForestClassifier(n_estimators=100, n_jobs=4, verbose=2) #clf = LinearSVC(verbose=2) #clf = MultinomialNB() #clf = LogisticRegression(verbose=1,n_jobs=4,solver='sag') clf.fit(X, y_train) td = testData().getAllTweets() t = tokenize(X_test) test_x = vec.transform(t).toarray() pre = clf.predict(test_x) print(confusion_matrix(y_test, pre)) print(accuracy_score(y_test, pre))
import pickle def predictUsingSVMModel(lines): # create output file output = open("output.csv", 'w', encoding='utf-8') output.write("Id,Predicted\n") # create extractor ext = extractor() # vectorize test data list = ext.batchToVector(lines, usr_flag=False) # load in SVM model with open("svmModel.pkl", 'rb') as file: model = pickle.load(file) id = 0 for l in list: id += 1 ans = model.predict([l]) output.write(str(id) + ',' + ans[0] + '\n') print("progress: " + str(id * 100 / list.__len__()) + "%") # load testdata t = testData() data = t.getAllTweets() predictUsingSVMModel(data)