Ejemplo n.º 1
0
def scope():
    td = trainData(threshold=20)
    testd = testData()
    label, raw = td.getLabelsAndrawData()

    #process data
    ext = extractor()

    ext.loadCacheFile()
    ext.highFrequencyTokens(label, raw)
    ext.extractEmoji(raw)
    ext.extractHashTags(raw)
    data = ext.batchProduceFixFeatureVec(raw)
    tdata = ext.batchProduceFixFeatureVec(testd.getAllTweets())
    td.unloadData()
    ext.saveCacheFiles()
    #ext.unloadExt()
    del ext
    del raw

    print(getsizeof(data))
    print("clean trash...")

    #malloc_trim()

    return label, data, tdata
Ejemplo n.º 2
0
from fileReader import trainData, testData
from featureExtractor import extractor
from sklearn import svm
import pickle

# create trainingData and feature extractor
train = trainData(threshold=10)
e = extractor()

# read training data
label, raw = train.getLabelsAndrawData()

data = e.batchToVector(raw, usr_flag=False)

# clear training data for memory saving
train.unloadData()

# create svm model
print("init model....")
clf = svm.SVC(gamma='scale', verbose=False)

print("training model.....")
# train model
clf.fit(data, label)

print("finished training!!!!")
# save model

print("saving model...")
with open("svmModel.pkl", 'wb') as f:
    pickle.dump(clf, f)
Ejemplo n.º 3
0
        text = re.sub(
            r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            "", text)
        tokens = nltk.word_tokenize(text)
        stems = []
        for item in tokens:
            if len(item) > 4:
                stems.append(PorterStemmer().stem(item))
        space = " "
        text = space.join(stems)
        new_texts.append(text)
        print('break')
    return new_texts


td = trainData()
label, rawData = td.getLabelsAndrawData()

X_train, X_test, y_train, y_test = train_test_split(rawData,
                                                    label,
                                                    test_size=0.25,
                                                    random_state=32)

t = tokenize(X_train)
vec = TfidfVectorizer(min_df=20, max_df=1000, stop_words='english')
X = vec.fit_transform(t).toarray()
print(X)
print("break")

clf = RandomForestClassifier(n_estimators=100, n_jobs=4, verbose=2)
#clf = LinearSVC(verbose=2)
Ejemplo n.º 4
0
import pandas as pd
import numpy as np
from fileReader import trainData
from sklearn.model_selection import train_test_split
from featureExtractor import extractor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB

td = trainData(threshold=50)
label,raw = td.getLabelsAndrawData()

#process data
ext = extractor()
data = ext.batchProduceFixFeatureVec(raw)
td.unloadData()

X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)

#gnb = GaussianNB()
#gnb.fit(X_train, y_train)
#y_pred = gnb.predict(X_test)



# train model
regressor = RandomForestClassifier(n_estimators=50, criterion='entropy', min_samples_split=40, verbose=2)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

# print output