Ejemplo n.º 1
0
def scope():
    td = trainData(threshold=20)
    testd = testData()
    label, raw = td.getLabelsAndrawData()

    #process data
    ext = extractor()

    ext.loadCacheFile()
    ext.highFrequencyTokens(label, raw)
    ext.extractEmoji(raw)
    ext.extractHashTags(raw)
    data = ext.batchProduceFixFeatureVec(raw)
    tdata = ext.batchProduceFixFeatureVec(testd.getAllTweets())
    td.unloadData()
    ext.saveCacheFiles()
    #ext.unloadExt()
    del ext
    del raw

    print(getsizeof(data))
    print("clean trash...")

    #malloc_trim()

    return label, data, tdata
Ejemplo n.º 2
0
        print('break')
    return new_texts


td = trainData()
label, rawData = td.getLabelsAndrawData()

X_train, X_test, y_train, y_test = train_test_split(rawData,
                                                    label,
                                                    test_size=0.25,
                                                    random_state=32)

t = tokenize(X_train)
vec = TfidfVectorizer(min_df=20, max_df=1000, stop_words='english')
X = vec.fit_transform(t).toarray()
print(X)
print("break")

clf = RandomForestClassifier(n_estimators=100, n_jobs=4, verbose=2)
#clf = LinearSVC(verbose=2)
#clf = MultinomialNB()
#clf = LogisticRegression(verbose=1,n_jobs=4,solver='sag')
clf.fit(X, y_train)
td = testData().getAllTweets()
t = tokenize(X_test)
test_x = vec.transform(t).toarray()
pre = clf.predict(test_x)

print(confusion_matrix(y_test, pre))
print(accuracy_score(y_test, pre))
Ejemplo n.º 3
0
import pickle


def predictUsingSVMModel(lines):
    # create output file
    output = open("output.csv", 'w', encoding='utf-8')
    output.write("Id,Predicted\n")

    # create extractor
    ext = extractor()

    # vectorize test data
    list = ext.batchToVector(lines, usr_flag=False)

    # load in SVM model
    with open("svmModel.pkl", 'rb') as file:
        model = pickle.load(file)

    id = 0
    for l in list:
        id += 1
        ans = model.predict([l])
        output.write(str(id) + ',' + ans[0] + '\n')
        print("progress: " + str(id * 100 / list.__len__()) + "%")


# load testdata
t = testData()
data = t.getAllTweets()

predictUsingSVMModel(data)