Ejemplo n.º 1
0
from sklearn.preprocessing import LabelEncoder
dataset = pd.read_csv('LabelledData.txt',
                      delimiter=',,,',
                      quoting=3,
                      header=None,
                      engine='python')
y = dataset.iloc[:, 1].str.strip()
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# Cleaning the texts
corpus = []
cleaner = DataHandler(dataset.iloc[:, 0])
#print(cleaner.__dict__)
#corpus = cleaner.cleanLemmatizer()
corpus = cleaner.cleanStemmer()

# Creating the Bag of Words model
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

##############################################################
# Training the model
print("Training the model with train_set=80% & test_set=20%")

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)