Beispiel #1
0
  def __init__(self):
    stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
    self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
    self.emails = read_email_bodies() 

    # train on the given email data.
    self.train()
    def add_stop_words(self):
        if self.stop_words is None:
            self.stop_words = list(ENGLISH_STOP_WORDS)
            logging.info("using default stop words")

        else:
            words = self._split_on_spaces(self.stop_words)
            self.stop_words = list(ENGLISH_STOP_WORDS.union(words))
            logging.info("using custom stop words")
            logging.debug("stop words:%s" % self.stop_words)
def lda(text, n_features, n_topics, n_top_words):
	""" perform latent dirichlet allocation

	input (array): an array of strings
	"""
	# add to stop words 
	# the word inapplicable is a result of the questionnaire
	stop_words = ENGLISH_STOP_WORDS.union(['inapplicable'])

	tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0., max_features=n_features, 
		stop_words=stop_words)
	tf = tf_vectorizer.fit_transform(text)
	model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
	 learning_method='online', learning_offset=50., random_state=0)
	model.fit(tf)

	tf_feature_names = tf_vectorizer.get_feature_names()

	tops = get_top_words(model, tf_feature_names, n_top_words)

	return tops 
    def fit(self):
        pass

    def transform(self):
        matrix = self.vectorizer.transform(self.text)
        self.X = self.dim_reducer.transform(matrix.toarray())

    def predict(self):
        labels = self.model.predict(self.X)
        self.X = np.column_stack((self.X, labels))
        return labels

    def find_closest_beer_names(self):
        pass

    def recommend(self, user_input):
        pass


def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', '', tokens.lower())
    return r

stop_words = ENGLISH_STOP_WORDS.union({'king','german','brau','james',\
'brewery','company','brewing','house','bock','style','scotch','california','oktoberfest',\
'wee','special','english','american','hefeweizen','old','common','gose','NUM'})

if __name__ == '__main__':
    df = load_data()
Beispiel #5
0
df_eng_all_hl = pd.DataFrame({
    "Headlines": eng_all_hl,
    "Date": all_headlines.iloc[:, 1],
    'Publisher': all_headlines.iloc[:, 2]
})

#Overview of the Sentiment
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

all_hl_lists = svd_headline_list + ex_headline_list + dn_headline_list + af_headline_list + metro_headline_list
joint_headlines = ','.join(eng_all_hl)

my_stop_words = ENGLISH_STOP_WORDS.union(
    ['sweden', 'swedish', 'new', 'best', 'want', 'does', 'dn'])

my_cloud = WordCloud(background_color='white',
                     stopwords=my_stop_words).generate(joint_headlines)
plt.imshow(my_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()
'''
#Tokenizing
from nltk import word_tokenize
import nltk
nltk.download('punkt')
word_tokens = [word_tokenize(review) for review in df_eng_all_hl.headlines]
cleaned_tokens = [[word for word in item if word.isalpha()] for item in word_tokens]

list_cleaned_tokens = []
from nltk.stem.snowball import SnowballStemmer

from params import mlp_params, svm_params, log_params
import pandas as pd
import pickle
import nltk
import re
import eda

TOP_K_FEATURES = 20000

NEG_REGEX = re.compile(r"^(\w*?n't|no(t)?|never$)", re.I)
WORD_REGEX = re.compile(r"^[a-zA-Z_']+$", re.I)
STEMMER = SnowballStemmer('english')
STOP_WORDS = frozenset(
    ENGLISH_STOP_WORDS.union(['movie',
                              'film']).difference(['not', 'never', 'no']))
ALGS_METRICS = {}

models = [
    ('MNB', MultinomialNB(), None),
    ('LogReg', LogisticRegression(), log_params),
    ('SVM', LinearSVC(), svm_params),
    #    ('MLP', MLPClassifier(), mlp_params),
    ('DT', DecisionTreeClassifier(), None)
]


def preprocess_raw_text(raw_review):
    """
    negates appropriate words
    removes stop words
Beispiel #7
0
        ax.set_title("cluster = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2, 2))
        ax.barh(x, df.score, align='center', color='#7530FF')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1] + 1])
        yticks = ax.set_yticklabels(df.features)
        plt.subplots_adjust(bottom=0.09,
                            right=0.97,
                            left=0.15,
                            top=0.95,
                            wspace=0.52)
    plt.show()


from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
stopwords = ENGLISH_STOP_WORDS.union(
    ['ect', 'hou', 'com', 'recipient', 'dell', 'hi', 'hello', 'nikitha'])
vect = TfidfVectorizer(analyzer='word',
                       stop_words=stopwords,
                       max_df=0.3,
                       min_df=2)

X = vect.fit_transform(df1.Body)
features = vect.get_feature_names()

# Now we print the top terms across all documents.
print(top_mean_feats(X, features, None, 0.1, 20))

df1[1:10]

train = df1.sample(frac=0.8, random_state=200)
Beispiel #8
0
def main():
    stop_words = set(STOPWORDS)
    stop_words.update(ENGLISH_STOP_WORDS)
    #extra stop words
    extra_words=["said","say","seen","come","end","came","year","years","new","saying"]
    stop_words = ENGLISH_STOP_WORDS.union(extra_words)

    df = pd.read_csv('train_set.csv', sep='\t')

    cat_politics = []
    cat_film = []
    cat_football = []
    cat_business = []
    cat_technology = []
    #store the content for each category
    for index in range(len(df.Category)):
        cat = df.Category[index]
        if cat == "Politics":
            cat_politics.append(df.Content[index])
        elif cat == "Film":
            cat_film.append(df.Content[index])
        elif cat == "Football":
            cat_football.append(df.Content[index])
        elif cat == "Business":
            cat_business.append(df.Content[index])
        elif cat == "Technology":
            cat_technology.append(df.Content[index])

    str_pol = ''.join(cat_politics)
    str_fil = ''.join(cat_film)
    str_foo = ''.join(cat_football)
    str_bus = ''.join(cat_business)
    str_tec = ''.join(cat_technology)

    #produce wordcloud for each category
    cloud = WordCloud(background_color="white", mode = "RGB", stopwords = stop_words, width=1920, height=1080)
    w = cloud.generate(str_pol)
    plt.figure()
    plt.title("Politics")
    plt.imshow(w)
    plt.axis("off")
    plt.savefig('Politics.png')


    w = cloud.generate(str_fil)
    plt.figure()
    plt.title("Film")
    plt.imshow(w)
    plt.axis("off")
    plt.savefig('Film.png')

    w = cloud.generate(str_foo)
    plt.figure()
    plt.imshow(w)
    plt.title("Football")
    plt.axis("off")
    plt.savefig('Football.png')

    w = cloud.generate(str_bus)
    plt.figure()
    plt.imshow(w)
    plt.title("Business")
    plt.axis("off")
    plt.savefig('Business.png')

    w = cloud.generate(str_tec)
    plt.figure()
    plt.imshow(w)
    plt.title("Technology")
    plt.axis("off")
    plt.savefig('Technology.png')
Beispiel #9
0
# Code used in part 2 of How I used machine learning to classify emails and turn them into insights.

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

from helpers import parse_into_emails
from query import EmailDataset

# Just like in part_1, read and preprocess emails
emails = pd.read_csv('split_emails.csv') 
email_df = pd.DataFrame(parse_into_emails(emails.message))
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
vec_train = vec.fit_transform(email_df.body)

# print out the vector of the first email
# print(vec_train[0:1])

# Find cosine similarity between the first email and all others.
cosine_sim = linear_kernel(vec_train[0:1], vec_train).flatten()
# print out the cosine similarities
# print(cosine_sim)

# Finding emails related to a query.
query = "john"

# Transform the query into the original vector
vec_query = vec.transform([query])
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


def expandContractions(word):
    if word in c_dict.keys():
        return c_dict[word]
    else:
        return word


# stop words
# add more, ex: news outlet name
add_stop = ['said', 'say', '...', 'like', 'cnn', 'ad', 'bbc']
stop_words = ENGLISH_STOP_WORDS.union(add_stop)

punc = list(set(string.punctuation))


def process_text(text):
    # if isinstance(text, float):
    #     print(text)
    #     return
    text = casual_tokenizer(text)
    text = [each.lower() for each in text]
    text = [re.sub('[0-9]+', '', each) for each in text]
    text = [expandContractions(word) for word in text]
    stemmed_text = []
    # We tried using Snowball Stemmer and Lancaster Stemmer
    ps = LancasterStemmer()
classifiers = {
    "Naïve Bayes": MultinomialNB(),
    "Support Vector Machine": LinearSVC(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression()
}
max_features = [200, 1000, 10000]
n_categories = {"Exc./Non-Exc": "nota", "1-5": "classe 2"}

n_grams = {"1": (1, 1), "1-3": (1, 3)}
stop_words = {
    "No":
    None,
    "Yes":
    ENGLISH_STOP_WORDS.union(get_stop_words('spanish')).union(
        get_stop_words('portuguese'))
}

graph_data = []
for category_key, category in n_categories.items():
    for max_feature in max_features:
        for gram_key, gram in n_grams.items():
            for stop_key, stop_word in stop_words.items():
                for class_key, classifier in classifiers.items():
                    count_vectorizer = CountVectorizer(
                        analyzer='word',
                        lowercase=True,
                        stop_words=stop_word,
                        ngram_range=gram,
                        max_features=max_feature)
def crossValidationRoc(df, method, n_components, category):
    # Add noisy features
    random_state = np.random.RandomState(0)
    classifier = svm.SVC(kernel='linear',
                         probability=True,
                         random_state=random_state)
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    avgAccuracy = 0
    nFolds = 10
    kf = KFold(n_splits=nFolds)
    fold = 0
    my_additional_stop_words = [
        'said', 'th', 'month', 'much', 'thing', 'say', 'says'
    ]
    stop_words = ENGLISH_STOP_WORDS.union(my_additional_stop_words)
    count_vect = TfidfVectorizer(stop_words=stop_words)
    #count_vect = CountVectorizer(stop_words=stop_words)
    count_vect.fit(df['Content'] + df['Title'])
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(count_vect.transform(df['Content'] + df['Title']))
    for train_index, test_index in kf.split(df):
        X_train_counts = count_vect.transform(df['Content'].iloc[train_index])
        X_train_counts = np.add(
            X_train_counts,
            count_vect.transform(df['Title'].iloc[train_index]) * 2)
        X_test_counts = count_vect.transform(df['Content'].iloc[test_index])
        X_test_counts = np.add(
            X_test_counts,
            count_vect.transform(df['Title'].iloc[test_index]) * 2)
        X_train_counts = svd.transform(X_train_counts)
        X_test_counts = svd.transform(X_test_counts)
        probas_ = classifier.fit(
            X_train_counts,
            df['Category'].iloc[train_index]).predict_proba(X_test_counts)
        # Compute ROC curve and area the curve
        test1 = label_binarize(
            df['Category'].iloc[test_index],
            classes=["Business", "Film", "Football", "Politics", "Technology"])
        fpr, tpr, thresholds = roc_curve(test1[:, categories_map[category]],
                                         probas_[:, categories_map[category]])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 label='ROC fold %d (area = %0.2f)' % (fold, roc_auc))
        print "Fold " + str(fold)
        fold += 1

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')

    mean_tpr /= 10
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr,
             mean_tpr,
             'k--',
             label='Mean ROC (area = %0.2f)' % mean_auc,
             lw=2)

    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic ' + category)
    plt.legend(loc="lower right")
    plt.savefig('output/' + category + '_roc')
    plt.close()
    return avgAccuracy
Beispiel #13
0
        filename):  ## function to read the dataset in the correct format.
    with open(filename) as f:
        dataset = f.read().splitlines()
    X = [i.split('\t')[0] for i in dataset]
    y = [i.split('\t')[1] for i in dataset]
    y = [0 if x == 'democrat' else 1 for x in y]
    return X, y


Xtrain, ytrain = read_dataset('train_newline.txt')
Xtest, ytest = read_dataset('dev_newline.txt')

X = Xtrain + Xtest

stop_words_list = ENGLISH_STOP_WORDS.union([
    u'http', u'rt', u'amp', u'just', u'bit', u'ly', u'com', u'url', u'tinyurl',
    u'ow', u'twurl'
])  ## added these stop words
## after observing the top features in the different models

### Uni-gram Model:
dic = CountVectorizer(input=X,
                      ngram_range=(1, 1),
                      analyzer='word',
                      stop_words=stop_words_list)

vecs = dic.fit_transform(X)

features = dic.fit(X).get_feature_names(
)  ### getting the feature names of the different features to be used in the classifier.

trainvecs = vecs[0:40000, :]
Beispiel #14
0
 def __init__(self):
     # Build a list of stop words that I don't want to use as features. These are often '.' but maybe other ones down the road
     my_stop_words = ['.', '(', ')', ' ', ' .', '..', ').', ' )', ' , ', ' ,']
     stop_words = ENGLISH_STOP_WORDS.union(my_stop_words)
     self.vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,7), stop_words='english', min_df = 1, max_df=1.0)
Beispiel #15
0
# Drop emails with empty body, to or from_ columns.
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index,
              inplace=True)

#Preview dataframe.
print("\nDataframe preview: \n", email_df.head())

#Print unique email addresses.
print("\nUnique FROM email addresses:", len(email_df.from_.unique()))
print("Unique TO email addresses:", len(email_df.to.unique()))

#Tokenize the bodies and convert them into a document-term matrix:

#Adding extra stop-words that appeared frequently in the dataset, but were not if interest.
stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])

#Vectorizer.
vect = TfidfVectorizer(analyzer='word',
                       stop_words=stopwords,
                       max_df=0.3,
                       min_df=2)
X = vect.fit_transform(email_df.body)
features = vect.get_feature_names()

#Print the top terms across all documents.
print("\nMost frequent terms in the dataset: \n",
      top_mean_feats(X, features, None, 0.1, 10))

#Data classification:
def main():

#------------------------------DATA----------------------------------

    train_data=pd.read_csv('train_set.csv',sep="\t")
    test_data=pd.read_csv('test_set.csv',sep="\t")
    train_data.drop('RowNum',axis=1)		#ignore rownum
    test_data.drop('RowNum',axis=1)

#------------------------------Processing----------------------------

    extra_words=["said","say","seen","come","end","came","year","years","new","saying"]		#extra stopwords
    stopwords=ENGLISH_STOP_WORDS.union(extra_words)
    tfidf=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stopwords)		#convert to tf-idf
    tsvd=TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)		#set dimensions

    set(train_data['Category'])		#check categories
    le=preprocessing.LabelEncoder()	#set labels
    le.fit(train_data["Category"])	#fit them to the number of our categories
    y_train=le.transform(train_data["Category"])	#transform categories
    set(y_train)

    count_vectorizer=CountVectorizer(stop_words=stopwords)	#set stopwords for vectorizer
    X_trainNoLSI=count_vectorizer.fit_transform(train_data['Content'])		#vectorize out data
    tsvd.fit(X_trainNoLSI)				#truncate data
    X_train=tsvd.transform(X_trainNoLSI)		#store them

    test_noLSI=count_vectorizer.transform(test_data['Content'])		#test data
    test=tsvd.transform(test_noLSI)

    k_fold = KFold(n_splits=10)				#10 fold validation

#--------------------------------SVM---------------------------------

    clf=svm.SVC(kernel='rbf', C=100, gamma='auto')		#algorithm for application
    clf.fit(X_train, y_train)
    y_pred=clf.predict(test)

#--------------------------------SVM_scores--------------------------
    print "SVM scores:"

    SVMprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    svm_prec=SVMprecs.mean()
    print "precision:" ,svm_prec

    SVMrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    svm_rec=SVMrecs.mean()
    print "recall:" ,svm_rec

    SVMfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    svm_fm=SVMfms.mean()
    print "F-measure:" ,svm_fm

    SVMaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    svm_acc=SVMaccs.mean()
    print "accuracy:" ,svm_acc

#---------------------------------RF---------------------------------

    clf=RandomForestClassifier(max_depth=6,random_state=1)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(test)

#---------------------------------RF_scores--------------------------

    print "RF scores:"

    RFprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    rf_prec=RFprecs.mean()
    print "precision:" ,rf_prec

    RFrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    rf_rec=RFrecs.mean()
    print "recall:" ,rf_rec

    RFfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    rf_fm=RFfms.mean()
    print "F-measure:" ,rf_fm

    RFaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    rf_acc=RFaccs.mean()
    print "accuracy:" ,rf_acc

#----------------------------------MNB--------------------------------

    clf=MultinomialNB()
    clf.fit(X_trainNoLSI,y_train)
    y_pred=clf.predict(test_noLSI)

#----------------------------------MNB_scores-------------------------

    print "MNB scores:"

    MNBprecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='precision_micro')
    mnb_prec=MNBprecs.mean()
    print "precision:" ,mnb_prec

    MNBrecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='recall_micro')
    mnb_rec=MNBrecs.mean()
    print "recall:" ,mnb_rec

    MNBfms=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='f1_micro')
    mnb_fm=MNBfms.mean()
    print "F-measure:" ,mnb_fm

    MNBaccs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='accuracy')
    mnb_acc=MNBaccs.mean()
    print "accuracy:" ,mnb_acc

#-----------------------------------K-Nearest_Neighbor------------------

    clf=knn.myKNN(10)			# K=10,check knn_functions.py(imported)
    clf.fit(X_train, y_train)
    y_pred=clf.predict(test)

#---------------------------------KNN_scores--------------------------

    print "KNN scores:"

    KNNprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    knn_prec=KNNprecs.mean()
    print "precision:" ,knn_prec

    KNNrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    knn_rec=KNNrecs.mean()
    print "recall:" ,knn_rec

    KNNfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    knn_fm=KNNfms.mean()
    print "F-measure:" ,knn_fm

    KNNaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    knn_acc=KNNaccs.mean()
    print "accuracy:" ,knn_acc

#----------------------------------------------------------------------
#                                   My Method
#----------------------------------------------------------------------
    #our method
    #data punctuation
    test_data['Content']=test_data['Content'].str.replace('[^\w\s]', '')
    train_data['Content']=train_data['Content'].str.replace('[^\w\s]', '')
    #convert multiple spaces to one
    test_data['Content']=test_data['Content'].str.replace('\s+', ' ')
    train_data['Content']=train_data['Content'].str.replace('\s+', ' ')

    #same process as before
    set(train_data['Category'])
    le=preprocessing.LabelEncoder()
    le.fit(train_data["Category"])
    y_train=le.transform(train_data["Category"])
    set(y_train)

    X_train=count_vectorizer.fit_transform(train_data['Content'])

    test=count_vectorizer.transform(test_data['Content'])
    #usage of MNB
    max=0.0
    maxi=0.0
    i=0.01
    #search for the best smoothing parameter(alpha)
    while i<1.0:
        clf=MultinomialNB(alpha=i)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(test)
        myprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
        my_prec=myprecs.mean()
        if my_prec>max:
            max=my_prec
            maxi=i
        i+=0.01
    print "My Method scores:"

    clf=MultinomialNB(alpha=maxi, fit_prior=True)
    clf.fit(X_train,y_train)
    the_pred=clf.predict(test)

    print "precision:" ,max

    myrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    my_rec=myrecs.mean()
    print "recall:" ,my_rec

    myfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    my_fm=myfms.mean()
    print "F-measure:" ,my_fm

    myaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    my_acc=myaccs.mean()
    print "accuracy:" ,my_acc

#------------------------------------CSV---------------------------------
    #my method csv
    output='testSet_categories.csv'
    predicted=le.inverse_transform(the_pred)
    testingfile=pd.DataFrame({'ID': test_data['Id'], 'Predicted_Category': list(predicted)}, columns=['ID', 'Predicted_Category'])
    testingfile.to_csv(output,encoding='utf-8',index=False,sep='\t')
    #results csv
    output='EvaluationMetric_10fold.csv'
    d={'StatisticMeasure': ['Accuracy','Precision','Recall','F-Measure'],'Naive Bayes':[mnb_acc,mnb_prec,mnb_rec,mnb_fm],'Random Forest':[rf_acc,rf_prec,rf_rec,rf_fm],'SVM': [svm_acc,svm_prec,svm_rec,svm_fm],'KNN': [knn_acc,knn_prec,knn_rec,knn_fm] ,'My Method': [my_acc,max,my_rec,my_fm]}
    df=pd.DataFrame(data=d,columns=['StatisticMeasure','Naive Bayes','Random Forest','SVM','KNN','My Method'])
    df.to_csv(output,encoding='utf-8',index=False,sep='|')
Beispiel #17
0
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
import nltk, string

rm_pun = dict((ord(char), None) for char in string.punctuation)
stmr = nltk.stem.porter.PorterStemmer()


def nml(t):
    # remove punctuation, stem
    tk = nltk.word_tokenize(t.lower().translate(rm_pun))
    r = [stmr.stem(i) for i in tk]
    # print("\nnormalize")
    # for w in re:
    #     print(w)
    return r


lst = [
    "cnnbrk", '’', '“', 'https…', 'htt…', 'h…', 's', 't', "cnnbrk…", "”", "…",
    "wo…", "”", "…", "w…", "a…", "m…", "i…", "t…", "‘", "an…", "g…", "d…",
    "to…", "p…", "o…", "is…", "in…", "wh…", "c…", "⁦…", "so…", "y…", "and…",
    "मे", "तो", "से", "be…", "re…", "are…", "as…", "no…", "r…", "ft…", "they…",
    "—", "not…", "f…", "l…", "e…", "it…", "u…", "b…", "n…", "tr…", "we…"
]
stpw = ENGLISH_STOP_WORDS.union(stopwords.words('english')).union(lst)
Beispiel #18
0
print(len(have_cancel), 'records have "cancel*" in them')

canceled_cats = Counter([i['category'] for i in have_cancel])

sorted(canceled_cats.items(), key=itemgetter(1), reverse=True)[0:10]


# #Set up the vectorisers and classifiers
# The per-record text data is fairly sparse and the vocabulary is quite big overall, so it's worth trying different vectorisers. 

# In[102]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as ESW

ESW = ESW.union({'cancelled', 'canceled'})

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.cross_validation import cross_val_score, KFold

def my_tokeniser(string):
    '''
    This can be changed to result in more sophisticated word detection.
    For now, it just splits up into alpha-only chunks, strips numbers.
    Preserves hyphenated and apostrophed words but ignores other punct.
    Gets rid of single-char stuff.
    '''
    pattern = re.compile("[A-Za-z0-9\-']*[^\W]")
    return [i for i in re.findall(pattern, string) if i.isnumeric() == False and len(i) > 1]
Beispiel #19
0
from nltk.stem.wordnet import WordNetLemmatizer
import re
import numpy as np


def testSet_categoriesCSV(predicted_categories, ids):
    d = {'ID': pd.Series(ids), 'Predicted_Category': pd.Series(predicted_categories)}
    df = pd.DataFrame(d)
    df.to_csv('Produced_Files/testSet_categories.csv', sep='\t', index=False, columns=['ID', 'Predicted_Category'])


size = 10000
components = 160
my_additional_stop_words = ['people', 'said', 'did', 'say', 'says', 'year', 'day', 'just', 'good', 'come', 'make',
                            'going', 'having', 'like', 'need', 'given', 'got']
vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS.union(my_additional_stop_words))
le = preprocessing.LabelEncoder()
lsi_model = TruncatedSVD(n_components=components)
ps = PorterStemmer()
lmtzr = WordNetLemmatizer()

clf = svm.SVC(kernel='rbf', C=1, gamma=1)
# clf = SGDClassifier()

# ----------------------------------------------------------------------------------------------------------------------
# ----------------------------------------------------TRAIN-------------------------------------------------------------

dataset = pd.read_csv('../datasets/project_1/train_set.csv', sep="\t")
#dataset = dataset[0:size]
le.fit(dataset["Category"])
y = le.transform(dataset["Category"])
Beispiel #20
0
 def add_stop_words(self):
     if self.stop_words is not None:
         words = self._split_on_spaces(self.stop_words)
         self.stop_words = ENGLISH_STOP_WORDS.union(words)
Beispiel #21
0
                 'trade':9}
data = []
target = []
docs = reuters.fileids()
for doc in docs:
    # Check if the document is only related to 1 class and that class is in category_dict
    if len(reuters.categories(doc)) == 1 and reuters.categories(doc)[0] in category_dict:
        data.append(" ".join(reuters.words(doc))) # Text of the document
        target.append(category_dict[reuters.categories(doc)[0]]) # Index for the class
print("Dataset REUTERS loaded...")

# Pre-process the dataset
print("Pre-processing the dataset...")
stemmer = PorterStemmer() # Define the type of stemmer to use
additional_stop_words = []
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words]) # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []: # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
        processed_data.append(stemmed_doc)