Esempio n. 1
0
  def __init__(self):
    stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
    self.vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
    self.emails = read_email_bodies() 

    # train on the given email data.
    self.train()
Esempio n. 2
0
    def add_stop_words(self):
        if self.stop_words is None:
            self.stop_words = list(ENGLISH_STOP_WORDS)
            logging.info("using default stop words")

        else:
            words = self._split_on_spaces(self.stop_words)
            self.stop_words = list(ENGLISH_STOP_WORDS.union(words))
            logging.info("using custom stop words")
            logging.debug("stop words:%s" % self.stop_words)
def lda(text, n_features, n_topics, n_top_words):
	""" perform latent dirichlet allocation

	input (array): an array of strings
	"""
	# add to stop words 
	# the word inapplicable is a result of the questionnaire
	stop_words = ENGLISH_STOP_WORDS.union(['inapplicable'])

	tf_vectorizer = CountVectorizer(max_df=0.85, min_df=0., max_features=n_features, 
		stop_words=stop_words)
	tf = tf_vectorizer.fit_transform(text)
	model = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
	 learning_method='online', learning_offset=50., random_state=0)
	model.fit(tf)

	tf_feature_names = tf_vectorizer.get_feature_names()

	tops = get_top_words(model, tf_feature_names, n_top_words)

	return tops 
Esempio n. 4
0
    def top_n_words_heatmap(self, n=10, ngrams=(1, 1), by_decade=True):
        """Produces a heatmap for top N words or their combinations over the years"""
        df = self.data.copy()
        df.reset_index(drop=True, inplace=True)
        sw = ENGLISH_STOP_WORDS.union(CSW)
        c = CountVectorizer(stop_words=sw, ngram_range=ngrams)

        m = c.fit_transform(df.lyrics)

        rev = {it: ind for ind, it in c.vocabulary_.items()}

        words = []
        for oi, m0 in enumerate(m):
            words.append({oi: {rev[ind]: it for ind, it in enumerate(m0.toarray().reshape(-1)) if it > 0}})

        words_df = [pd.DataFrame(word) for word in words]

        total_words = pd.concat(words_df, axis=1, sort=False)

        trans = total_words.T
        if by_decade:
            trans["_year"] = df["decade"].fillna(0)
        else:
            trans["_year"] = df["year"].fillna(0)

        trans["_year"] = pd.to_numeric(trans["_year"].fillna(0), downcast="integer")

        words_by_year = trans.groupby("_year").sum()

        top10 = {y: z.nlargest(n) for y, z in words_by_year.iterrows()}

        top10_df = pd.concat(top10, axis=1, sort=False)

        top10_df["sum"] = top10_df.sum(axis=1)
        top10_df.sort_values(by="sum", ascending=False, inplace=True)
        top10_df.drop("sum", axis=1, inplace=True)

        fig = plt.figure(figsize=(15, 10))
        sns.heatmap(top10_df.T, square=True, cmap="YlGnBu", cbar_kws={"orientation":"horizontal"})
Esempio n. 5
0
def labelClustersWKeywords(labels, myReader, num_clusters):
    top_features_list = []
    print(myReader)

    for cluster in range(num_clusters):
        indices = [index for index, clusterNum in enumerate(labels) if clusterNum == cluster] # indices of documents in cluster
        clusterCorpus = [doc_dict['negative_feedback'] for (docnum, doc_dict) in myReader.iter_docs() if docnum in indices] # documents in cluster

        custom_stop_words = ENGLISH_STOP_WORDS.union(["firefox"])
        vectorizer = TfidfVectorizer(stop_words=custom_stop_words)
        X_tf = vectorizer.fit_transform(clusterCorpus)
        response = vectorizer.transform(clusterCorpus)
        feature_names = vectorizer.get_feature_names()

        top_n = 5
        feature_name_occurences = np.nonzero(response.toarray())[1]
        most_common_n = collections.Counter(feature_name_occurences).most_common(top_n)
        top_features = [feature_names[feature[0]] for feature in most_common_n]
        top_features_list.append(top_features)

    feature_names_df = pd.DataFrame(top_features_list, columns=['1', '2', '3', '4', '5'])
    return feature_names_df
Esempio n. 6
0
def main():
    #ta idia opws kai sto classification
    df = pd.read_csv("datasets/train_set.csv", sep="\t")
    with open('extraStopWords.json', 'r') as extraStopWords:
        extraStopWords = json.load(extraStopWords)
    stop_words = ENGLISH_STOP_WORDS.union(extraStopWords)
    count_vect = CountVectorizer(stop_words=stop_words)
    X_train_counts = count_vect.fit_transform(df.Content)

    #pairnoume ta test
    df2 = pd.read_csv("datasets/test_set.csv", sep="\t")
    X_test_counts = count_vect.transform(df2.Content)

    #SVM Linear dioti htane o kaluteros
    clf_cv = MultinomialNB().fit(X_train_counts, np.array(df.Category))
    y_pred = clf_cv.predict(X_test_counts)

    f = open("testSet_categories.csv", "w")
    f.write("ID\tPredicted_Category\n")
    i = 0
    for pred in y_pred:
        f.write(str(df2.Id[i]) + "\t" + pred + "\n")
        i += 1
    f.close()
Esempio n. 7
0
def main():
    
    #diavazoume to csv se panda kai meta ftiaxnoume ton vectorizer + kanoume trubcated gia na meiwsoume tis diastaseis
    with open('extraStopWords.json','r') as extraStopWords:
        extraStopWords = json.load(extraStopWords)
    stopWords = ENGLISH_STOP_WORDS.union(extraStopWords)
 
    df = pd.read_csv("datasets/train_set.csv", sep="\t")
    
    count_vect = CountVectorizer(stop_words=stopWords)
    X_train_counts = count_vect.fit_transform(df.Content)
    svd = TruncatedSVD(n_components = 60)
    X_train_counts = svd.fit(X_train_counts)
        #edw dhmiourgoume to object gia na kanoume to cross validation
    kf = KFold(n_splits = 10)
    #fold = 0


    #edw exoume tous metrites pou xreiazontai
    #metrame se kathe epanalipsi to apotelesma kai sto telos kanoume total/10
    #0 einai gia svm
    #1 gia Nayve
    #2 gia Rnadom
    #3 gia KNN
    class_list = [Metrics_for_Class() for i in range(0,4)]
    
    #oi katigories
    categories = ["Technology", "Football", "Film", "Business","Politics"]
    #kratame plhroforia gia to roc plot
    folist = []
    tlist = []
    plist = []
    filist = []
    blist = []
    #edw xwrizoum
    
    for train_index, test_index in kf.split(df.Content):
        
        #pleon kanoume mono transform edw kai oxi fit dioti tha xasoume plhrofories an kanoume neo fit
        X_train_counts3 = count_vect.transform(np.array(df.Content)[train_index])
        X_train_counts2 = svd.transform(X_train_counts3)

        #to idio me to panw
        X_test_counts3 = count_vect.transform(np.array(df.Content)[test_index])
        X_test_counts2 = svd.transform(X_test_counts3)
        
        #SVM
        if sys.argv[1] == "SVM":
            #print("SVM STARTED")
            place = 0
            #parameters = {'kernel':('linear', 'rbf')}
            svr = svm.SVC(kernel = "linear")
            svr.fit(X_train_counts2, np.array(df.Category)[train_index])
            y_pred = svr.predict(X_test_counts2)
            y_true = np.array(df.Category)[test_index]
            class_list[0].rec += recall_score(y_true,y_pred,average = "macro")
            class_list[0].acc += accuracy_score(y_true,y_pred)
            class_list[0].prec += precision_score(y_true,y_pred,average = "macro")
            class_list[0].fl_sc += f1_score(y_true, y_pred,average = "macro")

            #NayveBayes
        elif sys.argv[1] == "NAYVE":
            #print("NAYVE_STARTED")
            place = 1
            clf_cv =  MultinomialNB().fit(X_train_counts3, np.array(df.Category)[train_index])
            y_pred = clf_cv.predict(X_test_counts3)
            y_true = np.array(df.Category)[test_index]
            class_list[1].rec += recall_score(y_true,y_pred,average = "macro")
            class_list[1].acc += accuracy_score(y_true,y_pred)
            class_list[1].prec += precision_score(y_true,y_pred,average = "macro")
            class_list[1].fl_sc += f1_score(y_true, y_pred,average = "macro")



        #RandomForest
        elif sys.argv[1] == "RANDOM_FOREST":
            #print("RANDOM_FOREST_STARTED")
            place = 2
            clf_rf = RandomForestClassifier(n_estimators=10).fit(X_train_counts2, np.array(df.Category)[train_index])
            y_pred = clf_rf.predict(X_test_counts2)
            y_true = np.array(df.Category)[test_index]
            class_list[2].rec += recall_score(y_true,y_pred,average = "macro")
            class_list[2].acc += accuracy_score(y_true,y_pred)
            class_list[2].prec += precision_score(y_true,y_pred,average = "macro")
            class_list[2].fl_sc += f1_score(y_true, y_pred,average = "macro")

        #KNN
        elif sys.argv[1] == "KNN":
            place = 3
            
            K = 7
            clf_kn = knn_classifier(K).fit(X_train_counts2,np.array(df.Category)[train_index])

            y_pred = clf_kn.predict(X_test_counts2,X_train_counts2,K)
            y_true = np.array(df.Category)[test_index]
            class_list[3].rec += recall_score(y_true,y_pred,average = "macro")
            class_list[3].acc += accuracy_score(y_true,y_pred)
            class_list[3].prec += precision_score(y_true,y_pred,average = "macro")
            class_list[3].fl_sc += f1_score(y_true, y_pred,average = "macro")

    #upologismos meswn orwn
    class_list[place].rec = float(class_list[place].rec) / 10
    class_list[place].acc = float(class_list[place].acc) / 10
    class_list[place].prec = float(class_list[place].prec) / 10
    class_list[place].fl_sc = float(class_list[place].fl_sc) / 10
    #class_list[place].roc_auc = float(class_list[place].roc_auc) / 10

    #print ta apotelesmata
    f = open("EvaluationMetric_" + sys.argv[1] + ".csv", "w")
    f.write("Statistic_Metrics\t")
    if sys.argv[1] == "SVM":
        f.write("SVM")
    elif sys.argv[1] == "NAYVE":
        f.write("Naive Bayes")
    elif sys.argv[1] == "RANDOM_FOREST":
        f.write("Random Forest")
    elif sys.argv[1] == "KNN":
        f.write("KNN")
    f.write("\n")
    
    #grpasimo sto csv
    f.write("Accuracy\t")
    f.write(str(class_list[place].acc) + "\n")
    f.write("Presicion\t")
    f.write(str(class_list[place].prec) + "\n")
    f.write("Recall\t")
    f.write(str(class_list[place].rec) + "\n")
    f.write("F_Measure\t")
    f.write(str(class_list[place].fl_sc) + "\n")
    f.close()
                   color='#eeeeee',
                   zorder=1)

    ax.set_xlabel(xlabel, labelpad=20, weight='bold', size=12)
    ax.set_ylabel(ylabel, labelpad=20, weight='bold', size=12)
    ax.set_title(title)
    ax.xaxis.set_major_formatter(StrMethodFormatter('{x:,g}'))


#Run when you want to shuffle and remake a the CSV file.
# load_slice_dataframes()

df_kaggle_reviews = pd.read_csv('.\labeled_data.csv')
df_kaggle_reviews['reviews_stem'] = get_stemmed_text(
    preprocess_reviews(df_kaggle_reviews.reviews))
my_stop_words = get_stemmed_text(ENGLISH_STOP_WORDS.union(('and', 'or', 'if')))
# my_pattern = r'\b[^\d\W][^\d\W]+\b' #token_pattern=

# # TFIDF frequnsie of the words so how many times each word is apper in the string
# vect_Tfid = TfidfVectorizer(ngram_range=(1, 2), max_features=100,
#                         stop_words=my_stop_words).fit(df_kaggle_reviews['reviews_stem'])
# X_txt_Tfid = vect_Tfid.transform(df_kaggle_reviews['reviews_stem'])
# df_Tfid = pd.DataFrame(X_txt_Tfid.toarray(), columns=vect_Tfid.get_feature_names())

# min_df ( = 5): defines the minimum frequency of a word for it to be counted as a feature

vect_BOW = CountVectorizer(ngram_range=(1, 2),
                           stop_words=my_stop_words,
                           max_features=2000,
                           binary=True).fit(df_kaggle_reviews.reviews_stem)
X_txt_BOW = vect_BOW.transform(df_kaggle_reviews.reviews_stem)
Esempio n. 9
0
###############################################################################
_20news = fetch_20newsgroups(subset="all")
print("Dataset 20NEWS loaded...")
data = _20news.data
target = _20news.target
###############################################################################
# Pre-process the dataset
###############################################################################
print("Pre-processing the dataset...")
stemmer = PorterStemmer()  # Define the type of stemmer to use
additional_stop_words = [
    'edu', 'com', 'gov', 'ca', 'mit', 'uk', 'subject', 'lines', 'organization',
    'writes', 'msg', 'article', 'university', 'does', 'posting', 'thanks',
    'don', 'know', 'help', 'use', 'copy'
]
stop_words = ENGLISH_STOP_WORDS.union(additional_stop_words)
stop_words = set([stemmer.stem(word) for word in stop_words
                  ])  # Stem the stop words for larger detection
processed_data = []
id_to_delete = []
for i, doc in enumerate(data):
    tokenized_doc = list(simple_preprocess(doc, deacc=True, min_len=2))
    stemmed_doc = []
    for word in tokenized_doc:
        stemmed_word = stemmer.stem(word)
        if stemmed_word not in stop_words:
            stemmed_doc.append(stemmed_word)
    #[stemmer.stem(word) for word in tokenized_doc if word not in stop_words]
    if stemmed_doc == []:  # Empty document after pre-processing: to be removed
        id_to_delete.append(i)
    else:
Esempio n. 10
0
 def __init__(self):
     # Build a list of stop words that I don't want to use as features. These are often '.' but maybe other ones down the road
     my_stop_words = ['.', '(', ')', ' ', ' .', '..', ').', ' )', ' , ', ' ,']
     stop_words = ENGLISH_STOP_WORDS.union(my_stop_words)
     self.vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(1,7), stop_words='english', min_df = 1, max_df=1.0)
Esempio n. 11
0
with open("additional_stopwords.txt", "r") as textfile:
    additional_stopwords = textfile.read().split('\n')

stopwords_list = list(string.punctuation)
stopwords_list += ['....', '...', '..', '.....', 'im']
stopwords_list += stopwords.words('english')  # nltk
stopwords_list += get_stop_words('en')  # stop words
stopwords_list = list(set(stopwords_list))
stopwords_list = [w for w in stopwords_list if w not in 'not']

print('Length of standard stopwords list: {}'.format(len(stopwords_list)))

ext_stopwords_list = stopwords_list
ext_stopwords_list += spacy.lang.en.stop_words.STOP_WORDS  # spacy
ext_stopwords_list += additional_stopwords
ext_stopwords_list = ENGLISH_STOP_WORDS.union(
    stopwords_list)  # sklearn stopwords
ext_stopwords_list = list(set(ext_stopwords_list))
ext_stopwords_list = [w for w in ext_stopwords_list if w not in 'not']

print('Length of extended stopwords list: {}'.format(len(ext_stopwords_list)))

with open('vocab_20k.txt', 'r', encoding="utf8") as f:
    extended_vocab_20k = f.read().splitlines()

with open('contractions.txt', 'r') as f:
    contractions = eval(f.read())


def find_review_errors(df):
    """
    List unique identity key for missing and duplicate reviews,
def main():

#------------------------------DATA----------------------------------

    train_data=pd.read_csv('train_set.csv',sep="\t")
    test_data=pd.read_csv('test_set.csv',sep="\t")
    train_data.drop('RowNum',axis=1)		#ignore rownum
    test_data.drop('RowNum',axis=1)

#------------------------------Processing----------------------------

    extra_words=["said","say","seen","come","end","came","year","years","new","saying"]		#extra stopwords
    stopwords=ENGLISH_STOP_WORDS.union(extra_words)
    tfidf=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stopwords)		#convert to tf-idf
    tsvd=TruncatedSVD(n_components=200, algorithm='randomized', random_state=42)		#set dimensions

    set(train_data['Category'])		#check categories
    le=preprocessing.LabelEncoder()	#set labels
    le.fit(train_data["Category"])	#fit them to the number of our categories
    y_train=le.transform(train_data["Category"])	#transform categories
    set(y_train)

    count_vectorizer=CountVectorizer(stop_words=stopwords)	#set stopwords for vectorizer
    X_trainNoLSI=count_vectorizer.fit_transform(train_data['Content'])		#vectorize out data
    tsvd.fit(X_trainNoLSI)				#truncate data
    X_train=tsvd.transform(X_trainNoLSI)		#store them

    test_noLSI=count_vectorizer.transform(test_data['Content'])		#test data
    test=tsvd.transform(test_noLSI)

    k_fold = KFold(n_splits=10)				#10 fold validation

#--------------------------------SVM---------------------------------

    clf=svm.SVC(kernel='rbf', C=100, gamma='auto')		#algorithm for application
    clf.fit(X_train, y_train)
    y_pred=clf.predict(test)

#--------------------------------SVM_scores--------------------------
    print "SVM scores:"

    SVMprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    svm_prec=SVMprecs.mean()
    print "precision:" ,svm_prec

    SVMrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    svm_rec=SVMrecs.mean()
    print "recall:" ,svm_rec

    SVMfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    svm_fm=SVMfms.mean()
    print "F-measure:" ,svm_fm

    SVMaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    svm_acc=SVMaccs.mean()
    print "accuracy:" ,svm_acc

#---------------------------------RF---------------------------------

    clf=RandomForestClassifier(max_depth=6,random_state=1)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(test)

#---------------------------------RF_scores--------------------------

    print "RF scores:"

    RFprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    rf_prec=RFprecs.mean()
    print "precision:" ,rf_prec

    RFrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    rf_rec=RFrecs.mean()
    print "recall:" ,rf_rec

    RFfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    rf_fm=RFfms.mean()
    print "F-measure:" ,rf_fm

    RFaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    rf_acc=RFaccs.mean()
    print "accuracy:" ,rf_acc

#----------------------------------MNB--------------------------------

    clf=MultinomialNB()
    clf.fit(X_trainNoLSI,y_train)
    y_pred=clf.predict(test_noLSI)

#----------------------------------MNB_scores-------------------------

    print "MNB scores:"

    MNBprecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='precision_micro')
    mnb_prec=MNBprecs.mean()
    print "precision:" ,mnb_prec

    MNBrecs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='recall_micro')
    mnb_rec=MNBrecs.mean()
    print "recall:" ,mnb_rec

    MNBfms=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='f1_micro')
    mnb_fm=MNBfms.mean()
    print "F-measure:" ,mnb_fm

    MNBaccs=cross_val_score(clf, X_trainNoLSI, y_train, cv=k_fold, scoring='accuracy')
    mnb_acc=MNBaccs.mean()
    print "accuracy:" ,mnb_acc

#-----------------------------------K-Nearest_Neighbor------------------

    clf=knn.myKNN(10)			# K=10,check knn_functions.py(imported)
    clf.fit(X_train, y_train)
    y_pred=clf.predict(test)

#---------------------------------KNN_scores--------------------------

    print "KNN scores:"

    KNNprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
    knn_prec=KNNprecs.mean()
    print "precision:" ,knn_prec

    KNNrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    knn_rec=KNNrecs.mean()
    print "recall:" ,knn_rec

    KNNfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    knn_fm=KNNfms.mean()
    print "F-measure:" ,knn_fm

    KNNaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    knn_acc=KNNaccs.mean()
    print "accuracy:" ,knn_acc

#----------------------------------------------------------------------
#                                   My Method
#----------------------------------------------------------------------
    #our method
    #data punctuation
    test_data['Content']=test_data['Content'].str.replace('[^\w\s]', '')
    train_data['Content']=train_data['Content'].str.replace('[^\w\s]', '')
    #convert multiple spaces to one
    test_data['Content']=test_data['Content'].str.replace('\s+', ' ')
    train_data['Content']=train_data['Content'].str.replace('\s+', ' ')

    #same process as before
    set(train_data['Category'])
    le=preprocessing.LabelEncoder()
    le.fit(train_data["Category"])
    y_train=le.transform(train_data["Category"])
    set(y_train)

    X_train=count_vectorizer.fit_transform(train_data['Content'])

    test=count_vectorizer.transform(test_data['Content'])
    #usage of MNB
    max=0.0
    maxi=0.0
    i=0.01
    #search for the best smoothing parameter(alpha)
    while i<1.0:
        clf=MultinomialNB(alpha=i)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(test)
        myprecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='precision_micro')
        my_prec=myprecs.mean()
        if my_prec>max:
            max=my_prec
            maxi=i
        i+=0.01
    print "My Method scores:"

    clf=MultinomialNB(alpha=maxi, fit_prior=True)
    clf.fit(X_train,y_train)
    the_pred=clf.predict(test)

    print "precision:" ,max

    myrecs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='recall_micro')
    my_rec=myrecs.mean()
    print "recall:" ,my_rec

    myfms=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='f1_micro')
    my_fm=myfms.mean()
    print "F-measure:" ,my_fm

    myaccs=cross_val_score(clf, X_train, y_train, cv=k_fold, scoring='accuracy')
    my_acc=myaccs.mean()
    print "accuracy:" ,my_acc

#------------------------------------CSV---------------------------------
    #my method csv
    output='testSet_categories.csv'
    predicted=le.inverse_transform(the_pred)
    testingfile=pd.DataFrame({'ID': test_data['Id'], 'Predicted_Category': list(predicted)}, columns=['ID', 'Predicted_Category'])
    testingfile.to_csv(output,encoding='utf-8',index=False,sep='\t')
    #results csv
    output='EvaluationMetric_10fold.csv'
    d={'StatisticMeasure': ['Accuracy','Precision','Recall','F-Measure'],'Naive Bayes':[mnb_acc,mnb_prec,mnb_rec,mnb_fm],'Random Forest':[rf_acc,rf_prec,rf_rec,rf_fm],'SVM': [svm_acc,svm_prec,svm_rec,svm_fm],'KNN': [knn_acc,knn_prec,knn_rec,knn_fm] ,'My Method': [my_acc,max,my_rec,my_fm]}
    df=pd.DataFrame(data=d,columns=['StatisticMeasure','Naive Bayes','Random Forest','SVM','KNN','My Method'])
    df.to_csv(output,encoding='utf-8',index=False,sep='|')
    # Lower text and use translation table to remove all punctuation and digits
    text = text.lower().translate(t_table)
    # Best Stemmer for this dataset (Tested)
    stemmer = PorterStemmer()
#     stemmer = SnowballStemmer("english")
#     stemmer = LancasterStemmer()
    stems = [stemmer.stem(word.strip()) for word in text.split()]
    return stems


print('Creating stop words (NLTK & SKLEARN) ...')
# 153 stop words from NLTK
nltk_stop_words = stopwords.words('english')
# Combine stop words from all the stop word lists
stop_words = ENGLISH_STOP_WORDS.union(nltk_stop_words)

ngram = 2
min_df = 5
# Using idf
print('[PorterStemmer] Converting text documents to numerical feature vectors.... aka vectorizing...')
print('Ngrams = %i' % ngram)
print('min_df = %i' % min_df)

tfidf_vec = TfidfVectorizer(tokenizer=tokenizer, norm='l2', ngram_range=(1, ngram), sublinear_tf=True,
                            min_df=min_df, stop_words=stop_words)

# Fit the vectorizer on the combined train/test abstract data
tfidf_vec.fit(abstract_df.values)
# Transform training and test data set to numerical feature vectors
X_train_tfidf = tfidf_vec.transform(train_df['Abstract'].values)
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

# stopwords_set = set(stopwords.words('english'))
# todo: for this dataset using empty stopwords list shows highest CV score
# stopwords_set = set()

# todo: but this one, without negative word-parts shows higher score than nltk 'english' list
# todo: but the dataset is too small, and empty stopwords set is correct for 1 or 2 reviews more
# todo: so probably it is better to use general approach with common stopwords
# todo: but negative word parts should be kept as ngrams of 2 words are used
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stopwords_set = ENGLISH_STOP_WORDS.difference(
    ['not', 'no', 'nor', 'none', 'never', 'nothing', 'very'])

stemmer = PorterStemmer()
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [
        stemmer.stem(word) for word in review if word not in stopwords_set
    ]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500, ngram_range=(1, 2))
Esempio n. 15
0
## Ingest ML papers data ##
###########################

data_fname = '../../data/papers.csv'
data = pd.read_csv(data_fname)
data.dropna(subset=['full_text'], inplace=True)

# %%

####################################################################
## Ingest and incorporate custom data science specific stop words ##
####################################################################

stopwords_fname = '../../data/ml_stopwords.csv'
add_stop_words = pd.read_csv(stopwords_fname)
new_stop_word_list = ENGLISH_STOP_WORDS.union(add_stop_words.Stopword.values)

# %%

###############################################
## Ingest custom keyword lists for ML topics ##
###############################################

# Load all custom keyword lists for ML topics to use as topic priors in LDA
topic_priors_dir = os.fsencode('../../data/topic_priors/')
topic_priors_df_list = []
base_weight = 100
for f in os.listdir(topic_priors_dir):
    fname = '../../data/topic_priors/' + os.fsdecode(f)
    topic_name = os.fsdecode(f).split('.')[0]
    topic_words = pd.read_csv(open(fname))
Esempio n. 16
0

def get_random_class_labels(num=8):
    return np.random.choice(class_labels_all, num, replace=False)


# ========================= STOP WORDS ========================= #

useless_words = set([
    'postgres', 'big', 'panda', 'using', 'scikit', 'sklearn', 'apache',
    'spark', 'lambda', 's3', 'does', 'looking', 'help', 'new', 'data',
    'science', 'scientist', 'machine', 'learning', 'use', 'need', 'engineer',
    'engineering'
])

custom_stop_words = ENGLISH_STOP_WORDS.union(useless_words).union(
    set(class_labels_all))


def load_sqlite(database, query=None, class_labels=None):

    try:
        connection = sqlite3.connect(database)
    except Exception as e:
        print(f"The error '{e}' occurred connecting")

    placeholders = ','.join('?' for label in class_labels)

    ### FIX ###
    # this query needs to be explicitely given in each notebook
    # to allow for different databases
    subreddit_query = """
Esempio n. 17
0
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import normalize 
from make_csv import *

emails = pd.read_csv('email_dataset.csv')
dataframe = pd.DataFrame(parse_into_emails(emails.message))
dataframe.drop(dataframe.query("body == '' | to == '' | from_ == ''").index, inplace=True)
stopwords = ENGLISH_STOP_WORDS.union([ 'hou', 'com', 'recipient'])
vect = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.6, min_df=2)

X = vect.fit_transform(dataframe.body)
features = vect.get_feature_names()

n_clusters = 3
clf = KMeans(n_clusters=n_clusters,max_iter=100,init='k-means++', n_init=1)
labels = clf.fit_predict(my_email.csv)	
clusters = {}
    n = 0
    for item in labels:
        if item in clusters:
            clusters[item].append(row_dict[n])
        else:
            clusters[item] = [row_dict[n]]
Esempio n. 18
0
print(len(have_cancel), 'records have "cancel*" in them')

canceled_cats = Counter([i['category'] for i in have_cancel])

sorted(canceled_cats.items(), key=itemgetter(1), reverse=True)[0:10]


# #Set up the vectorisers and classifiers
# The per-record text data is fairly sparse and the vocabulary is quite big overall, so it's worth trying different vectorisers. 

# In[102]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as ESW

ESW = ESW.union({'cancelled', 'canceled'})

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.cross_validation import cross_val_score, KFold

def my_tokeniser(string):
    '''
    This can be changed to result in more sophisticated word detection.
    For now, it just splits up into alpha-only chunks, strips numbers.
    Preserves hyphenated and apostrophed words but ignores other punct.
    Gets rid of single-char stuff.
    '''
    pattern = re.compile("[A-Za-z0-9\-']*[^\W]")
    return [i for i in re.findall(pattern, string) if i.isnumeric() == False and len(i) > 1]
Esempio n. 19
0
# Code used in part 2 of How I used machine learning to classify emails and turn them into insights.

from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd

from helpers import parse_into_emails
from query import EmailDataset

# Just like in part_1, read and preprocess emails
emails = pd.read_csv('split_emails.csv') 
email_df = pd.DataFrame(parse_into_emails(emails.message))
email_df.drop(email_df.query("body == '' | to == '' | from_ == ''").index, inplace=True)

stopwords = ENGLISH_STOP_WORDS.union(['ect', 'hou', 'com', 'recipient'])
vec = TfidfVectorizer(analyzer='word', stop_words=stopwords, max_df=0.3, min_df=2)
vec_train = vec.fit_transform(email_df.body)

# print out the vector of the first email
# print(vec_train[0:1])

# Find cosine similarity between the first email and all others.
cosine_sim = linear_kernel(vec_train[0:1], vec_train).flatten()
# print out the cosine similarities
# print(cosine_sim)

# Finding emails related to a query.
query = "john"

# Transform the query into the original vector
vec_query = vec.transform([query])
Esempio n. 20
0
# Import the vectorizer and default English stop words list
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS

# Define the stop words
my_stop_words = ENGLISH_STOP_WORDS.union(
    ['airline', 'airlines', '@', 'am', 'pm'])

# Build and fit the vectorizers
vect1 = CountVectorizer(stop_words=my_stop_words)
vect2 = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
vect1.fit(tweets.text)
vect2.fit(tweets.negative_reason)

# Print the last 15 features from the first, and all from second vectorizer
print(vect1.get_feature_names()[-15:])
print(vect2.get_feature_names())
Esempio n. 21
0
    input2 = np.column_stack(
        (input_texts.reshape(-1, 1), target_texts.reshape(-1, 1)))
    df = pd.DataFrame(input2, columns=['Body', 'target'])

    mail_df = df.copy()
    # mail_df.drop(emails.query(
    #     "Body == '' | To == '' | 'Sender Email' == ''"
    # ).index, inplace=True)

    mail_df = mail_df[mail_df['Body'].isnull() == False]
    '''
    no stop words
    vect = TfidfVectorizer(stop_words='english', max_df=0.50, min_df=2)
    X = vect.fit_transform(mail_df.Body)
    '''
    stopwords = ENGLISH_STOP_WORDS.union(
        ['ect', 'hou', 'com', 'recipient', 'tom', 'mary', 'don'])
    vect = TfidfVectorizer(analyzer='word',
                           stop_words=stopwords,
                           max_df=0.3,
                           min_df=2)
    X = vect.fit_transform(mail_df.Body)

    def pca_scatter():
        X_dense = X.todense()
        coords = PCA(n_components=2).fit_transform(X_dense)

        plt.scatter(coords[:, 0], coords[:, 1], c='m')
        plt.show()

    def top_tfidf_feats(row, features, top_n=20):
        topn_ids = np.argsort(row)[::-1][:top_n]
Esempio n. 22
0
def main():
    stop_words = set(STOPWORDS)
    stop_words.update(ENGLISH_STOP_WORDS)
    #extra stop words
    extra_words=["said","say","seen","come","end","came","year","years","new","saying"]
    stop_words = ENGLISH_STOP_WORDS.union(extra_words)

    df = pd.read_csv('/kaggle/input/question1/train.csv')
#     df=df.head(n=1000)



    cat_business = []
    cat_entertainment = []
    cat_health = []
    cat_technology = []
    #store the content for each category
    for index in range(len(df.Label)):
        cat = df.Label[index]
        if cat == "Business":
            cat_business.append(df.Content[index])
        elif cat == "Entertainment":
            cat_entertainment.append(df.Content[index])
        elif cat == "Health":
            cat_health.append(df.Content[index])
        elif cat == "Technology":
            cat_technology.append(df.Content[index])

    str_bus = ''.join(cat_business)
    str_ent = ''.join(cat_entertainment)
    str_hea = ''.join(cat_health)
    str_tec = ''.join(cat_technology)

    #produce wordcloud for each category
    cloud = WordCloud(stopwords=stop_words)

    w = cloud.generate(str_bus)
    plt.figure()
    plt.imshow(w)
    plt.title("Business")
    plt.axis("off")
    plt.savefig('/kaggle/working/Business.png')

    w = cloud.generate(str_ent)
    plt.figure()
    plt.imshow(w)
    plt.title("Entertainment")
    plt.axis("off")
    plt.savefig('/kaggle/working/Entertainment.png')

    w = cloud.generate(str_hea)
    plt.figure()
    plt.title("Health")
    plt.imshow(w)
    plt.axis("off")
    plt.savefig('/kaggle/working/Health.png')

    w = cloud.generate(str_tec)
    plt.figure()
    plt.imshow(w)
    plt.title("Technology")
    plt.axis("off")
    plt.savefig('/kaggle/working/Technology.png')
Esempio n. 23
0
from nltk.stem import WordNetLemmatizer


class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


#vect = CountVectorizer(tokenizer=LemmaTokenizer())

#print('test')
# esta dando certo ate aqui
my_stopwords = ENGLISH_STOP_WORDS.union(['@', '<br />'])
#porter = PorterStemmer()

vect = CountVectorizer(max_features=1000,
                       ngram_range=(1, 3),
                       stop_words=my_stopwords,
                       tokenizer=LemmaTokenizer())
#vect.fit(df_train.text)
X = vect.fit_transform(df_train.text)
X_test = vect.fit_transform(df_test.text)

# Transform to an array
my_array = X.toarray()
my_array_test = X_test.toarray()
# Transform back to a dataframe, assign column names
X_df = pd.DataFrame(my_array, columns=vect.get_feature_names())
Esempio n. 24
0
globalDataNum = 100
train_data = pd.read_csv('train_set.csv', sep="\t")
test_data = pd.read_csv('test_set.csv', sep="\t")
train_data = train_data[0:globalDataNum]
test_data = test_data[0:globalDataNum]
categories = train_data.Category
ids = train_data.Id
compons = [2, 3, 5, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
myData = train_data['Content'] + 5 * train_data['Title']
#for question #4 - initialization
metrics_all = [[0 for x in range(5)] for y in range(4)]

#adding English stopwords
eng_stop_words = ENGLISH_STOP_WORDS
myStopWords = {'yes', 'just', "don't", 'didn'}
eng_stop_words = ENGLISH_STOP_WORDS.union(myStopWords)

#
#set(categories)
le = preprocessing.LabelEncoder()
le.fit(categories)
y = le.transform(categories)
set(y)
set(le.inverse_transform(y))
count_vectorizer = CountVectorizer(stop_words=eng_stop_words)
#count_vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,min_df=2,stop_words=eng_stop_words,use_idf=True)
X = count_vectorizer.fit_transform(myData)
"""
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))
X_train_lsa = lsa.fit_transform(X)
Esempio n. 25
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import json
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

with open('extraStopWords.json','r') as extraStopWords:
	extraStopWords = json.load(extraStopWords)
stopWords = ENGLISH_STOP_WORDS.union(extraStopWords)

categories = ['Politics','Film','Football','Business','Technology']

df = pd.read_csv('./datasets/train_set.csv', sep='\t')

for category in categories:
    print("Creating word cloud for: " + category + "." )
    c_df =  df[(df['Category']==category)]
    content = ' '.join(c_df['Title'].iloc[i] + ' ' + c_df['Content'].iloc[i] for i in range(len(c_df)))
    wordcloud = WordCloud(background_color="white", stopwords=stopWords).generate(content)
    plt.imsave('WordCloud_For:_'+category+'_.png', wordcloud)
    
print("Done!")
Esempio n. 26
0
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.cluster import KMeans
#import matplotlib.pyplot as plt

# pd.options.display.max_columns = 30

stop_words = ENGLISH_STOP_WORDS.union({'king','german','brau','james',\
'brewery','company','brewing','house','bock','style','scotch','california','oktoberfest',\
'wee','special','english','american','hefeweizen','old','common','gose'})

scaler = StandardScaler()

class TopicModeler(object):
    """
    Topic Modeler
    ---------
    model : type of sklearn model to use (currently set up for LDA  but could be extended to NMF and others)
    vectorizer : sklearn vectorizer (currently set up for CountVectorizer but could be extended to use TfIDF)
    distance_func : sklearn.pairwise function for determining distance between documents
    """
    def __init__(self, model, vectorizer, distance_func=cosine_distances):
        self.model = model
        self.text = None
        self.names = None
        continue
    for spam_file in files:
        print('file: {}'.format(spam_file))
        spam_df = pd.read_csv(os.path.join(subdir, spam_file), encoding='latin-1')
        spam_df.dropna(inplace=True)
        spam_df['label'] = 1
        df = pd.concat([spam_df, ham_df])
        del spam_df

        df = df.sample(frac=1)

        additional_stop_words = ['enron','vince','louise','attached','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016']
        additional_stop_words += ['koi8', 'http', 'windows', 'utf', 'nbsp', 'bruceg']
        more_words_file = open(os.path.join(os.getcwd(),'Results','features','removed_features_list.dmp'))
        more_words = more_words_file.readlines()
        more_words_file.close()
        additional_stop_words += more_words
        start_time = time.time()
        #vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words), ngram_range=(1, 2))
        vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words))
        y_train = df['label']
        X_train =  vectorizer.fit_transform(df['content'])
        del df 
        classifier = LogisticRegression()
        classifier.fit(X_train, y_train)
        print('Training time = {}'.format(time.time() - start_time))
        del X_train, y_train
        
        # print('Informative features')
        show_most_informative_features(vectorizer, classifier, spam_file)
Esempio n. 28
0
import re
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.externals import joblib
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.model_selection import GridSearchCV, train_test_split
from debate_cleaning import *
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.display.max_columns = 40

stop_words = ENGLISH_STOP_WORDS.union({'redirects', 'mentions', 'locations','ve','know','don','way','think','going','just',
                                        'said','got','like','need','say','ll','america','want','sure','make','come','right',
                                        'let','did','look', 'actually','lot','does','people','fact','time','president',
                                        'country','united','states','american','trump'})

class TopicModeler(object):

    def __init__(self, model, vectorizer, distance_func=cosine_distances):
        self.model = model
        self.text = None
        self.titles = None
        self.vectorizer = vectorizer
        self.feature_names = None
        self.doc_probs = None
        self.distance_func = distance_func
        self.word_vec = None

    def set_feature_names(self):
        b = get_article(records[num])
        val.append(b)
    df = build_df(val)

    #TODO stem/lemmatize
    stop_words =set()
    stop_words = stop_words.union({'house','just', 'like', 'did', 'time', 'saw', 'right', 'left', 
                                            'road', 'county', 'year', 'road','said', 'area', 'nt',
                                            'woods', 'heard', '2009', '2012', '2011', '2013', '2009', 'km',
                                            '07', '09', 'didnt', 'got', 'went', 'know'})
    stop_words = stop_words.union(set(df['year']))
    stop_words = stop_words.union(set(df['season']))
    stop_words = stop_words.union(set(df['month']))
    stop_words = stop_words.union(set(df['state']))
    stop_words = stop_words.union(set(df['county']))
    stop_words =ENGLISH_STOP_WORDS.union(stop_words)
    
    lemmer=WordNetLemmatizer()
    
    # tokenized = [word_tokenize(content.lower()) for content in corpus]
    # docs = [[word for word in words if word not in stop_words] for words in tokenized]
   
    # corp_lem = [wordnet.lemmatize(word) for word in word_tokenize(corpus.lower())]
    n_corp =[]
    for i in corpus:
        n_corp.append(re.sub('[^A-Za-z0-9]+', ' ', i).lower())
    corp=[]
    for i in n_corp:
        corp.append(lemmer.lemmatize(i))
    vectorizer = TfidfVectorizer(stop_words =stop_words)
    X = vectorizer.fit_transform(corpus)
Esempio n. 30
0
 def _remove_stopwords(self):
     """删除关键词中的停止词"""
     stop_words = sklearn_stopwords.union(nltk_stopwords.words('english'))
     self.terms = self.terms[~(self.terms['name'].isin(stop_words))]
     self.paper_term = self.paper_term[self.paper_term['term_id'].isin(self.terms.index)]
skip_files = ['2012_spam.csv', '2013_spam.csv', 'ham.csv', 'ham_latin.csv']

for subdir, dirs, files in os.walk(yearly_spam_folder):
    if dirs != []:
        continue
    for spam_file in files:
        if spam_file not in skip_files:
            eprint('file: {}'.format(spam_file))
            spam_df = pd.read_csv(os.path.join(subdir, spam_file), encoding='latin-1')
            spam_df.dropna(inplace=True)
            spam_df['label'] = 1
            df = pd.concat([spam_df, ham_df])
            del spam_df

            df = df.sample(frac=1)

            additional_stop_words = ['enron','vince','louise','attached','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016']
            additional_stop_words += ['koi8', 'http', 'windows', 'utf', 'nbsp', 'bruceg']
            start_time = time.time()
            vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(additional_stop_words), ngram_range=(1, 2))
            y_train = df['label']
            X_train =  vectorizer.fit_transform(df['content'])
            del df 
            classifier = LogisticRegression()
            classifier.fit(X_train, y_train)
            eprint('Training time = {}'.format(time.time() - start_time))
            del X_train, y_train
            
            # print('Informative features')
            show_most_informative_features(vectorizer, classifier, spam_file)
Esempio n. 32
0
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from time import time
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import numpy as np

#add some basic stopwords
stopwords = set()
my_words = ["said", "say", "says"]

stopwords = ENGLISH_STOP_WORDS.union(my_words)

print "RandomForests..."
#load the datasets
train_data = pd.read_csv('./datasets/train_set.csv', sep="\t")

X = train_data["Content"]

#Transform Category from strings to numbers from 0-4
le = preprocessing.LabelEncoder()
le.fit(train_data["Category"])
y = le.transform(train_data["Category"])

#Split the train set by preserving the percentage of samples for each class.
n_folds = 10
folds = StratifiedKFold(n_splits=n_folds)
Esempio n. 33
0
    def crossValidation(self, model_tag):
        scores = []
        f1_scores = []
        skf = StratifiedKFold(n_splits=5, shuffle=True)
        additional_stop_words = [
            'https', 'http', 'amp', 'com', 'reddit', 'www'
        ]
        for train_index, test_index in skf.split(self.train_x, self.train_y):
            if model_tag == 0:
                model = Pipeline([
                    ("ngram",
                     CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(
                         additional_stop_words),
                                     ngram_range=(1, 3),
                                     min_df=2,
                                     max_features=1000)),
                    ("tfidf", TfidfTransformer()), ("clf", MultinomialNB())
                ])
            elif model_tag == 1:
                model = Pipeline([
                    ("ngram",
                     CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(
                         additional_stop_words),
                                     ngram_range=(1, 3),
                                     min_df=2,
                                     max_features=1000)),
                    ("tfidf", TfidfTransformer()),
                    ("clf", SVC(C=1.0, gamma='scale', kernel='linear'))
                ])
            elif model_tag == 2:
                model = Pipeline([
                    ("ngram",
                     CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(
                         additional_stop_words),
                                     ngram_range=(1, 3),
                                     min_df=2,
                                     max_features=1000)),
                    ("tfidf", TfidfTransformer()),
                    ("clf", AdaBoostClassifier())
                ])
            elif model_tag == 3:
                model = Pipeline([
                    ("ngram",
                     CountVectorizer(stop_words=ENGLISH_STOP_WORDS.union(
                         additional_stop_words),
                                     ngram_range=(1, 3),
                                     min_df=2,
                                     max_features=1000)),
                    ("tfidf", TfidfTransformer()),
                    ("clf",
                     SGDClassifier(loss='hinge',
                                   penalty='l1',
                                   alpha=1e-3,
                                   max_iter=1000,
                                   tol=1e-3))
                ])
            # SVC(C=1.0, kernel='linear', gamma='auto')
            # LinearSVC(penalty='l1', dual= False, max_iter=1000)
            # SVC(C=1.0, kernel='sigmoid', gamma='scale')
            model.fit(self.train_x[train_index], self.train_y[train_index])
            predicted = model.predict(self.train_x[test_index])
            scores.append(accuracy_score(predicted, self.train_y[test_index]))
            f1_scores.append(f1_score(self.train_y[test_index], predicted))

            # Start: This part is used for error analysis
            # count = 0
            # error_text = []
            # for k in range(len(predicted)):
            #     if predicted[k] != self.train_y[test_index][k]:
            #         count += 1
            #         error_text.append((k, self.train_x[test_index][k]))
            #     if count == 10:
            #         break
            #
            # for error in error_text:
            #     print(error)
            # End

            # Start: This part is used for top 20 feature output
            #self.print_top20(model.named_steps['ngram'],model.named_steps['clf'])
            features_matrix = model.named_steps['ngram'].fit_transform(
                self.train_x[train_index], self.train_y[train_index])
            top20_best = SelectKBest(chi2, k=20)
            top20_best.fit_transform(features_matrix,
                                     self.train_y[train_index])
            feature_names = model.named_steps['ngram'].get_feature_names()

            top20_index = [
                i for i, x in enumerate(top20_best.get_support()) if x
            ]
            # print("Top 20 features : %s" % (", ".join(feature_names[i] for i in top20_index)))
            # End

        print("\nF1-measure: ", mean(f1_scores))
        return (scores)
Esempio n. 34
0
def removeStopwordsNLTKSklearn(document):
    # dumb wrapper for just showing the union method
    stopwords = sklearn_stop_words.union(
        nltk.corpus.stopwords.words('english'))
    return removeStopwords(stopwords, document)
Esempio n. 35
0
 def get_stop_words(self):
     f = open('names.txt')
     a = f.read()
     b = [x.strip() for x in a.split(',')]
     stop_words = ENGLISH_STOP_WORDS.union(b)
     return stop_words
Esempio n. 36
0
          "%22&page=1"
SUS_DEV_URL = "https://www.federalregister.gov/api/v1/documents.json?conditions%5Bagencies%" \
              "5D%5B%5D=commerce-department&conditions%5Bagencies%5D%5B%5D=defense-department" \
              "&conditions%5Bagencies%5D%5B%5D=national-aeronautics-and-space-administration" \
              "&conditions%5Bagencies%5D%5B%5D=health-and-human-services-department" \
              "&conditions%5Bagencies%5D%5B%5D=transportation-department" \
              "&conditions%5Bterm%5D=%22sustainable+development%22&page=1"
IOT_PATH = "files/data/iot.pkl"
SUS_DEV_PATH = "files/data/sus_dev.pkl"

# LDADE
TOKEN_PATTERN = re.compile(r"(?u)\b[a-zA-Z]{2}[a-zA-Z]+\b")
ITERATIONS = 100
ALPHA = None
BETA = None
STOP_WORDS = ENGLISH_STOP_WORDS.union(['software', 'engineering'])
N_TOPICS = 10
RANDOM_STATE = 1
AGENCY_MAP = {
    'Transportation Department': 'DOT',
    'Federal Transit Administration': 'FTA',
    'Commerce Department': 'DOC',
    'International Trade Administration': 'ITA',
    'Economic Development Administration': 'EDA',
    'National Oceanic and Atmospheric Administration': 'NOAA',
    'Federal Highway Administration': 'FHWA',
    'Interior Department': 'DOI',
    'Fish and Wildlife Service': 'FWS',
    'Defense Department': 'DOD',
    'Navy Department': 'USN',
    'Health and Human Services Department': 'HHS',
Esempio n. 37
0
 def add_stop_words(self):
     if self.stop_words is not None:
         words = self._split_on_spaces(self.stop_words)
         self.stop_words = ENGLISH_STOP_WORDS.union(words)