def get_feats(times_of_day, data_set): """Reads the golden standard CSV file and puts the content in bags of words""" print("\n##### Reading {} files...".format(data_set)) feats = list() stemmer = SnowballStemmer("dutch") stop_words = set(stopwords.words('dutch')) for item in times_of_day: tweets_txt = open(item + '/' + data_set + '.txt') c = 0 for line in tweets_txt.readlines(): c += 1 tokens = TweetTokenizer(preserve_case=False).tokenize(line) filtered_tokens = [ w for w in tokens if w not in stop_words if not onlyDigits(w) ] #chars = [w for i in filtered_tokens for w in i] #bigrams = ngrams(filtered_tokens,2) bag = bag_of_words(filtered_tokens) feats.append((bag, item)) #if c == 5000: #break print("{} {} tweets read".format(c, item)) # return high_information(feats, times_of_day) return feats
def read_files(): csvfile = open('OnionOrNot.csv', 'r', encoding='UTF-8').readlines() feats = list() for line in csvfile: data = str(line[:-3]).lower() category = line[-2] if category == '0' or category == '1': tokens = word_tokenize(data) # stemming # stemmer = SnowballStemmer("english") # punctuation removal # tokens = [stemmer.stem(filteredtoken) for filteredtoken in tokens] # punct = set(string.punctuation) # for item in tokens: # if item in punct: # tokens.remove(item) # else: # pass # stopwords removal # tokens = bag_of_non_stopwords(tokens) feats.append((bag_of_words(tokens), category)) else: pass return feats
def main(): ''' Main function of the boilerplate code is the entry point of the 'chitragoopt' executable script (defined in setup.py). Use doctests, those are very helpful. >>> main() Hello >>> 2 + 2 4 ''' lfeats = label_feats_from_corpus(movie_reviews) train_feats, test_feats = split_label_feats(lfeats, split=0.75) train_feats, test_feats = split_label_feats(lfeats, split=0.75) # nb_classifier = NaiveBayesClassifier.train(train_feats) print(sys.argv[1].split()) negfeat = bag_of_words(sys.argv[1].split()) f = open('my_classifier.pickle') nb_classifier = pickle.load(f) f.close() print(accuracy(nb_classifier, test_feats)) print(nb_classifier.classify(negfeat)) for x in range(0, 50): print(nb_classifier.classify(negfeat))
def read_files(categories, stopwords): feats = list() print("\n##### Reading files...") for category in categories: files = get_filenames_in_folder(category) num_tweets = 0 for i in files: with open("{}/{}".format(category, i), encoding='Latin-1') as csvfile: csv_reader = csv.reader(csvfile, delimiter=',', quotechar='|') for tweet in csv_reader: tweet_body = tweet[4] clean = [] tokens = word_tokenize(tweet_body) for i in tokens: i = i.lower() # lowering all text if i in stopwords: pass if i in punct: # remove punctuation pass else: clean.append(i) # Filter out all stopwords bag = bag_of_words(clean) feats.append((bag, category)) num_tweets += 1 print(" Category {}, amount of tweets ={}".format( category, num_tweets)) print(" Total, %i files read" % (len(feats))) return feats
def read_files(categories, stopwords): train_feats = [] test_feats = [] for category in categories: files = get_filenames_in_folder('Raad/' + category) feats = [] train = [] test = [] for f in files: data = open('Raad/' + category + '/' + f, 'r', errors="ignore").read() data = data.lower() for ch in '!"(),?:.\;"': data = data.replace(ch, " ") for ch in '\n': data = data.replace(ch, " ") for word in stopWords: pattern = re.compile('(\s*){}(\s*)'.format(word)) data = pattern.sub(' ', data) tokens = word_tokenize(data) bag = bag_of_words(tokens) feats.append((bag, category)) split = 0.9 cutoff = int(len(feats) * split) train, test = feats[:cutoff], feats[cutoff:] train_feats = train_feats + train test_feats = test_feats + test #print (" Category %s, %i files read" % (category, num_files)) return train_feats, test_feats
def bigram(feats): bigrams = [] for feat in feats: bigramsandwords = bag_of_bigrams_words(list(feat[0].keys())) bigram = bag_of_words_not_in_set(bigramsandwords, list(feat[0].keys())) cleanbigram = list(' '.join((a, b)) for a, b in bigram) bigrams.append(bag_of_words(cleanbigram)) return bigrams
def NaiveBayes(review_set, csvFilePathNltk, chIndex): print('[' + time.strftime("%H:%M:%S") + ']$ Starting NLTK Naive Bayes Classifier Sentiment Analyzer...') movie_reviews.categories() lfeats = label_feats_from_corpus(movie_reviews) lfeats.keys() train_feats, test_feats = split_label_feats(lfeats, split=0.75) cv = cross_validation.KFold(len(train_feats), n_folds=10, shuffle=True, random_state=None) for traincv, evalcv in cv: nb_classifier = NaiveBayesClassifier.train( train_feats[traincv[0]:traincv[len(traincv) - 1]]) save_classifier = open("Classifier-CV.pickle", "wb") pickle.dump(nb_classifier, save_classifier) save_classifier.close() print('[' + time.strftime("%H:%M:%S") + ']$ Generating NLTK sentiment analysis based on ' + bookList[chIndex - 1] + '\'s ' + str(len(review_set)) + ' review(s).') sentiment_set = [] for review in review_set: filtered_review = review for word in sw: filtered_review = filtered_review.replace(" " + word + " ", " ") diff_sw = len(filtered_review) / len(review) feats = bag_of_words(word_tokenize(filtered_review)) sentiment = nb_classifier.classify(feats) probs = nb_classifier.prob_classify(feats) pos_prob = round(probs.prob('pos'), 4) neg_prob = round(probs.prob('neg'), 4) neu_prob = round(pos_prob - neg_prob, 4) pct_red = round(100 - (100 * diff_sw), 2) sentiment_set.append([sentiment, pos_prob, neg_prob, review[:-1]]) if (chIndex != 10): print('[' + time.strftime("%H:%M:%S") + ']$ Writing NLTK results file (csv) at ' + csvFilePathNltk) generateCSV(sentiment_set, csvFilePathNltk) print('[' + time.strftime("%H:%M:%S") + ']$ Results file (csv) successfully generated at ' + csvFilePathNltk) else: print(sentiment, pos_prob, neg_prob, neu_prob, filtered_review, pct_red) print('[' + time.strftime("%H:%M:%S") + ']$ Finished NLTK Naive Bayes Classifier Sentiment Analyzer!')
def read_files(categories): feats = list() print("\n##### Reading files...") for category in categories: files = get_filenames_in_folder('Volkskrant/' + category) num_files = 0 for f in files: data = open('Volkskrant/' + category + '/' + f, 'r').read().decode("utf-8") tokens = word_tokenize(data) bag = bag_of_words(tokens) feats.append((bag, category)) #print len(tokens) num_files += 1 # if num_files>=50: # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster # break print(" Category %s, %i files read" % (category, num_files)) return feats
def read_files(): feats = list () print("\n##### Reading files...") song_data = pickle.load(open('final_songdata2.pickle','rb')) files = get_filenames_in_folder('lyric_files') num_files=0 for f in files: data = open('lyric_files/' + f, 'r').read() song_id = int(f.split(".")[0]) songwriter = song_data[song_id][2] # Remove all punctuation customTokenizer = RegexpTokenizer(r'\w+') tokens = customTokenizer.tokenize(data) # Lowercase everything tokens = [t.lower() for t in tokens] bag = bag_of_words(tokens) feats.append((bag, songwriter)) num_files+=1 return feats
def read_files(categories): feats = list () print("\n##### Reading files...") for category in categories: files = get_filenames_in_folder('Volkskrant/' + category) num_files=0 for f in files: data = open('Volkskrant/' + category + '/' + f, 'r', encoding='UTF-8').read() tokens = word_tokenize(data) tokens = [token.lower() for token in tokens] # remove items that are not alphabetics for token in tokens: if token.isalpha() == False: tokens.remove(token) # remove punctuation items for token in tokens: if token in '!;@#$%^&*().,/?~1234567890': tokens.remove(token) bag = bag_of_words(tokens) feats.append((bag, category)) #print len(tokens) num_files+=1 #if num_files>=50: # you may want to de-comment this and the next line if you're doing tests (it just loads N documents instead of the whole collection so it runs faster # break print (" Category %s, %i files read" % (category, num_files)) print(" Total, %i files read" % (len(feats))) return feats
# ['neg', 'pos'] lfeats = label_feats_from_corpus(movie_reviews) print(lfeats.keys()) # dict_keys(['neg', 'pos']) train_feats, test_feats = split_label_feats(lfeats, split=0.75) print(len(train_feats)) print(len(test_feats)) nb_classifier = NaiveBayesClassifier.train(train_feats) print(nb_classifier.labels()) negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous']) print(nb_classifier.classify(negfeat)) posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible']) print(nb_classifier.classify(posfeat)) print(accuracy(nb_classifier, test_feats)) probs = nb_classifier.prob_classify(test_feats[0][0]) print(probs.samples()) print(probs.max()) print(probs.prob('pos')) print(probs.prob('neg'))
def read_files(pre_processing=False, ner_tags=False, add_ngrams=False, train_data=True): """ Reads the files for each category (asshole and not_asshole). """ feats = list() print("\n##### Reading files...") stop_words = set(stopwords.words('dutch')) # load in ner tagger ner_list = list() if ner_tags: nlp = spacy.load("nl_core_news_sm") # ngrams list ngrams_list = list() # load train data if train_data: file = "Dutch_Abusive_Language_Corpus_Train.tsv" # load test data else: file = "Dutch_Abusive_Language_Corpus_Test.tsv" with open(file, "r", encoding="utf-8") as f: data = f.readlines() print("Loaded", str(len(data)), "tweets") for line in data: line_split = line.split("\t") tweet_text = line_split[1].strip().lower() tweet_label = line_split[-1].strip() tokens = word_tokenize(tweet_text) print(tweet_label, line_split[-2].strip(), tweet_text) if tweet_label != "NA": # get ner tags for text ner_set = set() if ner_tags: parsed_string = nlp(tweet_text) for token in parsed_string: if token.ent_type_ != "": ner_set.add(token.ent_type_) ner_list.append(ner_set) # Perform pre-processing to tweet if pre_processing: # lower and strip tokens new_tokens = list() for token in tokens: new_tokens.append(token.lower().strip()) tokens = new_tokens # remove stopwords DEPRECATED ''' new_tokens = list() for token in tokens: if token not in stop_words: new_tokens.append(token) tokens = new_tokens ''' # get n-grams for text ngrams_set = set() if add_ngrams: ngrams_object = ngrams(tokens, 2) for grams in ngrams_object: ngrams_set.add(" ".join(grams)) ngrams_list.append(ngrams_set) # Turn tokens into a bag of words bag = bag_of_words(tokens) feats.append((bag, tweet_label)) print("Using", str(len(feats)), "tweets") return feats, ner_list, ngrams_list
# !pip install nltk import nltk #nltk.download('movie_reviews') from nltk.corpus import movie_reviews from featx import bag_of_words from featx import label_feats_from_corpus from featx import split_label_feats lfeats = label_feats_from_corpus(movie_reviews) train_feats, test_feats = split_label_feats(lfeats, split=0.75) from nltk.classify import NaiveBayesClassifier nb_classifier = NaiveBayesClassifier.train(train_feats) print(nb_classifier.labels()) review1 = bag_of_words(['the', 'plot', 'was', 'ludicrous']) print(nb_classifier.classify(review1)) review2 = bag_of_words(['kate', 'winslet', 'is', 'accessible']) print(nb_classifier.classify(review2)) from nltk.classify.util import accuracy print(accuracy(nb_classifier, test_feats))