def train(self): # with open('pos_tweet.csv', encoding="utf8", mode='r') as pos_tweet: with open('datasets/pos_tweet.csv', mode='r') as pos_tweet: pos = csv.DictReader(pos_tweet, delimiter=',') for ptweet in pos: self.pos_tweets.append( (bag_of_words(ptweet['tweet']), 'positive')) # with open('neg_tweet.csv', encoding="utf8", mode='r') as neg_tweet: with open('datasets/neg_tweet.csv', mode='r') as neg_tweet: neg = csv.DictReader(neg_tweet, delimiter=',') for ntweet in neg: self.neg_tweets.append( (bag_of_words(ntweet['tweet']), 'negative')) # with open('neg_tweet.csv', encoding="utf8", mode='r') as neu_tweet: # with open('neg_tweet.csv', mode='r') as neu_tweet: # neu = csv.DictReader(neu_tweet, delimiter=',') # for neutweet in neu: # self.neu_tweets.append((bag_of_words(neutweet['tweet']), 'neutral')) shuffle(self.pos_tweets) shuffle(self.neg_tweets) #shuffle(self.neu_tweets) self.all_train_set = self.pos_tweets + self.neg_tweets # + self.neu_tweets trained = NaiveBayesClassifier.train(self.all_train_set) return trained
def main(argv): f = False fs = '' ds = '' sample_size = 0 method = None try: opts, args = getopt.getopt(argv, "hf", ["sample=", "fs=", "ds="]) except getopt.GetoptError: print_help() sys.exit(2) for opt, arg in opts: if opt == '-h': print_help() sys.exit() elif opt == '--ds': ds = arg elif opt == '--sample': sample_size = int(arg) elif opt == '--fs': fs = arg elif opt == '-f': f = True if ds == '1' and not f: dsb.generate_concat(sample_size) elif ds == '2' and not f: dsb.generate_one_question_per_line(sample_size) ds = 1 if ds == '' else ds if fs == '1': fe.bag_of_words(int(ds)) elif ds and fs == '2': fe.word2vec(int(ds))
def test_bag_of_words(self): test_corpus = [ 'the sky is blue.', 'sky is blue and sky is beautiful.', 'the beautiful sky is so blue.', 'i love blue cheese.' ] features = [ Counter({ 'blue': 1, 'sky': 1 }), Counter({ 'sky': 2, 'blue': 1, 'beautiful': 1 }), Counter({ 'beautiful': 1, 'blue': 1, 'sky': 1 }), Counter({'blue': 1}) ] feature_names = set(['blue', 'beautiful', 'sky']) self.assertEqual(bag_of_words(test_corpus, num_feats=3), (features, feature_names))
def text_classification(mode): words_inputs, word_outputs = read_from_csv('reviews_mixed.csv') vocabulary = tokenize_sentences(words_inputs) bag = [] for each in words_inputs: bag.append(bag_of_words(each, vocabulary)) bag_train_input, bag_train_output, bag_test_input, bag_test_output = split_data(bag, word_outputs) if mode == 'kmeans': words_km = kmeans(k=2, similarity=jaccard_similarity, max_iterations=50) words_km.fit(bag_train_input) print(len(words_km.clusters[0])) print(len(words_km.clusters[1])) dictionary = {'first cluster' : [], 'second cluster' : []} for i in range(len(bag_test_input)): if words_km.centroids[0] == words_km.predictJaccard(bag_test_input[i]): dictionary['first cluster'].append(bag_test_output[i]) else: dictionary['second cluster'].append(bag_test_output[i]) print('FIRST CLUSTER :') for each in dictionary['first cluster']: print(each) print('SECOND CLUSTER :') for each in dictionary['second cluster']: print(each) print('Dunn index : ' + str(words_km.dunn_index())) if mode == 'log': classifier = MyLogisticRegression() numeric_output = [0 if bag_train_output[i] == 'negative' else 1 for i in range(len(bag_train_output))] classifier.fit(bag_train_input, numeric_output) accuracy = 0 for i in range(len(bag_test_input)): if classifier.predictOneSample(bag_test_input[i]) > 0.5: computed = 'positive' else: computed = 'negative' real = bag_train_output[i] if real == computed: accuracy += 1 print("computed : " + computed + " real : " + real) else: print("computed : " + computed + " real : " + real + " GRESIT") error = 1 - (accuracy / len(bag_test_input)) print("error : " + str(error))
return parser.parse_args() if __name__=="__main__": args = parse_args() print "Reading data..." titles, bodies, tags_sets, _ = da.read_data(args.data) tags = [list(t)[0] for t in tags_sets] X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags) X_train_t, X_train_b = zip(*X_train) print "Generating features..." if args.feat == "bow": X, extractor = fe.bag_of_words(X_train_t, X_train_b) elif args.feat == "tfidf": X, extractor = fe.tfidf(X_train_t, X_train_b) elif args.feat == "bigram": X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2) else: X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3) print "Train..." if args.classifier == "knn": classifier = KNeighborsClassifier(n_neighbors=3) elif args.classifier == "log-reg": classifier = LogisticRegression(C=1e5) elif args.classifier == "dec-tree": classifier = DecisionTreeClassifier() else:
return parser.parse_args() if __name__=="__main__": args = parse_args() print "Reading data..." titles, bodies, tags_sets, _ = da.read_data(args.data, args.maxRows) tags = [list(t)[0] for t in tags_sets] X_train, X_test, y_train, y_test = evaluation.cross_validation(zip(titles, bodies), tags) X_train_t, X_train_b = zip(*X_train) print "Generating features..." if args.feat == "bow": X, extractor = fe.bag_of_words(X_train_t, X_train_b) elif args.feat == "tfidf": X, extractor = fe.tfidf(X_train_t, X_train_b) elif args.feat == "bigram": X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=2) else: X, extractor = fe.ngrams(X_train_t, X_train_b, n_upper=3) print "Train..." if args.classifier == "naive": classifier = MultinomialNB() classifier.fit(X, y_train) print "Test..." predictions = [classifier.predict(extractor.transform(t, b))[0] for t,b in X_test]
def get_tweet_sentiment(self, tweet): ''' Utility function to classify sentiment of passed tweet ''' tweet_set = bag_of_words(self.clean_tweet(tweet)) return self.classifier.classify(tweet_set)