def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format(len(training_documents))) training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) logger.info('Training classifier') options = [] options.append('-no-cv') self.classifier = nltk.WekaClassifier.train('weka.model', training_set, 'log_regression', options)
def process_corpus_with_cross_validation(self, total_evaluation, metrics_table): folds = [] if self.fold_number is None: folds = range(self.n_folds) else: folds.append(self.fold_number) pos_fold_size = int(self.corpus_size*self.prop_of_pos / self.n_folds) neg_fold_size = int(self.corpus_size*self.prop_of_neg / self.n_folds) for i in folds: pos_test_start = pos_fold_size * i pos_test_end = pos_test_start + pos_fold_size neg_test_start = neg_fold_size * i neg_test_end = neg_test_start + neg_fold_size logger.info('Fold {}/{}, pos: {}..{}, neg: {}..{}'.format(i + 1, self.n_folds, pos_test_start, pos_test_end, neg_test_start, neg_test_end)) training_documents = [(c, 'pos') for c in self.pos_comments[:pos_test_start] + self.pos_comments[pos_test_end:]] training_documents.extend([(c, 'neg') for c in self.neg_comments[:neg_test_start] + self.neg_comments[neg_test_end:]]) if self.out_of_domain_test: # testeo siempre con el mismo set en todas las iteraciones pos_test_set = self.pos_comments_dom2[:pos_fold_size] neg_test_set = self.neg_comments_dom2[:neg_fold_size] else: pos_test_set = self.pos_comments[pos_test_start:pos_test_end] neg_test_set = self.neg_comments[neg_test_start:neg_test_end] evaluation = self.process_fold(training_documents, pos_test_set, neg_test_set) total_evaluation.update(evaluation) self.add_metrics(metrics_table, i, evaluation)
def process_corpus(self): evaluation = Evaluation('pos', 'neg') pos_size = int(self.corpus_size * self.prop_of_pos) neg_size = int(self.corpus_size * self.prop_of_neg) self.corpus = self.pos_comments[:pos_size] + self.neg_comments[: neg_size] self.pos_hits = self.hits(self.pos_words) self.neg_hits = self.hits(self.neg_words) pos_test_size = int(pos_size * 0.2) neg_test_size = int(neg_size * 0.2) pos_test_corpus = self.pos_comments[:pos_test_size] neg_test_corpus = self.neg_comments[:neg_test_size] tagged_pos_test_corpus = self.tag_test_corpus(pos_test_corpus) tagged_neg_test_corpus = self.tag_test_corpus(neg_test_corpus) evaluation = self.classify_corpus(pos_test_corpus, tagged_pos_test_corpus, 'pos', evaluation) evaluation = self.classify_corpus(neg_test_corpus, tagged_neg_test_corpus, 'neg', evaluation) logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format( evaluation.get_cases(), evaluation.get_accuracy_avg())) metrics_table = self.build_metrics_table() self.add_metrics(metrics_table, 'Total', evaluation) print metrics_table return evaluation
def process_corpus(self): evaluation = Evaluation('pos', 'neg') pos_size = int(self.corpus_size*self.prop_of_pos) neg_size = int(self.corpus_size*self.prop_of_neg) self.corpus = self.pos_comments[:pos_size] + self.neg_comments[:neg_size] self.pos_hits = self.hits(self.pos_words) self.neg_hits = self.hits(self.neg_words) pos_test_size = int(pos_size*0.2) neg_test_size = int(neg_size*0.2) pos_test_corpus = self.pos_comments[:pos_test_size] neg_test_corpus = self.neg_comments[:neg_test_size] tagged_pos_test_corpus = self.tag_test_corpus(pos_test_corpus) tagged_neg_test_corpus = self.tag_test_corpus(neg_test_corpus) evaluation = self.classify_corpus(pos_test_corpus, tagged_pos_test_corpus, 'pos', evaluation) evaluation = self.classify_corpus(neg_test_corpus, tagged_neg_test_corpus, 'neg', evaluation) logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format(evaluation.get_cases(), evaluation.get_accuracy_avg())) metrics_table = self.build_metrics_table() self.add_metrics(metrics_table, 'Total', evaluation) print metrics_table return evaluation
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format(len(training_documents))) training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) logger.info('Training classifier') #self.classifier = nltk.MaxentClassifier.train(training_set, algorithm='megam', explicit=True, bernoulli=True, model='multiclass') self.classifier = nltk.MaxentClassifier.train(training_set, algorithm='megam') self.classifier.show_most_informative_features()
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format( len(training_documents))) training_set = nltk.classify.util.apply_features( feature_extractor.extract, training_documents) logger.info('Training classifier') self.classifier = nltk.SvmClassifier.train(training_set) self.classifier.show_most_informative_features()
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format( len(training_documents))) training_set = nltk.classify.util.apply_features( feature_extractor.extract, training_documents) logger.info('Training classifier') options = [] options.append('-no-cv') self.classifier = nltk.WekaClassifier.train('weka.model', training_set, 'log_regression', options)
def preprocess_corpus(self): logger.info("Preprocessing corpus") processor = self.build_preprocessor() self.pos_comments = processor.process(self.pos_comments[:int(self.corpus_size*self.prop_of_pos)]) self.neg_comments = processor.process(self.neg_comments[:int(self.corpus_size*self.prop_of_neg)]) self.pos_comments_dom2 = processor.process(self.pos_comments_dom2[:int(self.corpus_size*self.prop_of_pos)]) self.neg_comments_dom2 = processor.process(self.neg_comments_dom2[:int(self.corpus_size*self.prop_of_neg)])
def classify_comments(self, test_fold, test_comments): self.classifier = self.get_classifier(test_fold) logger.info(self.classifier.show_most_informative_features()) evaluation = Evaluation('pos', 'neg') for comment, expected_klass in test_comments: klass = self.classifier.classify(comment) #if klass != expected_klass: #print 'expected class: %s, class: %s, comment: %s' %(expected_klass, klass, " ".join(comment)) evaluation.add(expected_klass, klass) return evaluation
def process_corpus(self): total_evaluation = Evaluation('pos', 'neg') metrics_table = self.build_metrics_table() if (self.cross_validation): self.process_corpus_with_cross_validation(total_evaluation, metrics_table) else: self.process_corpus_with_holdout_validation(total_evaluation) self.add_metrics(metrics_table, 'Total', total_evaluation) logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format(total_evaluation.get_cases(), total_evaluation.get_accuracy_avg())) print metrics_table return total_evaluation
def get_bag_of_words(self, training_documents): bag_of_words_freq = FreqDist() for w in sum([d[0] for d in training_documents], []): bag_of_words_freq.inc(w) min_freq = 10 if self.adjectives: min_freq = 4 bag_of_words = filter(lambda x: bag_of_words_freq[x] > min_freq, bag_of_words_freq.keys()) bag_of_words = bag_of_words[:3000] logger.info('bag of words size: {}'.format(len(bag_of_words))) return bag_of_words
def process(self, corpus): full_text = '' for doc in corpus: doc_text = ' '.join(doc) doc_text = doc_text.replace('|', '') full_text += doc_text + '|\n' freeling_docs = self.freeling_processor.process_text(full_text) i = 0 processed_corpus = [] for doc in corpus: logger.info("original doc: [" + ' '.join(doc) + "]") logger.info("processed doc: [" + ' '.join("u'" + x.word + "'" for x in freeling_docs[i]) + "]") processed_corpus.append([self.extract_feature(term) for term in freeling_docs[i] if self.filter(term)]) i += 1 return processed_corpus
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def load_corpus(self): self.pos_comments = self.load_comments(os.path.join(base_path, 'data', 'output', 'pos')) logger.info('Positive dataset loaded, size: {}'.format(len(self.pos_comments))) self.neg_comments = self.load_comments(os.path.join(base_path, 'data', 'output', 'neg')) logger.info('Negative dataset loaded, size: {}'.format(len(self.neg_comments))) if self.out_of_domain_test: self.pos_comments_dom2 = self.load_comments(os.path.join(base_path, 'data2', 'output', 'pos')) logger.info('Positive dataset 2 loaded, size: {}'.format(len(self.pos_comments_dom2))) self.neg_comments_dom2 = self.load_comments(os.path.join(base_path, 'data2', 'output', 'neg')) logger.info('Negative dataset 2 loaded, size: {}'.format(len(self.neg_comments_dom2)))
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format( len(training_documents))) #training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) training_set = [] for td in training_documents: document = td[0] label = td[1] features = feature_extractor.extract(document) training_set.append((features, label)) logger.info('Building classifier') if self.algorithm == 'nb': self.classifier = SklearnClassifier(MultinomialNB(), dtype=bool) elif self.algorithm == 'maxent': self.classifier = SklearnClassifier(LogisticRegression(), dtype=numpy.float64) elif self.algorithm == 'svm': self.classifier = SklearnClassifier(LinearSVC()) elif self.algorithm == 'tree': self.classifier = SklearnClassifier( DecisionTreeClassifier(), sparse=False) #optimized version of the CART algorithm #dot_data = StringIO.StringIO() #tree.export_graphviz(self.classifier._clf, dot_data, feature_names=self.classifier._feature_index.keys()) #graph = pydot.graph_from_dot_data(dot_data.getvalue()) #graph.write_pdf("test_export_graphvix.pdf") logger.info('Training classifier') self.classifier.train(training_set)
def process(self, corpus): full_text = '' for doc in corpus: doc_text = ' '.join(doc) doc_text = doc_text.replace('|', '') full_text += doc_text + '|\n' freeling_docs = self.freeling_processor.process_text(full_text) i = 0 processed_corpus = [] for doc in corpus: logger.info("original doc: [" + ' '.join(doc) + "]") logger.info("processed doc: [" + ' '.join("u'" + x.word + "'" for x in freeling_docs[i]) + "]") processed_corpus.append([ self.extract_feature(term) for term in freeling_docs[i] if self.filter(term) ]) i += 1 return processed_corpus
def _classify_using_weka(self, test_comments, feature_extractor): test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments) temp_dir = tempfile.mkdtemp() self.test_filename = os.path.join(temp_dir, 'test.arff') logger.info('Writing Test WEKA File: ' + self.test_filename) self._write_ARFF_file(self.test_filename, test_set) cmd = [ self.javaclass, '-t', self.train_filename, '-T', self.test_filename ] + ['-p', '0'] logger.info('Executing WEKA: ' + str(cmd)) config_java(options='-Xmx2000M') (stdout, stderr) = java(cmd, classpath=weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return self.parse_weka_output(stdout.split('\n'))
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format(len(training_documents))) training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) logger.info('Extracting features') self.features_names = sorted(feature_extractor.extract_features_names()) temp_dir = tempfile.mkdtemp() self.train_filename = os.path.join(temp_dir, 'train.arff') logger.info('Writing Training WEKA File: ' + self.train_filename) self._write_ARFF_file(self.train_filename, training_set)
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format( len(training_documents))) training_set = nltk.classify.util.apply_features( feature_extractor.extract, training_documents) logger.info('Extracting features') self.features_names = sorted( feature_extractor.extract_features_names()) temp_dir = tempfile.mkdtemp() self.train_filename = os.path.join(temp_dir, 'train.arff') logger.info('Writing Training WEKA File: ' + self.train_filename) self._write_ARFF_file(self.train_filename, training_set)
def process_fold(self, training_documents, pos_test_comments, neg_test_comments): feature_extractor = self.build_feature_extractor(training_documents) logger.info('Feature extractor: {}'.format(str(feature_extractor))) self.train(training_documents, feature_extractor) logger.info('Classifying') test_comments = zip(pos_test_comments, ['pos'] * len(pos_test_comments)) test_comments.extend(zip(neg_test_comments, ['neg'] * len(neg_test_comments))) evaluation = self.classify_comments(test_comments, feature_extractor) logger.info('TestSet Size: {} - Accuracy: {}'.format(evaluation.get_cases(), evaluation.get_accuracy_avg())) return evaluation
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format(len(training_documents))) featuresets = [] labelsets = [] feature_index = {} labels = ['pos', 'neg'] label_index = {'neg': 1, 'pos': 0} X = np.zeros((len(training_documents), feature_extractor.get_features_size()), dtype=bool) i = 0 for td in training_documents: document = td[0] label = td[1] features = feature_extractor.extract(document) featuresets.append(features) labelsets.append(label) for f,v in features.iteritems(): if f not in feature_index: feature_index[f] = len(feature_index) X[i, feature_index[f]] = bool(v) i = i + 1 logger.info('Building classifier') if self.algorithm == 'nb': self.classifier = OptSklearnClassifier(MultinomialNB(), dtype=bool) elif self.algorithm == 'maxent': self.classifier = OptSklearnClassifier(LogisticRegression(), dtype=np.float64) elif self.algorithm == 'svm': self.classifier = OptSklearnClassifier(LinearSVC(), sparse=False) elif self.algorithm == 'tree': self.classifier = OptSklearnClassifier(DecisionTreeClassifier(), sparse=False) #optimized version of the CART algorithm #dot_data = StringIO.StringIO() #tree.export_graphviz(self.classifier._clf, dot_data, feature_names=self.classifier._feature_index.keys()) #graph = pydot.graph_from_dot_data(dot_data.getvalue()) #graph.write_pdf("test_export_graphvix.pdf") self.classifier.train(labelsets, feature_index, labels, label_index, X) logger.info('Training classifier')
help='Transform chars to lower case', action='store_true') parser.add_argument('-punct', help='Remove punctuation marks', action='store_true') parser.add_argument('-acc', help='Remove spanish accents', action='store_true') parser.add_argument('-lemma', help='Use lemmatized words', action='store_true') parser.add_argument('-adj', help='Use just adjectives', action='store_true') parser.add_argument('-allprepro', help='Apply all preprocessors', action='store_true') parser.add_argument( '-pp', help='Proportion of positive comments for unbalanced experiences', type=Decimal, default=0.5) args = parser.parse_args() logger.info('Starting Sentiment Analysis Process. Params: ' + str(args)) main(args.nb, args.weka, args.megam, args.svmlight, args.sklearn, args.turney, args.f, args.s, args.fn, args.sw, args.u, args.wf, args.docbi, args.bi, args.wl, args.dc, args.neg, args.stem, args.lc, args.punct, args.acc, args.lemma, args.adj, args.allprepro, args.od, args.pp)
group.add_argument('-turney', help='Classification using Turney algorithm', action='store_true') parser.add_argument('-f', help='Number of folds for supervised algorithms using k-fold cross validation. If this parameter is not provided then holdout validation is performed.', type=int) parser.add_argument('-fn', help='Fold number for supervised algorithms using k-fold cross validation', type=int) parser.add_argument('-s', help='Corpus size', type=int) parser.add_argument('-od', help='Out of domain testing', action='store_true') parser.add_argument('-u', help='Use top training unigrams as feature extractor', action='store_true') parser.add_argument('-wf', help='Use top training unigrams frequency as feature extractor', action='store_true') parser.add_argument('-docbi', help='Use document bigrams as feature extractor', action='store_true') parser.add_argument('-bi', help='Use top training bigrams as feature extractor', action='store_true') parser.add_argument('-sw', help='Remove stop words', action='store_true') parser.add_argument('-wl', help='Filter words by minimum length', type=int) parser.add_argument('-dc', help='Remove duplicated characters', action='store_true') parser.add_argument('-neg', help='Preprocess negations', action='store_true') parser.add_argument('-stem', help='Use stemmed words', action='store_true') parser.add_argument('-lc', help='Transform chars to lower case', action='store_true') parser.add_argument('-punct', help='Remove punctuation marks', action='store_true') parser.add_argument('-acc', help='Remove spanish accents', action='store_true') parser.add_argument('-lemma', help='Use lemmatized words', action='store_true') parser.add_argument('-adj', help='Use just adjectives', action='store_true') parser.add_argument('-allprepro', help='Apply all preprocessors', action='store_true') parser.add_argument('-pp', help='Proportion of positive comments for unbalanced experiences', type=Decimal, default=0.5) args = parser.parse_args() logger.info('Starting Sentiment Analysis Process. Params: ' + str(args)) main(args.nb, args.weka, args.megam, args.svmlight, args.sklearn, args.turney, args.f, args.s, args.fn, args.sw, args.u, args.wf, args.docbi, args.bi, args.wl, args.dc, args.neg, args.stem, args.lc, args.punct, args.acc, args.lemma, args.adj, args.allprepro, args.od, args.pp)
def train(self, training_documents, feature_extractor): logger.info('Creating training dataset, documents size {}'.format(len(training_documents))) training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents) logger.info('Training classifier') self.classifier = nltk.SvmClassifier.train(training_set) self.classifier.show_most_informative_features()