def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(len(training_documents)))
     training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents)
     logger.info('Training classifier')
     options = []
     options.append('-no-cv')
     self.classifier = nltk.WekaClassifier.train('weka.model', training_set, 'log_regression', options)
예제 #2
0
    def process_corpus_with_cross_validation(self, total_evaluation, metrics_table):
        folds = []

        if self.fold_number is None:
            folds = range(self.n_folds)
        else:
            folds.append(self.fold_number)

        pos_fold_size = int(self.corpus_size*self.prop_of_pos / self.n_folds)
        neg_fold_size = int(self.corpus_size*self.prop_of_neg / self.n_folds)
        for i in folds:
            pos_test_start = pos_fold_size * i
            pos_test_end = pos_test_start + pos_fold_size

            neg_test_start = neg_fold_size * i
            neg_test_end = neg_test_start + neg_fold_size

            logger.info('Fold {}/{}, pos: {}..{}, neg: {}..{}'.format(i + 1, self.n_folds, pos_test_start, pos_test_end, neg_test_start, neg_test_end))

            training_documents = [(c, 'pos') for c in self.pos_comments[:pos_test_start] + self.pos_comments[pos_test_end:]]
            training_documents.extend([(c, 'neg') for c in self.neg_comments[:neg_test_start] + self.neg_comments[neg_test_end:]])

            if self.out_of_domain_test:
                # testeo siempre con el mismo set en todas las iteraciones
                pos_test_set = self.pos_comments_dom2[:pos_fold_size]
                neg_test_set = self.neg_comments_dom2[:neg_fold_size]
            else:
                pos_test_set = self.pos_comments[pos_test_start:pos_test_end]
                neg_test_set = self.neg_comments[neg_test_start:neg_test_end]

            evaluation = self.process_fold(training_documents, pos_test_set, neg_test_set)
            total_evaluation.update(evaluation)

            self.add_metrics(metrics_table, i, evaluation)
예제 #3
0
    def process_corpus(self):
        evaluation = Evaluation('pos', 'neg')

        pos_size = int(self.corpus_size * self.prop_of_pos)
        neg_size = int(self.corpus_size * self.prop_of_neg)

        self.corpus = self.pos_comments[:pos_size] + self.neg_comments[:
                                                                       neg_size]

        self.pos_hits = self.hits(self.pos_words)
        self.neg_hits = self.hits(self.neg_words)

        pos_test_size = int(pos_size * 0.2)
        neg_test_size = int(neg_size * 0.2)
        pos_test_corpus = self.pos_comments[:pos_test_size]
        neg_test_corpus = self.neg_comments[:neg_test_size]

        tagged_pos_test_corpus = self.tag_test_corpus(pos_test_corpus)
        tagged_neg_test_corpus = self.tag_test_corpus(neg_test_corpus)

        evaluation = self.classify_corpus(pos_test_corpus,
                                          tagged_pos_test_corpus, 'pos',
                                          evaluation)
        evaluation = self.classify_corpus(neg_test_corpus,
                                          tagged_neg_test_corpus, 'neg',
                                          evaluation)

        logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format(
            evaluation.get_cases(), evaluation.get_accuracy_avg()))

        metrics_table = self.build_metrics_table()
        self.add_metrics(metrics_table, 'Total', evaluation)
        print metrics_table

        return evaluation
예제 #4
0
    def process_corpus(self):
        evaluation = Evaluation('pos', 'neg')

        pos_size = int(self.corpus_size*self.prop_of_pos)
        neg_size = int(self.corpus_size*self.prop_of_neg)

        self.corpus = self.pos_comments[:pos_size] + self.neg_comments[:neg_size]

        self.pos_hits = self.hits(self.pos_words)
        self.neg_hits = self.hits(self.neg_words)
        
        pos_test_size = int(pos_size*0.2)
        neg_test_size = int(neg_size*0.2)
        pos_test_corpus = self.pos_comments[:pos_test_size]
        neg_test_corpus = self.neg_comments[:neg_test_size]

        tagged_pos_test_corpus = self.tag_test_corpus(pos_test_corpus)
        tagged_neg_test_corpus = self.tag_test_corpus(neg_test_corpus)

        evaluation = self.classify_corpus(pos_test_corpus, tagged_pos_test_corpus, 'pos', evaluation)
        evaluation = self.classify_corpus(neg_test_corpus, tagged_neg_test_corpus, 'neg', evaluation)

        logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format(evaluation.get_cases(), evaluation.get_accuracy_avg()))

        metrics_table = self.build_metrics_table()
        self.add_metrics(metrics_table, 'Total', evaluation)
        print metrics_table

        return evaluation
예제 #5
0
 def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(len(training_documents)))
     training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents)
     logger.info('Training classifier')
     #self.classifier = nltk.MaxentClassifier.train(training_set, algorithm='megam', explicit=True, bernoulli=True, model='multiclass')
     self.classifier = nltk.MaxentClassifier.train(training_set, algorithm='megam')
     self.classifier.show_most_informative_features()
예제 #6
0
 def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(
         len(training_documents)))
     training_set = nltk.classify.util.apply_features(
         feature_extractor.extract, training_documents)
     logger.info('Training classifier')
     self.classifier = nltk.SvmClassifier.train(training_set)
     self.classifier.show_most_informative_features()
예제 #7
0
 def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(
         len(training_documents)))
     training_set = nltk.classify.util.apply_features(
         feature_extractor.extract, training_documents)
     logger.info('Training classifier')
     options = []
     options.append('-no-cv')
     self.classifier = nltk.WekaClassifier.train('weka.model', training_set,
                                                 'log_regression', options)
예제 #8
0
    def preprocess_corpus(self):
        logger.info("Preprocessing corpus")
   
        processor = self.build_preprocessor()

        self.pos_comments = processor.process(self.pos_comments[:int(self.corpus_size*self.prop_of_pos)])
        self.neg_comments = processor.process(self.neg_comments[:int(self.corpus_size*self.prop_of_neg)])

        self.pos_comments_dom2 = processor.process(self.pos_comments_dom2[:int(self.corpus_size*self.prop_of_pos)])
        self.neg_comments_dom2 = processor.process(self.neg_comments_dom2[:int(self.corpus_size*self.prop_of_neg)])
    def classify_comments(self, test_fold, test_comments):
        self.classifier = self.get_classifier(test_fold)
        logger.info(self.classifier.show_most_informative_features())

        evaluation = Evaluation('pos', 'neg')
        for comment, expected_klass in test_comments:
            klass = self.classifier.classify(comment)
            #if klass != expected_klass:
            #print 'expected class: %s, class: %s, comment: %s' %(expected_klass, klass, " ".join(comment))
            evaluation.add(expected_klass, klass)
        return evaluation
예제 #10
0
    def process_corpus(self):
        total_evaluation = Evaluation('pos', 'neg')
        metrics_table = self.build_metrics_table()
       
        if (self.cross_validation): 
            self.process_corpus_with_cross_validation(total_evaluation, metrics_table)
        else:
            self.process_corpus_with_holdout_validation(total_evaluation)

        self.add_metrics(metrics_table, 'Total', total_evaluation)

        logger.info('Total TestSet Size: {} - Avg Accuracy: {}'.format(total_evaluation.get_cases(), total_evaluation.get_accuracy_avg()))
        print metrics_table
        return total_evaluation
예제 #11
0
    def get_bag_of_words(self, training_documents):
        bag_of_words_freq = FreqDist()
        for w in sum([d[0] for d in training_documents], []):
            bag_of_words_freq.inc(w)

        min_freq = 10
        if self.adjectives:
            min_freq = 4

        bag_of_words = filter(lambda x: bag_of_words_freq[x] > min_freq, bag_of_words_freq.keys())
        
        bag_of_words = bag_of_words[:3000]
        logger.info('bag of words size: {}'.format(len(bag_of_words)))

        return bag_of_words
예제 #12
0
 def process(self, corpus):
     full_text = ''
     for doc in corpus:
         doc_text = ' '.join(doc)
         doc_text = doc_text.replace('|', '')
         full_text += doc_text + '|\n'
         
     freeling_docs = self.freeling_processor.process_text(full_text)
         
     i = 0
     processed_corpus = []
     for doc in corpus:
         logger.info("original doc: [" + ' '.join(doc) + "]")
         logger.info("processed doc: [" + ' '.join("u'" + x.word + "'" for x in freeling_docs[i]) + "]")
         processed_corpus.append([self.extract_feature(term) for term in freeling_docs[i] if
                                  self.filter(term)])            
         i += 1
     return processed_corpus
예제 #13
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract, test_comments)
        
        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')               
        
        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [self.javaclass, '-t', self.train_filename, '-T', self.test_filename] + ['-p', '0']
        
        logger.info('Executing WEKA: ' + str(cmd))
        
        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd, classpath=weka_classpath,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
        
        return self.parse_weka_output(stdout.split('\n'))
예제 #14
0
    def load_corpus(self):
        self.pos_comments = self.load_comments(os.path.join(base_path, 'data', 'output', 'pos'))
        logger.info('Positive dataset loaded, size: {}'.format(len(self.pos_comments)))

        self.neg_comments = self.load_comments(os.path.join(base_path, 'data', 'output', 'neg'))
        logger.info('Negative dataset loaded, size: {}'.format(len(self.neg_comments)))

        if self.out_of_domain_test:
            self.pos_comments_dom2 = self.load_comments(os.path.join(base_path, 'data2', 'output', 'pos')) 
            logger.info('Positive dataset 2 loaded, size: {}'.format(len(self.pos_comments_dom2)))

            self.neg_comments_dom2 = self.load_comments(os.path.join(base_path, 'data2', 'output', 'neg'))
            logger.info('Negative dataset 2 loaded, size: {}'.format(len(self.neg_comments_dom2)))
예제 #15
0
    def train(self, training_documents, feature_extractor):
        logger.info('Creating training dataset, documents size {}'.format(
            len(training_documents)))
        #training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents)

        training_set = []
        for td in training_documents:
            document = td[0]
            label = td[1]
            features = feature_extractor.extract(document)
            training_set.append((features, label))

        logger.info('Building classifier')
        if self.algorithm == 'nb':
            self.classifier = SklearnClassifier(MultinomialNB(), dtype=bool)
        elif self.algorithm == 'maxent':
            self.classifier = SklearnClassifier(LogisticRegression(),
                                                dtype=numpy.float64)
        elif self.algorithm == 'svm':
            self.classifier = SklearnClassifier(LinearSVC())
        elif self.algorithm == 'tree':
            self.classifier = SklearnClassifier(
                DecisionTreeClassifier(),
                sparse=False)  #optimized version of the CART algorithm
            #dot_data = StringIO.StringIO()
            #tree.export_graphviz(self.classifier._clf, dot_data, feature_names=self.classifier._feature_index.keys())
            #graph = pydot.graph_from_dot_data(dot_data.getvalue())
            #graph.write_pdf("test_export_graphvix.pdf")

        logger.info('Training classifier')
        self.classifier.train(training_set)
예제 #16
0
    def process(self, corpus):
        full_text = ''
        for doc in corpus:
            doc_text = ' '.join(doc)
            doc_text = doc_text.replace('|', '')
            full_text += doc_text + '|\n'

        freeling_docs = self.freeling_processor.process_text(full_text)

        i = 0
        processed_corpus = []
        for doc in corpus:
            logger.info("original doc: [" + ' '.join(doc) + "]")
            logger.info("processed doc: [" +
                        ' '.join("u'" + x.word + "'"
                                 for x in freeling_docs[i]) + "]")
            processed_corpus.append([
                self.extract_feature(term) for term in freeling_docs[i]
                if self.filter(term)
            ])
            i += 1
        return processed_corpus
예제 #17
0
    def _classify_using_weka(self, test_comments, feature_extractor):
        test_set = nltk.classify.util.apply_features(feature_extractor.extract,
                                                     test_comments)

        temp_dir = tempfile.mkdtemp()
        self.test_filename = os.path.join(temp_dir, 'test.arff')

        logger.info('Writing Test WEKA File: ' + self.test_filename)
        self._write_ARFF_file(self.test_filename, test_set)

        cmd = [
            self.javaclass, '-t', self.train_filename, '-T', self.test_filename
        ] + ['-p', '0']

        logger.info('Executing WEKA: ' + str(cmd))

        config_java(options='-Xmx2000M')
        (stdout, stderr) = java(cmd,
                                classpath=weka_classpath,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)

        return self.parse_weka_output(stdout.split('\n'))
예제 #18
0
 def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(len(training_documents)))
     training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents)
     
     logger.info('Extracting features')
     self.features_names = sorted(feature_extractor.extract_features_names())
     
     temp_dir = tempfile.mkdtemp()
     self.train_filename = os.path.join(temp_dir, 'train.arff')               
     
     logger.info('Writing Training WEKA File: ' + self.train_filename)
     self._write_ARFF_file(self.train_filename, training_set)
예제 #19
0
    def train(self, training_documents, feature_extractor):
        logger.info('Creating training dataset, documents size {}'.format(
            len(training_documents)))
        training_set = nltk.classify.util.apply_features(
            feature_extractor.extract, training_documents)

        logger.info('Extracting features')
        self.features_names = sorted(
            feature_extractor.extract_features_names())

        temp_dir = tempfile.mkdtemp()
        self.train_filename = os.path.join(temp_dir, 'train.arff')

        logger.info('Writing Training WEKA File: ' + self.train_filename)
        self._write_ARFF_file(self.train_filename, training_set)
예제 #20
0
    def process_fold(self, training_documents, pos_test_comments, neg_test_comments):
        feature_extractor = self.build_feature_extractor(training_documents)

        logger.info('Feature extractor: {}'.format(str(feature_extractor)))
        self.train(training_documents, feature_extractor)

        logger.info('Classifying')

        test_comments = zip(pos_test_comments, ['pos'] * len(pos_test_comments))
        test_comments.extend(zip(neg_test_comments, ['neg'] * len(neg_test_comments)))

        evaluation = self.classify_comments(test_comments, feature_extractor)

        logger.info('TestSet Size: {} - Accuracy: {}'.format(evaluation.get_cases(), evaluation.get_accuracy_avg()))

        return evaluation
예제 #21
0
    def train(self, training_documents, feature_extractor):
        logger.info('Creating training dataset, documents size {}'.format(len(training_documents)))

        featuresets = []
        labelsets = []
        feature_index = {}
        labels = ['pos', 'neg']
        label_index = {'neg': 1, 'pos': 0}
        X = np.zeros((len(training_documents), feature_extractor.get_features_size()), dtype=bool)
        i = 0
        for td in training_documents:
            document = td[0]
            label = td[1]
            features = feature_extractor.extract(document)
            featuresets.append(features)
            labelsets.append(label)

            for f,v in features.iteritems():
                if f not in feature_index:
                    feature_index[f] = len(feature_index)

                X[i, feature_index[f]] = bool(v)

            i = i + 1
                
        logger.info('Building classifier')
        if self.algorithm == 'nb':
            self.classifier = OptSklearnClassifier(MultinomialNB(), dtype=bool)
        elif self.algorithm == 'maxent':
            self.classifier = OptSklearnClassifier(LogisticRegression(), dtype=np.float64)
        elif self.algorithm == 'svm':
            self.classifier = OptSklearnClassifier(LinearSVC(), sparse=False)
        elif self.algorithm == 'tree':
            self.classifier = OptSklearnClassifier(DecisionTreeClassifier(), sparse=False) #optimized version of the CART algorithm
            #dot_data = StringIO.StringIO() 
            #tree.export_graphviz(self.classifier._clf, dot_data, feature_names=self.classifier._feature_index.keys())        
            #graph = pydot.graph_from_dot_data(dot_data.getvalue()) 
            #graph.write_pdf("test_export_graphvix.pdf")
        
        self.classifier.train(labelsets, feature_index, labels, label_index, X)
        logger.info('Training classifier')
                        help='Transform chars to lower case',
                        action='store_true')
    parser.add_argument('-punct',
                        help='Remove punctuation marks',
                        action='store_true')
    parser.add_argument('-acc',
                        help='Remove spanish accents',
                        action='store_true')
    parser.add_argument('-lemma',
                        help='Use lemmatized words',
                        action='store_true')
    parser.add_argument('-adj',
                        help='Use just adjectives',
                        action='store_true')
    parser.add_argument('-allprepro',
                        help='Apply all preprocessors',
                        action='store_true')
    parser.add_argument(
        '-pp',
        help='Proportion of positive comments for unbalanced experiences',
        type=Decimal,
        default=0.5)
    args = parser.parse_args()

    logger.info('Starting Sentiment Analysis Process. Params: ' + str(args))
    main(args.nb, args.weka, args.megam, args.svmlight, args.sklearn,
         args.turney, args.f, args.s, args.fn, args.sw, args.u, args.wf,
         args.docbi, args.bi, args.wl, args.dc, args.neg, args.stem, args.lc,
         args.punct, args.acc, args.lemma, args.adj, args.allprepro, args.od,
         args.pp)
    group.add_argument('-turney', help='Classification using Turney algorithm', action='store_true')

    parser.add_argument('-f', help='Number of folds for supervised algorithms using k-fold cross validation. If this parameter is not provided then holdout validation is performed.', type=int)
    parser.add_argument('-fn', help='Fold number for supervised algorithms using k-fold cross validation', type=int)
    parser.add_argument('-s', help='Corpus size', type=int)
    parser.add_argument('-od', help='Out of domain testing', action='store_true')
    parser.add_argument('-u', help='Use top training unigrams as feature extractor', action='store_true')
    parser.add_argument('-wf', help='Use top training unigrams frequency as feature extractor', action='store_true')
    parser.add_argument('-docbi', help='Use document bigrams as feature extractor', action='store_true')
    parser.add_argument('-bi', help='Use top training bigrams as feature extractor', action='store_true')
    parser.add_argument('-sw', help='Remove stop words', action='store_true')
    parser.add_argument('-wl', help='Filter words by minimum length', type=int)
    parser.add_argument('-dc', help='Remove duplicated characters', action='store_true')
    parser.add_argument('-neg', help='Preprocess negations', action='store_true')
    parser.add_argument('-stem', help='Use stemmed words', action='store_true')
    parser.add_argument('-lc', help='Transform chars to lower case', action='store_true')
    parser.add_argument('-punct', help='Remove punctuation marks', action='store_true')
    parser.add_argument('-acc', help='Remove spanish accents', action='store_true')
    parser.add_argument('-lemma', help='Use lemmatized words', action='store_true')
    parser.add_argument('-adj', help='Use just adjectives', action='store_true')
    parser.add_argument('-allprepro', help='Apply all preprocessors', action='store_true')
    parser.add_argument('-pp', help='Proportion of positive comments for unbalanced experiences', type=Decimal, default=0.5)
    args = parser.parse_args()
    
    logger.info('Starting Sentiment Analysis Process. Params: ' + str(args))
    main(args.nb, args.weka, args.megam, args.svmlight, args.sklearn, args.turney,
        args.f, args.s, args.fn, args.sw, args.u, args.wf, args.docbi, args.bi, 
        args.wl, args.dc, args.neg, args.stem, args.lc, args.punct, args.acc,
        args.lemma, args.adj, args.allprepro, args.od, args.pp)

예제 #24
0
 def train(self, training_documents, feature_extractor):
     logger.info('Creating training dataset, documents size {}'.format(len(training_documents)))
     training_set = nltk.classify.util.apply_features(feature_extractor.extract, training_documents)
     logger.info('Training classifier')
     self.classifier = nltk.SvmClassifier.train(training_set)
     self.classifier.show_most_informative_features()