Example #1
0
 def get_corpus(self):
     """ Lire et peupler le corpus """
     if self.corpus is None:
         self.corpus = Dictionary()
         self.corpus.updated = time.time()
         try:
             directory = Paths.get_root_dir(*CORPUS_PATH)
             infile = '{name}.csv'.format(name=self.pathname)
             path = join(directory,
                         '{name}.csv.zip'.format(name=self.pathname))
             # Lire le CSV dans le fichier zip
             with ZipFile(open(path, 'rb')) as zipfile:
                 buffer = StringIO(zipfile.read(infile))
                 reader = csv.reader(buffer)
                 for row in reader:
                     # 0: category, 1: doc, 2: hash
                     self.corpus[row[2]] = (row[0], row[1])
         except IOError:
             pass
     if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated:
         self.corpus_shadow = List(self.corpus.values())
         self.corpus_shadow.updated = time.time()
         self.classifier = MaxEntClassifier(
             self.corpus_shadow,
             feature_extractor=extractor_base)  # ou NaiveBayesClassifier
     return self.corpus_shadow
Example #2
0
class TestMaxEntClassifier(unittest.TestCase):
    def setUp(self):
        self.classifier = MaxEntClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), 'positive')
        assert_true(res.prob("positive") > res.prob("negative"))
Example #3
0
class TestMaxEntClassifier(unittest.TestCase):

    def setUp(self):
        self.classifier = MaxEntClassifier(train_set)

    def test_classify(self):
        res = self.classifier.classify("I feel happy this morning")
        assert_equal(res, 'positive')
        assert_equal(len(self.classifier.train_set), len(train_set))

    def test_prob_classify(self):
        res = self.classifier.prob_classify("I feel happy this morning")
        assert_equal(res.max(), 'positive')
        assert_true(res.prob("positive") > res.prob("negative"))
Example #4
0
class _Classifier:
    def __init__(self, data):
        # self._classifier = NaiveBayesClassifier(data)
        self._classifier = MaxEntClassifier(data)

    def update(self, data):
        """
        data needs to be an iterable(list or tuple) of iterables
        the inner iterable needs to be in the format of ('str', 'label')
        """
        self._classifier.update(data)

    def probability(self, text):
        return self._classifier.prob_classify(text)

    def labels(self):
        return self._classifier.labels()
class _Classifier:
    def __init__(self, data):
        # self._classifier = NaiveBayesClassifier(data)
        self._classifier = MaxEntClassifier(data)

    def update(self, data):
        """
        data needs to be an iterable(list or tuple) of iterables
        the inner iterable needs to be in the format of ('str', 'label')
        """
        self._classifier.update(data)

    def probability(self, text):
        return self._classifier.prob_classify(text)

    def labels(self):
        return self._classifier.labels()
Example #6
0
def add_global_hook():
    tweets = TweetBank(50)
    train, test = tweets.data_set()

    naive_bayes = NaiveBayesClassifier(train)
    maxent = MaxEntClassifier(train)
    classifier_dictionary = {"Naive Bayes": naive_bayes, "Maxent": maxent}
    g = web.storage({
        "classifier_dictionary": classifier_dictionary,
        "test_set": test
    })

    def _wrapper(handler):
        web.ctx.globals = g
        return handler()

    return _wrapper
Example #7
0
 def __init__(self, data):
     # self._classifier = NaiveBayesClassifier(data)
     self._classifier = MaxEntClassifier(data)
Example #8
0
 def setUp(self):
     self.classifier = MaxEntClassifier(train_set)
Example #9
0
from textblob.classifiers import MaxEntClassifier



with open('data/train-toy.csv', 'r') as fp:
    cl = MaxEntClassifier(fp, format="csv")


with open('data/test-toy.csv', 'r') as gp:
    print cl.accuracy(gp, format="csv")

Example #10
0
print(len(words), len(tags))

for i in range(1000):
    if (i < 800):
        temp = (words[i], tags[i])
        train.append(temp)
    else:
        temp = (words[i], tags[i])
        test.append(temp)
print(train)
print(test)

naive = NaiveBayesClassifier(train)
dtc = DecisionTreeClassifier(train)
mec = MaxEntClassifier(train)

print("NaiveBayesClassifier Accuracy: {0}".format(naive.accuracy(test)))
print("DecisionTreeClassifier Accuracy: {0}".format(dtc.accuracy(test)))
print("MaxEntClassifier Accuracy: {0}".format(mec.accuracy(test)))

cl = NaiveBayesClassifier(train)
print("NaiveBayesClassifier Accuracy: {0}".format(cl.accuracy(test)))
for i in range(0, len(test)):
    tag = cl.classify(test[i])
    pred_tags.append(tag)
    if (tag == test_tags[i]):
        count += 1
print(len(pred_tags), len(test_tags))
print(count)
Example #11
0
def search_department(job, train):
    cl_depart = MaxEntClassifier(train)
    prob_dist = cl_depart.prob_classify(job)
    print(prob_dist.max())
    return prob_dist.max()
Example #12
0
#     trains.append(train[i])

trains = train

if choice == "1":
    print("\n" + "#NaiveBayesClassifier")
    cl1 = NaiveBayesClassifier(trains)
    print("Classifier: Naive Bayes -- Accuracy: ", cl1.accuracy(test), "\n")

elif choice == "2":
    print("\n" + "#DecisionTreeClassifier")
    cl2 = DecisionTreeClassifier(trains)
    print("Classifier: Decision Tree -- Accuracy: ", cl2.accuracy(test), "\n")

elif choice == "3":
    print("\n" + "#MaxEntClassifier")
    cl3 = MaxEntClassifier(trains)
    print("Classifier: Maximum Entropy -- Accuracy: ", cl3.accuracy(test),
          "\n")

elif choice == "4":
    print("\n" + "#NLTKClassifier")
    cl4 = NLTKClassifier(trains)
    print("Classifier: NLTK -- Accuracy: ", cl4.accuracy(test), "\n")

else:
    print("Bad input!")

# most repeated words (most important properties)
totalDictPosSorted = sorted(totalDictPos.items(), key=operator.itemgetter(1))
totalDictNegSorted = sorted(totalDictNeg.items(), key=operator.itemgetter(1))
Example #13
0
class FileCorpus(BaseCorpus):
    """
    Corpus stocké dans un fichier texte CSV compressé

    J'estime qu'un corpus de 2↑15 (32768) documents devrait être utilisable
    avec cette classe de corpus. Dans le cas contraire, il faudra penser
    à développer son propre module CFFI.
    """

    # Attributs
    corpus = None  # de type Dictionary (dictionnaire auquel on peut assigner des attributs)
    corpus_shadow = None  # copie de type List (les classifieurs NLTK utilisant des listes)
    classifier = None  # classifieur NLTK, initialisé dans get_corpus

    # Getter
    def get_corpus(self):
        """ Lire et peupler le corpus """
        if self.corpus is None:
            self.corpus = Dictionary()
            self.corpus.updated = time.time()
            try:
                directory = Paths.get_root_dir(*CORPUS_PATH)
                infile = '{name}.csv'.format(name=self.pathname)
                path = join(directory,
                            '{name}.csv.zip'.format(name=self.pathname))
                # Lire le CSV dans le fichier zip
                with ZipFile(open(path, 'rb')) as zipfile:
                    buffer = StringIO(zipfile.read(infile))
                    reader = csv.reader(buffer)
                    for row in reader:
                        # 0: category, 1: doc, 2: hash
                        self.corpus[row[2]] = (row[0], row[1])
            except IOError:
                pass
        if self.corpus_shadow is None or self.corpus_shadow.updated < self.corpus.updated:
            self.corpus_shadow = List(self.corpus.values())
            self.corpus_shadow.updated = time.time()
            self.classifier = MaxEntClassifier(
                self.corpus_shadow,
                feature_extractor=extractor_base)  # ou NaiveBayesClassifier
        return self.corpus_shadow

    def classify(self, document):
        """
        Renvoyer la catégorie la plus probable pour un document

        :rtype: str
        """
        self.get_corpus()
        return self.classifier.classify(document)

    def classify_prob(self, document):
        """
        Renvoyer les probabilités de catégorie

        :rtype: nltk.probability.DictionaryProbDist
        """
        self.get_corpus()
        return self.classifier.prob_classify(document)

    # Actions
    def save(self):
        """
        Enregistrer le corpus sur disque

        :rtype: bool
        :returns: True si la sauvegarde a eu lieu, False sinon
        """
        directory = Paths.get_root_dir(*CORPUS_PATH)
        infile = '{name}.csv'.format(name=self.pathname)
        path = join(directory, '{name}.csv.zip'.format(name=self.pathname))
        # Écrire le CSV dans le fichier zip
        try:
            with ZipFile(path, 'w', ZIP_DEFLATED) as zipfile:
                buffer = StringIO()
                writer = csv.writer(buffer, delimiter=",", encoding='utf-8')
                for row in self.corpus_shadow:
                    writer.writerow(row)
                zipfile.writestr(infile, buffer.getvalue())
            return True
        except IOError:
            return False

    def train(self, document, category):
        """
        Classer un document dans une catégorie

        :returns: signature du document
        :rtype: long
        """
        self.get_corpus()
        document = format_base(document)
        document_shadow = [document]
        analyzer_default_format.send(
            FileCorpus, document_shadow, category
        )  # On passe une liste car est modifiable par les listeners
        document = "".join(document_shadow)
        signature = hash(document)
        self.corpus[signature] = (document, category)
        self.corpus.updated = time.time()
        return signature

    def retrain(self, signature, category):
        """
        Changer la catégorie d'un document déjà classifié

        :param signature: hash du document à reclassifier
        :param category: nouvelle catégorie du document
        """
        self.get_corpus()
        if self.corpus[signature][1] != category:
            self.corpus[signature] = (self.corpus[signature][0], category)
            self.corpus.updated = time.time()
            return True
        return False

    def untrain(self, signature):
        """
        Retirer du corpus

        :param signature: Hash du document
        """
        self.get_corpus()
        extracted = self.corpus.pop(signature, None)
        self.corpus.updated = time.time()
        return extracted is not None

    # Overrides
    def __init__(self, pathname, *args, **kwargs):
        """
        Initialiser le corpus avec son nom de fichier

        :param pathname: nom de fichier du  corpus sans répertoire et extension
        """
        self.pathname = pathname
Example #14
0
from textblob.classifiers import MaxEntClassifier

with open('data/train-toy.csv', 'r') as fp:
    cl = MaxEntClassifier(fp, format="csv")

with open('data/test-toy.csv', 'r') as gp:
    print cl.accuracy(gp, format="csv")
print('Before pre-processing \n')
cl = DecisionTreeClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = DecisionTreeClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
cl = DecisionTreeClassifier(training_array_stemmed_without_sw)
classify_review(cl)
print('\n ************ NaiveBayesClassifier ********************\n')
print('Before pre-processing\n')
cl = NaiveBayesClassifier(training_array)
classify_review(cl)
print('\n After removing stop-words \n')
cl = NaiveBayesClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
cl = NaiveBayesClassifier(training_array_stemmed_without_sw)
classify_review(cl)

print('\n ************ MaxEntClassifier ********************\n')
cl= MaxEntClassifier(training_array)
print('Before pre-processing\n')
classify_review(cl)
print('\n After removing stop-words \n')
cl = MaxEntClassifier(training_array_without_sw)
classify_review(cl)
print('\n After stemming \n')
cl = MaxEntClassifier(training_array_stemmed_without_sw)
classify_review(cl)
 def __init__(self, data):
     # self._classifier = NaiveBayesClassifier(data)
     self._classifier = MaxEntClassifier(data)