Ejemplo n.º 1
0
    def run(self):
        if not self.usedev:
            for grams in self.allgrams:
                c = NaiveBayesClassifier(self.rawfname, grams=grams)
                c.trainClassifier()
                self.stdout = True
                self.evaluate(c)
            return

        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname, grams=grams)
            c.trainClassifier()

            for w in self.allweights:
                c.setWeight(w)

                for t1 in self.allthresholds:
                    for t2 in self.allthresholds:
                        c.setThresholds(neg=t1, pos=t2)
                        cinfo, accpos, accneg, accall, corrall = self.evaluate(
                            c)
                        self.results.append(
                            [cinfo, accpos, accneg, accall, corrall])

        if self.csvout:
            self.flushToCSV()
Ejemplo n.º 2
0
def assignment_e_naivebayes_2():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)
    def run(self):
        
        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname,
                                     grams=grams)
            c.trainClassifier()
            self.stdout = False

            return self.evaluate(c)
        
            
        for grams in self.allgrams:
            c = NaiveBayesClassifier(self.rawfname,
                                     grams=grams)
            c.trainClassifier()
            
            for w in self.allweights:
                c.setWeight(w)                                
        
                for t1 in self.allthresholds:
                    for t2 in self.allthresholds:
                        c.setThresholds(neg=t1, pos=t2)
                        cinfo, accpos, accneg, accall, corrall = self.evaluate(c)
                        self.results.append([cinfo, accpos, accneg, accall, corrall])
Ejemplo n.º 4
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from naivebayesclassifier import NaiveBayesClassifier
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")

    def evaluator(text):
        results = []
        classifier.classify(text, lambda m: results.append(m))
        return results
    simple_repl("text", evaluator)
Ejemplo n.º 5
0
 def test_language_detection_trained_on_some_news_corpora(self):
     import os.path
     from corpus import InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     training_set = {
         language: InMemoryCorpus(os.path.join(data_path,
                                               f"{language}.txt"))
         for language in ["en", "no", "da", "de"]
     }
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     self._classify_buffer_and_verify_top_categories(
         "Vil det riktige språket identifiseres? Dette er bokmål.",
         classifier, ["no"])
     self._classify_buffer_and_verify_top_categories(
         "I don't believe that the number of tokens exceeds a billion.",
         classifier, ["en"])
     self._classify_buffer_and_verify_top_categories(
         "De danske drenge drikker snaps!", classifier, ["da"])
     self._classify_buffer_and_verify_top_categories(
         "Der Kriminalpolizei! Haben sie angst?", classifier, ["de"])
Ejemplo n.º 6
0
class NaiveBayesClassifierTest(unittest.TestCase):
    def test_predict(self):
        STOP_WORDS = set(line.strip().decode('utf-8')
                         for line in open("stopwords.dic", 'r'))

        def tokenize(text):
            try:
                seg_list = jieba.cut(text, cut_all=False)
                return set(
                    [x.strip() for x in seg_list if x not in STOP_WORDS])
            except Exception, e:
                print e
                return []

        classifier = NaiveBayesClassifier(tokenizer=tokenize)
        # classifier.fit(u'naive_train_data')
        # classifier.dump('naive_classifier.dat')
        classifier.load('naive_classifier.dat')
        classifier.reduce(400)
        start = time()
        total = 0.0
        errors = 0.0
        for root, dirs, files in os.walk(u'naive_test_data/', topdown=True):
            for name in files:
                if root.startswith('.') or name.startswith('.'):
                    continue
                category = root.split('/')[-1]
                text = open(os.path.join(root, name),
                            'r').read().decode('utf-8')
                predict = classifier.predict(text)
                total += 1
                if category != predict:
                    errors += 1
                    print 'predict: %s, actual: %s, errors percentage: %0.2f' % (
                        predict.encode('utf-8'), category.encode('utf-8'),
                        100 * errors / total)
        print 'testing completed, total: %d, errors: %d, error rate:%0.2f, costs: %0.2f' % (
            total, errors, 100 * errors / total, time() - start)
        return errors / total
Ejemplo n.º 7
0
def assignment_e_naivebayes_1():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language
Ejemplo n.º 8
0
 def test_china_example_from_textbook(self):
     import math
     from corpus import InMemoryDocument, InMemoryCorpus
     from naivebayesclassifier import NaiveBayesClassifier
     china = InMemoryCorpus()
     china.add_document(
         InMemoryDocument(0, {"body": "Chinese Beijing Chinese"}))
     china.add_document(
         InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
     china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
     not_china = InMemoryCorpus()
     not_china.add_document(
         InMemoryDocument(0, {"body": "Tokyo Japan Chinese"}))
     training_set = {"china": china, "not china": not_china}
     classifier = NaiveBayesClassifier(training_set, ["body"],
                                       self._normalizer, self._tokenizer)
     results = []
     classifier.classify("Chinese Chinese Chinese Tokyo Japan",
                         lambda m: results.append(m))
     self.assertEqual(len(results), 2)
     self.assertEqual(results[0]["category"], "china")
     self.assertAlmostEqual(math.exp(results[0]["score"]), 0.0003, 4)
     self.assertEqual(results[1]["category"], "not china")
     self.assertAlmostEqual(math.exp(results[1]["score"]), 0.0001, 4)
Ejemplo n.º 9
0
from maxentclassifier import MaximumEntropyClassifier
from naivebayesclassifier import NaiveBayesClassifier
import random
import csv

fname = 'training.csv'


nb = NaiveBayesClassifier(fname, grams=[1, 2])
nb.setThresholds(neg=1.0, pos=20.0)
nb.setWeight(0.000000000005)
nb.trainClassifier()
ment = MaximumEntropyClassifier(fname)
ment.trainClassifier()
classifiers = [nb, ment]

def csvdata_to_list(data):
    d=[]
    for row in data:
        d.append(row)
    return d

def search(text,data):
    output = []
    i=0
    for d in data:
        
        if d[0].lower().find(text) != -1:
           
            output.append([])
            output[i].append(d[0])
Ejemplo n.º 10
0
    processed = re.sub(r'—', r"-", line)
    processed = re.sub(r'([^\w\s\'])', r' \1 ', line)
    processed = processed.lower()

    return (processed.split())


#End def

parser = argparse.ArgumentParser()
parser.add_argument('train', help='The filename that points to training set.')
parser.add_argument('test', help='The filename that points to test set.')
args = parser.parse_args()

# Train our classifier
nbc = NaiveBayesClassifier(featurizer, classer, (AGREE_CLASS, DISAGREE_CLASS))
with open(args.train, 'r', encoding='UTF-8') as csv_train:
    train_reader = csv.reader(csv_train, delimiter=',')
    next(train_reader)

    for row in train_reader:
        rating = float(row[1])
        if rating >= -1 and rating < 1:
            continue
        nbc.add_sample(row)
#End with
nbc.smooth()

false_counts = Counter()
true_counts = Counter()
real_counts = Counter()
Ejemplo n.º 11
0
def assignment_e():

    # Use these throughout below. These are really language-specific functions, so it's a huge
    # simplification to use these for a language identifier.
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    results = []

    # Callback for receiving results. Received scores are log-probabilities.
    def match_collector(match: dict):
        results.append(match)
        print("*** WINNER", match["score"], match["category"])

    # Use this as the training set for our language identifier.
    print("LOADING...")
    training_set = {
        language: InMemoryCorpus("data/" + language + ".txt")
        for language in ["en", "no", "da", "de"]
    }

    # Assess probabilities from the training set.
    print("TRAINING...")
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)

    # Classify some previously unseen text fragments.
    print("CLASSIFYING...")
    for (buffer, language) in [
        ("Mon tro om det riktige språket identifiseres? Dette er norsk bokmål, forøvrig.",
         "no"),
        ("I don't believe that the number of tokens exceeds a billion.", "en"),
        ("De danske drenge drikker snaps!", "da"),
        ("Der Kriminalpolizei! Haben sie angst?", "de")
    ]:
        print(buffer)
        results.clear()
        classifier.classify(buffer, match_collector)
        assert results[0]["category"] == language

    # For demonstration purposes, replicate Example 13.1 on pages 241 and 242 in the textbook.
    china = InMemoryCorpus()
    china.add_document(InMemoryDocument(0,
                                        {"body": "Chinese Beijing Chinese"}))
    china.add_document(
        InMemoryDocument(1, {"body": "Chinese Chinese Shanghai"}))
    china.add_document(InMemoryDocument(2, {"body": "Chinese Macao"}))
    not_china = InMemoryCorpus()
    not_china.add_document(InMemoryDocument(0,
                                            {"body": "Tokyo Japan Chinese"}))
    training_set = {"china": china, "not china": not_china}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer,
                                      tokenizer)
    buffer = "Chinese Chinese Chinese Tokyo Japan"
    print(buffer)
    results.clear()
    classifier.classify(buffer, match_collector)
    assert len(results) == 2
    assert results[0]["category"] == "china"
    assert results[1]["category"] == "not china"
    assert math.isclose(math.exp(results[0]["score"]), 0.0003, abs_tol=0.00001)
    assert math.isclose(math.exp(results[1]["score"]), 0.0001, abs_tol=0.00005)