Beispiel #1
0
    def __call__(self, text):
        context = self.context
        request = self.request
        response = request.response
        catalog = context.portal_catalog

        bayesFilter = api.portal.get_registry_record(
            'i8d.content.browser.coverSetting.ICoverSetting.bayesFilter')

        trainingSet = []
        for line in bayesFilter.split('\n'):
            trainingSet.append({
                'category': 'hasKey',
                'text': safe_unicode(line)
            })

        trainer = Trainer(tokenizer)
        for record in trainingSet:
            trainer.train(record['text'], record['category'])
        classifier = Classifier(trainer.data, tokenizer)

        result = classifier.classify(safe_unicode(text))

        import pdb
        pdb.set_trace()
Beispiel #2
0
    def getKeywords(self, html):

        text = self.getHtml2Text(html)
#        print text
        text = self.zhsJieba(text)

        #取得registry
        reg = api.portal.get_registry_record('mingjing.content.browser.mjnetSetting.IMJNetSetting.catDict')
        trainSet = []
        for item in reg:
            key = item.split('|||')[0]
            for line in reg[item].split('\n'):
                zhsString = self.zhsJieba(line)
                trainSet.append({'category': key, 'text': zhsString})

        #用簡單貝氏分類文章
        newsTrainer = Trainer(tokenizer)
        for news in trainSet:
            newsTrainer.train(news['text'].encode('utf-8'), news['category'])
        newsClassifier = Classifier(newsTrainer.data, tokenizer)
        classification = newsClassifier.classify(text)
        print classification
#        import pdb; pdb.set_trace()
        if classification[0][1] == 0.0:
            classification.insert(0, (u'n99', 0.0))
        result = []
        for item in classification:
            result.append(item[0])
        return result
    def classifyNonClusteredJira(self):
        columnName = 'C'
        for index, row in self.df.iterrows():
            clusterName = row['Labels']
            keyWords = row['KeyWords']

            if (clusterName in constantsObj.INITIAL_CLUSTERS):
                self.issueSet.append(({
                    "class": row['Labels'],
                    "sentence": keyWords
                }))

        for issue in self.issueSet:
            self.jiraTrainer.train(issue['sentence'], issue['class'])

        jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer)

        for index, row in self.df.iterrows():
            clusterName = row['Labels']
            keyWords = row['KeyWords']
            if (clusterName not in constantsObj.INITIAL_CLUSTERS):
                identifiedCluster = jiraClassifier.classify(
                    row['KeyWords']).__getitem__(0)
                identifiedCluster = identifiedCluster.__getitem__(0)
                self.issueSet.append(({
                    "class": identifiedCluster,
                    "sentence": keyWords
                }))
                self.nonClusteredJirasAfterClusteringFile.write(
                    "%s --- %s\n" % (keyWords, identifiedCluster))
                '''writeIndex = columnName + str(index-2)
                self.activeWorkSheet[writeIndex] = identifiedCluster'''

        self.nonClusteredJirasAfterClusteringFile.close()
        return self.issueSet
    def classifyNewJiraToOneOfTheClusters(self, inputTrainingData, inputJira):
        for item in inputTrainingData:
            self.jiraTrainer.train(item['sentence'], item['class'])
        jiraClassifier = Classifier(self.jiraTrainer.data, tokenizer)
        clusterForInputJira = jiraClassifier.classify(inputJira)

        return clusterForInputJira
    def train(self):
        """Train on base and FB data"""

        with open('res/data/base_data.csv', 'r') as csv_file:

            reader = csv.reader(csv_file)
            i = 0
            for line in reader:

                i += 1

                line_split = line
                read_dict = {}

                if i == 1 or len(line_split) <= 2 or len(line_split[0]) == 0:
                    continue

                read_dict['class'] = line_split[2].strip()
                # Accounting for our inconsistency in Spreadsheet
                if read_dict["class"] == "Real":
                    read_dict['text'] = line_split[6].strip()
                else:
                    read_dict['text'] = line_split[5].strip()

                print(read_dict)

                self.training_data.append(read_dict)

        print('---->>>>>><<<<<<<-------')

        with open('res/data/fb_data.csv', 'r') as csv_file:

            reader = csv.reader(csv_file)
            i = 0
            for line in reader:

                i += 1

                line_split = line
                read_dict = {}

                if i == 1 or len(line_split) <= 2:
                    continue

                read_dict['class'] = line_split[2].strip()
                read_dict['text'] = line_split[5].strip()

                print(read_dict)

                self.training_data.append(read_dict)

        #print training_data
        for data in self.training_data:
            self.newsTrainer.train(data['text'], data['class'])

        self.newsClassifier = Classifier(
            self.newsTrainer.data,
            tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
def classify(filename, size):

    trainingSet, testingSet = make_chronological_sets.create_sets(
        filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    prop_caught = float(mal_mal) / float(mal_mal + clean_mal)
    prop_missed = float(clean_mal) / float(mal_mal + clean_mal)

    ## Stuff to get proportions:

    # size = float(size)

    # mal_mal = float(mal_mal)/size
    # mal_clean = float(mal_clean)/size
    # clean_mal = float(clean_mal)/size
    # clean_clean = float(clean_clean)/size

    ## Confusion matrix stuff:

    # confusionMatrix = [['Actually malicious', mal_mal, clean_mal], ['Actually clean', mal_clean, clean_clean]]

    # print tabulate(confusionMatrix, headers=['', 'Predicted malicious', 'Predicted clean'])

    print "Total: " + str(mal_mal + mal_clean + clean_mal + clean_clean)
    print "Malware: " + str(mal_mal + clean_mal)
    print "Clean: " + str(mal_clean + clean_clean)
    print "Caught: " + str(mal_mal) + " (" + "{:.1%}".format(prop_caught) + ")"
    print "Missed: " + str(clean_mal) + " (" + "{:.1%}".format(
        prop_missed) + ")"
Beispiel #7
0
    def train(self):
        """Train on base and FB data"""

        # Run through each training example in data interface and
        # feed them into model
        for data_point in self.data_interface.arr:
            data_class = data_point[2].strip()  # Class is "Credibility"
            data_text = data_point[4].strip()  # Text is "Content URL"
            self.newsTrainer.train(data_text, data_class)

        self.newsClassifier = Classifier(self.newsTrainer.data, \
            tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def classify(filename, size, url, result):

    trainingSet = make_training_set.create_set(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(trainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = [""]))

    print "Expected: " + result
    print classifier.classify(url)
Beispiel #9
0
    def __init__(self, journal=None):
        """Classifer initialization.

        Parameters:
            journal_file (str): Journal file string to import.
        """
        self._tknizer = tokenizer.Tokenizer(signs_to_remove=['?!%.'])
        self._trainer = Trainer(self._tknizer)
        if journal is not None:
            journal_data = train_journal(journal)

            for group in journal_data:
                # 0: Allocation account.
                # 1: List of transactions.
                # 2: Greatest common multiple of values in transactions.
                for transaction in group[1]:
                    # 0: Transaction payee string.
                    # 1: Allocation account.
                    self._trainer.train(transaction[0], transaction[1])

            self._classifier = BayesClassifier(
                self._trainer.data,
                self._tknizer
            )
        else:
            self._classifier = None
class DomainModel:

    training_data = []
    newsTrainer = Trainer(tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
    newClassifier = None

    def __init__(self):
        self.train()

    # TODO: Train on FB data too
    def train(self):
        with open('src/URL.csv', 'r') as csv_file:
                reader = csv_file.readlines()
                for line in reader:
                        read_dict = {}
                        line_split = line.split(',')
                        if len(line_split) < 2 or len(line_split[0]) == 0:
                                continue
                        read_dict['text'] = line_split[0].strip()
                        read_dict['class'] = line_split[1].strip()
                        
                        self.training_data.append(read_dict)

        #print training_data
        for data in self.training_data:
            self.newsTrainer.train(data['text'], data['class'])

        self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))

    def classify(self, unknownInstance):
        classification = self.newsClassifier.classify(unknownInstance)
        return classification
Beispiel #11
0
  def train_spam_texts():
    # Reading dataset file
    dataset_lang = "ru"
    dataset_file = codecs.open(os.path.abspath(os.curdir) + "/data/assets/spam_texts.json", "r", "utf_8_sig")
    dataset_data = json.load(dataset_file)

    # Preparing adverts spam dataset
    prepared_dataset = []
    for idx, item in enumerate(dataset_data[dataset_lang]["adverts"]):
      prepared_dataset.append({
        "text": item["text"],
        "category": "adverts"
      })
    
    # Training
    # (Will be replaced by another library soon)
    advertsTrainer = Trainer(tokenizer)
    for one_dataset_item in prepared_dataset:
      advertsTrainer.train(one_dataset_item["text"], one_dataset_item["category"])
    adverts_classifier = Classifier(advertsTrainer.data, tokenizer)

    # Usage
    # classification = adverts_classifier.classify("рассылка")
    # category_chance = classification[0][1]
    # print(category_chance)
def classificationNB(index):
    '''
    Train the Naive Bayes classifier and classify data
    naiveBayesClassifier is used.
    https://github.com/muatik/naive-bayes-classifier
    '''
    # Initial training set from file
    trainset = []
    f = open('E:\\databases\\trainset.txt', 'r')
    for line in f:
        if len(line.strip()) == 0:
            continue
        line = line.strip().split()
        assert len(line) == 22
        trainset.append({
            'text': '%08d' % int(line[(index + 1) * 2]),
            'category': line[(index + 1) * 2 + 1]
        })
    pass  # for line in f
    f.close()

    # Train the classifier
    trainer = Trainer(tokenizer)
    for case in trainset:
        trainer.train(case['text'], case['category'])
    classifier = Classifier(trainer.data, tokenizer)

    # Classification for each of the rest sets
    for i in range(10):
        if index == i:
            continue
        print '%-2d ~ %-2d' % (index, i)
        # Read cases from the file and classify each case
        f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'r')
        results = []
        count = 0
        for line in f:
            count += 1
            line = line.strip()
            if len(line) == 0:
                continue
            if count == 1:  # the first line -- title
                header = 'CAT%02d' % (index + 1)
                assert header not in line
                results.append('%s\t%s' % (line, header))
                continue
            pass  # if count == 1
            case = line.split()
            assert len(case) >= 4
            clf = classifier.classify(case[2])
            results.append('%s\t%s' % (line, clf))
        pass  # for line in f
        f.close()

        # Save the results back to the file
        f = open('E:\\databases\\classification%02d.txt' % (i + 1), 'w')
        for re in results:
            f.write('%s\n' % re)
        f.close()
    pass  # for i in range(10)
    def train(self):
        with open('src/URL.csv', 'r') as csv_file:
                reader = csv_file.readlines()
                for line in reader:
                        read_dict = {}
                        line_split = line.split(',')
                        if len(line_split) < 2 or len(line_split[0]) == 0:
                                continue
                        read_dict['text'] = line_split[0].strip()
                        read_dict['class'] = line_split[1].strip()
                        
                        self.training_data.append(read_dict)

        #print training_data
        for data in self.training_data:
            self.newsTrainer.train(data['text'], data['class'])

        self.newsClassifier = Classifier(self.newsTrainer.data, tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))
def classify(filename, size):

    trainingSet, testingSet = make_balanced_sets.create_sets(filename, size)

    trainer = Trainer(tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    for sample in trainingSet:
        trainer.train(sample['url'], sample['result'])

    classifier = Classifier(
        trainer.data, tokenizer.Tokenizer(stop_words=[], signs_to_remove=[""]))

    mal_mal = 0
    mal_clean = 0
    clean_clean = 0
    clean_mal = 0

    for sample in testingSet:

        predicted = classifier.classify(sample['url'])[0][0]
        actual = sample['result']

        if predicted == 'malicious' and actual == 'malicious':
            mal_mal += 1
        elif predicted == 'malicious' and actual == 'clean':
            mal_clean += 1
        elif predicted == 'clean' and actual == 'clean':
            clean_clean += 1
        elif predicted == 'clean' and actual == 'malicious':
            clean_mal += 1

    size = float(size)

    mal_mal = float(mal_mal) / size
    mal_clean = float(mal_clean) / size
    clean_mal = float(clean_mal) / size
    clean_clean = float(clean_clean) / size

    confusionMatrix = [[mal_mal, clean_mal], [mal_clean, clean_clean]]

    pprint(confusionMatrix)
    print "Accuracy: " + str(mal_mal + clean_clean)
    print "False positives (predicted clean when malicious): " + str(clean_mal)
    print "False negatives (predicted malicious when clean): " + str(mal_clean)
Beispiel #15
0
def get_classer():
    newsTrainer = Trainer(tokenizer)

    for news in newsSet:
        newsTrainer.train(news['text'], news['category'])

    # When you have sufficient trained data, you are almost done and can start to use
    # a classifier.
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    return newsClassifier
Beispiel #16
0
 def neyronka(self, _str):
     newsTrainer = Trainer(tokenizer)
     with open('o', 'rt', encoding='utf8') as csvfile:
         res = '['
         for i in csvfile.readlines():
             if i == '\n':
                 continue
             else:
                 theme, text = i.split('***')
                 res += '{\'text\':' + '\'' + text.strip() + '\'' + ', ' + '\'category\':' + '\'' + str(
                     theme) + '\'},\n'
         res += ']'
         newsSet = eval(res)
         for news in newsSet:
             newsTrainer.train(news['text'], news['category'])
         newsClassifier = Classifier(newsTrainer.data, tokenizer)
         unknownInstance = _str
         classification = newsClassifier.classify(unknownInstance)
         return (sorted(classification, key=(lambda x: -x[1])))
Beispiel #17
0
def train_classifier(newsData_train):

    data_process = Data_process()

    for data in newsData_train:
        data_process.final_process(data['text'], data['category'])

    newsClassifier = Classifier(data_process, data_process.tokenizer)

    return newsClassifier
def tweet_classification(unknownInstance):
    newsTrainer = Trainer(tokenizer)
    with open("train.txt") as f:
        for line in f:
            str = line
            str = str.split(' ', 1 );
            newsTrainer.train(str[1], str[0])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
	# Now you have a classifier which can give a try to classifiy text of news whose
	# category is unknown, yet.
    classification = newsClassifier.classify(unknownInstance)
	# the classification variable holds the possible categories sorted by
	# their probablity value
    ans = dict()
    for i in range(3):
        if(classification[0][1]!=0.0):
            ans[classification[i][0]] = classification[i][1] / classification[0][1];
            #print classification
    #print ans
    return ans
Beispiel #19
0
    def init(cls, lang='tr', namesCollection=NamesCollection, classifier=None):

        cls.lang = lang
        cls.namesCollection = namesCollection

        if classifier:
            cls.classifier = classifier
        else:
            cls.classifier = Classifier(CachedModel.get(lang), tokenizer)

        cls.initialized = True
Beispiel #20
0
class NaiveBayesClassifier:
    def __init__(self):
        jieba.set_dictionary('dict.big.txt')
        self.articleTrainer = Trainer(tokenizer)

    def train(self):
        # Training
        articles = article.create_articles_from_file("data/HatePoliticsdata.json")
        p_train = articles[0:3001]
        p_test = articles[3001:3031]

        for a in p_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'politics')

        articles = article.create_articles_from_file("data/Gossipingdata.json")
        g_train = articles[0:3000]
        g_test = articles[3001:3301]

        for a in g_train:
            doc = a.body
            #seg_list = jieba.lcut(doc, cut_all=False)
            seg_list = jieba.analyse.extract_tags(doc)
            doc = " ".join(seg_list)
            self.articleTrainer.train(doc, 'gossiping')
        f = open('data/docCountOfClasses.json', 'w', -1, 'utf-8')
        f.write(json.dumps(self.articleTrainer.data.docCountOfClasses))
        f.close()
        f = open('data/frequencies.json', 'w', -1, 'utf-8')
        f.write(json.dumps(self.articleTrainer.data.frequencies))
        f.close()
        

    def classify(self, article):
        self.data = TrainedData()
        f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8')
        self.data.docCountOfClasses = json.load(f)
        f.close()
        f = open('data/frequencies.json', 'r', -1, 'utf-8')
        self.data.frequencies = json.load(f)
        f.close()
        #Testing
        self.articleClassifier = Classifier(self.data, tokenizer)
        doc = article.body
        #seg_list = jieba.lcut(doc, cut_all=False)
        seg_list = jieba.analyse.extract_tags(doc)
        doc = " ".join(seg_list)
        classification = self.articleClassifier.classify(doc)
        return classification[0][0]
def determine(sentence):
    newsTrainer = Trainer(tokenizer)
    newsSet = []

    with open('data.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            newsSet.append({'fact': row['Fact'], 'decision': row['Decision']})

    for news in newsSet:
        newsTrainer.train(news['fact'], news['decision'])

    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    classification = newsClassifier.classify(sentence)
    # False
    false = classification[0][1]
    false = str(false).split('.')[0]
    # True
    true = classification[1][1]
    true = str(true).split('.')[0]
    data = [true, false]
    return data
Beispiel #22
0
def post_logfile():
    if request.method == 'GET':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification

        return json.dumps(dict(classification))
Beispiel #23
0
    def update(self, text, category):
        """Update training data with new examples.

        Adds new data to the trainer then generates a new classifier. Can be
        useful for updating on the fly if performing an interactive data import.

        Parameters:
            text (str): New text to classify.
            category (str): Classification of `text`.
        """
        self._trainer.train(text, category)
        self._classifier = BayesClassifier(
            self._trainer.data,
            self._tknizer
        )
Beispiel #24
0
def create_naive_bayes_classifier(training_examples, training_annotations):
    print("creating naive bayes classifier")
    annotations = [categories[x] for x in training_annotations]

    news_trainer = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(training_examples, annotations):
        news_trainer.train(example, annotation)
    classifier = Classifier(
        news_trainer.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    print("\t->done")
    return classifier
Beispiel #25
0
 def classify(self, article):
     self.data = TrainedData()
     f = open('data/docCountOfClasses.json', 'r', -1, 'utf-8')
     self.data.docCountOfClasses = json.load(f)
     f.close()
     f = open('data/frequencies.json', 'r', -1, 'utf-8')
     self.data.frequencies = json.load(f)
     f.close()
     #Testing
     self.articleClassifier = Classifier(self.data, tokenizer)
     doc = article.body
     #seg_list = jieba.lcut(doc, cut_all=False)
     seg_list = jieba.analyse.extract_tags(doc)
     doc = " ".join(seg_list)
     classification = self.articleClassifier.classify(doc)
     return classification[0][0]
Beispiel #26
0
def post_logfile():
    if request.method == 'POST':
        log_file = request.args['symptom']
        print(log_file)
        diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
        with open("Dataset.csv", "r") as file:  #OPENS DATASET
            for i in file:  #FOR EACH LINE
                lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
                diseaseclassifier.train(lines[1], lines[0])  #TRAINING
        diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
        classification = diseaseclassifier.classify(log_file)  #CLASIFY INPUT
        print classification
        result = []
        for item in classification:
            obj = CustomType(item[0], item[1])
            result.append(json.loads(obj.toJSON()))
        # return json.dumps(OrderedDict(classification))
        return json.dumps(result, indent=4)
Beispiel #27
0
def create_nbc_nb_classifier(training_dataset):
    training_examples, training_annotations = training_dataset
    # training_annotations = [int(not bool(annotation)) for annotation in training_annotations]
    parsed_training_examples = [
        set(tokenize(example)) for example in training_examples
    ]

    tr = Trainer(
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
    for example, annotation in zip(parsed_training_examples,
                                   training_annotations):
        tr.train(example, annotation)

    print("number of tokens seen: %s" % len(tr.data.frequencies.keys()))
    return tr, Classifier(
        tr.data,
        tokenizer.Tokenizer(stop_words=[],
                            signs_to_remove=[tokenization_string]))
Beispiel #28
0
class DomainModel:

    data_interface = []
    newsTrainer = Trainer(
        tokenizer.Tokenizer(stop_words=[], signs_to_remove=["?!#%&"]))
    newClassifier = None

    def __init__(self, data_interface):
        """
        Constructor:
        Store data interface on creation,
        Don't train yet, let parent decide when
        """

        if not isinstance(data_interface, Data):
            raise ValueError(
                "Data is not properly interfaced through class Data")

        self.data_interface = data_interface

    def train(self):
        """Train on base and FB data"""

        # Run through each training example in data interface and
        # feed them into model
        for data_point in self.data_interface.arr:
            data_class = data_point[2].strip()  # Class is "Credibility"
            data_text = data_point[4].strip()  # Text is "Content URL"
            self.newsTrainer.train(data_text, data_class)

        self.newsClassifier = Classifier(self.newsTrainer.data, \
            tokenizer.Tokenizer(stop_words = [], signs_to_remove = ["?!#%&"]))

    def classify(self, unknownInstance):
        classification = self.newsClassifier.classify(unknownInstance)
        return classification
documentTrainer = Trainer(tokenizer)

documentSet = []


def getTextBasedOnDocumentID(documentID):
    ID = int(documentID.split('_')[1])
    line = linecache.getline('../2.document_set/document_set.csv', ID + 2)
    text = line.split(',"')[1]
    return text


for i in range(0, len(traincsv)):
    documentSet.append({
        'text': getTextBasedOnDocumentID(traincsv[i][0]),
        'category': traincsv[i][1]
    })

for documents in documentSet:
    documentTrainer.train(documents['text'], documents['category'])

newsClassifier = Classifier(documentTrainer.data, tokenizer)

for i in range(0, len(testcsv)):
    data = getTextBasedOnDocumentID(testcsv[i][0])
    classification = newsClassifier.classify(data)
    testcsv[i][1] = int(classification[0][0])
df = pd.DataFrame(testcsv)
df.to_csv("../5.evaluation_file/predicted_cat.csv", index=False)
#np.savetxt("./5.evaluation_file/predicted_cat.csv", testcsv,header="document_id,category" ,delimiter=",")
import json, os, sys, re
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
##IMPORTS
'''
Usage:

    python GuessDisease.py "symptomA symptomB symptomC"
Example INPUT:
    python GuessDisease.py "agitation exhaustion vomit"
Example OUTPUT:

    {
    "disease": "influenza"
    }


'''

##SETTING UP
diseaseclassifier = Trainer(tokenizer)  #STARTS CLASIFIERS
with open("Dataset.csv", "r") as file:  #OPENS DATASET
    for i in file:  #FOR EACH LINE
        lines = file.next().split(",")  #PARSE CSV <DISEASE> <SYMPTOM>
        diseaseclassifier.train(lines[1], lines[0])  #TRAINING
diseaseclassifier = Classifier(diseaseclassifier.data, tokenizer)
classification = diseaseclassifier.classify(sys.argv[1])  #CLASIFY INPUT
print classification[0]  #PRINT CLASIFICATION
Beispiel #31
0
tosTrainer = Trainer(tokenizer)

def get_corp(read_file):
	with open(read_file,"r") as r:
		corpus = []
		for line in r:
			tabsep = line.decode('utf-8').strip().split('\t')
			a = {}
			a['text'] = tabsep[0]
			a['rating'] = tabsep[1]
			corpus.append(a)
		return corpus

# get the corpus from a training set - using copyright clauses here as an example (a subset of the csv generated by the getpointsdata.py script)
tosSet = get_corp("tosdr.org/copyrighttrainset.txt")

# You need to train the system passing each text one by one to the trainer module.
for corpi in tosSet:
    tosTrainer.train(corpi['text'], corpi['rating'])

# When you have sufficient trained data, you are almost done and can start to use a classifier.
tosClassifier = Classifier(tosTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of policy clauses whose rating is unknown, yet. Example here drawn from test set
unknownInstance = "You are free to choose your own copyright license for your content in your account settings: Public Domain Creative Commons non commercial or free licenses but also classic copyright if you wish so."
classification = tosClassifier.classify(unknownInstance)

# the classification variable holds the possible categories sorted by their probablity value
print classification
Beispiel #32
0
from naiveBayesClassifier.classifier import Classifier

sentimentTrainer = Trainer(tokenizer)

# Get the training dataset.
with open('training.csv', 'r') as f:
    data = f.read()
trainset = data.splitlines()

for line in trainset:
    pos1 = line.find(',"')
    pos2 = line.find('",', pos1)
    if pos1 == -1:
        pos1 = line.find(',')
        pos2 = line.find(',', pos1 + 1)
        comment = line[pos1 + 1:pos2]
        sentiment = line[pos2 + 1:]
    else:
        comment = line[pos1 + 2:pos2 - 2]
        sentiment = line[pos2 + 2:]
    sentimentTrainer.train(comment, sentiment)

# Use the classifier.
sentimentClassifier = Classifier(sentimentTrainer.data, tokenizer)

# Classify an unknown review.
unknownInstance = "I don't like the app. It crashes everytime."
classification = sentimentClassifier.classify(unknownInstance)

print classification
You want to train a system with this pre-categorized/pre-classified 
texts. So, you have better call this data your training set.
"""
from naiveBayesClassifier import tokenizer
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

newsTrainer = Trainer(tokenizer)

# You need to train the system passing each text one by one to the trainer module.
newsSet =[
    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
    {'text': 'do not neglect exercise', 'category': 'health'},
    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
    {'text': 'eat to lose weight', 'category': 'health'},
    {'text': 'you should not eat much', 'category': 'health'}
]
for news in newsSet:
    newsTrainer.train(news['text'], news['category'])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, tokenizer)

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
classification = newsClassifier.classify("Obama is")

# the classification variable holds the detected categories sorted
print(classification)
    doc = " ".join(seg_list)
    articleTrainer.train(doc, 'politics')

articles = article.create_articles_from_file("data/Gossipingdata.json")
g_train = articles[0:3000]
g_test = articles[3001:3301]

for a in g_train:
    doc = a.body
    #seg_list = jieba.lcut(doc, cut_all=False)
    seg_list = jieba.analyse.extract_tags(doc)
    doc = " ".join(seg_list)
    articleTrainer.train(doc, 'gossiping')

#Testing
articleClassifier = Classifier(articleTrainer.data, tokenizer)
p_gossiping = 0
p_politics = 0
g_gossiping = 0
g_politics = 0

for a in p_test:
    doc = a.body
    #seg_list = jieba.lcut(doc, cut_all=False)
    seg_list = jieba.analyse.extract_tags(doc)
    doc = " ".join(seg_list)
    classification = articleClassifier.classify(doc)
    if classification[0][0] == 'gossiping':
        p_gossiping += 1
    else:
        p_politics += 1
def classify(input):
    twitter = Twitter()

    f = open("data.txt", "r")

    data = json.loads(f.read())

    gradeTrainer = Trainer(tokenizer)
    loadTrainer = Trainer(tokenizer)
    lectureTrainer = Trainer(tokenizer)

    print("Training grade ...")
    for subject in data:
        if subject["grade"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    gradeTrainer.train(li, subject["grade"])

    print("Training load ...")
    for subject in data:
        if subject["load"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    loadTrainer.train(li, subject["load"])

    print("Training lecture ...")
    for subject in data:
        if subject["lecture"] != "?":
            review = subject["comment"].replace('.', '\n').split("\n")
            for li in review:
                if len(li.strip()) != 0:
                    lectureTrainer.train(li, subject["lecture"])

    gradeClassifier = Classifier(gradeTrainer.data, tokenizer)
    loadClassifier = Classifier(loadTrainer.data, tokenizer)
    lectureClassifier = Classifier(lectureTrainer.data, tokenizer)

    input = u"" + input
    classify_input = []

    for element in twitter.pos(input):
        if element[1] == 'Noun':
            classify_input.append(element[0])
        elif element[1] == 'Verb':
            classify_input.append(element[0])
        elif element[1] == 'Adjective':
            classify_input.append(element[0])
        elif element[1] == 'Adverb':
            classify_input.append(element[0])
        elif element[1] == 'Exclamation':
            classify_input.append(element[0])
        elif element[1] == 'Alpha':
            classify_input.append(element[0])
        elif element[1] == 'KoreanParticle':
            classify_input.append(element[0])

    text = " ".join(classify_input)

    print(text)

    gradeClassification = gradeClassifier.classify(text)
    loadClassification = loadClassifier.classify(text)
    lectureClassification = lectureClassifier.classify(text)

    print(
        "\n________________________________________GRADE________________________________________\n"
    )
    print(gradeClassification)
    print(
        "\n________________________________________LOAD_________________________________________\n"
    )
    print(loadClassification)
    print(
        "\n________________________________________LECTURE______________________________________\n"
    )
    print(lectureClassification)

    return gradeClassification, loadClassification, lectureClassification
Beispiel #36
0
    def get(self):
        try:
            print "  "
            print "TestClassifier start"
            print "  "
            # pasar  los stop words a lista desde el file
            with open("stop_words.txt", "r") as ins:
                array = []
                for line in ins:
                    array.append((line.rstrip('\n')).decode('unicode-escape'))
            #print array
            newsTrainer = Trainer(
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&_"]))

            hoy = date.today()

            query = News3.query(News3.date == hoy,
                                News3.news_from.IN([
                                    "uy_press",
                                ]), News3.category == "Política")

            # You need to train the system passing each text one by one to the trainer module.
            #newsSet =[
            #    {'text': 'not to eat too much is not enough to lose weight', 'category': 'health'},
            #    {'text': 'Russia try to invade Ukraine', 'category': 'politics'},
            #    {'text': 'do not neglect exercise', 'category': 'health'},
            #    {'text': 'Syria is the main issue, Obama says', 'category': 'politics'},
            #    {'text': 'eat to lose weight', 'category': 'health'},
            #    {'text': 'you should not eat much', 'category': 'health'}
            #]

            query2 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "deportes")

            query4 = News3.query(News3.date == hoy,
                                 News3.news_from == "uy_press",
                                 News3.category == "salud")

            #for news in newsSet:
            #    newsTrainer.train(news['text'], news['category'])
            c = 0
            #print query
            for i in query:
                print "  "
                print i.category
                newsTrainer.train(i.html, 'politica')
                #if c == 10: break
                c += 1

            #for i in query2:
            #	newsTrainer.train(i.html, 'deportes')
            #raise Exception('I know Python!')

            #for i in query4:
            #	newsTrainer.train(i.html, 'salud')

            # When you have sufficient trained data, you are almost done and can start to use
            # a classifier.

            # Now you have a classifier which can give a try to classifiy text of news whose
            # category is unknown, yet.
            query3 = News3.query(
                News3.date == hoy,
                News3.news_from.IN([
                    "el_pais",
                ]),
                News3.id.IN([0]),
            )

            ###
            newsClassifier = Classifier(
                newsTrainer.data,
                tokenizer.Tokenizer(stop_words=array,
                                    signs_to_remove=["?!#%&"]))
            #print unknownInstance
            classification = newsClassifier.classify(
                "Vidalín: No quiero que me llamen para saber qué tramite hay que hacer para poner un prostíbulo"
            )

            # the classification variable holds the detected categories sorted
            print " classification "
            print(classification)
        except:
            print traceback.format_exc()
def article_keywords(article):
    keys = Keywords.objects.get(article=article)
    print keys
    l = [k.keyword for k in keys.keywords.all()]
    print " ".join(l)
    keyset = {'keyword': " ".join(l)}
    return keyset


if __name__ == '__main__':
    print "Starting testing of Bayes Classifer"
    labeled_articles = [
        (a, a.relevant)
        for a in Article.objects.all()[:(len(Article.objects.all()))]
    ]
    print labeled_articles
    featuresets = []
    for (article, relevant) in labeled_articles:
        r = article_keywords(article)
        featuresets.append((r, relevant))
    print featuresets
    train_set, test_set = featuresets[:(len(featuresets))], featuresets[(
        len(featuresets) - 2):]
    print train_set
    newsTrainer = Trainer(tokenizer)
    for f in train_set:
        newsTrainer.train(f[0]['keyword'], f[1])
    newsClassifier = Classifier(newsTrainer.data, tokenizer)
    url = raw_input("Enter the url: ")
    testurl(url, newsClassifier)