Exemple #1
0
    def __init__(self, stopWords=True):

        self.__wordCountByTermClass = collections.defaultdict(lambda: 0)  # number of terms each each class has
        self.__wordCountByClass = collections.defaultdict(lambda: 0)      # sum ( number of terms each class has )

        self.__numDocs = -1                                     # N
        self.__classes = list()                                 # class labels
        self.__prior = collections.defaultdict(lambda: 0)       # Nc / N
        self.__N = collections.defaultdict(lambda: 0)           # number of documents each class has
        #self.__vocabulary = list()
        self.__vocabulary = collections.defaultdict(lambda: 0)  # Word -> frequency in all the training set
        self.__totalNumberOfTokens = 0
        self.__condProb = collections.defaultdict(lambda: 0)    # CondProb[Class][Term]
        self.__precision = collections.defaultdict(lambda: 0)
        self.__recall = collections.defaultdict(lambda: 0)
        self.__f1 = collections.defaultdict(lambda: 0)
        self.__usingStopWordsList = stopWords
        self.__stopWords = StopWords()
Exemple #2
0
"""
cliente = base.iniciar_conexao()
con_colecao = base.iniciar_colecao(cliente, "colecao_processada")


def load_base():
    all_textos = base.carrega_colecao_completo("brumadinhoinflux",
                                               "colecao_completa")
    return all_textos


# print('spaCy Version: %s' % (spacy.__version__))
spacy_nlp = spacy.load('pt')
nlp = spacy.load("pt_core_news_sm")
spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS
set_stop = stopWords.load_stop_words()
# combina as duas bases de stopWords
set_stop.union(spacy_stopwords)

all_textos = load_base()


def to_int_str(data):
    return str(int(data))


def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')


def common_words(tokens):
"""
import ast
import re
# import base
import string
from collections import Counter
from unicodedata import normalize

import emoji
import spacy
import stopWords.StopWords as stopWords

# print('spaCy Version: %s' % (spacy.__version__))
nlp = spacy.load("pt_core_news_sm")
spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS
set_stop = stopWords.load_stop_words()
# carrega adjetivos
set_adjetivos = stopWords.load_stop_words("adjetivos.txt")
# combina as duas bases de stopWords
set_stop = set_stop.union(spacy_stopwords)
set_stop = set_stop.union(set_adjetivos)


def remove_emoji(text):
    """remove emoji de uma string."""
    return emoji.get_emoji_regexp().sub(u'', text)


def to_int_str(data):
    """converte para inteiro."""
    return str(int(data))
import math
import heapq
import subprocess
from stopWords import StopWords
from Stemmer import Stemmer

reload(sys)
sys.setdefaultencoding('utf8')

argv = sys.argv

PATH_WIKI_XML = './'
FILENAME_WIKI = './wiki-search-small.xml'
ENCODING = "utf-8"

stop_words = StopWords()
stop_words.readStopWords()
stemmer = Stemmer('english')
freq = {}
doc_freq = {}
titles = {}
category_detection = re.compile(u"\[\[Category:(.*?)\]\]", re.M)
file_cntr = 0
file_step = 1000

def getCategories(text):
    cate = []
    matches = re.finditer(category_detection, text)
    if matches:
        for match in matches:
            temp = match.group(1).split("|")
Exemple #5
0
class NB:

    def __init__(self, stopWords=True):

        self.__wordCountByTermClass = collections.defaultdict(lambda: 0)  # number of terms each each class has
        self.__wordCountByClass = collections.defaultdict(lambda: 0)      # sum ( number of terms each class has )

        self.__numDocs = -1                                     # N
        self.__classes = list()                                 # class labels
        self.__prior = collections.defaultdict(lambda: 0)       # Nc / N
        self.__N = collections.defaultdict(lambda: 0)           # number of documents each class has
        #self.__vocabulary = list()
        self.__vocabulary = collections.defaultdict(lambda: 0)  # Word -> frequency in all the training set
        self.__totalNumberOfTokens = 0
        self.__condProb = collections.defaultdict(lambda: 0)    # CondProb[Class][Term]
        self.__precision = collections.defaultdict(lambda: 0)
        self.__recall = collections.defaultdict(lambda: 0)
        self.__f1 = collections.defaultdict(lambda: 0)
        self.__usingStopWordsList = stopWords
        self.__stopWords = StopWords()

    def __getAllStatistics(self, data):

        logging.info("Calculating statistics...")
        for featureVector in data:
            className = featureVector.pop(0)  # remove temporarily
            categoryName = featureVector.pop(1)  # remove temporarrily

            if len(featureVector) == 0:  # Empty category found...
                logging.warning("No pages were found for category %s", categoryName)
                continue
            
            #Add new class to set of classes, if necessary
            if className not in self.__classes:
                self.__classes.append(className)
                self.__wordCountByTermClass[className] = collections.defaultdict(lambda: 0)

            #increment number of documents of this class
            self.__N[className] += 1

            for (value, freq) in featureVector.iteritems():
                if self.__usingStopWordsList and self.__stopWords.isStopWord(value):
                    continue

                freq = int(freq)
                self.__wordCountByTermClass[className][value] += freq
                self.__wordCountByClass[className] += freq
                #if value not in self.__vocabulary:
                #    self.__vocabulary.append(value)
                self.__vocabulary[value] += freq
                self.__totalNumberOfTokens += freq

            featureVector[0] = className
            featureVector[1] = categoryName

        #remove empty elements
        data = [elem for elem in data if len(elem) > 0]

        #Get the size of valid documents found
        self.__numDocs = len(data)
        if self.__numDocs == 0:
            logging.error("Empty training set! Aborting...")
            print "ERROR - empty training set!"
            sys.exit(0)

    def trainClassifier(self, data):

        logging.info("Training NB...")
        self.__getAllStatistics(data)        
        logging.info("Calculated statistics. Using %d categories.", self.__numDocs)

        for c in self.__classes:
            self.__prior[c] = self.__N[c] / self.__numDocs
            self.__condProb[c] = collections.defaultdict(lambda: (1 / (self.__wordCountByClass[c] + len(self.__vocabulary))))

            logging.debug("Class %s - Prior %f", c, self.__prior[c])
            for v in self.__vocabulary.keys():
                self.__condProb[c][v] = (self.__wordCountByTermClass[c][v] + 1) / (self.__wordCountByClass[c] + len(self.__vocabulary))
                logging.debug("Class %s - Word %s Conditional Probability --> %f", c, v, self.__condProb[c][v])

#        logging.info("Most commom words:")
#        for (w,k) in sorted(self.__vocabulary.iteritems(), key=operator.itemgetter(1), reverse=True):
#            if k / self.__totalNumberOfTokens > 0.005:
#                logging.info("(more than 1  of all tokens --- Item %s - frequency %d - %f of tokens", w, k, k/self.__totalNumberOfTokens)
#        sys.exit(0)

    def testInBatch(self, instances):

        logging.info("Testing in Batch mode...")

        correctMap = collections.defaultdict(lambda: 0)
        numberOfIntances = collections.defaultdict(lambda: 0)
        predictedMap = collections.defaultdict(lambda: 0)

        for instance in instances:
            groundTruth = instance[0]
            categoryName = instance[1]
            
            predicted = self.testClassifier(instance)

            numberOfIntances[groundTruth] += 1
            predictedMap[predicted] += 1

            if predicted == groundTruth:
                correctMap[groundTruth] += 1

        logging.info("Result details by class...")
        for c in self.__classes:

            print "------- For Class", c,
            logging.info("----- For class %s", c)
            
            try:
                self.__precision[c] = correctMap[c] / predictedMap[c]
            except ZeroDivisionError:
                self.__precision[c] = 0.0
                logging.warn("PredictedMap[%s] = 0.0. This class was never predicted!", c)
                
            print "precision =", self.__precision[c],
            logging.info("-- Precision %f", self.__precision[c])

            try:
                self.__recall[c] = correctMap[c] / numberOfIntances[c]
            except ZeroDivisionError:
                self.__recall[c] = 0.0
                logging.warn("Recall[%s] = 0.0 --- numberOfInstances of this class is zero!", c)

            print "recall =", self.__recall[c]
            logging.info("-- Recall %f", self.__recall[c])

            try:
                self.__f1[c] = (2 * self.__precision[c] * self.__recall[c]) / (self.__precision[c] + self.__recall[c])
            except ZeroDivisionError:
                self.__f1[c] = 0.0
                logging.warn("F1[%s] = 0.0 --- precision + recall = 0!", c)

            print "F1 =", self.__f1[c]
            logging.info("-- F1 %f", self.__f1[c])

    def testClassifier(self, instance):
        score = collections.defaultdict(lambda: 0)
        classInformation = instance.pop(0)  # remove class information temporarily
        categoryName = instance.pop(1)  # remove category name temporarily
        #logging.debug("Category %s", categoryName)
        #logging.debug(instance)

        for c in self.__classes:
            score[c] = math.log(self.__prior[c])
            for (word, freq) in instance.iteritems():
                if self.__usingStopWordsList and self.__stopWords.isStopWord(word):
                    continue
                score[c] += (int(freq) * math.log(self.__condProb[c][word]))

            logging.debug("Predicted score for class %s => %f", c, score[c])

        instance[0] = classInformation
        instance[1] = categoryName
        predicted = max(score, key=score.get)
        
        logging.info("Tested instance \'%s\' --- Class predicted %s --- ground truth %s", categoryName, predicted, classInformation)
        return predicted