def __init__(self, stopWords=True): self.__wordCountByTermClass = collections.defaultdict(lambda: 0) # number of terms each each class has self.__wordCountByClass = collections.defaultdict(lambda: 0) # sum ( number of terms each class has ) self.__numDocs = -1 # N self.__classes = list() # class labels self.__prior = collections.defaultdict(lambda: 0) # Nc / N self.__N = collections.defaultdict(lambda: 0) # number of documents each class has #self.__vocabulary = list() self.__vocabulary = collections.defaultdict(lambda: 0) # Word -> frequency in all the training set self.__totalNumberOfTokens = 0 self.__condProb = collections.defaultdict(lambda: 0) # CondProb[Class][Term] self.__precision = collections.defaultdict(lambda: 0) self.__recall = collections.defaultdict(lambda: 0) self.__f1 = collections.defaultdict(lambda: 0) self.__usingStopWordsList = stopWords self.__stopWords = StopWords()
""" cliente = base.iniciar_conexao() con_colecao = base.iniciar_colecao(cliente, "colecao_processada") def load_base(): all_textos = base.carrega_colecao_completo("brumadinhoinflux", "colecao_completa") return all_textos # print('spaCy Version: %s' % (spacy.__version__)) spacy_nlp = spacy.load('pt') nlp = spacy.load("pt_core_news_sm") spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS set_stop = stopWords.load_stop_words() # combina as duas bases de stopWords set_stop.union(spacy_stopwords) all_textos = load_base() def to_int_str(data): return str(int(data)) def remover_acentos(txt): return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII') def common_words(tokens):
""" import ast import re # import base import string from collections import Counter from unicodedata import normalize import emoji import spacy import stopWords.StopWords as stopWords # print('spaCy Version: %s' % (spacy.__version__)) nlp = spacy.load("pt_core_news_sm") spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS set_stop = stopWords.load_stop_words() # carrega adjetivos set_adjetivos = stopWords.load_stop_words("adjetivos.txt") # combina as duas bases de stopWords set_stop = set_stop.union(spacy_stopwords) set_stop = set_stop.union(set_adjetivos) def remove_emoji(text): """remove emoji de uma string.""" return emoji.get_emoji_regexp().sub(u'', text) def to_int_str(data): """converte para inteiro.""" return str(int(data))
import math import heapq import subprocess from stopWords import StopWords from Stemmer import Stemmer reload(sys) sys.setdefaultencoding('utf8') argv = sys.argv PATH_WIKI_XML = './' FILENAME_WIKI = './wiki-search-small.xml' ENCODING = "utf-8" stop_words = StopWords() stop_words.readStopWords() stemmer = Stemmer('english') freq = {} doc_freq = {} titles = {} category_detection = re.compile(u"\[\[Category:(.*?)\]\]", re.M) file_cntr = 0 file_step = 1000 def getCategories(text): cate = [] matches = re.finditer(category_detection, text) if matches: for match in matches: temp = match.group(1).split("|")
class NB: def __init__(self, stopWords=True): self.__wordCountByTermClass = collections.defaultdict(lambda: 0) # number of terms each each class has self.__wordCountByClass = collections.defaultdict(lambda: 0) # sum ( number of terms each class has ) self.__numDocs = -1 # N self.__classes = list() # class labels self.__prior = collections.defaultdict(lambda: 0) # Nc / N self.__N = collections.defaultdict(lambda: 0) # number of documents each class has #self.__vocabulary = list() self.__vocabulary = collections.defaultdict(lambda: 0) # Word -> frequency in all the training set self.__totalNumberOfTokens = 0 self.__condProb = collections.defaultdict(lambda: 0) # CondProb[Class][Term] self.__precision = collections.defaultdict(lambda: 0) self.__recall = collections.defaultdict(lambda: 0) self.__f1 = collections.defaultdict(lambda: 0) self.__usingStopWordsList = stopWords self.__stopWords = StopWords() def __getAllStatistics(self, data): logging.info("Calculating statistics...") for featureVector in data: className = featureVector.pop(0) # remove temporarily categoryName = featureVector.pop(1) # remove temporarrily if len(featureVector) == 0: # Empty category found... logging.warning("No pages were found for category %s", categoryName) continue #Add new class to set of classes, if necessary if className not in self.__classes: self.__classes.append(className) self.__wordCountByTermClass[className] = collections.defaultdict(lambda: 0) #increment number of documents of this class self.__N[className] += 1 for (value, freq) in featureVector.iteritems(): if self.__usingStopWordsList and self.__stopWords.isStopWord(value): continue freq = int(freq) self.__wordCountByTermClass[className][value] += freq self.__wordCountByClass[className] += freq #if value not in self.__vocabulary: # self.__vocabulary.append(value) self.__vocabulary[value] += freq self.__totalNumberOfTokens += freq featureVector[0] = className featureVector[1] = categoryName #remove empty elements data = [elem for elem in data if len(elem) > 0] #Get the size of valid documents found self.__numDocs = len(data) if self.__numDocs == 0: logging.error("Empty training set! Aborting...") print "ERROR - empty training set!" sys.exit(0) def trainClassifier(self, data): logging.info("Training NB...") self.__getAllStatistics(data) logging.info("Calculated statistics. Using %d categories.", self.__numDocs) for c in self.__classes: self.__prior[c] = self.__N[c] / self.__numDocs self.__condProb[c] = collections.defaultdict(lambda: (1 / (self.__wordCountByClass[c] + len(self.__vocabulary)))) logging.debug("Class %s - Prior %f", c, self.__prior[c]) for v in self.__vocabulary.keys(): self.__condProb[c][v] = (self.__wordCountByTermClass[c][v] + 1) / (self.__wordCountByClass[c] + len(self.__vocabulary)) logging.debug("Class %s - Word %s Conditional Probability --> %f", c, v, self.__condProb[c][v]) # logging.info("Most commom words:") # for (w,k) in sorted(self.__vocabulary.iteritems(), key=operator.itemgetter(1), reverse=True): # if k / self.__totalNumberOfTokens > 0.005: # logging.info("(more than 1 of all tokens --- Item %s - frequency %d - %f of tokens", w, k, k/self.__totalNumberOfTokens) # sys.exit(0) def testInBatch(self, instances): logging.info("Testing in Batch mode...") correctMap = collections.defaultdict(lambda: 0) numberOfIntances = collections.defaultdict(lambda: 0) predictedMap = collections.defaultdict(lambda: 0) for instance in instances: groundTruth = instance[0] categoryName = instance[1] predicted = self.testClassifier(instance) numberOfIntances[groundTruth] += 1 predictedMap[predicted] += 1 if predicted == groundTruth: correctMap[groundTruth] += 1 logging.info("Result details by class...") for c in self.__classes: print "------- For Class", c, logging.info("----- For class %s", c) try: self.__precision[c] = correctMap[c] / predictedMap[c] except ZeroDivisionError: self.__precision[c] = 0.0 logging.warn("PredictedMap[%s] = 0.0. This class was never predicted!", c) print "precision =", self.__precision[c], logging.info("-- Precision %f", self.__precision[c]) try: self.__recall[c] = correctMap[c] / numberOfIntances[c] except ZeroDivisionError: self.__recall[c] = 0.0 logging.warn("Recall[%s] = 0.0 --- numberOfInstances of this class is zero!", c) print "recall =", self.__recall[c] logging.info("-- Recall %f", self.__recall[c]) try: self.__f1[c] = (2 * self.__precision[c] * self.__recall[c]) / (self.__precision[c] + self.__recall[c]) except ZeroDivisionError: self.__f1[c] = 0.0 logging.warn("F1[%s] = 0.0 --- precision + recall = 0!", c) print "F1 =", self.__f1[c] logging.info("-- F1 %f", self.__f1[c]) def testClassifier(self, instance): score = collections.defaultdict(lambda: 0) classInformation = instance.pop(0) # remove class information temporarily categoryName = instance.pop(1) # remove category name temporarily #logging.debug("Category %s", categoryName) #logging.debug(instance) for c in self.__classes: score[c] = math.log(self.__prior[c]) for (word, freq) in instance.iteritems(): if self.__usingStopWordsList and self.__stopWords.isStopWord(word): continue score[c] += (int(freq) * math.log(self.__condProb[c][word])) logging.debug("Predicted score for class %s => %f", c, score[c]) instance[0] = classInformation instance[1] = categoryName predicted = max(score, key=score.get) logging.info("Tested instance \'%s\' --- Class predicted %s --- ground truth %s", categoryName, predicted, classInformation) return predicted