def train(tagger: BayesClassifier, iter: int, measGen: MeasurementGenerator, classGen: ClassAssign, Z: int, V: int = 10, delta: float = .05) -> List[float]: # Performs 'iter' iterations of vFold testing (default 'V' is ten) with 'tagger' classifier # for 'Z' samples generated by 'measGen' and 'classGen.' After each vFold validation, appends # an expected value (attached to tagger) and then optimizes tagger by 'delta' paramenter (default .05). # Outputs a new optimized tagger and list of gain values from each iteration. expectedGain = [] for _ in range(iter): # Generates measurements samples = measGen.genMeas(Z) values = classGen.assign(samples) # Shuffles values samplesSh, valuesSh = shuffle(samples, values) # Performs Test matrix = vFold(samplesSh, valuesSh, V, tagger) # Appends value to list expectedGain.append(calcExpGain(matrix, tagger.eGain)) # Gives class probability over whole data set. tagger.priorUpdate(calcClassProb(valuesSh, tagger.range)) # Updates tagger tagger.optimize(delta, samplesSh, valuesSh) return expectedGain
def main(): from random import seed from probability import genProbs #### Unit test for training funciton # Initiatializes seed for recurrent testing. for _ in range(10): dimen = (4, 3, 6, 5, 6) classValues = 5 measures = MeasurementGenerator(dimen) classes = ClassAssign(dimen, classValues) conds = [list(genProbs(measures.range)) for _ in range(classValues)] egain = [[2, 0, 0, 0, 1], [3, 4, 0, 2, 2], [2, 2, 5, 1, 1], [2, 2, 3, 4, 1], [0, 1, -3, 2, 3]] classifier = BayesClassifier( None, conds, eGain=egain ) # Worries that supplying similar priors is affecting our results. Even though vFold updates. y = train(classifier, 20, measures, classes, 6000, delta=.0005) z = [y[i] - y[i - 1] for i in range(1, len(y))] # Trying to figure out average negative error to see if this is floating point. print(y) print() print(z) q = [i for i in z if i < 0] q = sum(q) / max(len(q), 1) print(q) print() x = measures.genMeas(20) p = classes.assign(x) l = classifier.assign(x)
def build_bayes_graph(img, labels, sigma=1e2, kappa=2): """ Build a graph from 4-neighborhood of pixels. Foreground and background is determined from labels (1 for foreground, 0 for background) and is modeled with naive Bayes classifiers.""" m, n = img.shape[:2] # RGB vector version (one pixel per row) vim = img.reshape((-1, 3)) # RGB for foreground and background foreground = img[labels == 1].reshape((-1, 3)) background = img[labels == 0].reshape((-1, 3)) train_data = [foreground, background] # train naive Bayes classifier bc = BayesClassifier() bc.train(train_data) # get probabilities for all pixels bc_lables, prob = bc.classify(vim) prob_fg, prob_bg = prob[0], prob[1] print(np.amax(prob_fg), np.max(prob_bg)) # create graph with m*n+2 nodes gr = Graph() gr.add_node(range(m * n + 2)) source = m * n # second to last is source sink = m * n + 1 # last node is sink # normalize for i in range(vim.shape[0]): vim[i] = vim[i] / np.linalg.norm(vim[i]) # go through all nodes and add edges for i in range(m * n): print(i) # add edge from source gr.add_edge((source, i), (prob_fg[i] / (prob_fg[i] + prob_bg[i]))) # add edge to sink gr.add_edge((i, sink), (prob_bg[i] / (prob_fg[i] + prob_bg[i]))) # add edges to neighbors if i % n != 0: # left exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i - 1])**2) / sigma) gr.add_edge((i, i - 1), edge_wt) if (i + 1) % n != 0: # right exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i + 1])**2) / sigma) gr.add_edge((i, i + 1), edge_wt) if i // n != 0: # up exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i - n])**2) / sigma) gr.add_edge((i, i - n), edge_wt) if i // n != m - 1: # down exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i + n])**2) / sigma) gr.add_edge((i, i + n), edge_wt) gr.build_flow(source, sink) return gr
def vFold(meas: Tuple[int], tags: Tuple[int], V: int, classifier: BayesClassifier) -> List[List[float]]: # Performs a round of V-fold validation tests on measurements 'meas' and respective real classes 'tags' # Performs V test using classifier. Returns a normalized confusion matrix of tests. results = [] measFold = partition(meas, V) tagsFold = partition(tags, V) for v in range(V): # Creates folds # Assigns testing and training trainTags = [tag for i in range(V) if i != v for tag in tagsFold[i]] testMeas = measFold[v] # Updates with new probability values trainProb = calcClassProb(trainTags, classifier.range) classifier.priorUpdate(trainProb) results.append(classifier.assign(testMeas)) # Unfolds tuple results = tuple(i for tpl in results for i in tpl) matrix = genConMatrix(tags, results, classifier.range) return results #normConMatrix(matrix)
def main(): from bayes import calcClassProb priors = (.6, .4) conds = [[.12 / .6, .18 / .6, .3 / .6], [.2 / .4, .16 / .4, .04 / .4]] gain = ((1, 0), (0, 2)) ### Tests for biasCCP classifier = BayesClassifier(priors, conds, eGain=gain) # Classification should be 1,1,0. The update should alter, 0|1, 1|1, 2|0. newCCP = biasCCP(classifier, .05) # print(newCCP) # Uncomment to see if ccp comforms to predictiosn. Should bias 2|0 and raise 0 and 1 |1. ### Tests for biasMeasGenerator generator = MeasurementGenerator((2, 2)) # print(generator.cmlProbs) prev = generator.cmlProbs[1] - generator.cmlProbs[0] # Let's give some conditionals that biases one measure conds = ((.1, .7, .1, .1), (.25, .25, .25, .25)) # And feed biased priors priors = (.7, .3) biasMeasGenerator(generator, priors, conds) # print(generator.cmlProbs) now = generator.cmlProbs[1] - generator.cmlProbs[0] # Should show bias towards second value now. assert now > prev generator = MeasurementGenerator((2, 2)) # print(generator.cmlProbs) prev1 = generator.cmlProbs[1] - generator.cmlProbs[0] prev2 = generator.cmlProbs[-1] - generator.cmlProbs[-2] # Let's give some conditionals that biases one measure conds = ((.1, .5, .1, .3), (.25, .25, .25, .25)) # And feed biased priors priors = (.7, .3) biasMeasGenerator(generator, priors, conds) # print(generator.cmlProbs) now1 = generator.cmlProbs[1] - generator.cmlProbs[0] now2 = generator.cmlProbs[-1] - generator.cmlProbs[-2] # Should show bias towards second value now. assert now1 > prev1, (prev1, now1) assert now2 > prev2, (prev2, now2)
def main(): parser = argparse.ArgumentParser() parser.add_argument("data", help="File of measure class pairs to test.", type=str) parser.add_argument("dimen", help="Tuple representing measure space.", type=str) parser.add_argument( "priors", help="File designating prior probabilities of classes.", type=str) parser.add_argument( "conditionals", help="File designating class conditional probabilities.", type=str) parser.add_argument( "--eGain", "-e", help= "Economic gain matrix for data. If not provided assumes identity matrix.", type=str) parser.add_argument( "--vFolds", "-v", help= "Number of v-fold partitions for testing. If not provided, assumes all data is for testing.", type=int) args = parser.parse_args() # Reading data dimen = eval(args.dimen) measures, tags = reader.readData(args.data, dimen) priors = reader.readPriors(args.priors) conds = reader.readCCP(args.conditionals) e = False if args.eGain: e = reader.readGain(args.eGain) classifier = BayesClassifier(priors, conds, eGain=e) expGain = test(classifier, measures, tags, V=args.vFolds) print("The expected gain for the data is: {}".format(expGain))
def main(): #### Testing Shuffle # Checks that shuffle does not return same values. sampleSize = random.randint(1, 1000) # Samples unique value pairs. (Makes easier to track. Misassignment of identical values doesn't alter data.) meas = tuple(random.sample(range(1, 10 * sampleSize), sampleSize)) tags = tuple(random.sample(range(1, 10 * sampleSize), sampleSize)) measSh, tagsSh = shuffle(meas, tags) assert (measSh, tagsSh) != (meas, tags) # Checks that original pairs are present for i in range(sampleSize): # Finds where the measurement was shuffled to indexSh = measSh.index(meas[i]) # Checks that this new index pairs with original tag value. assert tags[i] == tagsSh[indexSh] ### Testing vFold # Generates trivial matrix to make sure all parts are working. # This should have perfect accuracy. meas = (0, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 2) tags = (2, 1, 0, 0, 1, 0, 0, 1, 1, 2, 0, 0) cp = (.5, .2, .3) ccp = [[0, 0, 1], [0, 1, 0], [1, 0, 0]] tagger = BayesClassifier(cp, ccp) test = vFold(meas, tags, 5, tagger) print(test) ### Testing partition ### Partition should retain order for i in range(3, 1000): l = [_ for _ in range(random.randint(3, 1000))] k = partition(l, 3) m = tuple(j for i in k for j in i) assert tuple(l) == m, m
def test(classifier: BayesClassifier, measures: Tuple[int], tags: Tuple[int], V=0) -> float: ## Training protocol for testing measurements. Shuffles measures and tags while retaining # pairings and then classifies according to Bayes classifier. Returns expected gain # of testing. If V is provided, performs V-fold testing on data of V partitions # Shuffles tags and measures K = classifier.range e = classifier.eGain measures, tags = shuffle(measures, tags) # Sees if we are v-folding or just testing over a random measure. if V: normMatrix = vFold(measures, tags, V, classifier) else: results = classifier.assign(measures) matrix = genConMatrix(tags, results, K) normMatrix = normConMatrix(matrix) return calcExpGain(normMatrix, e)
def biasCCP(classifier: BayesClassifier, delta: float) -> List[List[float]]: # Destrucctively Updates CCP values in classifier. Classes are generated from classifier # for all measurements d in M space. Each d|c is increased by delta towards classifier's selection # and is normalized over class space. Returns conditional matrix. Not used in function but preserved # for legacy update. # Classifies for all d in measurement space M M = classifier.spaceSize K = classifier.range bayesValues = classifier.assign(range(M)) conds = classifier.cond # Note, this alters the original classifier. Had to be done for speed and memory efficiency. # Updates conditionals for measure in range(M): tag = bayesValues[measure] conds[tag][measure] += delta # Normalizes conditionals for val in range(K): sums = sum(conds[val]) conds[val] = [prb / sums for prb in conds[val]] return conds
def main(): parser = argparse.ArgumentParser() parser.add_argument("samples", help="Number of measurement samples to generate.", type=int) parser.add_argument("dimen", help="Measurement space.", type=str) parser.add_argument("classes", help="Number of classes.", type=int) parser.add_argument("seed", help="Random seed for experiement duplication.", type=int) parser.add_argument( "--vfolds", "-v", default=10, help= "Number of v-folds to partition testing data for v-folds testing. Default is 10.", type=int) parser.add_argument( "--optimization", "-o", default=0.0, help= "Specify if iterative improvement of class conditional probability values should be taken.", type=float) parser.add_argument("--iteration", "-t", default=10, help="Number of iterations for conditional update.", type=int) parser.add_argument( "--identity", "-i", action="store_true", default=False, help="Specify if economic gain matrix should be identity.") args = parser.parse_args() # Checks that our given limits are even feasible memory wise # Prompts for reader friendliness print("Generating testing data for seed {}".format(args.seed)) # Sets seed seed(args.seed) # Assigns values dimen = eval(args.dimen) # Calculates size of domain M = 1 for N in dimen: M *= N K = args.classes V = args.vfolds Z = args.samples print("Dimensions of Measurement Space: {}".format(dimen)) print("Number of Samples: {}".format(Z)) print("Classes: {}".format(K)) # Checks that this is even possible to calculate. if config.computeLimit(M, K): print("Possible measurements exceed memory capabilities.") sys.exit() print("Generating {0}x{0} Gain Matrix. Identity Matrix: {1}".format( K, args.identity)) gain = genGain(K, identity=args.identity) print("{}x{} Economic Gain Matrix Generated".format( len(gain), len(gain[0]))) # Generates measures print("Generating {} Measure-Value pairs.".format(Z)) print("Generating measures.") generator = MeasurementGenerator(dimen) measures = generator.genMeas(Z) assigner = ClassAssign(dimen, K) tags = assigner.assign(measures) print("{} measures and {} values generated.".format( len(measures), len(tags))) ## Generates classifier. print( "Generating class conditional probabilities for {} classes and {} possible measures." .format(K, M)) conditionals = genCCP(K, dimen) print( "Class conditional probabilities generated for {} classes and {} possible measures" .format(len(conditionals), len(conditionals[0]))) classifier = BayesClassifier( None, conditionals, eGain=gain) # No priors given since vFold always assigns. print("Testing classifier. V-fold factor: {}".format(V)) measures, tags = shuffle(measures, tags) results = vFold(measures, tags, V, classifier) matrix = genConMatrix(tags, results, K) norm = normConMatrix(matrix) expGain = calcExpGain(norm, classifier.eGain) #expGain = test(classifier, measures, tags, V=V) print("The expected gain for the given data is: {}".format(expGain)) #### Here we will work on updating if args.optimization: print( "Fitting data for improved performance. Improvement factor {} used over {} iterations." .format(args.optimization, args.iteration)) gains = [] # Going to set priors generated from this measurement set as permanent priors. priors = calcClassProb(tags, K) classifier.priorUpdate(priors) for i in range(args.iteration): # print(priors) classifier.optimize(args.optimization, measures, tags) classifier, measures, tags = fitData(classifier, generator, Z, args.optimization) measures, tags = shuffle(measures, tags) results = vFold(measures, tags, V, classifier) matrix = genConMatrix(tags, results, K) norm = normConMatrix(matrix) expGain = calcExpGain(norm, classifier.eGain) #expGain = test(classifier, measures, tags, V=V) gains.append(expGain) print("Expected Gain from iteration {} is {}".format( i + 1, expGain)) print("The expected gain for fitted data after {} iterations is: {}". format(args.iteration, gains[-1])) # Writes all data to files print("Writing to file.") reader.writeData(measures, tags, dimen) reader.writePriors(classifier.prior) reader.writeGain(gain) reader.writeCCP(classifier.cond) print("Done.")
import readline, sys, pandas, numpy as np from bayes import BayesClassifier from data import FEATURES if len(sys.argv) != 2: print("Invalid usage. Try ppd <filename>.") #load the data FNAME = str(sys.argv[1]) df = pandas.read_csv(FNAME) df = df.dropna() #load the classifier model = BayesClassifier() #classify the file predictions = model.model.predict(df[FEATURES].values) prediction_vals = model.model.predict_proba(df[FEATURES].values) rev_zone_dic = {v: k for k, v in model.zone_dic.items()} prediction_vals = np.array( list(map(lambda pv: list(map(lambda v: v * 100, pv)), prediction_vals))) output = np.column_stack((df['ID'].values, prediction_vals)) df = pandas.DataFrame(output) df.columns = ["ID"] + [v for k, v in rev_zone_dic.items()] df.to_csv("output.csv") print("Results in ./output.csv")
msg_type = line_split[0] words = re.findall("[a-z0-9']+", " ".join(line_split[1:])) return msg_type, set(words) if __name__ == '__main__': with open('data/data.txt', 'r') as f: messages = [] for line in f: msg_type, words = tokenize(line) messages.append({'msg_type': msg_type, 'words': words}) training_set = messages[:int(len(messages) * 0.75)] testing_set = messages[int(len(messages) * 0.75):] bayes = BayesClassifier() bayes.train(training_set) classified = bayes.classify(testing_set) true_positive = len([ 1 for message in classified if message['msg_type'] == 'spam' and message['prob_spam'] > 0.5 ]) false_positive = len([ 1 for message in classified if message['msg_type'] == 'ham' and message['prob_spam'] > 0.5 ]) true_negative = len([ 1 for message in classified if message['msg_type'] == 'ham' and message['prob_spam'] <= 0.5 ])
import readline from bayes import BayesClassifier def _rev_lookup(val, mydict): return (list(mydict.keys())[list(mydict.values()).index(val)]) while True: FEATURES = ["d15N", "d13C", "d2H", "d18O"] model = BayesClassifier() model_input = [] for f in FEATURES: model_input.append(float(input("{f}: ".format(f=f)))) y_pred, y_pred_prob = model.predict(model_input) print("" "Model predicts {y_pred} with {conf}% confidence." "".format(y_pred=_rev_lookup(y_pred, model.zone_dic), conf=round(y_pred_prob[0][y_pred][0] * 100, 4))) print(model.zone_dic) print([ypp * 100 for ypp in y_pred_prob])
except: description=news['summary'] description=re.sub('<.*>','',description) slug=slugify(title) collection_obj=CollectionMapping(tablename) collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,}) if __name__ == "__main__": # delete old news CollectionMapping('news_news').delete_all() CollectionMapping('news_category').delete_all() # fetch news feed for tablename, url_dict in news_dict.items(): for site,url in url_dict.items(): parse_data(tablename,site,url) # add training data category_set=TrainClassifier.train_classifier(news_training_dict) category_dict=dict([(slugify(category),1) for category in news_training_dict.keys()]) # classify each document for news in CollectionMapping('news_news').objects.all(): bayes_obj=BayesClassifier(category_set) # returns each obj with category_list attribute ordered according to their score obj=bayes_obj.find_posterior("%s %s"%(news.name,news.description)) news.update(category_list=map(lambda category:category[0],obj.category_list)) for category,score in obj.category_list:category_dict[category]+=1 category_obj=CollectionMapping('news_category') category_obj.save(type="education",category=category_dict)
#!/usr/bin/env python #-*- encoding:utf-8 -*- import sys, os from preprocess import Preprocessor from features import FeatureSelector from bayes import BayesClassifier if __name__ == '__main__': train_file = sys.argv[1] test_file = sys.argv[2] pr = Preprocessor() pr.build_vocabulary_and_categories(train_file) fs = FeatureSelector(train_file, ck = 500) fs.select_features() bc = BayesClassifier(train_file, test_file, model = 'bernoulli') bc.train() bc.test()