Esempio n. 1
0
def train(tagger: BayesClassifier,
          iter: int,
          measGen: MeasurementGenerator,
          classGen: ClassAssign,
          Z: int,
          V: int = 10,
          delta: float = .05) -> List[float]:
    # Performs 'iter' iterations of vFold testing (default 'V' is ten) with 'tagger' classifier
    # for 'Z' samples generated by 'measGen' and 'classGen.' After each vFold validation, appends
    # an expected value (attached to tagger) and then optimizes tagger by 'delta' paramenter (default .05).
    # Outputs a new optimized tagger and list of gain values from each iteration.
    expectedGain = []

    for _ in range(iter):
        # Generates measurements
        samples = measGen.genMeas(Z)
        values = classGen.assign(samples)

        # Shuffles values
        samplesSh, valuesSh = shuffle(samples, values)

        # Performs Test
        matrix = vFold(samplesSh, valuesSh, V, tagger)
        # Appends value to list
        expectedGain.append(calcExpGain(matrix, tagger.eGain))

        # Gives class probability over whole data set.
        tagger.priorUpdate(calcClassProb(valuesSh, tagger.range))

        # Updates tagger
        tagger.optimize(delta, samplesSh, valuesSh)

    return expectedGain
Esempio n. 2
0
def main():
    from random import seed
    from probability import genProbs
    #### Unit test for training funciton
    # Initiatializes seed for recurrent testing.
    for _ in range(10):
        dimen = (4, 3, 6, 5, 6)
        classValues = 5

        measures = MeasurementGenerator(dimen)
        classes = ClassAssign(dimen, classValues)

        conds = [list(genProbs(measures.range)) for _ in range(classValues)]
        egain = [[2, 0, 0, 0, 1], [3, 4, 0, 2, 2], [2, 2, 5, 1, 1],
                 [2, 2, 3, 4, 1], [0, 1, -3, 2, 3]]
        classifier = BayesClassifier(
            None, conds, eGain=egain
        )  # Worries that supplying similar priors is affecting our results. Even though vFold updates.
        y = train(classifier, 20, measures, classes, 6000, delta=.0005)
        z = [y[i] - y[i - 1] for i in range(1, len(y))]
        # Trying to figure out average negative error to see if this is floating point.
        print(y)
        print()
        print(z)
        q = [i for i in z if i < 0]
        q = sum(q) / max(len(q), 1)
        print(q)
        print()
    x = measures.genMeas(20)

    p = classes.assign(x)
    l = classifier.assign(x)
Esempio n. 3
0
def build_bayes_graph(img, labels, sigma=1e2, kappa=2):
    """ Build a graph from 4-neighborhood of pixels.
        Foreground and background is determined from
        labels (1 for foreground, 0 for background)
        and is modeled with naive Bayes classifiers."""
    m, n = img.shape[:2]
    # RGB vector version (one pixel per row)
    vim = img.reshape((-1, 3))
    # RGB for foreground and background
    foreground = img[labels == 1].reshape((-1, 3))
    background = img[labels == 0].reshape((-1, 3))
    train_data = [foreground, background]
    # train naive Bayes classifier
    bc = BayesClassifier()
    bc.train(train_data)
    # get probabilities for all pixels
    bc_lables, prob = bc.classify(vim)
    prob_fg, prob_bg = prob[0], prob[1]
    print(np.amax(prob_fg), np.max(prob_bg))
    # create graph with m*n+2 nodes
    gr = Graph()
    gr.add_node(range(m * n + 2))
    source = m * n  # second to last is source
    sink = m * n + 1  # last node is sink
    # normalize
    for i in range(vim.shape[0]):
        vim[i] = vim[i] / np.linalg.norm(vim[i])
    # go through all nodes and add edges
    for i in range(m * n):
        print(i)
        # add edge from source
        gr.add_edge((source, i), (prob_fg[i] / (prob_fg[i] + prob_bg[i])))
        # add edge to sink
        gr.add_edge((i, sink), (prob_bg[i] / (prob_fg[i] + prob_bg[i])))
        # add edges to neighbors
        if i % n != 0:  # left exists
            edge_wt = kappa * \
                np.exp(-1.0 * sum((vim[i] - vim[i - 1])**2) / sigma)
            gr.add_edge((i, i - 1), edge_wt)
        if (i + 1) % n != 0:  # right exists
            edge_wt = kappa * \
                np.exp(-1.0 * sum((vim[i] - vim[i + 1])**2) / sigma)
            gr.add_edge((i, i + 1), edge_wt)
        if i // n != 0:  # up exists
            edge_wt = kappa * \
                np.exp(-1.0 * sum((vim[i] - vim[i - n])**2) / sigma)
            gr.add_edge((i, i - n), edge_wt)
        if i // n != m - 1:  # down exists
            edge_wt = kappa * \
                np.exp(-1.0 * sum((vim[i] - vim[i + n])**2) / sigma)
            gr.add_edge((i, i + n), edge_wt)
    gr.build_flow(source, sink)
    return gr
Esempio n. 4
0
def vFold(meas: Tuple[int], tags: Tuple[int], V: int,
          classifier: BayesClassifier) -> List[List[float]]:
    # Performs a round of V-fold validation tests on measurements 'meas' and respective real classes 'tags'
    # Performs V test using classifier. Returns a normalized confusion matrix of tests.
    results = []

    measFold = partition(meas, V)
    tagsFold = partition(tags, V)

    for v in range(V):
        # Creates folds
        # Assigns testing and training
        trainTags = [tag for i in range(V) if i != v for tag in tagsFold[i]]
        testMeas = measFold[v]

        # Updates with new probability values
        trainProb = calcClassProb(trainTags, classifier.range)
        classifier.priorUpdate(trainProb)

        results.append(classifier.assign(testMeas))  # Unfolds tuple
    results = tuple(i for tpl in results for i in tpl)
    matrix = genConMatrix(tags, results, classifier.range)
    return results  #normConMatrix(matrix)
Esempio n. 5
0
def main():
    from bayes import calcClassProb

    priors = (.6, .4)
    conds = [[.12 / .6, .18 / .6, .3 / .6], [.2 / .4, .16 / .4, .04 / .4]]
    gain = ((1, 0), (0, 2))

    ### Tests for biasCCP
    classifier = BayesClassifier(priors, conds, eGain=gain)
    # Classification should be 1,1,0. The update should alter, 0|1, 1|1, 2|0.
    newCCP = biasCCP(classifier, .05)
    # print(newCCP) # Uncomment to see if ccp comforms to predictiosn. Should bias 2|0 and raise 0 and 1 |1.

    ### Tests for biasMeasGenerator
    generator = MeasurementGenerator((2, 2))
    # print(generator.cmlProbs)
    prev = generator.cmlProbs[1] - generator.cmlProbs[0]

    # Let's give some conditionals that biases one measure
    conds = ((.1, .7, .1, .1), (.25, .25, .25, .25))
    # And feed biased priors
    priors = (.7, .3)

    biasMeasGenerator(generator, priors, conds)
    # print(generator.cmlProbs)
    now = generator.cmlProbs[1] - generator.cmlProbs[0]
    # Should show bias towards second value now.
    assert now > prev

    generator = MeasurementGenerator((2, 2))
    # print(generator.cmlProbs)
    prev1 = generator.cmlProbs[1] - generator.cmlProbs[0]
    prev2 = generator.cmlProbs[-1] - generator.cmlProbs[-2]

    # Let's give some conditionals that biases one measure
    conds = ((.1, .5, .1, .3), (.25, .25, .25, .25))
    # And feed biased priors
    priors = (.7, .3)

    biasMeasGenerator(generator, priors, conds)
    # print(generator.cmlProbs)
    now1 = generator.cmlProbs[1] - generator.cmlProbs[0]
    now2 = generator.cmlProbs[-1] - generator.cmlProbs[-2]

    # Should show bias towards second value now.
    assert now1 > prev1, (prev1, now1)
    assert now2 > prev2, (prev2, now2)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("data",
                        help="File of measure class pairs to test.",
                        type=str)
    parser.add_argument("dimen",
                        help="Tuple representing measure space.",
                        type=str)
    parser.add_argument(
        "priors",
        help="File designating prior probabilities of classes.",
        type=str)
    parser.add_argument(
        "conditionals",
        help="File designating class conditional probabilities.",
        type=str)
    parser.add_argument(
        "--eGain",
        "-e",
        help=
        "Economic gain matrix for data. If not provided assumes identity matrix.",
        type=str)
    parser.add_argument(
        "--vFolds",
        "-v",
        help=
        "Number of v-fold partitions for testing. If not provided, assumes all data is for testing.",
        type=int)
    args = parser.parse_args()

    # Reading data
    dimen = eval(args.dimen)
    measures, tags = reader.readData(args.data, dimen)
    priors = reader.readPriors(args.priors)
    conds = reader.readCCP(args.conditionals)
    e = False
    if args.eGain:
        e = reader.readGain(args.eGain)

    classifier = BayesClassifier(priors, conds, eGain=e)

    expGain = test(classifier, measures, tags, V=args.vFolds)

    print("The expected gain for the data is: {}".format(expGain))
Esempio n. 7
0
def main():
    #### Testing Shuffle
    # Checks that shuffle does not return same values.
    sampleSize = random.randint(1, 1000)

    # Samples unique value pairs. (Makes easier to track. Misassignment of identical values doesn't alter data.)
    meas = tuple(random.sample(range(1, 10 * sampleSize), sampleSize))
    tags = tuple(random.sample(range(1, 10 * sampleSize), sampleSize))

    measSh, tagsSh = shuffle(meas, tags)
    assert (measSh, tagsSh) != (meas, tags)

    # Checks that original pairs are present
    for i in range(sampleSize):

        # Finds where the measurement was shuffled to
        indexSh = measSh.index(meas[i])

        # Checks that this new index pairs with original tag value.
        assert tags[i] == tagsSh[indexSh]


### Testing vFold
# Generates trivial matrix to make sure all parts are working.
# This should have perfect accuracy.
    meas = (0, 1, 2, 2, 1, 2, 2, 1, 1, 0, 2, 2)
    tags = (2, 1, 0, 0, 1, 0, 0, 1, 1, 2, 0, 0)
    cp = (.5, .2, .3)
    ccp = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
    tagger = BayesClassifier(cp, ccp)

    test = vFold(meas, tags, 5, tagger)
    print(test)

    ### Testing partition
    ### Partition should retain order
    for i in range(3, 1000):
        l = [_ for _ in range(random.randint(3, 1000))]
        k = partition(l, 3)
        m = tuple(j for i in k for j in i)
        assert tuple(l) == m, m
Esempio n. 8
0
def test(classifier: BayesClassifier,
         measures: Tuple[int],
         tags: Tuple[int],
         V=0) -> float:
    ## Training protocol for testing measurements. Shuffles measures and tags while retaining
    # pairings and then classifies according to Bayes classifier. Returns expected gain
    # of testing. If V is provided, performs V-fold testing on data of V partitions

    # Shuffles tags and measures
    K = classifier.range
    e = classifier.eGain
    measures, tags = shuffle(measures, tags)

    # Sees if we are v-folding or just testing over a random measure.
    if V:
        normMatrix = vFold(measures, tags, V, classifier)
    else:
        results = classifier.assign(measures)
        matrix = genConMatrix(tags, results, K)
        normMatrix = normConMatrix(matrix)
    return calcExpGain(normMatrix, e)
Esempio n. 9
0
def biasCCP(classifier: BayesClassifier, delta: float) -> List[List[float]]:
    # Destrucctively Updates CCP values in classifier. Classes are generated from classifier
    # for all measurements d in M space. Each d|c is increased by delta towards classifier's selection
    # and is normalized over class space. Returns conditional matrix. Not used in function but preserved
    # for legacy update.

    # Classifies for all d in measurement space M
    M = classifier.spaceSize
    K = classifier.range
    bayesValues = classifier.assign(range(M))
    conds = classifier.cond  # Note, this alters the original classifier. Had to be done for speed and memory efficiency.

    # Updates conditionals
    for measure in range(M):
        tag = bayesValues[measure]
        conds[tag][measure] += delta

    # Normalizes conditionals
    for val in range(K):
        sums = sum(conds[val])
        conds[val] = [prb / sums for prb in conds[val]]

    return conds
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("samples",
                        help="Number of measurement samples to generate.",
                        type=int)
    parser.add_argument("dimen", help="Measurement space.", type=str)
    parser.add_argument("classes", help="Number of classes.", type=int)
    parser.add_argument("seed",
                        help="Random seed for experiement duplication.",
                        type=int)
    parser.add_argument(
        "--vfolds",
        "-v",
        default=10,
        help=
        "Number of v-folds to partition testing data for v-folds testing. Default is 10.",
        type=int)
    parser.add_argument(
        "--optimization",
        "-o",
        default=0.0,
        help=
        "Specify if iterative improvement of class conditional probability values should be taken.",
        type=float)
    parser.add_argument("--iteration",
                        "-t",
                        default=10,
                        help="Number of iterations for conditional update.",
                        type=int)
    parser.add_argument(
        "--identity",
        "-i",
        action="store_true",
        default=False,
        help="Specify if economic gain matrix should be identity.")
    args = parser.parse_args()

    # Checks that our given limits are even feasible memory wise

    # Prompts for reader friendliness
    print("Generating testing data for seed {}".format(args.seed))

    # Sets seed
    seed(args.seed)

    # Assigns values
    dimen = eval(args.dimen)
    # Calculates size of domain
    M = 1
    for N in dimen:
        M *= N
    K = args.classes
    V = args.vfolds
    Z = args.samples
    print("Dimensions of Measurement Space: {}".format(dimen))
    print("Number of Samples: {}".format(Z))
    print("Classes: {}".format(K))

    # Checks that this is even possible to calculate.
    if config.computeLimit(M, K):
        print("Possible measurements exceed memory capabilities.")
        sys.exit()

    print("Generating {0}x{0} Gain Matrix. Identity Matrix: {1}".format(
        K, args.identity))

    gain = genGain(K, identity=args.identity)
    print("{}x{} Economic Gain Matrix Generated".format(
        len(gain), len(gain[0])))

    # Generates measures
    print("Generating {} Measure-Value pairs.".format(Z))
    print("Generating measures.")
    generator = MeasurementGenerator(dimen)
    measures = generator.genMeas(Z)

    assigner = ClassAssign(dimen, K)
    tags = assigner.assign(measures)
    print("{} measures and {} values generated.".format(
        len(measures), len(tags)))

    ## Generates classifier.
    print(
        "Generating class conditional probabilities for {} classes and {} possible measures."
        .format(K, M))

    conditionals = genCCP(K, dimen)
    print(
        "Class conditional probabilities generated for {} classes and {} possible measures"
        .format(len(conditionals), len(conditionals[0])))

    classifier = BayesClassifier(
        None, conditionals,
        eGain=gain)  # No priors given since vFold always assigns.

    print("Testing classifier. V-fold factor: {}".format(V))
    measures, tags = shuffle(measures, tags)
    results = vFold(measures, tags, V, classifier)
    matrix = genConMatrix(tags, results, K)
    norm = normConMatrix(matrix)
    expGain = calcExpGain(norm, classifier.eGain)
    #expGain = test(classifier, measures, tags, V=V)

    print("The expected gain for the given data is: {}".format(expGain))

    #### Here we will work on updating
    if args.optimization:
        print(
            "Fitting data for improved performance. Improvement factor {} used over {} iterations."
            .format(args.optimization, args.iteration))
        gains = []
        # Going to set priors generated from this measurement set as permanent priors.
        priors = calcClassProb(tags, K)
        classifier.priorUpdate(priors)
        for i in range(args.iteration):
            # print(priors)
            classifier.optimize(args.optimization, measures, tags)

            classifier, measures, tags = fitData(classifier, generator, Z,
                                                 args.optimization)

            measures, tags = shuffle(measures, tags)
            results = vFold(measures, tags, V, classifier)
            matrix = genConMatrix(tags, results, K)
            norm = normConMatrix(matrix)
            expGain = calcExpGain(norm, classifier.eGain)
            #expGain = test(classifier, measures, tags, V=V)
            gains.append(expGain)
            print("Expected Gain from iteration {} is {}".format(
                i + 1, expGain))
        print("The expected gain for fitted data after {} iterations is: {}".
              format(args.iteration, gains[-1]))

    # Writes all data to files
    print("Writing to file.")
    reader.writeData(measures, tags, dimen)
    reader.writePriors(classifier.prior)
    reader.writeGain(gain)
    reader.writeCCP(classifier.cond)
    print("Done.")
Esempio n. 11
0
import readline, sys, pandas, numpy as np

from bayes import BayesClassifier
from data import FEATURES

if len(sys.argv) != 2:
    print("Invalid usage. Try ppd <filename>.")

#load the data
FNAME = str(sys.argv[1])
df = pandas.read_csv(FNAME)
df = df.dropna()

#load the classifier
model = BayesClassifier()

#classify the file
predictions = model.model.predict(df[FEATURES].values)
prediction_vals = model.model.predict_proba(df[FEATURES].values)
rev_zone_dic = {v: k for k, v in model.zone_dic.items()}
prediction_vals = np.array(
    list(map(lambda pv: list(map(lambda v: v * 100, pv)), prediction_vals)))
output = np.column_stack((df['ID'].values, prediction_vals))
df = pandas.DataFrame(output)
df.columns = ["ID"] + [v for k, v in rev_zone_dic.items()]
df.to_csv("output.csv")
print("Results in ./output.csv")
Esempio n. 12
0
    msg_type = line_split[0]
    words = re.findall("[a-z0-9']+", " ".join(line_split[1:]))
    return msg_type, set(words)


if __name__ == '__main__':
    with open('data/data.txt', 'r') as f:
        messages = []
        for line in f:
            msg_type, words = tokenize(line)
            messages.append({'msg_type': msg_type, 'words': words})

        training_set = messages[:int(len(messages) * 0.75)]
        testing_set = messages[int(len(messages) * 0.75):]

        bayes = BayesClassifier()
        bayes.train(training_set)
        classified = bayes.classify(testing_set)

        true_positive = len([
            1 for message in classified
            if message['msg_type'] == 'spam' and message['prob_spam'] > 0.5
        ])
        false_positive = len([
            1 for message in classified
            if message['msg_type'] == 'ham' and message['prob_spam'] > 0.5
        ])
        true_negative = len([
            1 for message in classified
            if message['msg_type'] == 'ham' and message['prob_spam'] <= 0.5
        ])
Esempio n. 13
0
import readline

from bayes import BayesClassifier


def _rev_lookup(val, mydict):
    return (list(mydict.keys())[list(mydict.values()).index(val)])


while True:
    FEATURES = ["d15N", "d13C", "d2H", "d18O"]
    model = BayesClassifier()
    model_input = []
    for f in FEATURES:
        model_input.append(float(input("{f}: ".format(f=f))))
    y_pred, y_pred_prob = model.predict(model_input)
    print(""
          "Model predicts {y_pred} with {conf}% confidence."
          "".format(y_pred=_rev_lookup(y_pred, model.zone_dic),
                    conf=round(y_pred_prob[0][y_pred][0] * 100, 4)))
    print(model.zone_dic)
    print([ypp * 100 for ypp in y_pred_prob])
Esempio n. 14
0
            except:
                description=news['summary']
            description=re.sub('<.*>','',description)
            slug=slugify(title)
            collection_obj=CollectionMapping(tablename)
            collection_obj.load_json({'site':site,'slug':slug,'name':title,'description':description,'link':link,})

if __name__ == "__main__":

    # delete old news
    CollectionMapping('news_news').delete_all()
    CollectionMapping('news_category').delete_all()
    # fetch news feed
    for tablename, url_dict in news_dict.items():
        for site,url in url_dict.items():
            parse_data(tablename,site,url)
    # add training data
    category_set=TrainClassifier.train_classifier(news_training_dict)
    category_dict=dict([(slugify(category),1) for category in news_training_dict.keys()])
    # classify each document 
    for news in CollectionMapping('news_news').objects.all():
        bayes_obj=BayesClassifier(category_set)
        # returns each obj with category_list attribute ordered according to their score
        obj=bayes_obj.find_posterior("%s %s"%(news.name,news.description))
        news.update(category_list=map(lambda category:category[0],obj.category_list))
        for category,score in obj.category_list:category_dict[category]+=1
    category_obj=CollectionMapping('news_category')
    category_obj.save(type="education",category=category_dict)

Esempio n. 15
0
#!/usr/bin/env python
#-*- encoding:utf-8 -*-

import sys, os

from preprocess import Preprocessor
from features import FeatureSelector
from bayes import BayesClassifier

if __name__ == '__main__':
    train_file = sys.argv[1]
    test_file = sys.argv[2]

    pr = Preprocessor()
    pr.build_vocabulary_and_categories(train_file)

    fs = FeatureSelector(train_file, ck = 500)
    fs.select_features()

    bc = BayesClassifier(train_file, test_file, model = 'bernoulli')
    bc.train()
    bc.test()