Beispiel #1
0
def evaluate(X_train, X_test, y_train, y_test, n_trials=4):
    jubatus_methods = ['perceptron', 'PA', 'PA1', 'PA2', 'CW', 'AROW', 'NHERD']
    sklearn_methods = [
        'Perceptron(sk)', 'PA1(sk)', 'PA2(sk)', 'LSVM(sk)', 'LR(sk)'
    ]
    results = dict.fromkeys(jubatus_methods + sklearn_methods, 0)
    vectorizer = TfidfVectorizer()
    for i in range(n_trials):
        X_train, y_train = shuffle(X_train, y_train, random_state=42)
        vec_X_train = vectorizer.fit_transform(X_train)
        vec_X_test = vectorizer.transform(X_test)
        for method in jubatus_methods:
            clf = linear_classifier(method=method)
            train_data = [(yi, Datum({'message': xi}))
                          for (xi, yi) in zip(X_train, y_train)]
            test_data = [Datum({'message': xi}) for xi in X_test]
            clf.train(train_data)
            predictions = clf.classify(test_data)
            y_pred = [
                max(pred, key=lambda x: x.score).label for pred in predictions
            ]
            test_score = accuracy_score(y_test, y_pred)
            print('{0:.3f}\t{1}'.format(test_score, method))
            results[method] += test_score
        for method in sklearn_methods:
            clf = sklearn_linear_classifier(method=method)
            clf.fit(vec_X_train, y_train)
            test_score = accuracy_score(y_test, clf.predict(vec_X_test))
            print('{0:.3f}\t{1}'.format(test_score, method))
            results[method] += test_score
    results = {k: v / n_trials for k, v in results.items()}
    return results
Beispiel #2
0
 def test_get_nearest_center(self):
     for i in range(0, 100):
         d = Datum({"nkey1": i, "nkey2": -i})
         self.cli.push([IndexedPoint(str(i), d)])
     q = Datum({"nkey1": 2.0, "nkey2": 1.0})
     res = self.cli.get_nearest_center(q)
     self.assertTrue(isinstance(res, Datum))
    def test_loadsave(self):
        x = Classifier(CONFIG)
        x.train([
            LabeledDatum('Y', Datum({'x': 'y'})),
            LabeledDatum('N', Datum({'x': 'n'})),
        ])
        path = '/tmp/127.0.0.1_0_classifier_hoge.jubatus'

        def _remove_model():
            try:
                os.remove(path)
            except Exception:
                pass

        _remove_model()
        try:
            self.assertEqual(
                {'127.0.0.1_0': '/tmp/127.0.0.1_0_classifier_hoge.jubatus'},
                x.save('hoge'))
            self.assertTrue(os.path.isfile(path))
            x = Classifier(CONFIG)
            self.assertTrue(x.load('hoge'))
            y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})])
            self.assertEqual(['Y', 'N'], [
                list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
                for z in y
            ])
        finally:
            _remove_model()
    def test(self):
        w = Weight(CONFIG)
        d = Datum({'n0': 1, 'n1': 2, 'n2': 3, 't0': 'hello world'})
        for r in [w.update(d), w.calc_weight(d)]:
            self.assertEqual(5, len(r))
            for x in r:
                self.assertTrue(isinstance(x, Feature))
            m = dict([(x.key, x.value) for x in r])
            self.assertEqual(5, len(m))
            self.assertEqual(1.0, m['n0@num'])
            self.assertAlmostEqual(math.log(2), m['n1@log'])
            self.assertEqual(1.0, m['n2@str$3'])

        w.update(Datum({'t1': 'hello world'}))
        w.update(Datum({'t1': 'foo bar'}))
        w.update(Datum({'t1': 'hello'}))
        d = Datum({'t1': 'foo bar hello world hoge'})
        r0 = dict([(x.key, x.value) for x in w.calc_weight(d)])

        model = w.save_bytes()
        w = Weight(CONFIG)
        w.load_bytes(model)
        self.assertEqual(CONFIG, json.loads(w.get_config()))
        r1 = dict([(x.key, x.value) for x in w.calc_weight(d)])
        self.assertEqual(r0, r1)
    def test_num(self):
        x = Classifier(CONFIG)
        self.assertEqual(
            2, x.train([
                ('Y', Datum({'x': 1})),
                ('N', Datum({'x': -1})),
            ]))

        def _test_classify(x):
            y = x.classify([Datum({'x': 1}), Datum({'x': -1})])
            self.assertEqual(['Y', 'N'], [
                list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
                for z in y
            ])
            self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1})

        _test_classify(x)
        model = x.save_bytes()

        x.clear()
        self.assertEqual({}, x.get_labels())
        x.set_label('Y')
        x.set_label('N')
        self.assertEqual({'N': 0, 'Y': 0}, x.get_labels())
        x.delete_label(u'Y')
        self.assertEqual({'N': 0}, x.get_labels())

        x = Classifier(CONFIG)
        x.load_bytes(model)
        _test_classify(x)
        self.assertEqual(CONFIG, json.loads(x.get_config()))
Beispiel #6
0
 def test_get_nearest_members_light(self):
     for i in range(0, 100):
         d = Datum({"nkey1": i, "nkey2": -i})
         self.cli.push([IndexedPoint(str(i), d)])
     q = Datum({"nkey1": 2.0, "nkey2": 1.0})
     res = self.cli.get_nearest_members_light(q)
     self.assertTrue(isinstance(res[0], WeightedIndex))
Beispiel #7
0
def test():
    def _valid_result(r):
        assert isinstance(r, Datum)
        d = dict(r.num_values)
        assert d.get('x', None) is not None and d.get('y', None) is not None

    x = Recommender(RECOMMENDER_CONFIG)
    x.update_row('0', Datum({'x': 0.9, 'y': 4.9}))
    x.update_row('1', Datum({'x': 1, 'y': 5}))
    x.update_row('2', Datum({'x': 1.15, 'y': 5.15}))
    x.update_row('3', Datum({'x': 1.2, 'y': 5.1}))
    x.update_row('4', Datum({'x': 1.05}))
    _valid_result(x.complete_row_from_datum(Datum({'x': 1.1})))
    _valid_result(x.complete_row_from_id('4'))
    r = x.similar_row_from_id('2', 3)
    assert isinstance(r, list)
    assert isinstance(r[0], IdWithScore)
    r = x.similar_row_from_datum(Datum({'y': 5.05}), 3)
    assert isinstance(r, list)
    assert isinstance(r[0], IdWithScore)
    _valid_result(x.decode_row('0'))
    assert set(x.get_all_rows()) == set([str(i) for i in range(5)])
    r = x.calc_similarity(Datum({'x': 1}), Datum({'y': 5}))
    assert isinstance(r, float)
    r = x.calc_l2norm(Datum({'x': 1, 'y': 5}))
    assert isinstance(r, float)

    x.clear()
    assert len(x.get_all_rows()) == 0
    assert json.loads(x.get_config())

    # エラーが発生しないことだけ確認
    model = x.dump()
    x.load(model)
 def _test_classify(x):
     y = x.classify([Datum({'x': 1}), Datum({'x': -1})])
     self.assertEqual(['Y', 'N'], [
         list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
         for z in y
     ])
     self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1})
Beispiel #9
0
    def test_add_string(self):
        d = Datum()
        d.add_string('key', 'value')
        self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())

        d = Datum()
        d.add_string(u('key'), u('value'))
        self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
Beispiel #10
0
    def test(self):
        x = Regression(CONFIG)
        self.assertEqual(
            5,
            x.train([
                ScoredDatum(0.0, Datum({'x': 1.0})),
                ScoredDatum(1.0, Datum({'x': 2.0})),
                ScoredDatum(2.0, Datum({'x': 4.0})),
                ScoredDatum(3.0, Datum({'x': 8.0})),
                ScoredDatum(4.0, Datum({'x': 16.0})),
            ]))
        ret = x.estimate([
            Datum({'x': 32.0}),
            Datum({'x': 1.5}),
        ])
        self.assertEqual(2, len(ret))
        self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0)
        self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0)
        self.assertEqual(CONFIG, json.loads(x.get_config()))

        model = x.save_bytes()
        x = Regression(CONFIG)
        x.load_bytes(model)
        self.assertEqual(ret,
                         x.estimate([
                             Datum({'x': 32.0}),
                             Datum({'x': 1.5}),
                         ]))
Beispiel #11
0
    def test(self):
        x = Regression(CONFIG)
        self.assertEqual(
            5,
            x.train([
                ScoredDatum(0.0, Datum({'x': 1.0})),
                ScoredDatum(1.0, Datum({'x': 2.0})),
                ScoredDatum(2.0, Datum({'x': 4.0})),
                ScoredDatum(3.0, Datum({'x': 8.0})),
                ScoredDatum(4.0, Datum({'x': 16.0})),
            ]))
        ret = x.estimate([
            Datum({'x': 32.0}),
            Datum({'x': 1.5}),
        ])
        self.assertEqual(2, len(ret))
        self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0)
        self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0)
        self.assertEqual(CONFIG, json.loads(x.get_config()))

        model = x.save_bytes()
        x = Regression(CONFIG)
        x.load_bytes(model)
        self.assertEqual(ret,
                         x.estimate([
                             Datum({'x': 32.0}),
                             Datum({'x': 1.5}),
                         ]))

        st = x.get_status()
        self.assertTrue(isinstance(st, dict))
        self.assertEqual(len(st), 1)
        self.assertEqual(list(st.keys())[0], 'embedded')
        self.assertTrue(isinstance(st['embedded'], dict))
Beispiel #12
0
def test_classifier_str():
    x = Classifier(CLASSIFIER_CONFIG)
    x.train([
        (u'Y', Datum({'x': u'y'})),
        (u'N', Datum({'x': u'n'})),
    ])
    y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})])
    assert [
        list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
        for z in y
    ] == ['Y', 'N']
 def test_str(self):
     x = Classifier(CONFIG)
     self.assertEqual(
         2, x.train([
             ('Y', Datum({'x': 'y'})),
             ('N', Datum({'x': 'n'})),
         ]))
     y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})])
     self.assertEqual(['Y', 'N'], [
         list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
         for z in y
     ])
 def test_types(self):
     x = Classifier(CONFIG)
     x.train([
         LabeledDatum('Y', Datum({'x': 'y'})),
         LabeledDatum('N', Datum({'x': 'n'})),
     ])
     y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})])
     self.assertTrue(isinstance(y[0][0], EstimateResult))
     self.assertEqual(['Y', 'N'], [
         list(sorted(z, key=lambda x: x.score, reverse=True))[0].label
         for z in y
     ])
Beispiel #15
0
def predict(client):
    # predict the last shogun
    data = [
        Datum({'name': '慶喜'}),
        Datum({'name': '義昭'}),
        Datum({'name': '守時'}),
    ]
    for d in data:
        res = client.classify([d])
        # get the predicted shogun name
        shogun_name = max(res[0], key=lambda x: x.score).label
        first_name = d.string_values[0][1]
        _output('{0} {1}\n'.format(shogun_name, first_name))
Beispiel #16
0
def test():
    x = Clustering(CLUSTERING_CONFIG)
    assert x.get_revision() == 0
    assert x.push([
        Datum({'x': 1.0}),
        Datum({'x': 0.9}),
        Datum({'x': 1.1}),
        Datum({'x': 5.0}),
        Datum({'x': 4.9}),
        Datum({'x': 5.1}),
    ])
    assert x.get_revision() == 1
    centers = x.get_k_center()
    assert isinstance(centers, list) and len(centers) == 2
    assert isinstance(centers[0], Datum)
    members = x.get_core_members()
    assert isinstance(members, list) and len(members) == 2
    assert isinstance(members[0], list)
    assert isinstance(members[0][0], WeightedDatum)
    c = x.get_nearest_center(Datum({'x': 1.05}))
    assert isinstance(c, Datum)
    assert c.num_values[0][1] >= 0.9 and c.num_values[0][1] <= 1.1
    c = x.get_nearest_members(Datum({'x': 1.05}))
    assert isinstance(c, list)
    assert isinstance(c[0], WeightedDatum)
    assert json.loads(x.get_config())

    # エラーが発生しないことだけ確認
    model = x.dump()
    x.load(model)
Beispiel #17
0
def predict(client):
    # predict the last shogun
    data = [
        Datum({'name': u'慶喜'}),
        Datum({'name': u'義昭'}),
        Datum({'name': u'守時'}),
    ]
    for d in data:
        res = client.classify([d])
        # get the predicted shogun name
        sys.stdout.write(max(res[0], key=lambda x: x.score).label)
        sys.stdout.write(' ')
        sys.stdout.write(d.string_values[0][1].encode('utf-8'))
        sys.stdout.write('\n')
Beispiel #18
0
 def test_decode_row(self):
     self.cli.clear_row("decode_row")
     d = Datum({"skey1": "val1", "skey2": "val2", "nkey1": 1.0, "nkey2": 2.0})
     self.cli.update_row("decode_row", d)
     decoded_row = self.cli.decode_row("decode_row")
     self.assertEqual(json.dumps(d.string_values), json.dumps(decoded_row.string_values))
     self.assertEqual(json.dumps(d.num_values), json.dumps(decoded_row.num_values))
Beispiel #19
0
 def test_pack(self):
     self.assertEqual(
         msgpack.packb(([['name', 'Taro']], [['age', 20.0]], [])),
         msgpack.packb(Datum({
             'name': 'Taro',
             'age': 20
         }).to_msgpack()))
Beispiel #20
0
 def test_get_k_center(self):
     for i in range(0, 100):
         d = Datum({"nkey1": i, "nkey2": -i})
         self.cli.push([IndexedPoint(str(i), d)])
     res = self.cli.get_k_center()
     self.assertEqual(len(res), 10)
     self.assertTrue(isinstance(res[0], Datum))
Beispiel #21
0
 def test_get_core_members_light(self):
     for i in range(0, 100):
         d = Datum({"nkey1": i, "nkey2": -i})
         self.cli.push([IndexedPoint(str(i), d)])
     res = self.cli.get_core_members_light()
     self.assertEqual(len(res), 10)
     self.assertTrue(isinstance(res[0][0], WeightedIndex))
Beispiel #22
0
    def predict(self, client):
        getpre  = preMongo()
        dic_pre = getpre.getDic()
        data = []
        predict_result = {}

        for line in dic_pre:
            value = dic_pre[line]['Value']
            data.append(Datum({'Value':value}))
        
        for d in data:
            res = client.classify([d])
            # getmongo.postDB(max(res[0], key=lambda x: x.score).label, str(d.num_values[0][1]))

            # sys.stdout.write(max(res[0], key=lambda x: x.score).label)
            # sys.stdout.write(' ')
            # sys.stdout.write(str(d.num_values[0][1]))
            # sys.stdout.write('\n')

            hoge = str(d.num_values[0][1])
            result = max(res[0], key=lambda x: x.score).label   
            predict_result.update({'Result': result, 
                                   'Value'  : hoge})

        return predict_result['Result']
Beispiel #23
0
def add_data(num=10):
    data = generate_data(num)
    client = jubatus.Anomaly(HOST, PORT_P, NAME)
    for d in data:
        dt = Datum({"x": d[0], "y": d[1], "z": d[2]})
        result = client.add(dt)
        print('Added {0}, score = {1}'.format(result.id, result.score))
Beispiel #24
0
    def on_status(self, status):
        if not hasattr(status, 'text'):
            return
        if not hasattr(status, 'coordinates'):
            return
        if not status.coordinates or not 'coordinates' in status.coordinates:
            return

        loc = None
        for l in self.locations:
            coordinates = status.coordinates['coordinates']
            if l.is_inside(coordinates[0], coordinates[1]):
                loc = l
                break
        if not loc:
            # Unknown location
            return
        hashtags = status.entities['hashtags']
        detagged_text = remove_hashtags_from_tweet(status.text, hashtags)

        # Create datum for Jubatus
        d = Datum({'text': detagged_text})

        # Send training data to Jubatus
        self.classifier.train([(loc.name, d)])

        # Print trained tweet
        print_green(loc.name, ' ')
        print(detagged_text)
Beispiel #25
0
def make_datum(row, args):
    title, authors, groups, keywords, topics, abstract = row
    d = Datum()
    d.add_string("title", title)
    if args.abstract:
        d.add_string("abstract", abstract)
    return d
Beispiel #26
0
def train_and_predict(client, file):
    input_data = []
    number_of_samples = 0
    with open(file) as tsv:
        line = tsv.readline()
        header = line[:-1].split("\t")
        for line in tsv:
            if line[8:11] == 'CHB':
                input_data.append(line)
                number_of_samples += 1
            elif line[8:11] == 'JPT':
                input_data.append(line)
                number_of_samples += 1
            else:
                continue
    shuffled_numbers = numpy.arange(number_of_samples)

    for epoch in range(number_of_epoch):
        random.shuffle(shuffled_numbers)
        random.shuffle(shuffled_numbers)
        random.shuffle(shuffled_numbers)
        for i in shuffled_numbers:
            fields = input_data[i][:-1].split("\t")
            if fields[0][0:7] == exclude:
                predict_data = []
                predict = {}
                answer = fields[0][8:11]
                for j in range(1, len(fields)):
                    fields[j] = float(fields[j])
                    predict.update({header[j]: fields[j]})
                predict_data.append((Datum(predict)))
            else:
                train_data = []
                trains = {}
                for j in range(1, len(fields)):
                    fields[j] = float(fields[j])
                    trains.update({header[j]: fields[j]})
                train_data.append((fields[0][8:11], Datum(trains)))
                client.train(train_data)

    result = client.classify([predict_data[0]])
    predicted = max(result[0], key=lambda x: x.score).label
    if answer == predicted:
        print('correct', end="\t")
    else:
        print('wrong', end="\t")
    print(answer, predicted, result, sep="\t")
    def test(self):
        x = NearestNeighbor(CONFIG)
        self.assertTrue(x.set_row("a0", Datum({'x': 0})))
        self.assertTrue(x.set_row("a1", Datum({'x': 0.25})))
        self.assertTrue(x.set_row("a2", Datum({'x': 0.5})))
        self.assertTrue(x.set_row("a3", Datum({'x': 1})))
        self.assertTrue(x.set_row("b0", Datum({'x': 10})))
        self.assertTrue(x.set_row("b1", Datum({'x': 10.25})))
        self.assertTrue(x.set_row("b2", Datum({'x': 10.5})))
        self.assertTrue(x.set_row("b3", Datum({'x': 11})))

        def _check_prefix(expected, lst):
            for x in lst:
                self.assertTrue(isinstance(x, IdWithScore))
                self.assertTrue(x.id.startswith(expected))

        ret = x.neighbor_row_from_id("a0", 3)
        self.assertEqual(3, len(ret))
        _check_prefix('a', ret)

        ret = x.neighbor_row_from_datum(Datum({'x': 0.25}), 3)
        self.assertEqual(3, len(ret))
        _check_prefix('a', ret)

        ret = x.similar_row_from_id("b3", 3)
        self.assertEqual(3, len(ret))
        _check_prefix('b', ret)

        ret = x.similar_row_from_datum(Datum({'x': 11}), 3)
        self.assertEqual(3, len(ret))
        _check_prefix('b', ret)

        self.assertEqual(set(['a0', 'a1', 'a2', 'a3', 'b0', 'b1', 'b2', 'b3']),
                         set(x.get_all_rows()))
        self.assertEqual(CONFIG, json.loads(x.get_config()))
        model = x.save_bytes()

        x = NearestNeighbor(CONFIG)
        x.load_bytes(model)
        self.assertEqual(set(['a0', 'a1', 'a2', 'a3', 'b0', 'b1', 'b2', 'b3']),
                         set(x.get_all_rows()))

        st = x.get_status()
        self.assertTrue(isinstance(st, dict))
        self.assertEqual(len(st), 1)
        self.assertEqual(list(st.keys())[0], 'embedded')
        self.assertTrue(isinstance(st['embedded'], dict))
Beispiel #28
0
    def test_add_bulk(self):
        x = Anomaly(CONFIG)
        data = [
            Datum({'x': 0.0999}),
            Datum({'x': 0.1}),
            Datum({'x': -0.1009}),
            Datum({'x': -0.101}),
            Datum({'x': 0.1011}),
        ]
        ret = x.add_bulk(data)
        self.assertEqual(['0', '1', '2', '3', '4'], ret)
        self.assertEqual(set(ret), set(x.get_all_rows()))

        x = Anomaly(CONFIG)
        x.fit(np.array([[d.num_values[0][1]] for d in data]))
        self.assertEqual(['0', '1', '2', '3', '4'], ret)
        self.assertEqual(set(ret), set(x.get_all_rows()))
Beispiel #29
0
    def test(self):
        x = Clustering(CONFIG)
        self.assertEqual(0, x.get_revision())
        self.assertTrue(x.push([
            IndexedPoint('a', Datum({'x': 1.0})),
            IndexedPoint('b', Datum({'x': 0.9})),
            IndexedPoint('c', Datum({'x': 1.1})),
            IndexedPoint('d', Datum({'x': 5.0})),
            IndexedPoint('e', Datum({'x': 4.9})),
            IndexedPoint('f', Datum({'x': 5.1})),
        ]))
        self.assertEqual(1, x.get_revision())
        centers = x.get_k_center()
        self.assertTrue(isinstance(centers, list))
        self.assertEqual(2, len(centers))
        self.assertTrue(isinstance(centers[0], Datum))
        members = x.get_core_members()
        self.assertTrue(isinstance(members, list))
        self.assertEqual(2, len(members))
        self.assertTrue(isinstance(members[0], list))
        self.assertTrue(isinstance(members[0][0], WeightedDatum))
        c = x.get_nearest_center(Datum({'x': 1.05}))
        self.assertTrue(isinstance(c, Datum))
        self.assertTrue(c.num_values[0][1] >= 0.9 and
                        c.num_values[0][1] <= 1.1)
        c = x.get_nearest_members(Datum({'x': 1.05}))
        self.assertTrue(isinstance(c, list))
        self.assertTrue(isinstance(c[0], WeightedDatum))

        c = x.get_core_members_light()
        self.assertTrue(isinstance(c, list))
        self.assertTrue(isinstance(c[0], list))
        self.assertTrue(isinstance(c[0][0], WeightedIndex))

        c = x.get_nearest_members_light(Datum({'x': 1.05}))
        self.assertTrue(isinstance(c, list))
        self.assertTrue(isinstance(c[0], WeightedIndex))

        model = x.save_bytes()
        x = Clustering(CONFIG)
        x.load_bytes(model)

        self.assertEqual(CONFIG, json.loads(x.get_config()))
        self.assertEqual(1, x.get_revision())
        self.assertEqual(len(centers), len(x.get_k_center()))

        st = x.get_status()
        self.assertTrue(isinstance(st, dict))
        self.assertEqual(len(st), 1)
        self.assertEqual(list(st.keys())[0], 'embedded')
        self.assertTrue(isinstance(st['embedded'], dict))
def predict_min(l):
    res = client.classify(
        [Datum({
            u'article': l['article'],
            u'HeadLine': l['HeadLine']
        })])
    pred = min(res[0], key=lambda x: x.score)
    return pred.label, pred.score