Beispiel #1
0
def make_datum(row, args):
    title, authors, groups, keywords, topics, abstract = row
    d = Datum()
    d.add_string("title", title)
    if args.abstract:
        d.add_string("abstract", abstract)
    return d
 def test_str(self):
     d = Datum()
     d.add_string('name', 'john')
     d.add_number('age', 20)
     d.add_binary('image', '0101')
     self.assertEquals('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}',
                       str(d))
 def test_str(self):
     d = Datum()
     d.add_string('name', 'john')
     d.add_number('age', 20)
     d.add_binary('image', b('0101'))
     s = str(d)
     self.assertTrue('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}' == s or 'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', b\'0101\']]}' == s)
Beispiel #4
0
    def test_add_string(self):
        d = Datum()
        d.add_string('key', 'value')
        self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())

        d = Datum()
        d.add_string(u('key'), u('value'))
        self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
Beispiel #5
0
def make_datum_binary(title=None, text=None, picture=None):
    d = Datum()
    if title:
        d.add_string("title", title)
    if text:
        d.add_string("text", text)
    if picture:
        d.add_binary("img", picture)
    return d
    def test_add_string(self):
        d = Datum()
        d.add_string('key', 'value')
        self.assertEquals(Datum({'key': 'value'}).to_msgpack(),
                          d.to_msgpack())

        d = Datum()
        d.add_string(u'key', u'value')
        self.assertEquals(Datum({'key': 'value'}).to_msgpack(),
                          d.to_msgpack())
Beispiel #7
0
def make_datum(title=None, text=None, picture=None):
    d = Datum()
    if title:
        d.add_string("title", title)
    if text:
        d.add_string("text", text)
    if picture:
        with open(picture, "rb") as f:
            d.add_binary("img", f.read())
    return d
Beispiel #8
0
 def test_str(self):
     d = Datum()
     d.add_string('name', 'john')
     d.add_number('age', 20)
     d.add_binary('image', b('0101'))
     s = str(d)
     self.assertTrue(
         'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}'
         == s or
         'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', b\'0101\']]}'
         == s)
Beispiel #9
0
  def convert(self, args):
    if len(args) % 2 != 0:
      raise ValueError('value for the last datum key ({0}) is missing'.format(args[len(args) - 1]))

    d = Datum()
    for i in range(int(len(args) / 2)):
      feat_key = args[i*2]
      feat_val = args[i*2+1]
      try:
        d.add_number(feat_key, float(feat_val))
      except ValueError:
        d.add_string(feat_key, feat_val)
    return (len(args), d)
Beispiel #10
0
    def convert(self, args):
        if len(args) % 2 != 0:
            raise ValueError(
                'value for the last datum key ({0}) is missing'.format(
                    args[len(args) - 1]))

        d = Datum()
        for i in range(int(len(args) / 2)):
            feat_key = args[i * 2]
            feat_val = args[i * 2 + 1]
            try:
                d.add_number(feat_key, float(feat_val))
            except ValueError:
                d.add_string(feat_key, feat_val)
        return (len(args), d)
Beispiel #11
0
    # 2. prepare training data
    with open('../kddcup.data_10_percent.txt', mode='r') as file:
        for line in file:
            duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label = line[:-1].split(",")

            datum = Datum()
            for (k, v) in [
                    ["protocol_type", protocol_type],
                    ["service", service],
                    ["flag", flag],
                    ["land", land],
                    ["logged_in", logged_in],
                    ["is_host_login", is_host_login],
                    ["is_guest_login", is_guest_login],
                    ]:
                datum.add_string(k, v)

            for (k, v) in [
                    ["duration",float(duration)],
                    ["src_bytes", float(src_bytes)],
                    ["dst_bytes", float(dst_bytes)],
                    ["wrong_fragment", float(wrong_fragment)],
                    ["urgent", float(urgent)],
                    ["hot", float(hot)],
                    ["num_failed_logins", float(num_failed_logins)],
                    ["num_compromised", float(num_compromised)],
                    ["root_shell", float(root_shell)],
                    ["su_attempted", float(su_attempted)],
                    ["num_root", float(num_root)],
                    ["num_file_creations", float(num_file_creations)],
                    ["num_shells", float(num_shells)],
Beispiel #12
0
import sys, json
from jubatus.clustering import client
from jubatus.clustering import types
from jubatus.common import Datum

NAME = "clustering_compounds"
if __name__ == '__main__':
    clustering = client.Clustering("127.0.0.1", 9199, NAME)

    for line in open("../../bench_data/demo4096.smi"):
        smiles, id = line.split(" ")
        datum = Datum()
        datum.add_string("SMILES", smiles)
        clustering.push([datum])
    center_list = clustering.get_k_center()
    members = clustering.get_core_members()
    for i in range(0,4):
        for j in range(len(members[i])):
            print "%d, %d, %s " %(i, j, members[i][j])
#    print "%s \n" % center_list[4]
#    for i in range(len(center_list)):
#        print "%s \n" % center_list[i]
Beispiel #13
0
def make_datum():
    d = Datum()
    d.add_string('string-key',   'str')
    d.add_number('number-key',     1.0)
    d.add_binary('binary-key',  b'bin')
    return d
import json, sys
import jubatus
from jubatus.common import Datum
headlines = {}
#keys = ["HeadLine", "DateLine", "Language", "DateId", "NewsItemId", "article", "Genre1", "Genre2"]
with open(sys.argv[1], "r") as f:
    client = jubatus.Recommender("127.0.0.1", 9199, "hoge", 0)
    feeds = json.load(f, encoding="utf-8")
    for feed in feeds:
        d = Datum()
        keys = list(feed.keys())
        headlines[feed["NewsItemId"]] = feed["HeadLine"]
        for key in keys:
            try: 
                if key == "article":
                    d.add_string(key, " ".join(feed[key]))
                elif key == "NewsItemId":
                    article_id = feed[key].encode('utf-8')
                elif key == "HeadLine":
                    d.add_string(key, feed[key].encode('utf-8'))
                else:
                    d.add_string(key, feed[key].encode('utf-8'))
            except TypeError:
                print("ignore", key, " ".join(feed[key]))
            except AttributeError:
                print("ignore", key, feed[key])
        client.update_row(article_id, d)
    res = client.similar_row_from_id(article_id, 10)
    client.save("jubatus_hackathon")
for r in res:
    print(r.id, r.score, headlines[r.id])
Beispiel #15
0
import sys, json
from jubatus.clustering import client
from jubatus.clustering import types
from jubatus.common import Datum

NAME = "clustering_compounds"
if __name__ == '__main__':
    clustering = client.Clustering("127.0.0.1", 9199, NAME)

    datum = Datum()
    datum.add_string("SMILES", "cccccccc")
    print clustering.get_nearest_center(datum)
#    print "%s \n" % center_list[4]
#    for i in range(len(center_list)):
#        print "%s \n" % center_list[i]
from jubatus.common import Datum
import jubatus

client = jubatus.Weight("127.0.0.1", 9199, "")

d = Datum()

d.add_number("user/age", 25)
d.add_number("user/income", 1000)
d.add_string("user/name", "Loren")
d.add_string("message", "Hello")

res = client.calc_weight(d)
print(res)
Beispiel #17
0
    with open('../kddcup.data_10_percent.txt', mode='r') as file:
        for line in file:
            duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label = line[:-1].split(
                ",")

            datum = Datum()
            for (k, v) in [
                ["protocol_type", protocol_type],
                ["service", service],
                ["flag", flag],
                ["land", land],
                ["logged_in", logged_in],
                ["is_host_login", is_host_login],
                ["is_guest_login", is_guest_login],
            ]:
                datum.add_string(k, v)

            for (k, v) in [
                ["duration", float(duration)],
                ["src_bytes", float(src_bytes)],
                ["dst_bytes", float(dst_bytes)],
                ["wrong_fragment", float(wrong_fragment)],
                ["urgent", float(urgent)],
                ["hot", float(hot)],
                ["num_failed_logins",
                 float(num_failed_logins)],
                ["num_compromised", float(num_compromised)],
                ["root_shell", float(root_shell)],
                ["su_attempted", float(su_attempted)],
                ["num_root", float(num_root)],
                ["num_file_creations",