def make_datum(row, args): title, authors, groups, keywords, topics, abstract = row d = Datum() d.add_string("title", title) if args.abstract: d.add_string("abstract", abstract) return d
def test_str(self): d = Datum() d.add_string('name', 'john') d.add_number('age', 20) d.add_binary('image', '0101') self.assertEquals('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}', str(d))
def test_str(self): d = Datum() d.add_string('name', 'john') d.add_number('age', 20) d.add_binary('image', b('0101')) s = str(d) self.assertTrue('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}' == s or 'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', b\'0101\']]}' == s)
def test_add_string(self): d = Datum() d.add_string('key', 'value') self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack()) d = Datum() d.add_string(u('key'), u('value')) self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
def make_datum_binary(title=None, text=None, picture=None): d = Datum() if title: d.add_string("title", title) if text: d.add_string("text", text) if picture: d.add_binary("img", picture) return d
def test_add_string(self): d = Datum() d.add_string('key', 'value') self.assertEquals(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack()) d = Datum() d.add_string(u'key', u'value') self.assertEquals(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
def make_datum(title=None, text=None, picture=None): d = Datum() if title: d.add_string("title", title) if text: d.add_string("text", text) if picture: with open(picture, "rb") as f: d.add_binary("img", f.read()) return d
def test_str(self): d = Datum() d.add_string('name', 'john') d.add_number('age', 20) d.add_binary('image', b('0101')) s = str(d) self.assertTrue( 'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}' == s or 'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', b\'0101\']]}' == s)
def convert(self, args): if len(args) % 2 != 0: raise ValueError('value for the last datum key ({0}) is missing'.format(args[len(args) - 1])) d = Datum() for i in range(int(len(args) / 2)): feat_key = args[i*2] feat_val = args[i*2+1] try: d.add_number(feat_key, float(feat_val)) except ValueError: d.add_string(feat_key, feat_val) return (len(args), d)
def convert(self, args): if len(args) % 2 != 0: raise ValueError( 'value for the last datum key ({0}) is missing'.format( args[len(args) - 1])) d = Datum() for i in range(int(len(args) / 2)): feat_key = args[i * 2] feat_val = args[i * 2 + 1] try: d.add_number(feat_key, float(feat_val)) except ValueError: d.add_string(feat_key, feat_val) return (len(args), d)
# 2. prepare training data with open('../kddcup.data_10_percent.txt', mode='r') as file: for line in file: duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label = line[:-1].split(",") datum = Datum() for (k, v) in [ ["protocol_type", protocol_type], ["service", service], ["flag", flag], ["land", land], ["logged_in", logged_in], ["is_host_login", is_host_login], ["is_guest_login", is_guest_login], ]: datum.add_string(k, v) for (k, v) in [ ["duration",float(duration)], ["src_bytes", float(src_bytes)], ["dst_bytes", float(dst_bytes)], ["wrong_fragment", float(wrong_fragment)], ["urgent", float(urgent)], ["hot", float(hot)], ["num_failed_logins", float(num_failed_logins)], ["num_compromised", float(num_compromised)], ["root_shell", float(root_shell)], ["su_attempted", float(su_attempted)], ["num_root", float(num_root)], ["num_file_creations", float(num_file_creations)], ["num_shells", float(num_shells)],
import sys, json from jubatus.clustering import client from jubatus.clustering import types from jubatus.common import Datum NAME = "clustering_compounds" if __name__ == '__main__': clustering = client.Clustering("127.0.0.1", 9199, NAME) for line in open("../../bench_data/demo4096.smi"): smiles, id = line.split(" ") datum = Datum() datum.add_string("SMILES", smiles) clustering.push([datum]) center_list = clustering.get_k_center() members = clustering.get_core_members() for i in range(0,4): for j in range(len(members[i])): print "%d, %d, %s " %(i, j, members[i][j]) # print "%s \n" % center_list[4] # for i in range(len(center_list)): # print "%s \n" % center_list[i]
def make_datum(): d = Datum() d.add_string('string-key', 'str') d.add_number('number-key', 1.0) d.add_binary('binary-key', b'bin') return d
import json, sys import jubatus from jubatus.common import Datum headlines = {} #keys = ["HeadLine", "DateLine", "Language", "DateId", "NewsItemId", "article", "Genre1", "Genre2"] with open(sys.argv[1], "r") as f: client = jubatus.Recommender("127.0.0.1", 9199, "hoge", 0) feeds = json.load(f, encoding="utf-8") for feed in feeds: d = Datum() keys = list(feed.keys()) headlines[feed["NewsItemId"]] = feed["HeadLine"] for key in keys: try: if key == "article": d.add_string(key, " ".join(feed[key])) elif key == "NewsItemId": article_id = feed[key].encode('utf-8') elif key == "HeadLine": d.add_string(key, feed[key].encode('utf-8')) else: d.add_string(key, feed[key].encode('utf-8')) except TypeError: print("ignore", key, " ".join(feed[key])) except AttributeError: print("ignore", key, feed[key]) client.update_row(article_id, d) res = client.similar_row_from_id(article_id, 10) client.save("jubatus_hackathon") for r in res: print(r.id, r.score, headlines[r.id])
import sys, json from jubatus.clustering import client from jubatus.clustering import types from jubatus.common import Datum NAME = "clustering_compounds" if __name__ == '__main__': clustering = client.Clustering("127.0.0.1", 9199, NAME) datum = Datum() datum.add_string("SMILES", "cccccccc") print clustering.get_nearest_center(datum) # print "%s \n" % center_list[4] # for i in range(len(center_list)): # print "%s \n" % center_list[i]
from jubatus.common import Datum import jubatus client = jubatus.Weight("127.0.0.1", 9199, "") d = Datum() d.add_number("user/age", 25) d.add_number("user/income", 1000) d.add_string("user/name", "Loren") d.add_string("message", "Hello") res = client.calc_weight(d) print(res)
with open('../kddcup.data_10_percent.txt', mode='r') as file: for line in file: duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label = line[:-1].split( ",") datum = Datum() for (k, v) in [ ["protocol_type", protocol_type], ["service", service], ["flag", flag], ["land", land], ["logged_in", logged_in], ["is_host_login", is_host_login], ["is_guest_login", is_guest_login], ]: datum.add_string(k, v) for (k, v) in [ ["duration", float(duration)], ["src_bytes", float(src_bytes)], ["dst_bytes", float(dst_bytes)], ["wrong_fragment", float(wrong_fragment)], ["urgent", float(urgent)], ["hot", float(hot)], ["num_failed_logins", float(num_failed_logins)], ["num_compromised", float(num_compromised)], ["root_shell", float(root_shell)], ["su_attempted", float(su_attempted)], ["num_root", float(num_root)], ["num_file_creations",