def _weight_feature(self, msg): Feature.extract(msg) score = 0.0 for i in range(len(self.feature_name)): f = self.feature_name[i] w = self.w[i] if f in msg.feature: score += msg.feature[f] * w return score
def run_test_case(): message_list = Serialize.loads(open('analysis/case.pickle').read()) for m in message_list: Feature.extract(m) print "====" print m print "----" #print m.feature for f in sorted([f for f in m.feature]): print "%s: %.5f" % (f, m.feature[f]) print "===="
def msg2X(self, samples): ''' Convert messages to data matrix format. X: A dict. See explanation of _G() ''' X = {} for m in samples.values(): Feature.extract(m) x = [] for name in self.feature_name: x.append(m.feature[name]) X[m.msg_id] = x return X
def extract_all(): import time begin = time.time() message = Serialize.loads(open('analysis/workspace.pickle').read()) end = time.time() print "Load finish. Time elapsed: %.3f" % (end - begin) begin = time.time() for m in message['message_list']: Feature.extract(m) end = time.time() print "Feature extraction finish. Time elapsed: %.3f" % (end - begin) begin = time.time() open('analysis/workspace.pickle', 'w').write(Serialize.dumps(message)) end = time.time() print "Dump finish. Time elapsed: %.3f" % (end - begin)
def __init__(self, samples, order, init_weight, learner): super(AutoWeight, self).__init__() self.samples = samples self.order = order self.learner = learner if init_weight is None: # Use one of the samples keys as sets of features to be trained. # This is deprecated. Whenever possible, please init your features # with weight in 'weights.json' m = samples.values()[0] Feature.extract(m) self.feature_name = m.feature.keys() else: self.feature_name = init_weight.keys() self.X = self.msg2X(samples) if init_weight is None: self.w = self.initw(self.init_weight_kendall(self.feature_name, self.samples, self.order)) else: self.w = self.initw(init_weight)
def __init__(self, samples, order, init_weight, learner): super(AutoWeight, self).__init__() self.samples = samples self.order = order self.learner = learner if init_weight is None: # Use one of the samples keys as sets of features to be trained. # This is deprecated. Whenever possible, please init your features # with weight in 'weights.json' m = samples.values()[0] Feature.extract(m) self.feature_name = m.feature.keys() else: self.feature_name = init_weight.keys() self.X = self.msg2X(samples) if init_weight is None: self.w = self.initw( self.init_weight_kendall(self.feature_name, self.samples, self.order)) else: self.w = self.initw(init_weight)
def export_arff(message_list, ds_name, fn_arff): ''' Export message_list to Weka's arff file ds_name: the name of data set. Shown in first line of arff file. ''' all_tags = json.loads(open('tag_mapping.json').read()) all_tags_r = {} for (k, v) in all_tags.iteritems(): all_tags_r[v] = k all_tags_r[0] = "null" with open(fn_arff, 'w') as fp: fp.write("@relation %s\n\n" % (ds_name)) fn = [] # Write schema fp.write("@attribute id numeric\n") for fe in Feature.feature_extractors: for (f, t) in fe.schema.iteritems(): fp.write("@attribute %s %s\n" % (f, t)) fn.append(f) fp.write("@attribute class {%s}\n" % (",".join(all_tags.keys()))) # Write data fp.write("\n\n@data\n") for m in message_list: # Ignore multi tagged messages for simplicity if len(m.tags) == 1: i = str(m.msg_id) t = all_tags_r[m.tags.keys()[0]] Feature.extract(m) fields = [str(m.feature[f]) for f in fn] fields.insert(0, i) fields.append(t) fp.write(",".join(fields) + "\n")