for j in range(classL.numAttr): currAttrY = classL.attrList[j].yesLabel if currAttrY == testDataList[i][j]: testDataArray[i][j] = 1 #print(trainDataArray) #print(trainResultsArray) #print(trainDataArray.sum(axis=0)) #print(testDataArray) #print(testResultsArray) # start training tree numLabelY = np.sum(trainResultsArray) # num of Yes in Label numLabelN = numTrainData - numLabelY trainTree = Tree(trainDataArray, trainResultsArray, classL, testDataArray, testResultsArray) # first output #print('[%d+/%d-]' % (numLabelY,numLabelN)) trainTree.setLabelDist([numLabelY, numLabelN]) numAttrYArray = trainDataArray.sum(axis=0) # num of Yes (1's) per attribute pAttrYArray = numAttrYArray/numTrainData # probability of Yes per attribute pLabelY = numLabelY/numTrainData # probability of Yes in Label # calculate entropy for each attribute HAttrArray = np.zeros([classL.numAttr]) for i in range(classL.numAttr): HAttrArray[i] = H(pAttrYArray[i]) HLabel = H(pLabelY) # calculate entropy for label
def test_model(topN, model, test_seq, last_action, t_size, context_window, alpha, beta, update_model, cbow_mean, freq_tree, update_one_time): all_actions_vocab = model.vocab top_accuracy = numpy.zeros(topN) current_action = last_action #print model.vocab[current_action] #for pos, word in enumerate(test_seq): #print "pos = " + str(pos) #print "word = " + str(word) #print "word.point = " + str(word.point) #print "\n" #print "test_seq = " + str(test_seq) for i in range(0, len(test_seq)): #print str(i) + " / " + str(len(test_seq)) top = [] next_action = test_seq[i] if i != 0: current_action = test_seq[i-1] #print "Current: " + current_action #current_vocab_obj = model.vocab[current_action] #print "current word index: " + str(current_vocab_obj.index) #DEL #context_indices = [] context_names = [] context_vecs = [] test_sub_seq = [] if i == 0: if model.context_labeling == True: labeled_last_action = "LabCon_" + str(last_action) + "_" + str(i) #DEL #context_indices.append(model.vocab[labeled_last_action].index) context_names.append(labeled_last_action) if model.new_vocab_syn0.has_key(labeled_last_action): context_vecs.append(model.new_vocab_syn0[labeled_last_action]) else: context_vecs.append(model.syn0[model.vocab[labeled_last_action].index]) else: #DEL #context_indices.append(model.vocab[last_action].index) context_names.append(last_action) if model.new_vocab_syn0.has_key(last_action): context_vecs.append(model.new_vocab_syn0[last_action]) else: context_vecs.append(model.syn0[model.vocab[last_action].index]) test_sub_seq.append(last_action) else: for j in range(0, context_window): index = i - j - 1 if index >= 0: if model.context_labeling == True: labeled_action = "LabCon_" + str(test_seq[index]) + "_" + str(j) #DEL #context_indices.append(model.vocab[labeled_action].index) context_names.append(labeled_action) if model.new_vocab_syn0.has_key(labeled_action): context_vecs.append(model.new_vocab_syn0[labeled_action]) else: context_vecs.append(model.syn0[model.vocab[labeled_action].index]) else: #DEL #context_indices.append(model.vocab[test_seq[index]].index) context_names.append(test_seq[index]) if model.new_vocab_syn0.has_key(test_seq[index]): context_vecs.append(model.new_vocab_syn0[test_seq[index]]) else: context_vecs.append(model.syn0[model.vocab[test_seq[index]].index]) test_sub_seq.append(test_seq[index]) if i < context_window and i != 0: if model.context_labeling == True: labeled_last_action = "LabCon_" + str(last_action) + "_" + str(i) #DEL #context_indices.append(model.vocab[labeled_last_action].index) context_names.append(labeled_last_action) if model.new_vocab_syn0.has_key(labeled_last_action): context_vecs.append(model.new_vocab_syn0[labeled_last_action]) else: context_vecs.append(model.syn0[model.vocab[labeled_last_action].index]) else: #DEL #context_indices.append(model.vocab[last_action].index) context_names.append(last_action) if model.new_vocab_syn0.has_key(last_action): context_vecs.append(model.new_vocab_syn0[last_action]) else: context_vecs.append(model.syn0[model.vocab[last_action].index]) test_sub_seq.append(last_action) #DEL #context_indices.reverse() #context_indices_origin = deepcopy(context_indices) context_names.reverse() context_vecs.reverse() test_sub_seq.reverse() context_names_origin = deepcopy(context_names) context_vecs_origin = deepcopy(context_vecs) test_sub_seq_origin = deepcopy(test_sub_seq) #print freq_tree.get_ascii(attributes=["name", "dist"], show_internal=True) #print next_action (top, sorted_sim) = prediction_fcbow(topN, model, t_size, context_names, test_sub_seq, cbow_mean, freq_tree, beta) #(top, sorted_sim) = prediction_fcbow(topN, model, t_size, context_vecs, test_sub_seq, cbow_mean, freq_tree, beta) #print "Prediction Done" if next_action not in all_actions_vocab: #print "New action" if model.new_vocab_syn0.has_key(next_action) == False: if model.context_labeling == True: for j in range(0, context_window): labeled_action = "LabCon_" + str(next_action) + "_" + str(j) random.seed(uint32(model.hashfxn(labeled_action + str(model.seed)))) new_vec = (random.rand(model.layer1_size) - 0.5) / model.layer1_size model.add_new_vocab_syn0(labeled_action, new_vec) new_vec = zeros((1, model.layer1_size), dtype=REAL) model.add_new_vocab_syn1(labeled_action, new_vec) #l1 = np_sum(model.syn0[context_indices], axis=0) random.seed(uint32(model.hashfxn(next_action + str(model.seed)))) new_vec = (random.rand(model.layer1_size) - 0.5) / model.layer1_size model.add_new_vocab_syn0(next_action, new_vec) new_vec = zeros((1, model.layer1_size), dtype=REAL) model.add_new_vocab_syn1(next_action, new_vec) #l1 = np_sum(context_vecs_origin, axis=0) #model.add_new_vocab(next_action, l1) if update_model == True: if update_one_time == False: context_names = [] c_i = len(context_names_origin) - 1 while len(context_names) < len(context_names_origin): context_names.append(context_names_origin[c_i]) c_i = c_i - 1 model = update_fcbow_model(model, next_action, context_names, alpha, cbow_mean) else: model = update_fcbow_model(model, next_action, context_names_origin, alpha, cbow_mean) #model = update_cbow_model(model, next_action, context_indices_origin, alpha, cbow_mean) freq_tree = Tree.updateTree(freq_tree, test_sub_seq_origin, next_action) #print "Update Done" for t in range(0, topN): if top[t] == next_action: for k in range(t, topN): top_accuracy[k] = top_accuracy[k] + 1 break ''' print "Current: " + current_action print "Target: " + target_action print "Predict: " + str(predicted_action[0][0]) print "\n" ''' return top_accuracy
def run_model(topN, input_test_data_folder, training_dataset_size, test_user_names, num_tests, sample_input_file, num_features, min_word_count, num_workers, context, downsampling, alpha_training, alpha_test, beta, cbow_mean, update_model, tree_weight_gap, context_labeling, build_vocab_total, num_validation_set=0, update_one_time=False): is_sample_test = False if sample_input_file != "": is_sample_test = True means = [] # Accuracies std = [] # Standard Deviations medians = [] # Medians num_users = len(test_user_names) for i in range(0, topN): means.append([]) std.append([]) medians.append([]) for t_size in training_dataset_size: folder_size = t_size # Array to store accuracy results test_results_accuracy = [[0 for x in xrange((num_tests*num_users))] for x in xrange(topN)] result_i = 0 for user in test_user_names: for test_i in range(1, (num_tests+1)): # Array for top N accuracies top_accuracy = numpy.zeros(topN) if is_sample_test == True: input_data_file = sample_input_file else: input_data_file = input_test_data_folder + str(folder_size) + "/" + str(user) + "-" + str(test_i) + ".csv" training_seq_list = [] test_seq_list = [] total_seq_list = [] # Extract training and test sequence data (training_seq, test_seq) = io.extract_seq_from_csv(input_data_file) # If larger than 0, validation set will be extracted from training data and run validation if num_validation_set > 0: test_seq = training_seq[(t_size - num_validation_set) : t_size] training_seq = training_seq[0 : (t_size - num_validation_set)] training_seq_list.append(training_seq) test_seq_list.append(test_seq) total_seq_list.append(training_seq) total_seq_list.append(test_seq) num_training_seq = len(training_seq) num_test_seq = len(test_seq) # Initiate a model model = init_model(training_seq_list, num_features, min_word_count, num_workers, context, downsampling, cbow_mean, context_labeling, alpha_training) if context_labeling == True: labeled_actions = [] #if build_vocab_total == True: # set_action = list(set(total_seq_list[0])) #else: set_action = list(set(training_seq)) for a in set_action: for label in range(context): labeled_a = "LabCon_" + str(a) + "_" + str(label) labeled_actions.append(labeled_a) if build_vocab_total == True: set_action = list(set(total_seq_list[1])) for a in set_action: for label in range(context): labeled_a = "LabCon_" + str(a) + "_" + str(label) #if labeled_a not in labeled_actions: labeled_actions.append(labeled_a) total_seq_list.append(labeled_actions) else: training_seq_list.append(labeled_actions) # Build CBOW vocabulary if build_vocab_total == True: # Build vocabulary with training and test datasets model.build_vocab(total_seq_list) else: # Build vocabulary with training dataset model.build_vocab(training_seq_list) #print len(model.vocab) #model.reset_weights() # Train a model model.train([training_seq]) indices_nan = isnan(model.syn0) model.syn0[indices_nan] = 0.0 indices_nan = isnan(model.syn1) model.syn1[indices_nan] = 0.0 #print model.syn1 # Build initial Trie to store frequencies of patterns freq_tree = Tree.buildFreqTree(training_seq, context) #print freq_tree.get_ascii(show_internal=True) # If you don't plan to train the model any further, calling # init_sims will make the model much more memory-efficient. #model.init_sims(replace=True) #model.save(model_name) # Run predictions with the model top_accuracy = test_model(topN, model, test_seq, training_seq[num_training_seq-1], t_size, context, alpha_test, beta, update_model, cbow_mean, freq_tree, update_one_time) #print "TEST done" model.init_sims(replace=True) #print "INIT done" for t in xrange(0, topN): test_results_accuracy[t][result_i] = float(top_accuracy[t]) / float(num_test_seq) result_i = result_i + 1 #print "user = "******" DONE" #print test_results_accuracy for t in range(0, topN): means[t].append(numpy.mean(test_results_accuracy[t])) std[t].append(numpy.std(test_results_accuracy[t])) medians[t].append(numpy.median(test_results_accuracy[t])) return (means, std, medians)
def prediction_fcbow(topN, model, t_size, context_names, test_sub_seq, cbow_mean, freq_tree, beta): top = [] score_dict = {} freq_dict = {} sim_dict = {} context_vecs = [] for name in context_names: if model.new_vocab_syn0.has_key(name) == False: context_vecs.append(model.syn0[model.vocab[name].index]) else: context_vecs.append(model.new_vocab_syn0[name]) all_actions_vocab = model.vocab j = -1 while len(context_vecs) >= 1: j = j + 1 if j > 0: del context_vecs[0] del test_sub_seq[0] if len(context_vecs) == 0: break #print test_sub_seq #l1 = np_sum(model.syn0[context_indices], axis=0) # 1 x layer1_size l1 = np_sum(context_vecs, axis=0) # 1 x layer1_size if context_vecs and cbow_mean: l1 /= len(context_vecs) new_actions = model.new_vocab_syn1.keys() for new_action in new_actions: if "LabCon_" in new_action: continue l2a = model.new_vocab_syn1[new_action] fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output #dist = 1. - fa #dist = math.pow(dist, 2) #sim = 1. / (1. + dist) sim = fa score = 0 freq = Tree.returnFreq(freq_tree, test_sub_seq, new_action) freq_log = 0 if freq > 0: freq_log = max(0, math.log(freq)) #freq_log = sigmoid(freq_log) #freq_log = math.tanh(freq_log) if freq_log > 0: if freq_dict.has_key(new_action): freq_dict[new_action] += freq_log else: freq_dict[new_action] = freq_log if sim_dict.has_key(new_action): sim_dict[new_action] += max(0, sim) else: sim_dict[new_action] = max(0, sim) for action in all_actions_vocab: if "LabCon_" in action: continue action_obj = all_actions_vocab[action] #print action_obj l2a = deepcopy(model.syn1[action_obj.point]) # 2d matrix, codelen x layer1_size #l2a = deepcopy(model.syn1neg[action_obj.index]) # 2d matrix, codelen x layer1_size ''' print "Size of syn1 = " + str(len(model.syn1)) print "Length of vocab = " + str(len(model.vocab)) print "index 89 = " + str(model.syn1[89]) print action_obj.point print model.syn1 #print l2a ''' #print model.syn1 u = dot(l1, l2a.T) #print u fa = 1. / (1. + exp(u)) # propagate hidden -> output #fa = 1. / (1. + exp(dot(l1, l2a.T))) # propagate hidden -> output #ga = (1. - action_obj.code - fa) #ga = (action_obj.code - fa) dist = distance.sqeuclidean(action_obj.code, fa) #dist = distance.euclidean(action_obj.code, fa) #dist = numpy.sum(ga) #dist = numpy.abs(dist) #print dist sim = 1. / (1. + dist) freq = Tree.returnFreq(freq_tree, test_sub_seq, action) freq_log = 0 if freq > 0: freq_log = max(0, math.log(freq)) if freq_log > 0: #if freq > 1: if freq_dict.has_key(action): freq_dict[action] += freq_log else: freq_dict[action] = freq_log if sim_dict.has_key(action): sim_dict[action] += max(0, sim) else: sim_dict[action] = max(0, sim) total_sum_freq = numpy.sum(freq_dict.values()) total_sum_sim = numpy.sum(sim_dict.values()) for f_i in freq_dict: if total_sum_freq != 0: freq_dict[f_i] = freq_dict[f_i] / float(total_sum_freq) for s_i in sim_dict: if total_sum_sim != 0: sim_dict[s_i] = sim_dict[s_i] / float(total_sum_sim) for action in sim_dict: if freq_dict.has_key(action): score_dict[action] = (beta * freq_dict[action]) + ((1.0 - beta) * sim_dict[action]) else: score_dict[action] = (1.0 - beta) * sim_dict[action] sorted_score_list = sorted(score_dict.items(), key=operator.itemgetter(1)) sorted_score_list.reverse() freq_dict = sorted(freq_dict.items(), key=operator.itemgetter(1)) freq_dict.reverse() sim_dict = sorted(sim_dict.items(), key=operator.itemgetter(1)) sim_dict.reverse() #print freq_dict #print sim_dict #print "\n" ''' print freq_dict print sim_dict #print "\n" print sorted_score_list print "\n" ''' for i in range(0, topN): top.append(sorted_score_list[i][0]) return (top, sorted_score_list)